Ejemplos de preprocess_data en Python

Lenguaje de programación: Python

Namespace/Package Name: process_data

Método / Función: preprocess_data

Ejemplos en hotexamples.com: 5

Python preprocess_data - 5 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de process_data.preprocess_data extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

def main():
    data = process_data.read_file(file)
    print("%%%%%%%%% BEGIN RUN " + str(len(data)) + " %%%%%%%%% " + file)

    if False:
        print_tw(data)

    data = process_data.preprocess_data(data)
    print("%%%%%%%%% Pre-precessing completed %%%%%%%%% ")
    out = {}
    ideas = {
        'winners': {},  # List of candidate winner names
        'awards': {},  # List of candidate award names
        'links': {},  # List of candidate winner=award pairing
        'xx': {}
    }

    award_names = [
        'best motion picture', 'best director', 'best actor', 'best actress',
        'best supporting actor', 'best supporting actress', 'best screenplay',
        'best original score', 'best original song', 'cecil b demille'
    ]

    for award_name in award_names:
        ideas['winners'][award_name] = {}

    for tw in data:
        # eval_tw(tw, ideas)
        guess_award_names(tw, ideas)
        ideas['winners'] = guess_winners(tw, ideas, award_names)

    answers = {}
    answers['awards'] = []
    answers['winners'] = []

    award_candidates = print_votes(ideas['awards'], detailed=False)
    for i in range(1, 11):
        answers['awards'].append(award_candidates[-i])

    for award in award_names:
        candidates = print_votes(ideas['winners'][award], detailed=False)
        if len(candidates) < 1:
            answers['winners'].append("None Found")
        else:
            answers['winners'].append(candidates[-1])

    output_readable(answers, award_names)

Ejemplo n.º 2

Mostrar archivo

import sys
sys.path.append('./')
import process_data
import reformat_data
import find_traffic_speed
from datetime import datetime
from pathlib import Path
from helper.global_var import SAVE_TYPE_PICKLE
from helper.graph_reader import graph_reader

if __name__ == '__main__':
    date_str = datetime.today().strftime('%Y%m%d')
    # date_str = "20220131"
    data_root = Path(".") / 'data'
    process_data.preprocess_data(date_str, overwrite=True, min_file_size=10)
    reformat_data.reformat_by_bus(date_str)
    reformat_data.sort_reformat_data(date_str)

    save_filename_list = [
        "final_node_table", "final_way_table", "final_relation_table"
    ]
    map_dates = graph_reader(Path("graph"), SAVE_TYPE_PICKLE,
                             save_filename_list)
    final_node_table = map_dates[0]
    final_way_table = map_dates[1]
    final_relation_table = map_dates[2]

    time_slot_intervals = [5, 15]
    for interval in time_slot_intervals:
        print("Processing interval:", interval)
        find_traffic_speed.find_traffic_speed(date_str,

Ejemplo n.º 3

Mostrar archivo

Archivo: search_tweets.py Proyecto: ckpeter/nu-20winter-cs337-g28

import json
import nltk
import re
import sys

import process_data

file = sys.argv[1]
data = process_data.read_file(file)

data = process_data.preprocess_data(data)
print("%%%%%%%%% Pre-precessing completed %%%%%%%%% ")

while True:
    query = input("Search tweets using RegEx. To quit, enter 'quit'.>>")
    if query == "quit":
        break

    else:
        count = 0
        for tw in data:
            '''
            for tk in tw['tokens']:
                if re.match(query, tk):
                    try:
                        print(tw['text'] + "\n")
                        count += 1
                    except OSError:
                        print("OSError occured")
            '''
            try:

Ejemplo n.º 4

Mostrar archivo

    map_dates = graph_reader(Path("graph"), SAVE_TYPE_PICKLE,
                             save_filename_list)
    final_node_table = map_dates[0]
    final_way_table = map_dates[1]
    final_relation_table = map_dates[2]

    today_str = (datetime.today()).strftime('%Y%m%d')
    for date_str in tqdm(os.listdir(data_root), unit="folder", position=-1):
        re_result = re.match(r"[0-9]{8}", str(date_str))
        if re_result is not None:
            if date_str != today_str:
                # print("python3.6 find_traffic_speed.py {}".format(date_str))
                # process_data part
                process_data.preprocess_data(date_str,
                                             overwrite=True,
                                             min_file_size=10,
                                             archive_after_preprocess=True,
                                             skip_if_archived=False)

                # reformat_data part
                reformat_data.reformat_by_bus(date_str)
                reformat_data.sort_reformat_data(date_str)

                # find_traffic_speed part
                time_slot_intervals = [5, 15]
                for interval in time_slot_intervals:
                    find_traffic_speed.find_traffic_speed(
                        date_str,
                        final_node_table,
                        final_way_table,
                        final_relation_table,

Ejemplo n.º 5

Mostrar archivo

from sklearn.preprocessing import MinMaxScaler
import process_data as pro
from ml_scripts import train_models
from pandas.tools.plotting import radviz
import matplotlib.pyplot as plt
# creating a mongo client object and setting up the db
client = MongoClient()
db = client.precog

# creating a getting a mongo curser with the data we need
# PLEASE REPLACE complete_remove WITH THE NAME OF YOUR MONGO COLLECTION
dic = db.precog.find()

# process the data using preprocess_data in process_data
# this function returns a pandas dataframe
df = pro.preprocess_data(dic)
# now we will normalize the data using mix max scaler
# mms = MinMaxScaler()
# for i in df.columns:
#     df[i] = pd.DataFrame(mms.fit_transform(df[i].values))
# df.describe()

# train_models(df.drop('likes_count', axis=1), df['likes_count'])


def plot_all(df):
    f, axarr = plt.subplots(13, 2)
    for i in range(13):
        axarr[i, 0].scatter(df['likes_count'], df[df.columns[i]])
        axarr[i, 0].set_title(df.columns[i])
        if i == 0: