def main(): data = process_data.read_file(file) print("%%%%%%%%% BEGIN RUN " + str(len(data)) + " %%%%%%%%% " + file) if False: print_tw(data) data = process_data.preprocess_data(data) print("%%%%%%%%% Pre-precessing completed %%%%%%%%% ") out = {} ideas = { 'winners': {}, # List of candidate winner names 'awards': {}, # List of candidate award names 'links': {}, # List of candidate winner=award pairing 'xx': {} } award_names = [ 'best motion picture', 'best director', 'best actor', 'best actress', 'best supporting actor', 'best supporting actress', 'best screenplay', 'best original score', 'best original song', 'cecil b demille' ] for award_name in award_names: ideas['winners'][award_name] = {} for tw in data: # eval_tw(tw, ideas) guess_award_names(tw, ideas) ideas['winners'] = guess_winners(tw, ideas, award_names) answers = {} answers['awards'] = [] answers['winners'] = [] award_candidates = print_votes(ideas['awards'], detailed=False) for i in range(1, 11): answers['awards'].append(award_candidates[-i]) for award in award_names: candidates = print_votes(ideas['winners'][award], detailed=False) if len(candidates) < 1: answers['winners'].append("None Found") else: answers['winners'].append(candidates[-1]) output_readable(answers, award_names)
import sys sys.path.append('./') import process_data import reformat_data import find_traffic_speed from datetime import datetime from pathlib import Path from helper.global_var import SAVE_TYPE_PICKLE from helper.graph_reader import graph_reader if __name__ == '__main__': date_str = datetime.today().strftime('%Y%m%d') # date_str = "20220131" data_root = Path(".") / 'data' process_data.preprocess_data(date_str, overwrite=True, min_file_size=10) reformat_data.reformat_by_bus(date_str) reformat_data.sort_reformat_data(date_str) save_filename_list = [ "final_node_table", "final_way_table", "final_relation_table" ] map_dates = graph_reader(Path("graph"), SAVE_TYPE_PICKLE, save_filename_list) final_node_table = map_dates[0] final_way_table = map_dates[1] final_relation_table = map_dates[2] time_slot_intervals = [5, 15] for interval in time_slot_intervals: print("Processing interval:", interval) find_traffic_speed.find_traffic_speed(date_str,
import json import nltk import re import sys import process_data file = sys.argv[1] data = process_data.read_file(file) data = process_data.preprocess_data(data) print("%%%%%%%%% Pre-precessing completed %%%%%%%%% ") while True: query = input("Search tweets using RegEx. To quit, enter 'quit'.>>") if query == "quit": break else: count = 0 for tw in data: ''' for tk in tw['tokens']: if re.match(query, tk): try: print(tw['text'] + "\n") count += 1 except OSError: print("OSError occured") ''' try:
map_dates = graph_reader(Path("graph"), SAVE_TYPE_PICKLE, save_filename_list) final_node_table = map_dates[0] final_way_table = map_dates[1] final_relation_table = map_dates[2] today_str = (datetime.today()).strftime('%Y%m%d') for date_str in tqdm(os.listdir(data_root), unit="folder", position=-1): re_result = re.match(r"[0-9]{8}", str(date_str)) if re_result is not None: if date_str != today_str: # print("python3.6 find_traffic_speed.py {}".format(date_str)) # process_data part process_data.preprocess_data(date_str, overwrite=True, min_file_size=10, archive_after_preprocess=True, skip_if_archived=False) # reformat_data part reformat_data.reformat_by_bus(date_str) reformat_data.sort_reformat_data(date_str) # find_traffic_speed part time_slot_intervals = [5, 15] for interval in time_slot_intervals: find_traffic_speed.find_traffic_speed( date_str, final_node_table, final_way_table, final_relation_table,
from sklearn.preprocessing import MinMaxScaler import process_data as pro from ml_scripts import train_models from pandas.tools.plotting import radviz import matplotlib.pyplot as plt # creating a mongo client object and setting up the db client = MongoClient() db = client.precog # creating a getting a mongo curser with the data we need # PLEASE REPLACE complete_remove WITH THE NAME OF YOUR MONGO COLLECTION dic = db.precog.find() # process the data using preprocess_data in process_data # this function returns a pandas dataframe df = pro.preprocess_data(dic) # now we will normalize the data using mix max scaler # mms = MinMaxScaler() # for i in df.columns: # df[i] = pd.DataFrame(mms.fit_transform(df[i].values)) # df.describe() # train_models(df.drop('likes_count', axis=1), df['likes_count']) def plot_all(df): f, axarr = plt.subplots(13, 2) for i in range(13): axarr[i, 0].scatter(df['likes_count'], df[df.columns[i]]) axarr[i, 0].set_title(df.columns[i]) if i == 0: