def sampling(): files = get_file_names() for file in files: print("reading file") out = read_pickle_files(file) print("sampling") review_list = divide(out) print("to pickle") write_new_pickle(review_list, file)
def sampling_estupido(): files = get_file_names() final = list() for file in files: print("File: " + file) out = read_pickle("raw", file) print("sampling") stupid = divide_estupido(out) final = final + stupid print("to pickle") write_new_pickle(final, "lexicon_dataset_smaller")
def sampling(): files = get_file_names() training = list() testing = list() for file in files: print("File: " + file) out = read_pickle("raw", file) print("sampling") training_sample, testing_sample = divide(out) training = training + training_sample testing = testing + testing_sample print("to pickle") write_new_pickle(training, "training") write_new_pickle(testing, "testing")
def prepare_dataframe(): with open(get_file_path("dataframe.pkl"), 'rb') as f: df = pickle.load(f) new_df = pd.DataFrame(index=[ 'reviews_Automotive', 'reviews_Cell_Phones_and_Accessories', 'reviews_Video_Games', 'reviews_Movies_and_TV' ], columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) for file in get_file_names(): for i in range(0, 10): df_temp = df[df.categories == file] m = df_temp[i].mean() new_df.at[file, i] = m return new_df
def import_and_divide(): files = get_file_names() training = list() testing = list() for file in files: with open(get_file_path('interim\\sample_' + file + '.pkl'), 'rb') as f: lines = pickle.load(f) t = choice(lines, size=70000, replace=False) for l in tqdm(t): lines.remove(l) l['category'] = file training.append(l) for l in lines: l['category'] = file testing.append(l) export_training_testing(training, testing)
def file_pickling(): files = get_file_names() for file in tqdm(files): test = import_dataset(file) export_sampled_datasets(test, file)