コード例 #1
0
def sampling():
    files = get_file_names()
    for file in files:
        print("reading file")
        out = read_pickle_files(file)
        print("sampling")
        review_list = divide(out)
        print("to pickle")
        write_new_pickle(review_list, file)
def sampling_estupido():
    files = get_file_names()
    final = list()
    for file in files:
        print("File: " + file)
        out = read_pickle("raw", file)
        print("sampling")
        stupid = divide_estupido(out)
        final = final + stupid
        print("to pickle")

    write_new_pickle(final, "lexicon_dataset_smaller")
def sampling():
    files = get_file_names()
    training = list()
    testing = list()
    for file in files:
        print("File: " + file)
        out = read_pickle("raw", file)
        print("sampling")
        training_sample, testing_sample = divide(out)
        training = training + training_sample
        testing = testing + testing_sample
        print("to pickle")

    write_new_pickle(training, "training")
    write_new_pickle(testing, "testing")
コード例 #4
0
def prepare_dataframe():
    with open(get_file_path("dataframe.pkl"), 'rb') as f:
        df = pickle.load(f)
    new_df = pd.DataFrame(index=[
        'reviews_Automotive', 'reviews_Cell_Phones_and_Accessories',
        'reviews_Video_Games', 'reviews_Movies_and_TV'
    ],
                          columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    for file in get_file_names():
        for i in range(0, 10):
            df_temp = df[df.categories == file]
            m = df_temp[i].mean()
            new_df.at[file, i] = m

    return new_df
コード例 #5
0
def import_and_divide():
    files = get_file_names()
    training = list()
    testing = list()
    for file in files:
        with open(get_file_path('interim\\sample_' + file + '.pkl'), 'rb') as f:
            lines = pickle.load(f)
            t = choice(lines, size=70000, replace=False)
            for l in tqdm(t):
                lines.remove(l)
                l['category'] = file
                training.append(l)
            for l in lines:
                l['category'] = file
                testing.append(l)


    export_training_testing(training, testing)
コード例 #6
0
def file_pickling():
    files = get_file_names()
    for file in tqdm(files):
        test = import_dataset(file)
        export_sampled_datasets(test, file)