def save_csv(save_dir): def inner(data): (train_x, train_y), (test_x, test_y) = data make_csv(train_x, train_y, filename='train.csv', data_dir=save_dir) make_csv(test_x, test_y, filename='test.csv', data_dir=save_dir) return inner # Pure functional part program = pipe([ make_class, make_category, create_text_column, train_test_dataframe(1), create_features_df, split_data, ]) # Dirty IO part def main(data_dir, save_dir): return pipe([load_df, program, save_csv(save_dir)])(data_dir) if __name__ == '__main__': data_dir = os.environ.get('DATA_DIR') save_dir = os.environ.get('SAVE_DIR') main(data_dir, save_dir)
text_df = helpers.create_text_column(transformed_df) # Check out the processed text for a single file, by row index row_idx = 0 sample_text = text_df.iloc[0]['Text'] # TESTING: Print out the sample processed text # print('Sample processed text:\n', sample_text) # STRATIFIED SAMPLING random_seed = 1 # Create new dataframe with Datatype (train, test, orig) column # Pass `text_df` from above to create a complete dataframe, with all the information you need complete_df = helpers.train_test_dataframe(text_df, random_seed=random_seed) # TESTING: Check results of the complete dataframe # print(complete_df.head(10)) # CONTAINMENT CALCULATION n = 3 # Select a value for n test_indices = range(5) # Indices for first few files # Iterate through files and calculate containment category_vals = [] containment_vals = [] for i in test_indices: # Get level of plagiarism for a given file index category_vals.append(complete_df.loc[i, 'Category'])