def save_csv(save_dir):
    def inner(data):
        (train_x, train_y), (test_x, test_y) = data
        make_csv(train_x, train_y, filename='train.csv', data_dir=save_dir)
        make_csv(test_x, test_y, filename='test.csv', data_dir=save_dir)

    return inner


# Pure functional part
program = pipe([
    make_class,
    make_category,
    create_text_column,
    train_test_dataframe(1),
    create_features_df,
    split_data,
])


# Dirty IO part
def main(data_dir, save_dir):
    return pipe([load_df, program, save_csv(save_dir)])(data_dir)


if __name__ == '__main__':
    data_dir = os.environ.get('DATA_DIR')
    save_dir = os.environ.get('SAVE_DIR')
    main(data_dir, save_dir)
text_df = helpers.create_text_column(transformed_df)

# Check out the processed text for a single file, by row index
row_idx = 0
sample_text = text_df.iloc[0]['Text']

# TESTING: Print out the sample processed text
# print('Sample processed text:\n', sample_text)


# STRATIFIED SAMPLING
random_seed = 1

# Create new dataframe with Datatype (train, test, orig) column
# Pass `text_df` from above to create a complete dataframe, with all the information you need
complete_df = helpers.train_test_dataframe(text_df, random_seed=random_seed)

# TESTING: Check results of the complete dataframe
# print(complete_df.head(10))

# CONTAINMENT CALCULATION
n = 3   # Select a value for n
test_indices = range(5)   # Indices for first few files

# Iterate through files and calculate containment
category_vals = []
containment_vals = []
for i in test_indices:
    # Get level of plagiarism for a given file index
    category_vals.append(complete_df.loc[i, 'Category'])