def test_read_write_simple_file(): # set up the json data original_recs = 10 json_filepath = 'data/test_read_write_simple_file.json' # TODO: assumptions here parquet_filepath = 'output/test_read_write_simple_file.parquet' recs = generate_data.make_recs(original_recs, 1998, 1, 5, 20) generate_data.write_json(recs, json_filepath) # process to parquet df = json_to_parquet.read_json(json_filepath) json_to_parquet.write_parquet(df, parquet_filepath) # get counts, compare, and clean up parquet_recs = len(pd.read_parquet(parquet_filepath)) assert parquet_recs == original_recs os.remove(json_filepath) os.remove(parquet_filepath)
def test_read_write_with_dups(): # set up the json data original_recs = 100 additional_recs = 20 json_filepath = 'data/test_read_write_with_dups.json' # TODO: assumptions here parquet_filepath = 'output/test_read_write_with_dups.parquet' recs = generate_data.make_recs(original_recs, 2019, 8, 1, 365) generate_data.write_json(recs, json_filepath) # process to parquet df = json_to_parquet.read_json(json_filepath) df_clean = json_to_parquet.remove_duplicates(df) json_to_parquet.write_parquet(df_clean, parquet_filepath) # get counts, compare, and clean up parquet_recs = len(pd.read_parquet(parquet_filepath)) assert parquet_recs == original_recs os.remove(json_filepath) os.remove(parquet_filepath)
def test_cl_generate_big(): yyyymmdd = '20190501' output_dir = 'data/test1' num_recs_per_file = '10000' num_files = '1000' subprocess.run([ 'python', 'src/generate_data.py', output_dir, num_recs_per_file, num_files, yyyymmdd ]) files = os.listdir(output_dir) dfs = [] for f in files: dfs.append(json_to_parquet.read_json(os.path.join(output_dir, f))) # clean as we go os.remove(os.path.join(output_dir, f)) os.rmdir(output_dir) json_df = pd.concat(dfs) assert len(json_df) == int(num_recs_per_file) * int(num_files)