def test_write_json(): original_recs = 100 additional_recs = 22 filepath = 'data/test_write_json.json' # TODO: assumptions here recs = generate_data.make_recs(original_recs, 2017, 12, 15, 120) all_recs = generate_data.add_dups_to_recs(recs, additional_recs) generate_data.write_json(all_recs, filepath) import json, os records_read = json.load(open(filepath)) os.remove(filepath) assert len(records_read['records']) == original_recs + additional_recs
def test_read_write_simple_file(): # set up the json data original_recs = 10 json_filepath = 'data/test_read_write_simple_file.json' # TODO: assumptions here parquet_filepath = 'output/test_read_write_simple_file.parquet' recs = generate_data.make_recs(original_recs, 1998, 1, 5, 20) generate_data.write_json(recs, json_filepath) # process to parquet df = json_to_parquet.read_json(json_filepath) json_to_parquet.write_parquet(df, parquet_filepath) # get counts, compare, and clean up parquet_recs = len(pd.read_parquet(parquet_filepath)) assert parquet_recs == original_recs os.remove(json_filepath) os.remove(parquet_filepath)
def test_read_write_with_dups(): # set up the json data original_recs = 100 additional_recs = 20 json_filepath = 'data/test_read_write_with_dups.json' # TODO: assumptions here parquet_filepath = 'output/test_read_write_with_dups.parquet' recs = generate_data.make_recs(original_recs, 2019, 8, 1, 365) generate_data.write_json(recs, json_filepath) # process to parquet df = json_to_parquet.read_json(json_filepath) df_clean = json_to_parquet.remove_duplicates(df) json_to_parquet.write_parquet(df_clean, parquet_filepath) # get counts, compare, and clean up parquet_recs = len(pd.read_parquet(parquet_filepath)) assert parquet_recs == original_recs os.remove(json_filepath) os.remove(parquet_filepath)
def test_add_dups_to_recs(): original_recs = 200 additional_recs = 20 recs = generate_data.make_recs(original_recs, 2017, 12, 1, 90) all_recs = generate_data.add_dups_to_recs(recs, additional_recs) assert len(all_recs) == original_recs + additional_recs
def test_make_recs(): num_recs = 10 recs = generate_data.make_recs(num_recs, 2019, 5, 1, 30) assert len(recs) == num_recs