def test_write_json():
    original_recs = 100
    additional_recs = 22
    filepath = 'data/test_write_json.json'   # TODO: assumptions here
    recs = generate_data.make_recs(original_recs, 2017, 12, 15, 120)
    all_recs = generate_data.add_dups_to_recs(recs, additional_recs)
    generate_data.write_json(all_recs, filepath)
    import json, os
    records_read = json.load(open(filepath))
    os.remove(filepath)
    assert len(records_read['records']) == original_recs + additional_recs
Example #2
0
def test_read_write_simple_file():
    # set up the json data
    original_recs = 10
    json_filepath = 'data/test_read_write_simple_file.json'   # TODO: assumptions here
    parquet_filepath = 'output/test_read_write_simple_file.parquet' 
    recs = generate_data.make_recs(original_recs, 1998, 1, 5, 20)
    generate_data.write_json(recs, json_filepath)

    # process to parquet
    df = json_to_parquet.read_json(json_filepath)
    json_to_parquet.write_parquet(df, parquet_filepath)

    # get counts, compare, and clean up
    parquet_recs = len(pd.read_parquet(parquet_filepath))

    assert parquet_recs == original_recs

    os.remove(json_filepath)
    os.remove(parquet_filepath)
Example #3
0
def test_read_write_with_dups():
    # set up the json data
    original_recs = 100
    additional_recs = 20
    json_filepath = 'data/test_read_write_with_dups.json'   # TODO: assumptions here
    parquet_filepath = 'output/test_read_write_with_dups.parquet' 
    recs = generate_data.make_recs(original_recs, 2019, 8, 1, 365)
    generate_data.write_json(recs, json_filepath)

    # process to parquet
    df = json_to_parquet.read_json(json_filepath)
    df_clean = json_to_parquet.remove_duplicates(df)
    json_to_parquet.write_parquet(df_clean, parquet_filepath)

    # get counts, compare, and clean up
    parquet_recs = len(pd.read_parquet(parquet_filepath))

    assert parquet_recs == original_recs

    os.remove(json_filepath)
    os.remove(parquet_filepath)
def test_add_dups_to_recs():
    original_recs = 200
    additional_recs = 20
    recs = generate_data.make_recs(original_recs, 2017, 12, 1, 90)
    all_recs = generate_data.add_dups_to_recs(recs, additional_recs)
    assert len(all_recs) == original_recs + additional_recs
def test_make_recs():
    num_recs = 10
    recs = generate_data.make_recs(num_recs, 2019, 5, 1, 30)
    assert len(recs) == num_recs