Exemple #1
0
def test_read_write_simple_file():
    # set up the json data
    original_recs = 10
    json_filepath = 'data/test_read_write_simple_file.json'   # TODO: assumptions here
    parquet_filepath = 'output/test_read_write_simple_file.parquet' 
    recs = generate_data.make_recs(original_recs, 1998, 1, 5, 20)
    generate_data.write_json(recs, json_filepath)

    # process to parquet
    df = json_to_parquet.read_json(json_filepath)
    json_to_parquet.write_parquet(df, parquet_filepath)

    # get counts, compare, and clean up
    parquet_recs = len(pd.read_parquet(parquet_filepath))

    assert parquet_recs == original_recs

    os.remove(json_filepath)
    os.remove(parquet_filepath)
Exemple #2
0
def test_read_write_with_dups():
    # set up the json data
    original_recs = 100
    additional_recs = 20
    json_filepath = 'data/test_read_write_with_dups.json'   # TODO: assumptions here
    parquet_filepath = 'output/test_read_write_with_dups.parquet' 
    recs = generate_data.make_recs(original_recs, 2019, 8, 1, 365)
    generate_data.write_json(recs, json_filepath)

    # process to parquet
    df = json_to_parquet.read_json(json_filepath)
    df_clean = json_to_parquet.remove_duplicates(df)
    json_to_parquet.write_parquet(df_clean, parquet_filepath)

    # get counts, compare, and clean up
    parquet_recs = len(pd.read_parquet(parquet_filepath))

    assert parquet_recs == original_recs

    os.remove(json_filepath)
    os.remove(parquet_filepath)
Exemple #3
0
def test_cl_generate_big():
    yyyymmdd = '20190501'
    output_dir = 'data/test1'
    num_recs_per_file = '10000'
    num_files = '1000'

    subprocess.run([
        'python', 'src/generate_data.py', output_dir, num_recs_per_file,
        num_files, yyyymmdd
    ])

    files = os.listdir(output_dir)
    dfs = []
    for f in files:
        dfs.append(json_to_parquet.read_json(os.path.join(output_dir, f)))
        # clean as we go
        os.remove(os.path.join(output_dir, f))
    os.rmdir(output_dir)
    json_df = pd.concat(dfs)

    assert len(json_df) == int(num_recs_per_file) * int(num_files)