def test_clean_data():
    '''tests clean_data'''
    input_df = pd.DataFrame(
       {'first_name' : ['j edgar','Dylan ., JR', 'Mary sue E. M.', 'nAtAsha',''],
        'last_name' : ['Hoover., iii','Smith F', 'Jones V', "O.'brien-jenkins IV", np.nan],
        'middle_initial' : ['A', 'B', np.nan, 'C', 'D'],
        'incident_datetime' : ['9999-99-99 12:12', '2016-01-21', '2015-12-52 100:100', '2016-01-12 02:54', '07/21/16 10:59'],
        'trr_date' : ['200-12-12', '2000-12-12', '1921-01-01', '2016-12-01', '07/21/21'],
        'trr_time' : [1212, "00", 9876, "23:12", 109],
        'age' : [120, -999, 0, 21, "hi"],
        'race' : ['N', 'wbh', 'naTIVE AMericaN', 'black hispanic',  'I'],
        'gender' : ['mALE', 'm', 'NONE', 'FEMALE', np.nan]
        })
    orig_input_df = copy.deepcopy(input_df)
    output_df = pd.DataFrame(
       {'first_name' : ['J EDGAR','DYLAN', 'MARY SUE', 'NATASHA',np.nan],
        'last_name' : ['HOOVER','SMITH', 'JONES', "O'BRIEN-JENKINS", np.nan],
        'first_name_NS' : ['JEDGAR','DYLAN', 'MARYSUE', 'NATASHA',np.nan],
        'last_name_NS' : ['HOOVER','SMITH', 'JONES', 'OBRIENJENKINS', np.nan],
        'middle_initial' : ['A', 'B', 'E', 'C', 'D'],
        'middle_initial2' : [np.nan, 'F', 'M', np.nan, np.nan],
        'suffix_name' : ['III', 'JR', 'V', 'IV', np.nan],
        'incident_date' : pd.to_datetime(pd.Series([np.nan, '2016-01-21', np.nan, '2016-01-12', '2016-07-21'])).dt.date,
        'incident_time' : pd.to_datetime(pd.Series(['12:12:00', '00:00:00', np.nan, '02:54:00', '10:59:00'])).dt.time,
        'trr_date' : pd.to_datetime(pd.Series([np.nan, '2000-12-12', '1921-01-01', '2016-12-01', '1921-07-21'])).dt.date,
        'trr_time' : pd.to_datetime(pd.Series(['12:12:00', '00:00:00', np.nan, '23:12:00', '01:09:00'])).dt.time,
        'age' : [np.nan, np.nan, np.nan, 21, np.nan],
        'race' : ['BLACK', 'HISPANIC', 'NATIVE AMERICAN/ALASKAN NATIVE', 'BLACK',  'NATIVE AMERICAN/ALASKAN NATIVE'],
        'gender' : ['MALE', 'MALE', '', 'FEMALE', '']
        })
    results = clean_data(input_df, log)
    assert set(results.columns) == set(output_df.columns)
    assert results.equals(output_df[results.columns])
    assert orig_input_df.equals(input_df)
def test_clean_data_skip():
    '''tests clean_data with skip_cols'''
    input_df = pd.DataFrame(
       {'race' : ['N', 'wbh', 'naTIVE AMericaN', 'black hispanic',  'I'],
        'gender' : ['mALE', 'm', 'NONE', 'FEMALE', np.nan]
        })
    orig_input_df = copy.deepcopy(input_df)
    output_df = pd.DataFrame(
       {'race' : ['BLACK', 'HISPANIC', 'NATIVE AMERICAN/ALASKAN NATIVE', 'BLACK',  'NATIVE AMERICAN/ALASKAN NATIVE'],
        'gender': ['mALE', 'm', 'NONE', 'FEMALE', np.nan]
        })
    input_skip_cols = ['gender']
    results = clean_data(input_df, log, skip_cols=input_skip_cols)

    assert set(results.columns) == set(output_df.columns)
    assert results.equals(output_df[results.columns])
    assert orig_input_df.equals(input_df)
Esempio n. 3
0
def test_clean_data_dict():
    '''tests clean_data with clean_dict'''
    input_df = pd.DataFrame(
       {'race' : ['N', 'wbh', 'naTIVE AMericaN', 'black hispanic',  'I'],
        'gender' : ['mALE', 'm', 'NONE', 'FEMALE', np.nan]
        })
    orig_input_df = copy.deepcopy(input_df)
    output_df = pd.DataFrame(
       {'race' : ['BLACK', 'HISPANIC', 'NATIVE AMERICAN/ALASKAN NATIVE', 'BLACK',  'WHITE'],
        'gender' : ['', 'MALE', '', 'FEMALE', '']
        })
    input_clean_dict = {'gender' : {'mALE' : '', 'm' : 'MALE', 'FEMALE': 'FEMALE'}}
    results = clean_data(input_df, log,
                         clean_dict = input_clean_dict)

    assert set(results.columns) == set(output_df.columns)
    assert results.equals(output_df[results.columns])
    assert orig_input_df.equals(input_df)
def test_clean_data_human_names():
    '''tests clean_data with human names'''
    input_df = pd.DataFrame(
       {'human_name' : ['J R JONES JR', 'MICHAEL SMOKED HAM', 'J EDGAR HOOVER', 'SALLY K E MAY JR'],
        'gender' : ['mALE', 'm', 'NONE', 'FEMALE']
        })
    orig_input_df = copy.deepcopy(input_df)
    output_df = pd.DataFrame(
       {'first_name' : ['J R', 'MICHAEL', 'J EDGAR', 'SALLY'],
       'first_name_NS' : ['JR', 'MICHAEL', 'JEDGAR', 'SALLY'],
       'middle_initial' : [np.nan, np.nan, np.nan, 'K'],
       'middle_initial2' : [np.nan, np.nan, np.nan, 'E'],
       'suffix_name' : ['JR', np.nan, np.nan, 'JR'],
       'last_name' : ['JONES', 'SMOKED HAM', 'HOOVER', 'MAY'],
       'last_name_NS' : ['JONES', 'SMOKEDHAM', 'HOOVER', 'MAY'],
        'gender': ['MALE', 'MALE', '', 'FEMALE']
        })
    results = clean_data(input_df, log)
    assert set(results.columns) == set(output_df.columns)
    assert results.equals(output_df[results.columns])
    assert orig_input_df.equals(input_df)
Esempio n. 5
0

def get_setup():
    ''' encapsulates args.
        calls setup.do_setup() which returns constants and logger
        constants contains args and a few often-useful bits in it
        including constants.write_yamlvar()
        logger is used to write logging messages
    '''
    script_path = __main__.__file__
    args = {
        'input_file': 'input/TRR-weapon-discharges_2004-2016_2016-09.csv.gz',
        'output_file': 'output/TRR-weapon-discharges_2004-2016_2016-09.csv.gz'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)
df = clean_data(df, log)
df.to_csv(cons.output_file, **cons.csv_opts)