def test_final_profiles():
    '''test final profiles creation'''
    input_df = pd.DataFrame({
        'ID': [1, 1, 1, 2, 2, 3, 3],
        'name': ['MIKE', 'MICHAEL', 'MICHAEL', 'JANE', 'JAN', 'BOB', 'BOB'],
        'age': [25, 24, 24, 30, 31, 40, 42],
        'rank': ['SGT', 'PO', 'PO', 'DET', 'PO', 'PO', 'PA'],
        'fid1__2016-09_ID': [np.nan, np.nan, 10, np.nan, 34, 40, np.nan],
        'fid2__2017-01_ID': [51, np.nan, np.nan, 13, np.nan, np.nan, np.nan],
        'fid3__2015-01_ID': [np.nan, 2, np.nan, np.nan, np.nan, np.nan, 10111]
    })
    input_args = {
        'aggregate_data_args': {
            'current_cols': ['rank'],
            'time_col': 'foia_date',
            'mode_cols': ['name'],
            'max_cols': ['age']
        },
        'column_order': ['current_rank', 'age', 'name']
    }
    output_df = pd.DataFrame(
        {
            'ID': [1, 2, 3],
            'current_rank': ['SGT', 'DET', 'PO'],
            'age': [25, 31, 42],
            'name': ['MICHAEL', 'JAN', 'BOB'],
            'profile_count': [3, 2, 2]
        },
        columns=['ID', 'current_rank', 'age', 'name', 'profile_count'])

    results = ReferenceData(input_df, uid='ID', log=log)\
        .final_profiles(**input_args)\
        .profiles
    assert results.equals(output_df)
Beispiel #2
0
        'loop_merge': {
            'verbose': True
        },
        'input_remerge_file':
        'input/complaints-investigators_2000-2016_2016-11.csv.gz',
        'output_remerge_file':
        'output/complaints-investigators_2000-2016_2016-11.csv.gz'
    }

    assert args['input_reference_file'] == 'input/officer-reference.csv.gz',\
        'Input reference file is not correct.'
    assert args['output_reference_file'] == 'output/officer-reference.csv.gz',\
        'Output reference file is not correct.'

    return setup.do_setup(script_path, args)


cons, log = get_setup()

ref_df = pd.read_csv(cons.input_reference_file)
sup_df = pd.read_csv(cons.input_profiles_file)

ReferenceData(ref_df, uid=cons.universal_id, log=log)\
    .add_sup_data(sup_df, add_cols=cons.add_cols)\
    .loop_merge(**cons.loop_merge)\
    .append_to_reference()\
    .remerge_to_file(cons.input_remerge_file,
                    cons.output_remerge_file,
                    cons.csv_opts)\
    .write_reference(cons.output_reference_file, cons.csv_opts)
    full_df = full_df.append(sub_df)

assert full_df.shape[0] == df.shape[0],\
    print('Remerged data does not match input dataset')

df = full_df

log.info("Beginning self-merge process")

for year in range(2002, 2018):
    dfy = df[df['year'] == year].copy()
    yid = cons.year_id.replace('year', str(year))
    dfy.rename(columns={cons.year_id: yid},
               inplace=True)
    if year == 2002:
        sd = ReferenceData(dfy, uid=cons.sid, data_id=yid, log=log)
    else:
        sd = (sd.add_sup_data(dfy, add_cols=['F4FN', 'F4LN'], base_OD=[])
                .loop_merge(custom_merges=cons.custom_merges, verbose=False)
                .append_to_reference())
log.info('Number of unique IDs = %d', len(sd.ref_df[cons.sid].unique()))

sd.write_reference(cons.output_file, cons.csv_opts)

sal = sd.ref_df

res_years = sal[[cons.id, 'year']].groupby(cons.id, as_index=False).max().rename(columns={'year' : 'resignation_year'})
sal = sal[[col for col in sal.columns if col in cons.profile_cols]].drop_duplicates()
assert sal[sal['start_date'].isnull() & sal['org_hire_date'].isnull()].empty

log.info('Creating so_max_date and so_min_date from max/min'
Beispiel #4
0
                ["F4FN", "L4LN", "appointed_date", "star"],
                ["F1FN", "L3FN", "last_name_NS", "appointed_date", "star"],
                ["F1FN", "L3FN","F1LN", "L3LN", "appointed_date", "star"],
                ["F4FN", "L4LN", "appointed_date"],
                ["F1FN", "L3FN", "L4LN", "appointed_date"]
            ],
        'one_to_one' : False,
        'keep_sup_um' : False
        }

    assert args['input_reference_file'] == 'input/officer-reference.csv.gz',\
    'Input reference file is not correct.'

    return setup.do_setup(script_path, args)

cons, log = get_setup()

ref_df = pd.read_csv(cons.input_reference_file)
ref_df.to_csv(cons.output_reference_file, **cons.csv_opts)
rd = ReferenceData(ref_df, uid=cons.universal_id, log=log)

sup_df = pd.read_csv(cons.input_profiles_file)
rd = (rd.add_sup_data(sup_df, add_cols=cons.add_cols)
        .loop_merge(custom_merges=cons.custom_merges,
                    one_to_one=cons.one_to_one,
                    verbose=True)
        .append_to_reference(keep_sup_um=cons.keep_sup_um)
        .remerge_to_file(cons.input_remerge_file,
                         cons.output_remerge_file,
                         cons.csv_opts))
                'middle_initial2', 'suffix_name', 'race', 'gender',
                'birth_year', 'appointed_date', 'start_date', 'org_hire_date'
                ],
            'max_cols': ['resignation_date'],
            }
        }

    assert args['input_file'] == 'input/officer-reference.csv.gz',\
        'Input file is not correct.'
    assert args['output_file'] == 'output/final-profiles.csv.gz',\
        'Output file is not correct.'

    return setup.do_setup(script_path, args)


cons, log = get_setup()

ref_df = pd.read_csv(cons.input_file)
ref_df['current_rank'] = ref_df["current_rank"].replace("UNKNOWN", np.nan)
profiles = \
ReferenceData(ref_df, uid=cons.universal_id, log=log)\
    .final_profiles(aggregate_data_args=cons.aggregate_data_args,
                    column_order=cons.column_order,
                    include_IDs=False)\
    .profiles

with open(cons.recode_file, "r") as f:
    rank_recode = yaml.load(f)
profiles['cleaned_rank'] = profiles['current_rank'].replace(rank_recode)
profiles.to_csv(cons.output_file, **cons.csv_opts)
def test_fill_data():
    '''tests merging for fill_data merging'''
    input_df = pd.DataFrame({
        'data_id': [1, 1, 2, 3, 2],
        'first_name_NS': ['BOB', 'BOB', 'AN', 'ANNIE', 'ANNA'],
        'middle_initial': ['M', np.nan, np.nan, 'L', 'L'],
        'suffix_name': ['JR', np.nan, np.nan, np.nan, np.nan],
        'last_name_NS': ['JONES', 'JONES', 'SMITH', 'SMITH', 'ORIELY'],
        'star': [10, 50, 20, 2, np.nan]
    })
    input_args = {'uid': 'UID', 'data_id': 'data_id', 'log': log}

    RD = ReferenceData(input_df, **input_args)

    def test_initialize_ReferenceData():
        '''test initializing ReferenceData'''
        output_ref_df = pd.DataFrame({
            'data_id': [1, 1, 2, 2, 3],
            'first_name_NS': ['BOB', 'BOB', 'AN', 'ANNA', 'ANNIE'],
            'middle_initial': ['M', np.nan, np.nan, 'L', 'L'],
            'suffix_name': ['JR', np.nan, np.nan, np.nan, np.nan],
            'last_name_NS': ['JONES', 'JONES', 'SMITH', 'ORIELY', 'SMITH'],
            'star': [10, 50, 20, np.nan, 2],
            'UID': [1, 1, 2, 2, 3]
        })

        results = RD.ref_df
        output_ref_df = output_ref_df[results.columns]
        assert results.equals(output_ref_df)

    test_initialize_ReferenceData()

    input_sup_df = pd.DataFrame({
        'sub__2016_ID': [1, 2, 3],
        'first_name_NS': ['BOB', 'AN', 'AN'],
        'middle_initial': [np.nan, 'L', 'L'],
        'suffix_name': [np.nan, np.nan, np.nan],
        'last_name_NS': ['JONES', 'ORIELY', 'SMITH'],
        'current_star': [50, 20, np.nan]
    })

    RD = RD.add_sup_data(input_sup_df,
                         add_cols=["F2FN"],
                         fill_cols=[
                             'first_name_NS', 'middle_initial', 'suffix_name',
                             'last_name_NS', 'star'
                         ])

    def test_add_sup_data_sup_df():
        '''test added supplementary data'''
        output_sup_df = pd.DataFrame({
            'sub__2016_ID': [1, 2, 3],
            'first_name_NS': ['BOB', 'AN', 'AN'],
            'middle_initial': [np.nan, 'L', 'L'],
            'suffix_name': [np.nan, np.nan, np.nan],
            'last_name_NS': ['JONES', 'ORIELY', 'SMITH'],
            'current_star': [50, 20, np.nan],
            'star': [50, 20, np.nan]
        })

        results = RD.sup_df
        output_sup_df = output_sup_df[results.columns]
        assert results.equals(output_sup_df)

    test_add_sup_data_sup_df()

    def test_add_sup_data_sup_um():
        '''test added unmerged supplementary data'''
        output_sup_um = pd.DataFrame({
            'sub__2016_ID': [1, 2, 3],
            'first_name_NS': ['BOB', 'AN', 'AN'],
            'F2FN': ['BO', 'AN', 'AN'],
            'middle_initial': [np.nan, 'L', 'L'],
            'suffix_name': [np.nan, np.nan, np.nan],
            'last_name_NS': ['JONES', 'ORIELY', 'SMITH'],
            'current_star': [50, 20, np.nan],
            'star': [50, 20, np.nan]
        })

        results = RD.sup_um
        output_sup_um = output_sup_um[results.columns]
        assert results.equals(output_sup_um)

    test_add_sup_data_sup_um()

    def test_add_sup_data_ref_um():
        '''test unmerged reference data'''
        output_ref_um = pd.DataFrame({
            'UID': [1, 1, 2, 2, 2, 2, 3],
            'first_name_NS':
            ['BOB', 'BOB', 'AN', 'AN', 'ANNA', 'ANNA', 'ANNIE'],
            'middle_initial': ['M', 'M', 'L', 'L', 'L', 'L', 'L'],
            'suffix_name':
            ['JR', 'JR', np.nan, np.nan, np.nan, np.nan, np.nan],
            'last_name_NS':
            ['JONES', 'JONES', 'SMITH', 'ORIELY', 'SMITH', 'ORIELY', 'SMITH'],
            'F2FN': ['BO', 'BO', 'AN', 'AN', 'AN', 'AN', 'AN'],
            'star': [10.0, 50.0, 20.0, 20.0, 20.0, 20.0, 2.0]
        })

        results = RD.ref_um
        output_ref_um = output_ref_um[results.columns]

        assert results.equals(output_ref_um)

    test_add_sup_data_ref_um()

    RD = RD.loop_merge(
        custom_merges=[['F2FN', 'last_name_NS', 'middle_initial']])

    def test_loop_merge_on_lists():
        '''test on_lists for loop_merge'''
        output_on_lists = [
            [
                'star', 'first_name_NS', 'last_name_NS', 'middle_initial',
                'suffix_name'
            ], ['star', 'first_name_NS', 'last_name_NS', 'middle_initial'],
            ['star', 'first_name_NS', 'last_name_NS', 'suffix_name'],
            ['star', 'first_name_NS', 'last_name_NS'],
            ['first_name_NS', 'last_name_NS', 'middle_initial', 'suffix_name'],
            ['first_name_NS', 'last_name_NS', 'middle_initial'],
            ['first_name_NS', 'last_name_NS', 'suffix_name'],
            ['first_name_NS', 'last_name_NS'],
            ['F2FN', 'last_name_NS', 'middle_initial']
        ]
        results = RD.on_lists
        assert results == output_on_lists

    test_loop_merge_on_lists()

    def test_loop_merge_merged_df():
        '''test merged data from loop_merge'''
        output_merged_df = pd.DataFrame({
            'UID': [2, 1, 3],
            'sub__2016_ID': [2, 1, 3],
            'matched_on': [
                'star-first_name_NS-last_name_NS-middle_initial',
                'star-first_name_NS-last_name_NS',
                'F2FN-last_name_NS-middle_initial'
            ]
        }).astype(str)
        results = RD.merged_df.astype(str)
        output_merged_df = output_merged_df[results.columns]
        assert results.equals(output_merged_df)

    test_loop_merge_merged_df()

    def test_loop_merge_ref_um():
        '''test unmerged reference data from loop_merge'''
        results = RD.ref_um
        assert results.empty

    test_loop_merge_ref_um()

    def test_loop_merge_sup_um():
        '''test unmerged supplementary data from loop_merge'''
        results = RD.sup_um
        assert results.empty

    test_loop_merge_sup_um()

    RD = RD.append_to_reference()

    def test_append_to_reference_ref_df():
        '''test reference data after append_to_reference'''
        output_ref_df = pd.DataFrame({
            'data_id': [1, 1, 2, 2, 3, np.nan, np.nan, np.nan],
            'first_name_NS':
            ['BOB', 'BOB', 'AN', 'ANNA', 'ANNIE', 'BOB', 'AN', 'AN'],
            'middle_initial':
            ['M', np.nan, np.nan, 'L', 'L', np.nan, 'L', 'L'],
            'suffix_name':
            ['JR', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
            'last_name_NS': [
                'JONES', 'JONES', 'SMITH', 'ORIELY', 'SMITH', 'JONES',
                'ORIELY', 'SMITH'
            ],
            'star': [10, 50, 20, np.nan, 2, 50, 20, np.nan],
            'current_star':
            [np.nan, np.nan, np.nan, np.nan, np.nan, 50, 20, np.nan],
            'UID': [1, 1, 2, 2, 3, 1, 2, 3],
            'sub__2016_ID': [np.nan, np.nan, np.nan, np.nan, np.nan, 1, 2, 3]
        })
        results = RD.ref_df
        output_ref_df = output_ref_df[results.columns]
        assert results.equals(output_ref_df)

    test_append_to_reference_ref_df()
def test_one_to_one_False():
    '''tests merging for one_to_many merging'''
    input_df = pd.DataFrame({
        'data_id': [10, 30, 109],
        'first_name_NS': ['BOB', 'KATHLEEN', 'ELLEN'],
        'middle_initial': ['M', np.nan, 'L'],
        'suffix_name': ['JR', np.nan, np.nan],
        'last_name_NS': ['JONES', 'SMITH', 'ORIELY'],
        'star1': [10, 20, np.nan],
        'star2': [20, np.nan, np.nan],
        'star3': [30, np.nan, np.nan]
    })

    input_args = {'uid': 'UID', 'data_id': 'data_id', 'log': log}
    RD = ReferenceData(input_df, **input_args)

    def test_initialize_ReferenceData():
        """test initializing ReferenceData
        """
        output_ref_df = pd.DataFrame({
            'data_id': [10, 10, 10, 30, 109],
            'first_name_NS': ['BOB', 'BOB', 'BOB', 'KATHLEEN', 'ELLEN'],
            'middle_initial': ['M', 'M', 'M', np.nan, 'L'],
            'suffix_name': ['JR', 'JR', 'JR', np.nan, np.nan],
            'last_name_NS': ['JONES', 'JONES', 'JONES', 'SMITH', 'ORIELY'],
            'star': [10, 20, 30, 20, np.nan],
            'UID': [1, 1, 1, 2, 3]
        })

        results = RD.ref_df
        output_ref_df = output_ref_df[results.columns]
        assert results.equals(output_ref_df)

    test_initialize_ReferenceData()

    input_sup_df = pd.DataFrame({
        'sub__2016_ID': [1, 2, 3, 4, 5],
        'first_name_NS': ['BOB', 'BOB', 'KATHY', 'ELLEN', 'JENNA'],
        'middle_initial': ['M', np.nan, np.nan, 'L', 'E'],
        'suffix_name': [np.nan, 'JR', np.nan, np.nan, np.nan],
        'last_name_NS':
        ['JONES', 'JONES', 'GRANT', 'SKARNULISORIELY', 'JONES'],
        'star': [10, 20, 20, 100, 192]
    })

    RD = RD.add_sup_data(input_sup_df,
                         add_cols=["F4FN", "L4LN"],
                         base_OD=[('star', ['star', '']),
                                  ('first_name', ['first_name_NS', 'F4FN']),
                                  ('last_name', ['last_name_NS', 'L4LN']),
                                  ('middle_initial', ['middle_initial', '']),
                                  ('suffix_name', ['suffix_name', ''])])

    def test_add_sup_data_sup_df():
        '''test added supplementary data'''
        output_sup_df = pd.DataFrame({
            'sub__2016_ID': [1, 2, 3, 4, 5],
            'first_name_NS': ['BOB', 'BOB', 'KATHY', 'ELLEN', 'JENNA'],
            'middle_initial': ['M', np.nan, np.nan, 'L', 'E'],
            'suffix_name': [np.nan, 'JR', np.nan, np.nan, np.nan],
            'last_name_NS':
            ['JONES', 'JONES', 'GRANT', 'SKARNULISORIELY', 'JONES'],
            'current_star': [np.nan, 51, 20, 100, 192],
            'star': [10, 20, 20, 100, 192]
        })
        results = RD.sup_df
        output_sup_df = output_sup_df[results.columns]
        assert results.equals(output_sup_df)

    test_add_sup_data_sup_df()

    def test_add_sup_data_sup_um():
        '''test added unmerged supplementary data'''
        output_sup_um = pd.DataFrame({
            'sub__2016_ID': [1, 2, 3, 4, 5],
            'first_name_NS': ['BOB', 'BOB', 'KATHY', 'ELLEN', 'JENNA'],
            'F4FN': ['BOB', 'BOB', 'KATH', 'ELLE', 'JENN'],
            'middle_initial': ['M', np.nan, np.nan, 'L', 'E'],
            'suffix_name': [np.nan, 'JR', np.nan, np.nan, np.nan],
            'last_name_NS':
            ['JONES', 'JONES', 'GRANT', 'SKARNULISORIELY', 'JONES'],
            'L4LN': ['ONES', 'ONES', 'RANT', 'IELY', 'ONES'],
            'current_star': [np.nan, 51, 20, 100, 192],
            'star': [10, 20, 20, 100, 192]
        })

        results = RD.sup_um
        output_sup_um = output_sup_um[results.columns]
        assert results.equals(output_sup_um)

    test_add_sup_data_sup_um()

    def test_add_sup_data_ref_um():
        '''test unmerged reference data'''
        output_ref_um = pd.DataFrame({
            'data_id': [10, 10, 10, 30, 109],
            'first_name_NS': ['BOB', 'BOB', 'BOB', 'KATHLEEN', 'ELLEN'],
            'middle_initial': ['M', 'M', 'M', np.nan, 'L'],
            'suffix_name': ['JR', 'JR', 'JR', np.nan, np.nan],
            'last_name_NS': ['JONES', 'JONES', 'JONES', 'SMITH', 'ORIELY'],
            'star': [10, 20, 30, 20, np.nan],
            'F4FN': ['BOB', 'BOB', 'BOB', 'KATH', 'ELLE'],
            'L4LN': ['ONES', 'ONES', 'ONES', 'MITH', 'IELY'],
            'UID': [1, 1, 1, 2, 3]
        })

        results = RD.ref_um
        output_ref_um = output_ref_um[results.columns]
        assert results.equals(output_ref_um)

    test_add_sup_data_ref_um()

    RD = RD.loop_merge(custom_merges=[['F4FN', 'star']], one_to_one=False)

    def test_loop_merge_on_lists():
        '''test on_lists for loop_merge'''
        output_on_lists = [
            [
                'star', 'first_name_NS', 'last_name_NS', 'middle_initial',
                'suffix_name'
            ], ['star', 'first_name_NS', 'last_name_NS', 'middle_initial'],
            ['star', 'first_name_NS', 'last_name_NS', 'suffix_name'],
            ['star', 'first_name_NS', 'last_name_NS'],
            ['star', 'first_name_NS', 'L4LN', 'middle_initial', 'suffix_name'],
            ['star', 'first_name_NS', 'L4LN', 'middle_initial'],
            ['star', 'first_name_NS', 'L4LN', 'suffix_name'],
            ['star', 'first_name_NS', 'L4LN'],
            ['star', 'F4FN', 'last_name_NS', 'middle_initial', 'suffix_name'],
            ['star', 'F4FN', 'last_name_NS', 'middle_initial'],
            ['star', 'F4FN', 'last_name_NS', 'suffix_name'],
            ['star', 'F4FN', 'last_name_NS'],
            ['star', 'F4FN', 'L4LN', 'middle_initial', 'suffix_name'],
            ['star', 'F4FN', 'L4LN', 'middle_initial'],
            ['star', 'F4FN', 'L4LN', 'suffix_name'], ['star', 'F4FN', 'L4LN'],
            ['first_name_NS', 'last_name_NS', 'middle_initial', 'suffix_name'],
            ['first_name_NS', 'last_name_NS', 'middle_initial'],
            ['first_name_NS', 'last_name_NS', 'suffix_name'],
            ['first_name_NS', 'last_name_NS'],
            ['first_name_NS', 'L4LN', 'middle_initial', 'suffix_name'],
            ['first_name_NS', 'L4LN', 'middle_initial'],
            ['first_name_NS', 'L4LN',
             'suffix_name'], ['first_name_NS', 'L4LN'],
            ['F4FN', 'last_name_NS', 'middle_initial', 'suffix_name'],
            ['F4FN', 'last_name_NS', 'middle_initial'],
            ['F4FN', 'last_name_NS', 'suffix_name'], ['F4FN', 'last_name_NS'],
            ['F4FN', 'L4LN', 'middle_initial', 'suffix_name'],
            ['F4FN', 'L4LN',
             'middle_initial'], ['F4FN', 'L4LN', 'suffix_name'],
            ['F4FN', 'L4LN'], ['F4FN', 'star']
        ]
        results = RD.on_lists
        assert results == output_on_lists

    test_loop_merge_on_lists()

    def test_loop_merge_merged_df():
        '''test merged data from loop_merge'''
        output_merged_df = pd.DataFrame({
            'UID': [1, 1, 3, 2],
            'sub__2016_ID': [1, 2, 4, 3],
            'matched_on': [
                'star-first_name_NS-last_name_NS-middle_initial',
                'star-first_name_NS-last_name_NS-suffix_name',
                'first_name_NS-L4LN-middle_initial', 'F4FN-star'
            ]
        }).astype(str)
        results = RD.merged_df.astype(str)
        output_merged_df = output_merged_df[results.columns]
        assert results.equals(output_merged_df)

    test_loop_merge_merged_df()

    def test_loop_merge_ref_um():
        '''test unmerged reference data from loop_merge'''
        output_ref_um = pd.DataFrame({
            'data_id': [10, 10, 10, 30, 109],
            'first_name_NS': ['BOB', 'BOB', 'BOB', 'KATHLEEN', 'ELLEN'],
            'middle_initial': ['M', 'M', 'M', np.nan, 'L'],
            'suffix_name': ['JR', 'JR', 'JR', np.nan, np.nan],
            'last_name_NS': ['JONES', 'JONES', 'JONES', 'SMITH', 'ORIELY'],
            'star': [10, 20, 30, 20, np.nan],
            'F4FN': ['BOB', 'BOB', 'BOB', 'KATH', 'ELLE'],
            'L4LN': ['ONES', 'ONES', 'ONES', 'MITH', 'IELY'],
            'UID': [1, 1, 1, 2, 3]
        })

        results = RD.ref_um
        output_ref_um = output_ref_um[results.columns]
        assert results.equals(output_ref_um)

    test_loop_merge_ref_um()

    def test_loop_merge_sup_um():
        '''test unmerged supplementary data from loop_merge'''
        output_sup_um = pd.DataFrame([{
            'sub__2016_ID': 5,
            'first_name_NS': 'JENNA',
            'last_name_NS': 'JONES',
            'middle_initial': 'E',
            'star': 192,
            'suffix_name': np.nan,
            'F4FN': 'JENN',
            'L4LN': 'ONES'
        }],
                                     index=[4])

        results = RD.sup_um.astype(str)
        output_sup_um = output_sup_um[results.columns].astype(str)
        assert results.equals(output_sup_um)

    test_loop_merge_sup_um()

    RD = RD.append_to_reference(keep_sup_um=False)

    def test_append_to_reference_ref_df_keep_sup_um():
        '''test reference data after append_to_reference'''
        output_ref_df = pd.DataFrame({
            'data_id': [10, 10, 10, 30, 109, np.nan, np.nan, np.nan, np.nan],
            'first_name_NS': [
                'BOB', 'BOB', 'BOB', 'KATHLEEN', 'ELLEN', 'BOB', 'BOB',
                'KATHY', 'ELLEN'
            ],
            'middle_initial':
            ['M', 'M', 'M', np.nan, 'L', 'M', np.nan, np.nan, 'L'],
            'suffix_name':
            ['JR', 'JR', 'JR', np.nan, np.nan, np.nan, 'JR', np.nan, np.nan],
            'last_name_NS': [
                'JONES', 'JONES', 'JONES', 'SMITH', 'ORIELY', 'JONES', 'JONES',
                'GRANT', 'SKARNULISORIELY'
            ],
            'star': [10, 20, 30, 20, np.nan, 10, 20, 20, 100],
            'UID': [1, 1, 1, 2, 3, 1, 1, 2, 3],
            'sub__2016_ID':
            [np.nan, np.nan, np.nan, np.nan, np.nan, 1, 2, 3, 4]
        })
        results = RD.ref_df
        output_ref_df = output_ref_df[results.columns]
        assert results.equals(output_ref_df)

    test_append_to_reference_ref_df_keep_sup_um()
def test_one_to_one_True():
    '''tests merging for one_to_one merging'''
    input_df = pd.DataFrame({
        'data_id': [10, 20, 30, 1, 109],
        'first_name_NS': ['BOB', 'BOB', 'KATHLEEN', 'KEVIN', 'ELLEN'],
        'middle_initial': ['M', np.nan, np.nan, 'J', 'L'],
        'suffix_name': ['SR', 'JR', np.nan, 'II', np.nan],
        'last_name_NS': ['JONES', 'JONES', 'SMITH', 'PARK', 'ORIELY'],
        'star1': [10, 50, 20, 2, np.nan],
        'star2': [20, 51, np.nan, 4, np.nan],
        'star3': [30, np.nan, np.nan, 10, np.nan],
        'merge': [1, 1, 1, 0, 1]
    })
    input_args = {
        'uid': 'UID',
        'data_id': 'data_id',
        'log': log,
        'starting_uid': 5
    }

    RD = ReferenceData(input_df, **input_args)

    def test_initialize_ReferenceData():
        '''test initializing ReferenceData'''
        output_ref_df = pd.DataFrame({
            'data_id': [10, 10, 10, 20, 20, 30, 109],
            'first_name_NS':
            ['BOB', 'BOB', 'BOB', 'BOB', 'BOB', 'KATHLEEN', 'ELLEN'],
            'middle_initial': ['M', 'M', 'M', np.nan, np.nan, np.nan, 'L'],
            'suffix_name': ['SR', 'SR', 'SR', 'JR', 'JR', np.nan, np.nan],
            'last_name_NS':
            ['JONES', 'JONES', 'JONES', 'JONES', 'JONES', 'SMITH', 'ORIELY'],
            'star': [10, 20, 30, 50, 51, 20, np.nan],
            'UID': [5, 5, 5, 6, 6, 7, 8]
        })

        results = RD.ref_df
        output_ref_df = output_ref_df[results.columns]
        assert results.equals(output_ref_df)

    test_initialize_ReferenceData()

    input_sup_df = pd.DataFrame({
        'sub__2016_ID': [1, 2, 3, 4, 5],
        'first_name_NS': ['BOB', 'BOB', 'KATHY', 'ELLEN', 'JENNA'],
        'birth_year': [1970, 1990, 1985, 1965, 1986],
        'middle_initial': [np.nan, 'M', 'C', 'L', 'E'],
        'suffix_name': ['SR', np.nan, np.nan, np.nan, np.nan],
        'last_name_NS':
        ['JONES', 'JONES', 'GRANT', 'SKARNULISORIELY', 'JONES'],
        'current_star': [np.nan, 51, 20, 100, 192]
    })

    RD = RD.add_sup_data(
        input_sup_df,
        add_cols=[
            "F4FN", {
                'id': '',
                'exec':
                "df['L4LN'] = df['last_name_NS'].map(lambda x: x[-4:None])"
            }
        ])

    def test_add_sup_data_sup_df():
        '''test added supplementary data'''
        output_sup_df = pd.DataFrame({
            'sub__2016_ID': [1, 2, 3, 4, 5],
            'first_name_NS': ['BOB', 'BOB', 'KATHY', 'ELLEN', 'JENNA'],
            'birth_year': [1970, 1990, 1985, 1965, 1986],
            'middle_initial': [np.nan, 'M', 'C', 'L', 'E'],
            'suffix_name': ['SR', np.nan, np.nan, np.nan, np.nan],
            'last_name_NS':
            ['JONES', 'JONES', 'GRANT', 'SKARNULISORIELY', 'JONES'],
            'current_star': [np.nan, 51, 20, 100, 192],
            'star': [np.nan, 51, 20, 100, 192]
        })

        results = RD.sup_df
        output_sup_df = output_sup_df[results.columns]
        assert results.equals(output_sup_df)

    test_add_sup_data_sup_df()

    def test_add_sup_data_sup_um():
        '''test added unmerged supplementary data'''
        output_sup_um = pd.DataFrame({
            'sub__2016_ID': [1, 2, 3, 4, 5],
            'first_name_NS': ['BOB', 'BOB', 'KATHY', 'ELLEN', 'JENNA'],
            'F4FN': ['BOB', 'BOB', 'KATH', 'ELLE', 'JENN'],
            'birth_year': [1970, 1990, 1985, 1965, 1986],
            'middle_initial': [np.nan, 'M', 'C', 'L', 'E'],
            'suffix_name': ['SR', np.nan, np.nan, np.nan, np.nan],
            'last_name_NS':
            ['JONES', 'JONES', 'GRANT', 'SKARNULISORIELY', 'JONES'],
            'L4LN': ['ONES', 'ONES', 'RANT', 'IELY', 'ONES'],
            'current_star': [np.nan, 51, 20, 100, 192],
            'star': [np.nan, 51, 20, 100, 192]
        })

        results = RD.sup_um
        output_sup_um = output_sup_um[results.columns]
        assert results.equals(output_sup_um)

    test_add_sup_data_sup_um()

    def test_add_sup_data_ref_um():
        '''test unmerged reference data'''
        output_ref_um = pd.DataFrame({
            'data_id': [10, 10, 10, 20, 20, 30, 109],
            'first_name_NS':
            ['BOB', 'BOB', 'BOB', 'BOB', 'BOB', 'KATHLEEN', 'ELLEN'],
            'middle_initial': ['M', 'M', 'M', np.nan, np.nan, np.nan, 'L'],
            'suffix_name': ['SR', 'SR', 'SR', 'JR', 'JR', np.nan, np.nan],
            'last_name_NS':
            ['JONES', 'JONES', 'JONES', 'JONES', 'JONES', 'SMITH', 'ORIELY'],
            'F4FN': ['BOB', 'BOB', 'BOB', 'BOB', 'BOB', 'KATH', 'ELLE'],
            'L4LN': ['ONES', 'ONES', 'ONES', 'ONES', 'ONES', 'MITH', 'IELY'],
            'star': [10, 20, 30, 50, 51, 20, np.nan],
            'UID': [5, 5, 5, 6, 6, 7, 8]
        })

        results = RD.ref_um
        output_ref_um = output_ref_um[results.columns]
        assert results.equals(output_ref_um)

    test_add_sup_data_ref_um()

    RD = RD.loop_merge(
        custom_merges=[["first_name_NS", "L4LN", "middle_initial"], {
            'cols': ["F4FN", "star"],
            'query': 'F4FN=="KATH"'
        }])

    def test_loop_merge_on_lists():
        '''test on_lists for loop_merge'''
        output_on_lists = [
            [
                'star', 'first_name_NS', 'last_name_NS', 'middle_initial',
                'suffix_name'
            ], ['star', 'first_name_NS', 'last_name_NS', 'middle_initial'],
            ['star', 'first_name_NS', 'last_name_NS', 'suffix_name'],
            ['star', 'first_name_NS', 'last_name_NS'],
            ['star', 'F4FN', 'last_name_NS', 'middle_initial', 'suffix_name'],
            ['star', 'F4FN', 'last_name_NS', 'middle_initial'],
            ['star', 'F4FN', 'last_name_NS', 'suffix_name'],
            ['star', 'F4FN', 'last_name_NS'],
            ['first_name_NS', 'last_name_NS', 'middle_initial', 'suffix_name'],
            ['first_name_NS', 'last_name_NS', 'middle_initial'],
            ['first_name_NS', 'last_name_NS', 'suffix_name'],
            ['first_name_NS', 'last_name_NS'],
            ['F4FN', 'last_name_NS', 'middle_initial', 'suffix_name'],
            ['F4FN', 'last_name_NS', 'middle_initial'],
            ['F4FN', 'last_name_NS', 'suffix_name'], ['F4FN', 'last_name_NS'],
            ['first_name_NS', 'L4LN', 'middle_initial'], {
                'cols': ["F4FN", "star"],
                'query': 'F4FN=="KATH"'
            }
        ]

        results = RD.on_lists
        assert results == output_on_lists

    test_loop_merge_on_lists()

    def test_loop_merge_merged_df():
        '''test merged data from loop_merge'''
        output_merged_df = pd.DataFrame({
            'UID': [6, 5, 8, 7],
            'sub__2016_ID': [2, 1, 4, 3],
            'matched_on': [
                'star-first_name_NS-last_name_NS',
                'first_name_NS-last_name_NS-suffix_name',
                'first_name_NS-L4LN-middle_initial', 'F4FN-star'
            ]
        }).astype(str)
        results = RD.merged_df.astype(str)
        output_merged_df = output_merged_df[results.columns]
        assert results.equals(output_merged_df)

    test_loop_merge_merged_df()

    def test_loop_merge_ref_um():
        '''test unmerged reference data from loop_merge'''
        results = RD.ref_um
        assert results.empty

    test_loop_merge_ref_um()

    def test_loop_merge_sup_um():
        '''test unmerged supplementary data from loop_merge'''
        output_sup_um = pd.DataFrame([{
            'sub__2016_ID': 5,
            'first_name_NS': 'JENNA',
            'last_name_NS': 'JONES',
            'middle_initial': 'E',
            'star': 192.0,
            'suffix_name': np.nan,
            'F4FN': 'JENN',
            'L4LN': 'ONES'
        }],
                                     index=[4])
        results = RD.sup_um.astype(str)
        output_sup_um = output_sup_um[results.columns].astype(str)
        assert results.equals(output_sup_um)

    test_loop_merge_sup_um()

    RD = RD.append_to_reference(drop_cols=['current_star'])

    def test_append_to_reference_ref_df_drop_cols():
        '''test reference data after append_to_reference'''
        output_ref_df = pd.DataFrame({
            'data_id': [
                10, 10, 10, 20, 20, 30, 109, np.nan, np.nan, np.nan, np.nan,
                np.nan
            ],
            'first_name_NS': [
                'BOB', 'BOB', 'BOB', 'BOB', 'BOB', 'KATHLEEN', 'ELLEN', 'BOB',
                'BOB', 'KATHY', 'ELLEN', 'JENNA'
            ],
            'middle_initial': [
                'M', 'M', 'M', np.nan, np.nan, np.nan, 'L', np.nan, 'M', 'C',
                'L', 'E'
            ],
            'suffix_name': [
                'SR', 'SR', 'SR', 'JR', 'JR', np.nan, np.nan, 'SR', np.nan,
                np.nan, np.nan, np.nan
            ],
            'last_name_NS': [
                'JONES', 'JONES', 'JONES', 'JONES', 'JONES', 'SMITH', 'ORIELY',
                'JONES', 'JONES', 'GRANT', 'SKARNULISORIELY', 'JONES'
            ],
            'star': [10, 20, 30, 50, 51, 20, np.nan, np.nan, 51, 20, 100, 192],
            'birth_year': [
                np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 1970,
                1990, 1985, 1965, 1986
            ],
            'UID': [5, 5, 5, 6, 6, 7, 8, 5, 6, 7, 8, 9],
            'sub__2016_ID': [
                np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 1, 2,
                3, 4, 5
            ]
        })
        results = RD.ref_df
        output_ref_df = output_ref_df[results.columns]
        assert results.equals(output_ref_df)

    test_append_to_reference_ref_df_drop_cols()

    def test_remerge_to_file():
        csv_opts = {'index': False}
        input_df = pd.DataFrame(
            {
                'sub__2016_ID': [1, 2, 3, 4, 5, 6],
                'event': [1, 0, 0, 0, 0, 1]
            },
            columns=['sub__2016_ID', 'event'])
        input_path = 'test_remerge_to_file_input.csv'
        output_path = 'test_remerge_to_file_output.csv'
        test_output_path = 'test_remerge_to_file_test.csv'
        input_df.to_csv(input_path, **csv_opts)
        RD.remerge_to_file(input_path, output_path, csv_opts)

        output_df = pd.DataFrame(
            {
                'sub__2016_ID': [1, 2, 3, 4, 5, 6],
                'event': [1, 0, 0, 0, 0, 1],
                'UID': [5, 6, 7, 8, 9, np.nan]
            },
            columns=['sub__2016_ID', 'event', 'UID'])
        output_df.to_csv(test_output_path, **csv_opts)
        import filecmp
        assert filecmp.cmp(test_output_path, output_path)
        import os
        os.system('rm %s %s %s' % (input_path, output_path, test_output_path))

    test_remerge_to_file()
Beispiel #9
0
    '''
    script_path = __main__.__file__
    args = {
        'input_profiles_file':
        'input/roster_1936-2017_2017-04_profiles.csv.gz',
        'input_remerge_file': 'input/roster_1936-2017_2017-04.csv.gz',
        'output_remerge_file': 'output/roster_1936-2017_2017-04.csv.gz',
        'intrafile_id': 'roster_1936-2017_2017-04_ID',
        'output_reference_file': 'output/officer-reference.csv.gz',
        'universal_id': 'UID',
        'starting_uid': 100001
    }

    assert args['output_reference_file'] == 'output/officer-reference.csv.gz',\
        'Output reference file is not correct.'

    return setup.do_setup(script_path, args)


cons, log = get_setup()
data_df = pd.read_csv(cons.input_profiles_file)
ReferenceData(data_df=data_df, uid=cons.universal_id,
              data_id=cons.intrafile_id,
              starting_uid = cons.starting_uid,
              log=log)\
              .remerge_to_file(cons.input_remerge_file,
                                cons.output_remerge_file,
                                cons.csv_opts)\
              .write_reference(cons.output_reference_file,
                                cons.csv_opts)
Beispiel #10
0
    assert args['input_reference_file'] == 'input/officer-reference.csv.gz',\
        'Input reference file is not correct.'
    assert args['output_reference_file'] == 'output/officer-reference.csv.gz',\
        'Output reference file is not correct.'

    return setup.do_setup(script_path, args)


cons, log = get_setup()

ref_df = pd.read_csv(cons.input_reference_file)
sup_df = pd.read_csv(cons.input_profiles_file)

rd = ReferenceData(ref_df, uid=cons.universal_id, log=log)\
        .add_sup_data(sup_df, add_cols=cons.add_cols, base_OD=cons.base_OD)\
        .loop_merge(**cons.loop_merge)
rd.merged_df.to_csv("output/merged_df.csv.gz", **cons.csv_opts)
rd.append_to_reference(keep_sup_um=False)\
.add_sup_data(rd.sup_um, add_cols=[],
                base_OD=cons.base_OD)\
  .loop_merge(verbose=True, base_OD_edits=[
      ('birth_year', ['birth_year', 'current_age',
                      'current_age_m1', 'current_age2_m1',
                      'current_age_p1', 'current_age2_p1',
                      'current_age_mp', 'current_age2_mp'
                      'current_age_pm','current_age2_pm', '']),
      ('appointed_date', ['so_min_date', 'so_max_date',
                          'so_min_year', 'so_max_year',
                          'so_min_year_m1', 'so_max_year_m1'])
                ])\
Beispiel #11
0
            ],
            'time_col':
            'foia_date',
            'mode_cols': [
                'first_name', 'last_name', 'middle_initial', 'middle_initial2',
                'suffix_name', 'race', 'gender', 'birth_year',
                'appointed_date', 'start_date', 'org_hire_date'
            ],
            'max_cols': ['resignation_date']
        }
    }

    assert args['input_file'] == 'input/officer-reference.csv.gz',\
        'Input file is not correct.'
    assert args['output_file'] == 'output/final-profiles.csv.gz',\
        'Output file is not correct.'

    return setup.do_setup(script_path, args)


cons, log = get_setup()

ref_df = pd.read_csv(cons.input_file)

(ReferenceData(ref_df, uid=cons.universal_id, log=log).final_profiles(
    aggregate_data_args=cons.aggregate_data_args,
    column_order=cons.column_order,
    include_IDs=False,
    output_path=cons.output_file,
    csv_opts=cons.csv_opts))