def test_assign_unique_ids(): '''test assign_unique_ids does not test report generation ''' input_df = pd.DataFrame({ 'A': [1, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5], 'B': [2, np.nan, 2, 3, 4, 4, 2, np.nan, 3, 4, 4], 'C': [2, np.nan, np.nan, 3, 1, 1, np.nan, 5, np.nan, np.nan, np.nan] }) orig_input_df = copy.deepcopy(input_df) input_args = { 'uid': 'ID', 'id_cols': ['A'], 'conflict_cols': ['B', 'C'], 'log': False } output_df = pd.DataFrame({ 'ID': [3.0, 3.0, 3.0, 1.0, 2.0, 2.0, 4.0, 4.0, 5.0, 6.0, 6.0], 'A': [1, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5], 'B': [2, np.nan, 2, 3, 4, 4, 2, np.nan, 3, 4, 4], 'C': [2, np.nan, np.nan, 3, 1, 1, np.nan, 5, np.nan, np.nan, np.nan] }) results = assign_unique_ids_functions.assign_unique_ids( input_df, **input_args) assert results.equals(output_df) assert orig_input_df.equals(input_df)
def test_assign_unique_ids_with_empty_kd_df(): '''test assign_unique_ids with an empty kd_df and conflict_cols does not test report generation ''' input_df = pd.DataFrame({ 'A': [1, 1, 2, 3], 'B': [2, 2, np.nan, 3], 'C': [1, 2, 3, 4] }) orig_input_df = copy.deepcopy(input_df) input_args = { 'uid': 'ID', 'id_cols': ['A'], 'conflict_cols': ['B'], 'log': False, 'unresolved_policy': 'distinct' } output_df = pd.DataFrame({ 'A': [1, 1, 2, 3], 'B': [2, 2, np.nan, 3], 'C': [1, 2, 3, 4], 'ID': [1, 1, 2, 3] }) results = assign_unique_ids_functions.assign_unique_ids( input_df, **input_args) assert results.equals(output_df) assert orig_input_df.equals(input_df)
def test_assign_unique_ids_many_nans(): '''test assign_unique_ids with unresolved_policy = 'distinct' but with many nans does not test report generation ''' input_df = pd.DataFrame({ 'A': [1, 1, 1, 1, 2, 2, 2, 2], 'B': [1, 2, 2, 2, 4, 4, 4, 3], 'C': [2, 2, 3, np.nan, 1, 2, np.nan, 1] }) orig_input_df = copy.deepcopy(input_df) input_args = { 'uid': 'ID', 'id_cols': ['A'], 'conflict_cols': ['B', 'C'], 'log': False, 'unresolved_policy': 'distinct' } output_df = pd.DataFrame({ 'A': [1, 1, 1, 1, 2, 2, 2, 2], 'B': [1, 2, 2, 2, 4, 4, 4, 3], 'C': [2, 2, 3, np.nan, 1, 2, np.nan, 1], 'ID': [1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 2.0] }) results = assign_unique_ids_functions.assign_unique_ids( input_df, **input_args) assert results.equals(output_df) assert orig_input_df.equals(input_df)
def test_assign_unique_ids_unresolved_same(): '''test assign_unique_ids with unresolved_policy = 'same' does not test report generation ''' input_df = pd.DataFrame({ 'A': [1, 1, 1, 1, 2, 2, 2, 2, 3], 'B': [2, np.nan, 3, 2, 3, 4, 4, 4, 3], 'C': [2, np.nan, np.nan, np.nan, 1, 1, np.nan, 2, 3] }) orig_input_df = copy.deepcopy(input_df) input_args = { 'uid': 'ID', 'id_cols': ['A'], 'conflict_cols': ['B', 'C'], 'log': False, 'unresolved_policy': 'same' } output_df = pd.DataFrame({ 'A': [1, 1, 1, 1, 2, 2, 2, 2, 3], 'B': [2, np.nan, 3, 2, 3, 4, 4, 4, 3], 'C': [2, np.nan, np.nan, np.nan, 1, 1, np.nan, 2, 3], 'ID': [3.0, 3.0, 3.0, 3.0, 2.0, 4.0, 4.0, 4.0, 1.0] }) results = assign_unique_ids_functions.assign_unique_ids( input_df, **input_args) assert results.equals(output_df) assert orig_input_df.equals(input_df)
} assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = assign_unique_ids(df, cons.cr_uid, **cons.cr_auid, log=log) df = assign_unique_ids(df, cons.ind_uid, **cons.ind_auid, log=log) udf = union_group(df[[cons.cr_uid, cons.ind_uid]].drop_duplicates(), cons.id, [cons.cr_uid, cons.ind_uid]) df = df.merge(udf, on=[cons.cr_uid, cons.ind_uid])\ .drop(cons.cr_uid, axis=1) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.ind_auid['id_cols'], max_cols=cons.ind_auid['conflict_cols'] + ['star']) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
'output_demo_file': 'output/settlements_1952-2016_2017-01_profiles.csv.gz', 'id_cols': [ "first_name", "last_name", "first_name_NS", "last_name_NS", "star", "current_status", "officer_id", "service_years", "service_months", "suffix_name", "cost", "rank", "race", "gender" ], 'id': 'settlements_1952-2016_2017-01_ID' } assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = assign_unique_ids(df, cons.id, cons.id_cols, log=log) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.id_cols) profiles_df.to_csv(cons.output_demo_file, **cons.csv_opts)
"output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) full_df = pd.DataFrame() for year in df['year'].unique(): log.info('Assigning unique sub-ids for year: %d', year) sub_df = df[df['year']==year] sub_df = assign_unique_ids(sub_df, cons.sub_id, cons.sub_id_cols, conflict_cols=cons.sub_conflict_cols, log=log) sub_df[cons.sub_id] = sub_df[cons.sub_id] + year * 100000 full_df = full_df.append(sub_df) assert full_df.shape[0] == df.shape[0],\ print('Remerged data does not match input dataset') df = full_df log.info("Beginning self-merge process") for year in range(2002, 2018): dfy = df[df['year'] == year].copy() yid = cons.year_id.replace('year', str(year)) dfy.rename(columns={cons.year_id: yid},
cons, log = get_setup() df = pd.read_csv(cons.input_file) po_df = df[df['first_name'] == 'POLICE'] log.info('{} hidden officers marked as merge = 0'.format(po_df.shape[0])) log.info(('{} officer with no name marked as merge = 0' '').format(df[df['first_name'].isnull()].shape[0])) df.loc[(df['first_name'].notnull()) & (df['first_name'] != 'POLICE'), 'merge'] = 1 df['merge'] = df['merge'].fillna(0) df = assign_unique_ids(df, cons.id, cons.id_cols + ['merge'], conflict_cols=cons.conflict_cols, log=log) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.id_cols + ['merge'], max_cols=cons.max_cols + cons.conflict_cols, merge_cols=cons.merge_cols, merge_on_cols=cons.merge_on_cols) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
'id': 'subject_ID' } assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = union_group(df, cons.id_cols[0], cons.group_cols) log.info('%d group_ids' % df[cons.id_cols[0]].nunique()) df = assign_unique_ids(df, cons.id, cons.id_cols, cons.conflict_cols, log=log, unresolved_policy='distinct') adf = aggregate_data(df, cons.id, id_cols=cons.id_cols, max_cols=cons.max_cols + cons.conflict_cols) df.to_csv(cons.output_file, **cons.csv_opts) adf.to_csv(cons.output_profiles_file, **cons.csv_opts)
cons, log = get_setup() df = pd.read_csv(cons.input_file) df["Specify"] = 0 res1_units = [5, 602] df.loc[(df["first_name"] == "ROBERT") & (df["last_name"] == "SMITH") & (df["middle_initial"] == "E") & (df["birth_year"] == 1947) & (df["appointed_date"] == "1971-02-22") & (df["unit"].isin(res1_units)), "Specify"] = 1 log.info(("Robert E Smith 1947 1971-02-22 in units {}" " specified as singular individual.").format(res1_units)) df = assign_unique_ids(df, cons.id, cons.id_cols + ["Specify"], cons.conflict_cols, log=log) del df["Specify"] log.info(("Specify column used to manually distinguish individuals" " created for AUID then dropped before aggregation")) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.id_cols, max_cols=cons.conflict_cols + cons.max_cols, current_cols=cons.current_cols, time_col=cons.time_col) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)