def final_profiles(self, aggregate_data_args, output_path='', column_order=[], csv_opts={}, include_IDs=True): """Generates unique profiles from reference data and writes to csv Parameters ---------- aggregate_data_args : dict Dictionary of arguments for aggregate_data column_order : list List of columns in specified order output_path : str csv_opts : dict include_IDs : bool If True, keep ID columns, if False drop them Returns ---------- self """ from assign_unique_ids_functions import aggregate_data self.generate_foia_dates() profiles = aggregate_data(self.ref_df, self.uid, **aggregate_data_args) count_df = pd.DataFrame(self.ref_df[[ col for col in self.ref_df.columns if col.endswith("_ID") or col == self.uid ]].drop_duplicates()[self.uid].value_counts()) count_df.columns = ['profile_count'] count_df[self.uid] = count_df.index profiles = profiles.merge(count_df, on=self.uid) assert profiles.shape[0] == self.ref_df[self.uid].nunique(),\ print(profiles.shape[0], self.ref_df[self.uid].nunique()) if include_IDs: ID_cols = [col for col in profiles.columns if col.endswith('_ID')] else: ID_cols = [] if column_order: cols = [col for col in column_order if col in profiles.columns] profiles = profiles[[self.uid] + cols + ID_cols + ['profile_count']] self.log.info('Officer profile count: {}'.format(profiles.shape[0])) if output_path: profiles.to_csv(output_path, **csv_opts) else: self.profiles = profiles return self
def test_aggregate_data(): ''' test aggregate_data''' input_df = pd.DataFrame({ 'uid': [1, 1, 1, 1, 1, 1, 4, 4, 4, 99, 99], 'ID': ['A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C'], 'mode': [2, 2, 2, 3, 3, 3, 1, np.nan, 4, 5, 5], 'date_of_age_obs': [ '2014-01-01', '2014-03-01', '2015-01-01', '2015-09-01', '2016-10-01', '2015-11-01', '2015-01-01', '2016-01-01', np.nan, np.nan, np.nan ], 'age': [20, 20, 21, 22, 23, 22, 56, 57, 90, 35, 30], 'max': [1, np.nan, 10, 1, 3, 9, 2, 2, -2, np.nan, np.nan], 'max_names': [ 'One', np.nan, 'Ten', 'One', 'Three', 'Nine', 'Two', 'Two', '-Two', np.nan, np.nan ] }) orig_input_df = copy.deepcopy(input_df) input_args = { 'uid': 'uid', 'id_cols': ['ID'], 'mode_cols': ['mode'], 'max_cols': ['max'], 'current_cols': ['age'], 'time_col': 'date_of_age_obs', 'merge_cols': ['max_names'], 'merge_on_cols': ['max'] } output_df = pd.DataFrame( { 'uid': [1, 4, 99], 'ID': ['A', 'B', 'C'], 'mode': [2.0, 1.0, 5.0], 'max': [10, 2, np.nan], 'current_age': [23, 57, np.nan], 'max_names': ['Ten', 'Two', np.nan] }, columns=['uid', 'ID', 'mode', 'max', 'current_age', 'max_names']) results = assign_unique_ids_functions.aggregate_data( input_df, **input_args) assert results.equals(output_df) assert orig_input_df.equals(input_df)
} assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = assign_unique_ids(df, cons.cr_uid, **cons.cr_auid, log=log) df = assign_unique_ids(df, cons.ind_uid, **cons.ind_auid, log=log) udf = union_group(df[[cons.cr_uid, cons.ind_uid]].drop_duplicates(), cons.id, [cons.cr_uid, cons.ind_uid]) df = df.merge(udf, on=[cons.cr_uid, cons.ind_uid])\ .drop(cons.cr_uid, axis=1) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.ind_auid['id_cols'], max_cols=cons.ind_auid['conflict_cols'] + ['star']) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
'output_demo_file': 'output/settlements_1952-2016_2017-01_profiles.csv.gz', 'id_cols': [ "first_name", "last_name", "first_name_NS", "last_name_NS", "star", "current_status", "officer_id", "service_years", "service_months", "suffix_name", "cost", "rank", "race", "gender" ], 'id': 'settlements_1952-2016_2017-01_ID' } assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = assign_unique_ids(df, cons.id, cons.id_cols, log=log) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.id_cols) profiles_df.to_csv(cons.output_demo_file, **cons.csv_opts)
'output_profiles_file': 'output/TRR-officers_2004-2016_2016-09_profiles.csv.gz', 'id_cols': [ "first_name", "last_name", "first_name_NS", "last_name_NS", "middle_initial", 'middle_initial2', "suffix_name", "appointed_date", "gender", "race", "current_star" ], 'id': 'TRR-officers_2004-2016_2016-09_ID' } assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = assign_unique_ids(df, cons.id, cons.id_cols, log=log) df.to_csv(cons.output_file, **cons.csv_opts) agg_df = aggregate_data(df, cons.id, cons.id_cols) agg_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
cons, log = get_setup() df = pd.read_csv(cons.input_file) po_df = df[df['first_name'] == 'POLICE'] log.info('{} hidden officers marked as merge = 0'.format(po_df.shape[0])) log.info(('{} officer with no name marked as merge = 0' '').format(df[df['first_name'].isnull()].shape[0])) df.loc[(df['first_name'].notnull()) & (df['first_name'] != 'POLICE'), 'merge'] = 1 df['merge'] = df['merge'].fillna(0) df = assign_unique_ids(df, cons.id, cons.id_cols + ['merge'], conflict_cols=cons.conflict_cols, log=log) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.id_cols + ['merge'], max_cols=cons.max_cols + cons.conflict_cols, merge_cols=cons.merge_cols, merge_on_cols=cons.merge_on_cols) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
'output_profiles_file': 'output/complaints-investigators_2000-2016_2016-11_profiles.csv.gz', 'id_cols': [ 'first_name', 'last_name', 'appointed_date', 'first_name_NS', 'last_name_NS', 'middle_initial' ], 'max_cols': ['current_unit', 'current_star', 'current_rank'], 'id': 'complaints-investigators_2000-2016_2016-11_ID' } assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = assign_unique_ids(df, cons.id, cons.id_cols, log=log) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.id_cols, max_cols=cons.max_cols) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
'id': 'subject_ID' } assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = union_group(df, cons.id_cols[0], cons.group_cols) log.info('%d group_ids' % df[cons.id_cols[0]].nunique()) df = assign_unique_ids(df, cons.id, cons.id_cols, cons.conflict_cols, log=log, unresolved_policy='distinct') adf = aggregate_data(df, cons.id, id_cols=cons.id_cols, max_cols=cons.max_cols + cons.conflict_cols) df.to_csv(cons.output_file, **cons.csv_opts) adf.to_csv(cons.output_profiles_file, **cons.csv_opts)
cons, log = get_setup() df = pd.read_csv(cons.input_file) df["Specify"] = 0 res1_units = [5, 602] df.loc[(df["first_name"] == "ROBERT") & (df["last_name"] == "SMITH") & (df["middle_initial"] == "E") & (df["birth_year"] == 1947) & (df["appointed_date"] == "1971-02-22") & (df["unit"].isin(res1_units)), "Specify"] = 1 log.info(("Robert E Smith 1947 1971-02-22 in units {}" " specified as singular individual.").format(res1_units)) df = assign_unique_ids(df, cons.id, cons.id_cols + ["Specify"], cons.conflict_cols, log=log) del df["Specify"] log.info(("Specify column used to manually distinguish individuals" " created for AUID then dropped before aggregation")) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.id_cols, max_cols=cons.conflict_cols + cons.max_cols, current_cols=cons.current_cols, time_col=cons.time_col) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
def combine_histories(uh_list, resignation_df, log, uid='UID', unit='unit', start='unit_start_date', end='unit_end_date', resignation_col='resignation_date'): """Combines multiple unit history dataframes into one containing unique unit movements for individuals, removing non-sensical data and filling missing data Parameters ---------- uh_list : list List of unit history pandas DataFrame resignation_df : pandas DataFrame Contains data on resignation dates log : logging object uid : str Name of unique ID column in unit history and resignation date DataFrames unit : str Name of unit column in unit history DataFrames in uh_list start : str Name of unit start date column in unit history DataFrames in uh_list end : str Name of unit end date column in unit history DataFrames in uh_list resignation_col : str Name of resignation date column in resignation_df DataFrames Returns ------- uh_df : pandas DataFrame """ from assign_unique_ids_functions import aggregate_data uh_df = pd.DataFrame() for df in uh_list: df = df.loc[:, [uid, unit, start, end]] df.dropna(subset=[unit, uid, start], how='any', inplace=True) log.info(('%d rows with non-NA end date and end date ' 'before/equal to start date' ''), df[(df[end].notnull()) & (df[end] <= df[start])].shape[0]) df.loc[(df[end].notnull()) & (df[end] <= df[start]), end] = np.nan uh_df = uh_df.append(df) uh_df.drop_duplicates(inplace=True) uh_df = uh_df.merge(resignation_df, on=uid, how='left') indexes = ((uh_df[resignation_col].notnull()) & (uh_df[end].isnull()) & (uh_df[start] < uh_df[resignation_col])) uh_df.loc[indexes, end] = uh_df.loc[indexes, resignation_col] uh_df.drop(resignation_col, axis=1, inplace=True) uh_rd = remove_duplicates(uh_df, [uid, start, unit]) uh_kd = keep_duplicates(uh_df, [uid, start, unit]) uh_kd = aggregate_data(uh_kd, uid=uid, id_cols=[start, unit], max_cols=[end]) assert uh_rd.shape[0] + uh_kd.shape[0] ==\ uh_df[[uid, unit, start]].drop_duplicates().shape[0],\ 'Data set lost information after split and aggregation.' uh_df = uh_rd.append(uh_kd) uh_df.sort_values([uid, start, unit], inplace=True) uh_df.reset_index(drop=True, inplace=True) return uh_df