def test_union_group(): '''test union_group''' input_df = pd.DataFrame([ (1,1,1), (1,2,1), (np.nan,4,1), (4,4,np.nan), (np.nan, np.nan, 10), (0,0,10), (9,10,np.nan)], columns=['A','B','C'], index=[100,11,12,13, 15, 66, 7]) orig_input_df = copy.deepcopy(input_df) input_args = {'gid' : 'gid', 'cols' : ['A', 'B', 'C']} output_df = pd.DataFrame([ (1,1,1,1), (1,2,1,1), (np.nan,4,1,1), (4,4,np.nan,1), (np.nan, np.nan, 10,2), (0,0,10,2), (9,10,np.nan,3)], columns = ['A', 'B', 'C','gid'], index=[100,11,12,13, 15, 66, 7]) results = general_utils.union_group(input_df, **input_args) assert orig_input_df.equals(input_df) assert results.equals(output_df)
} assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = assign_unique_ids(df, cons.cr_uid, **cons.cr_auid, log=log) df = assign_unique_ids(df, cons.ind_uid, **cons.ind_auid, log=log) udf = union_group(df[[cons.cr_uid, cons.ind_uid]].drop_duplicates(), cons.id, [cons.cr_uid, cons.ind_uid]) df = df.merge(udf, on=[cons.cr_uid, cons.ind_uid])\ .drop(cons.cr_uid, axis=1) df.to_csv(cons.output_file, **cons.csv_opts) profiles_df = aggregate_data(df, cons.id, cons.ind_auid['id_cols'], max_cols=cons.ind_auid['conflict_cols'] + ['star']) profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
'id': 'subject_ID' } assert (args['input_file'].startswith('input/') and args['input_file'].endswith('.csv.gz')),\ "input_file is malformed: {}".format(args['input_file']) assert (args['output_file'].startswith('output/') and args['output_file'].endswith('.csv.gz')),\ "output_file is malformed: {}".format(args['output_file']) return setup.do_setup(script_path, args) cons, log = get_setup() df = pd.read_csv(cons.input_file) df = union_group(df, cons.id_cols[0], cons.group_cols) log.info('%d group_ids' % df[cons.id_cols[0]].nunique()) df = assign_unique_ids(df, cons.id, cons.id_cols, cons.conflict_cols, log=log, unresolved_policy='distinct') adf = aggregate_data(df, cons.id, id_cols=cons.id_cols, max_cols=cons.max_cols + cons.conflict_cols) df.to_csv(cons.output_file, **cons.csv_opts) adf.to_csv(cons.output_profiles_file, **cons.csv_opts)