Esempio n. 1
0
def test_union_group():
    '''test union_group'''
    input_df = pd.DataFrame([
        (1,1,1),
        (1,2,1),
        (np.nan,4,1),
        (4,4,np.nan),
        (np.nan, np.nan, 10),
        (0,0,10),
        (9,10,np.nan)],
        columns=['A','B','C'],
        index=[100,11,12,13, 15, 66, 7])
    orig_input_df = copy.deepcopy(input_df)
    input_args = {'gid' : 'gid', 'cols' : ['A', 'B', 'C']}

    output_df = pd.DataFrame([
        (1,1,1,1),
        (1,2,1,1),
        (np.nan,4,1,1),
        (4,4,np.nan,1),
        (np.nan, np.nan, 10,2),
        (0,0,10,2),
        (9,10,np.nan,3)],
        columns = ['A', 'B', 'C','gid'],
        index=[100,11,12,13, 15, 66, 7])

    results = general_utils.union_group(input_df, **input_args)
    assert orig_input_df.equals(input_df)
    assert results.equals(output_df)
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df = assign_unique_ids(df, cons.cr_uid, **cons.cr_auid, log=log)
df = assign_unique_ids(df, cons.ind_uid, **cons.ind_auid, log=log)
udf = union_group(df[[cons.cr_uid, cons.ind_uid]].drop_duplicates(), cons.id,
                  [cons.cr_uid, cons.ind_uid])
df = df.merge(udf, on=[cons.cr_uid, cons.ind_uid])\
    .drop(cons.cr_uid, axis=1)
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df,
                             cons.id,
                             cons.ind_auid['id_cols'],
                             max_cols=cons.ind_auid['conflict_cols'] +
                             ['star'])
profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
        'id': 'subject_ID'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)
df = union_group(df, cons.id_cols[0], cons.group_cols)
log.info('%d group_ids' % df[cons.id_cols[0]].nunique())
df = assign_unique_ids(df,
                       cons.id,
                       cons.id_cols,
                       cons.conflict_cols,
                       log=log,
                       unresolved_policy='distinct')
adf = aggregate_data(df,
                     cons.id,
                     id_cols=cons.id_cols,
                     max_cols=cons.max_cols + cons.conflict_cols)
df.to_csv(cons.output_file, **cons.csv_opts)
adf.to_csv(cons.output_profiles_file, **cons.csv_opts)