Ejemplo n.º 1
0
#output_dir = (r'C:\Users\theism\Documents\Dimagi\Data\person_phone_aadhar-ap-anantapur2\test')
#case_data_regex = re.compile(r'cases_\d\d\d.csv')

# ------------- don't edit below here -----------------------------

gen_func.start_logging(output_dir)

logging.info('Starting scripts to analyze aadhar data...')

# combine all csv into one dataframe
case_df = gen_func.csv_files_to_df(target_dir, case_data_regex, case_date_cols,
                                   cols_to_use)

# clean case data and start to get age distribution information
output_dict = {}
case_clean_df, output_dict = case_func.clean_case_data(case_df, output_dict)
case_clean_df = case_func.add_age_info(case_clean_df)
location_column_names = ['doc_id', 'district_name']
case_clean_df = gen_func.add_locations(case_clean_df, 'owner_id',
                                       location_column_names)
case_clean_df = case_clean_df.loc[(
    case_clean_df['state_name'].isin(real_state_list))]

logging.info(case_clean_df['sex'].value_counts())
logging.info(case_clean_df['age_bracket'].value_counts())
clean_case_age_dist = case_clean_df.groupby(['age_bracket',
                                             'sex']).count()['caseid']
logging.info(clean_case_age_dist)
#case_clean_df = case_clean_df[(case_clean_df['district_name'] == 'WestGodavari')]
logging.info('------ FEMALE 15-49 -----------------')
target_clean_df = case_clean_df[(case_clean_df['sex'] == 'F') &
Ejemplo n.º 2
0
        phone_output_dict = {'location': location_name}

        # combine all csv into one dataframe
        case_df = gen_func.csv_files_to_df(os.path.join(target_dir, folder),
                                           case_data_regex, case_date_cols)

        if user_case:
            case_df['phone_number'] = np.nan
            case_df['phone_number'] = case_df.apply(
                lambda row: row['aww_phone_number']
                if (row['aww_phone_number'] != '---') & (row[
                    'aww_phone_number'] != np.nan) else row['ls_phone_number'],
                axis=1)

        # clean case data
        case_clean_df, phone_output_dict = case_func.clean_case_data(
            case_df, phone_output_dict, awc_test=False)

        # truncate data based on open date if so desired
        if use_opened_cutoff:
            logging.info('Removing any cases opened before %s' %
                         opened_cutoff_date)
            case_clean_df = case_clean_df[
                case_clean_df['opened_date'] >= opened_cutoff_date]

        # if the case type is for a user, rather than belonging to a user
        if user_case:
            case_clean_df = gen_func.add_usertype_from_id(
                case_df, 'commcare_location_id')
            logging.info(
                'User distribution by type for open, non-test user cases:')
            logging.info(case_clean_df['location_type'].value_counts())