# ------------- don't edit below here -----------------------------

gen_func.start_logging(output_dir)

logging.info('Starting scripts to analyze aadhar data...')

# combine all csv into one dataframe
case_df = gen_func.csv_files_to_df(target_dir, case_data_regex, case_date_cols,
                                   cols_to_use)

# clean case data and start to get age distribution information
output_dict = {}
case_clean_df, output_dict = case_func.clean_case_data(case_df, output_dict)
case_clean_df = case_func.add_age_info(case_clean_df)
location_column_names = ['doc_id', 'district_name']
case_clean_df = gen_func.add_locations(case_clean_df, 'owner_id',
                                       location_column_names)
case_clean_df = case_clean_df.loc[(
    case_clean_df['state_name'].isin(real_state_list))]

logging.info(case_clean_df['sex'].value_counts())
logging.info(case_clean_df['age_bracket'].value_counts())
clean_case_age_dist = case_clean_df.groupby(['age_bracket',
                                             'sex']).count()['caseid']
logging.info(clean_case_age_dist)
#case_clean_df = case_clean_df[(case_clean_df['district_name'] == 'WestGodavari')]
logging.info('------ FEMALE 15-49 -----------------')
target_clean_df = case_clean_df[(case_clean_df['sex'] == 'F') &
                                (case_clean_df['age_bracket'] == '15-49 yrs')]
logging.info(target_clean_df['sex'].value_counts())
logging.info(target_clean_df['age_bracket'].value_counts())
logging.info(target_clean_df['has_rch'].value_counts())
# start_date: column index of max on each row, which gives the start date
# last_submission:  reverse the order of columns and find the column index of
# max on each row again, which gives the end date
# do in one step so adding start date column doesn't interfere with last sub.
forms_df = forms_df.assign(
    start_date=forms_df.astype(int).astype(bool).idxmax(1),
    last_submission=forms_df.astype(int).astype(bool)[
        forms_df.columns[::-1]].idxmax(1))
forms_df['active_date'] = forms_df['start_date'].apply(lambda x: str(
    (pd.to_datetime(x) + np.timedelta64(startup_period, 'D')).date()))

# add locations to data and set indices
forms_df.index.rename('awc_id', True)
location_columns = ['doc_id', 'block_name', 'district_name', 'state_name']
locations = location_columns[1:]
forms_df = gf.add_locations(forms_df, None, location_columns, True)
# get rid of not real states
forms_df = forms_df.loc[(forms_df['state_name'].isin(real_state_list))]

#-----------------  daily form submissions ----------------------------
# days_inactive: today minus last_submission date
last_date = pd.to_datetime(date_cols[-1])
forms_df = forms_df.assign(days_inactive=lambda x: (
    (last_date - pd.to_datetime(x.last_submission)) / np.timedelta64(1, 'D')))
forms_df = forms_df.assign(days_since_start=lambda x: (
    (last_date - pd.to_datetime(x.start_date)) / np.timedelta64(1, 'D')) + 1)
forms_df = forms_df.assign(days_since_active=lambda x: (
    (last_date - pd.to_datetime(x.active_date)) / np.timedelta64(1, 'D')) + 1)

# reorder columns
column_order = (locations + \
    tasks_in_df = gf.forms_to_df(os.path.join(data_dir, 'tasks'),
                                 case_data_regex, task_date_cols, task_cols)
logging.info('Percentage of closed values out of %i cases' %
             tasks_in_df.shape[0])
logging.info(tasks_in_df['closed'].value_counts(dropna=False) * 100. /
             tasks_in_df.shape[0])
logging.info('Percentage of tasks by type out of %i cases' %
             tasks_in_df.shape[0])
logging.info(tasks_in_df['tasks_type'].value_counts(dropna=False) * 100. /
             tasks_in_df.shape[0])

# get latest location fixture and add location data
logging.info('Adding location data and removing test data ...')
if refresh_locations:
    gf.refresh_locations()
tasks_in_df = gf.add_locations(tasks_in_df, 'owner_id', location_columns)
tasks_in_df = tasks_in_df.loc[(
    tasks_in_df['state_name'].isin(real_state_list))]

#---------------------------------------------------------------------------------
#-------------------------- PREGNANCY IMMUNS -------------------------------------
#---------------------------------------------------------------------------------

preg_tasks_df = tasks_in_df[(tasks_in_df['tasks_type'] == 'pregnancy')]
num_preg = preg_tasks_df.shape[0]
logging.info('Analyzing %i pregnancy task cases' % num_preg)

logging.info(preg_tasks_df['closed'].value_counts(dropna=False) * 100. /
             num_preg)
logging.info(preg_tasks_df['schedule_flag'].value_counts(dropna=False) * 100. /
             num_preg)
date_cols = ['form_date', 'form_time', 'received_on']
col_names = date_cols + ['awc_id']
location_columns = ['doc_id', 'state_name', 'district_name', 'block_name']
file_list = ff.file_subset_by_date(start_date, end_date, target_dir, data_regex)

# iterate through files to build single dataframe
for data_file in file_list:
    file_date = data_file[5:-4]
    logging.info('Going through data for: %s' % data_file)
    # get data into input file
    input_df = pd.read_csv(os.path.join(target_dir, data_file), 
                           usecols=col_names, parse_dates=date_cols,
                           infer_datetime_format=True)
    
    # add locations to data and set indices
    input_df = gf.add_locations(input_df, 'awc_id', location_columns)
    input_df = input_df.loc[(input_df['state_name'].isin(real_state_list))]
        
    # add a few columns we care about - convert to days decimal point
    input_df['time_lag'] = input_df['received_on'] - input_df['form_time']
    input_df['time_lag_hrs'] = input_df['time_lag'] / np.timedelta64(1, 'h')
    
    # add buckets
    input_df['time_bucket'] = pd.cut(input_df['time_lag_hrs'], bins, labels=bin_names)
    
    # create output dataframes we care about
    # by states
    state_df = input_df.groupby(['form_date', 'state_name']).mean()
    state_df['median'] = input_df.groupby(['form_date', 'state_name']).median()
    state_buckets = input_df.groupby(['state_name'])['time_bucket'].value_counts().unstack().fillna(0)
    state_buckets['total'] = state_buckets.sum(axis=1)
for data_file in file_list:
    logging.info('Seeing if %s is already in output' % data_file)
    file_date = data_file[5:-4]
    # pandas magically knows that d.M.Y can match Y-D-M
    if file_date not in forms_df.index or recalc is True:
        logging.info('Going through data for: %s' % data_file)
        logging.debug(time.strftime('%X %x'))

        # get data
        input_df = pd.read_csv(os.path.join(target_dir, data_file),
                               usecols=col_names,
                               parse_dates=date_cols,
                               infer_datetime_format=True)

        # add locations to data and set indices
        input_df = gf.add_locations(input_df, 'awc_id', location_columns)
        input_df = input_df.loc[(input_df['state_name'].isin(real_state_list))]

        # apply state filter if exists
        if filter_by_state:
            input_df = input_df.loc[input_df['state_name'] == state_filter]

        input_df = input_df.set_index(['form_date', 'state_name'])

        # minimize data to just the counts
        min_in_df = input_df.groupby(['form_date', 'state_name']).sum()

        # put data into the categories we care about
        min_in_df['due_list'] = min_in_df['due_list_ccs'] + \
                                min_in_df['due_list_child']
        min_in_df['hh_mng'] = min_in_df['add_household'] + \
Esempio n. 6
0
def clean_case_data(df, output_dict, awc_test=True):
    '''
    Clean case data if columns to do so exist in the data files.

    Can remove from df: closed cases, cases not in a real state,
    cases belonging to a user with 'test' in the username

    Informational: # of person cases without an associated household parent id

    Parameters
    ----------
    df : pandas dataframe
      Dataframe of case data to clean

    output_dict : dictionary
      Output file to append key/value data/value pairs

    Returns
    -------
    df : pandas dataframe
      Cleaned dataframe of case data

    output_dict : dictionary
      Updated output file data/value pairs describing cleaning

    '''
    logging.info('Cleaning case data...')
    orig_rows = len(df.index)
    logging.info('%i rows found' % orig_rows)

    # get rid of any closed cases by looking for 'closed'==True
    if 'closed' in df.columns.tolist():
        num_closed = df.groupby('closed').size()[True]
        logging.info('%i closed removed' % num_closed)
        df = df[df['closed'] != True]
    else:
        logging.info('Not removing closed cases, closed column not found.')
        num_closed = np.nan

    # due to the way data is collected, indices.household isn't a good detector
    # of orphan cases.  need to revisit after talking more with simon
    '''
    # cases wo associated household parent case ids, 'indices.household' isnull
    # DO NOT remove from dataframe - return number as informational only
    if 'indices.household' in df.columns.tolist():
        num_wo_hh_id = df[df['indices.household'].isnull()].shape[0]
        logging.info('%i with no household parent cases (not removed from df)'
                     % num_wo_hh_id)
        # df = df[df['indices.household'].notnull()]
    else:
        logging.info('Not analyzing blank parent household ids, '
                     'indices.household column not found.')
        num_wo_hh_id = np.nan
    '''

    # cases with a blank name
    if 'name' in df.columns.tolist():
        orig_df_len = len(df.index)
        df = value_is_blank(df, 'name')
        num_all_numeric, all_num_df, all_num_index = string_contains(
            df, 'name', '^\d+$', 'only numeric chars in name', index_out=True)
        df = df[~all_num_index]
        num_numeric, some_num_df, some_num_index = string_contains(
            df, 'name', '\d+', 'some numeric chars in name', index_out=True)
        # df = df[~some_num_index]
        logging.info('Not removing names with at least one numeric character')
        num_blank_name = orig_df_len - len(df.index)

        if num_all_numeric != 0 or num_numeric != 0:
            output_frames = []
            output_frames.append(all_num_df)
            output_frames.append(some_num_df)
            error_df = pd.concat(output_frames, ignore_index=True, copy=False)
            name_err_filename = 'numeric_case_names_' + str(
                datetime.date.today()) + '.csv'
            logging.info('Creating csv output of names with numbers at %s' %
                         name_err_filename)
            error_df.to_csv(name_err_filename, encoding='utf-8')
    else:
        logging.info('Not analyzing for blank names, name column not found.')
        num_blank_name = np.nan

    # test state locations or that aren't an AWC
    # add temp location columns, filter out, then remove location columns
    if 'owner_id' in df.columns.tolist():
        pre_test_loc_total = len(df.index)
        location_columns = ['doc_id', 'state_name', 'awc_name']
        df = gen_func.add_locations(df, 'owner_id', location_columns)
        real_state_list = [
            'Uttar Pradesh', 'Madhya Pradesh', 'Chhattisgarh',
            'Andhra Pradesh', 'Bihar', 'Jharkhand', 'Rajasthan', 'Maharashtra'
        ]
        logging.debug('Cases by state:')
        logging.debug(df['state_name'].value_counts(dropna=False))
        df = df.loc[(df['state_name'].isin(real_state_list))]
        num_test_locations = pre_test_loc_total - len(df.index)
        df.drop(['state_name'], 1)
        logging.info(
            '%i cases in test locations or that are not AWCs removed' %
            num_test_locations)
        # if find non test locations, lets send that to an csv to inspect
        if num_test_locations != 0:
            non_realstate_df = df.loc[~df['state_name'].isin(real_state_list)]
            filename = 'non_realstate_case_' + str(
                datetime.date.today()) + '.csv'
            logging.info(
                'Creating csv output of presumed test locations at %s' %
                filename)
            non_realstate_df.to_csv(filename)
    else:
        logging.info('Not removing test or non-AWC locations, owner_id column '
                     'not found.')
        num_test_locations = np.nan

    # looks for string 'test' in any username
    if 'username' in df.columns.tolist():
        num_test_users = df[df['username'].str.contains('test')].shape[0]
        logging.info('%i with _test_ in username removed' % num_test_users)
        df = df[~df['username'].str.contains('test')]
    elif 'awc_name' in df.columns.tolist():
        num_test_users = df[df['awc_name'].str.contains('test')].shape[0]
        logging.info('%i with _test_ in awc_name removed' % num_test_users)
        df = df[~df['awc_name'].str.contains('test')]
    elif 'owner_name' in df.columns.tolist():
        num_test_users = df[df['owner_name'].str.contains('test')].shape[0]
        logging.info('%i with _test_ in owner_name removed' % num_test_users)
        df = df[~df['owner_name'].str.contains('test')]
    else:
        logging.info('Not removing usernames for test users, neither username '
                     'nor awc_name nor owner_name column found.')
        num_test_users = np.nan

    # look for locations for the case that aren't associated with awcs
    if (('commcare_location_id' in df.columns.tolist()
         or 'owner_id' in df.columns.tolist()) and awc_test is True):
        non_awc_num, non_awc_df, non_awc_index = search_non_awc_owners(
            df, index_out=True)
        df = df[~non_awc_index]
    else:
        logging.info('Not removing non-awc locations, neither owner_id nor '
                     'commcare_location_id found.')
        non_awc_num = np.nan

    # prepare outputs
    num_clean_rows = len(df.index)
    logging.info('Returning dataframe with %i rows' % num_clean_rows)
    output_dict.update({
        'orig_rows': orig_rows,
        # 'num_wo_hh_id': num_wo_hh_id,
        'num_blank_name': num_blank_name,
        'non_awc_num': non_awc_num,
        'num_closed': num_closed,
        'num_test_locations': num_test_locations,
        'num_test_users': num_test_users,
        'num_clean_rows': num_clean_rows
    })
    return df, output_dict
Esempio n. 7
0
                ['owner_id']).sum()

            # add current cycle to the output
            person_case_df = person_case_df.append(person_input_df)

            logging.info(person_case_df.shape)

    # add the person case data to the existing df
    case_df = case_df.join(person_case_df)
    case_types.append('person')
case_df = case_df.fillna(0)

# get latest location fixture and add location data
if refresh_locations:
    gf.refresh_locations()
case_df = gf.add_locations(case_df, None, location_columns)
case_df = case_df.loc[(case_df['state_name'].isin(real_state_list))]

# get overall stats for case loads
overall_stats = case_df.describe()
logging.info(overall_stats)
overall_stats['state'] = 'all_data'
stats_df = stats_df.append(overall_stats)

# get distribution graphs for each type of case
# case_df.plot.hist(bins=100, alpha=0.2, xlim=(0,250)).set_title('Open Case Distribution by User')
axes = case_df.hist(figsize=[10, 8], bins=25)
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.suptitle('Case Load - All AWW Users')
y = axes[0, 0].set_ylabel(ylabel='Frequency', labelpad=None)
y = axes[1, 0].set_ylabel(ylabel='Frequency', labelpad=None)
# , 'Uttar Pradesh', 'Maharashtra']

output_df = pd.DataFrame()
output_df = output_df.fillna('')
file_list = gf.data_file_list(data_dir, data_regex)
gf.start_logging(data_dir)

# combine each csv, with only columns necessary, into single dataframe
for data_file in file_list:
    # get data
    logging.info('going through %s' % data_file)
    input_df = pd.read_csv(os.path.join(data_dir, data_file),
                           usecols=cols_to_use, parse_dates=date_cols,
                           infer_datetime_format=True)
    output_df = output_df.append(input_df)
    print(output_df.shape)
    
# add location in order to get rid of test locations
output_df = gf.add_locations(output_df, 'userID', location_columns)
output_df = output_df.loc[(output_df['state_name'].isin(real_state_list))]

# perform calcs interested in
output_df['days_since_add'] = output_df['form.case_load_ccs_record0.case.@date_modified'] - output_df['form.add']
    


    
output_df.to_csv(os.path.join(data_dir, output_name))
logging.info('all files combined, output saved to directory')
    
    
# start logging
gf.start_logging(output_dir)

# initialize dfs
input_df = pd.DataFrame()
input_df = input_df.fillna('')
input_df = gf.csv_files_to_df(data_dir, case_data_regex, date_cols, cols_to_use)
    
# only keep open cases
case_df = input_df[input_df['closed'] == False]

# get latest location fixture and add location data
if refresh_locations:
    gf.refresh_locations()
case_df = gf.add_locations(case_df, 'owner_id', location_columns)
case_df = case_df.loc[(case_df['state_name'].isin(real_state_list))]

# get caste percentages to df for output
drop_cols = cols_to_use
drop_cols.append('state_name')
drop_cols.append('district_name')
blocks = case_df['block_name'].unique().tolist()
new_loc_cols = ['block_name', 'district_name', 'state_name']
denom = case_df['hh_caste'].count()

for item in data_cols:
    logging.info('Going through %s' % item)
    loc_df = case_df.drop(drop_cols, axis=1).drop_duplicates('block_name').set_index('block_name')
    category = case_df[item].value_counts().index.tolist()
    for cat in category:
Esempio n. 10
0
            phone_clean_df, phone_output_dict, deep_dive_dups)
        logging.info('Finished initial analysis...')

        # add geographic information to bad phone num list
        real_state_list = [
            'Madhya Pradesh', 'Chhattisgarh', 'Andhra Pradesh', 'Bihar',
            'Jharkhand', 'Rajasthan'
        ]
        # , 'Uttar Pradesh', 'Maharashtra']
        if user_case:
            location_columns = [
                'doc_id', 'awc_name', 'supervisor_name', 'block_name',
                'district_name', 'state_name'
            ]
            bad_num_list = gen_func.add_locations(bad_num_list,
                                                  'commcare_location_id',
                                                  location_columns)
            bad_num_list = bad_num_list.loc[(
                bad_num_list['state_name'].isin(real_state_list))]
        else:
            location_columns = ['doc_id', 'block_name', 'district_name']
            bad_num_list = gen_func.add_locations(bad_num_list, 'owner_id',
                                                  location_columns)
            bad_num_list = bad_num_list.loc[(
                bad_num_list['state_name'].isin(real_state_list))]
            bad_num_list = bad_num_list.drop([
                'has_aadhar', 'aadhar_number', 'raw_aadhar_string', 'name',
                'has_rch', 'rch_id'
            ],
                                             axis=1)
        bad_num_list.to_csv(