# ------------- don't edit below here ----------------------------- gen_func.start_logging(output_dir) logging.info('Starting scripts to analyze aadhar data...') # combine all csv into one dataframe case_df = gen_func.csv_files_to_df(target_dir, case_data_regex, case_date_cols, cols_to_use) # clean case data and start to get age distribution information output_dict = {} case_clean_df, output_dict = case_func.clean_case_data(case_df, output_dict) case_clean_df = case_func.add_age_info(case_clean_df) location_column_names = ['doc_id', 'district_name'] case_clean_df = gen_func.add_locations(case_clean_df, 'owner_id', location_column_names) case_clean_df = case_clean_df.loc[( case_clean_df['state_name'].isin(real_state_list))] logging.info(case_clean_df['sex'].value_counts()) logging.info(case_clean_df['age_bracket'].value_counts()) clean_case_age_dist = case_clean_df.groupby(['age_bracket', 'sex']).count()['caseid'] logging.info(clean_case_age_dist) #case_clean_df = case_clean_df[(case_clean_df['district_name'] == 'WestGodavari')] logging.info('------ FEMALE 15-49 -----------------') target_clean_df = case_clean_df[(case_clean_df['sex'] == 'F') & (case_clean_df['age_bracket'] == '15-49 yrs')] logging.info(target_clean_df['sex'].value_counts()) logging.info(target_clean_df['age_bracket'].value_counts()) logging.info(target_clean_df['has_rch'].value_counts())
# start_date: column index of max on each row, which gives the start date # last_submission: reverse the order of columns and find the column index of # max on each row again, which gives the end date # do in one step so adding start date column doesn't interfere with last sub. forms_df = forms_df.assign( start_date=forms_df.astype(int).astype(bool).idxmax(1), last_submission=forms_df.astype(int).astype(bool)[ forms_df.columns[::-1]].idxmax(1)) forms_df['active_date'] = forms_df['start_date'].apply(lambda x: str( (pd.to_datetime(x) + np.timedelta64(startup_period, 'D')).date())) # add locations to data and set indices forms_df.index.rename('awc_id', True) location_columns = ['doc_id', 'block_name', 'district_name', 'state_name'] locations = location_columns[1:] forms_df = gf.add_locations(forms_df, None, location_columns, True) # get rid of not real states forms_df = forms_df.loc[(forms_df['state_name'].isin(real_state_list))] #----------------- daily form submissions ---------------------------- # days_inactive: today minus last_submission date last_date = pd.to_datetime(date_cols[-1]) forms_df = forms_df.assign(days_inactive=lambda x: ( (last_date - pd.to_datetime(x.last_submission)) / np.timedelta64(1, 'D'))) forms_df = forms_df.assign(days_since_start=lambda x: ( (last_date - pd.to_datetime(x.start_date)) / np.timedelta64(1, 'D')) + 1) forms_df = forms_df.assign(days_since_active=lambda x: ( (last_date - pd.to_datetime(x.active_date)) / np.timedelta64(1, 'D')) + 1) # reorder columns column_order = (locations + \
tasks_in_df = gf.forms_to_df(os.path.join(data_dir, 'tasks'), case_data_regex, task_date_cols, task_cols) logging.info('Percentage of closed values out of %i cases' % tasks_in_df.shape[0]) logging.info(tasks_in_df['closed'].value_counts(dropna=False) * 100. / tasks_in_df.shape[0]) logging.info('Percentage of tasks by type out of %i cases' % tasks_in_df.shape[0]) logging.info(tasks_in_df['tasks_type'].value_counts(dropna=False) * 100. / tasks_in_df.shape[0]) # get latest location fixture and add location data logging.info('Adding location data and removing test data ...') if refresh_locations: gf.refresh_locations() tasks_in_df = gf.add_locations(tasks_in_df, 'owner_id', location_columns) tasks_in_df = tasks_in_df.loc[( tasks_in_df['state_name'].isin(real_state_list))] #--------------------------------------------------------------------------------- #-------------------------- PREGNANCY IMMUNS ------------------------------------- #--------------------------------------------------------------------------------- preg_tasks_df = tasks_in_df[(tasks_in_df['tasks_type'] == 'pregnancy')] num_preg = preg_tasks_df.shape[0] logging.info('Analyzing %i pregnancy task cases' % num_preg) logging.info(preg_tasks_df['closed'].value_counts(dropna=False) * 100. / num_preg) logging.info(preg_tasks_df['schedule_flag'].value_counts(dropna=False) * 100. / num_preg)
date_cols = ['form_date', 'form_time', 'received_on'] col_names = date_cols + ['awc_id'] location_columns = ['doc_id', 'state_name', 'district_name', 'block_name'] file_list = ff.file_subset_by_date(start_date, end_date, target_dir, data_regex) # iterate through files to build single dataframe for data_file in file_list: file_date = data_file[5:-4] logging.info('Going through data for: %s' % data_file) # get data into input file input_df = pd.read_csv(os.path.join(target_dir, data_file), usecols=col_names, parse_dates=date_cols, infer_datetime_format=True) # add locations to data and set indices input_df = gf.add_locations(input_df, 'awc_id', location_columns) input_df = input_df.loc[(input_df['state_name'].isin(real_state_list))] # add a few columns we care about - convert to days decimal point input_df['time_lag'] = input_df['received_on'] - input_df['form_time'] input_df['time_lag_hrs'] = input_df['time_lag'] / np.timedelta64(1, 'h') # add buckets input_df['time_bucket'] = pd.cut(input_df['time_lag_hrs'], bins, labels=bin_names) # create output dataframes we care about # by states state_df = input_df.groupby(['form_date', 'state_name']).mean() state_df['median'] = input_df.groupby(['form_date', 'state_name']).median() state_buckets = input_df.groupby(['state_name'])['time_bucket'].value_counts().unstack().fillna(0) state_buckets['total'] = state_buckets.sum(axis=1)
for data_file in file_list: logging.info('Seeing if %s is already in output' % data_file) file_date = data_file[5:-4] # pandas magically knows that d.M.Y can match Y-D-M if file_date not in forms_df.index or recalc is True: logging.info('Going through data for: %s' % data_file) logging.debug(time.strftime('%X %x')) # get data input_df = pd.read_csv(os.path.join(target_dir, data_file), usecols=col_names, parse_dates=date_cols, infer_datetime_format=True) # add locations to data and set indices input_df = gf.add_locations(input_df, 'awc_id', location_columns) input_df = input_df.loc[(input_df['state_name'].isin(real_state_list))] # apply state filter if exists if filter_by_state: input_df = input_df.loc[input_df['state_name'] == state_filter] input_df = input_df.set_index(['form_date', 'state_name']) # minimize data to just the counts min_in_df = input_df.groupby(['form_date', 'state_name']).sum() # put data into the categories we care about min_in_df['due_list'] = min_in_df['due_list_ccs'] + \ min_in_df['due_list_child'] min_in_df['hh_mng'] = min_in_df['add_household'] + \
def clean_case_data(df, output_dict, awc_test=True): ''' Clean case data if columns to do so exist in the data files. Can remove from df: closed cases, cases not in a real state, cases belonging to a user with 'test' in the username Informational: # of person cases without an associated household parent id Parameters ---------- df : pandas dataframe Dataframe of case data to clean output_dict : dictionary Output file to append key/value data/value pairs Returns ------- df : pandas dataframe Cleaned dataframe of case data output_dict : dictionary Updated output file data/value pairs describing cleaning ''' logging.info('Cleaning case data...') orig_rows = len(df.index) logging.info('%i rows found' % orig_rows) # get rid of any closed cases by looking for 'closed'==True if 'closed' in df.columns.tolist(): num_closed = df.groupby('closed').size()[True] logging.info('%i closed removed' % num_closed) df = df[df['closed'] != True] else: logging.info('Not removing closed cases, closed column not found.') num_closed = np.nan # due to the way data is collected, indices.household isn't a good detector # of orphan cases. need to revisit after talking more with simon ''' # cases wo associated household parent case ids, 'indices.household' isnull # DO NOT remove from dataframe - return number as informational only if 'indices.household' in df.columns.tolist(): num_wo_hh_id = df[df['indices.household'].isnull()].shape[0] logging.info('%i with no household parent cases (not removed from df)' % num_wo_hh_id) # df = df[df['indices.household'].notnull()] else: logging.info('Not analyzing blank parent household ids, ' 'indices.household column not found.') num_wo_hh_id = np.nan ''' # cases with a blank name if 'name' in df.columns.tolist(): orig_df_len = len(df.index) df = value_is_blank(df, 'name') num_all_numeric, all_num_df, all_num_index = string_contains( df, 'name', '^\d+$', 'only numeric chars in name', index_out=True) df = df[~all_num_index] num_numeric, some_num_df, some_num_index = string_contains( df, 'name', '\d+', 'some numeric chars in name', index_out=True) # df = df[~some_num_index] logging.info('Not removing names with at least one numeric character') num_blank_name = orig_df_len - len(df.index) if num_all_numeric != 0 or num_numeric != 0: output_frames = [] output_frames.append(all_num_df) output_frames.append(some_num_df) error_df = pd.concat(output_frames, ignore_index=True, copy=False) name_err_filename = 'numeric_case_names_' + str( datetime.date.today()) + '.csv' logging.info('Creating csv output of names with numbers at %s' % name_err_filename) error_df.to_csv(name_err_filename, encoding='utf-8') else: logging.info('Not analyzing for blank names, name column not found.') num_blank_name = np.nan # test state locations or that aren't an AWC # add temp location columns, filter out, then remove location columns if 'owner_id' in df.columns.tolist(): pre_test_loc_total = len(df.index) location_columns = ['doc_id', 'state_name', 'awc_name'] df = gen_func.add_locations(df, 'owner_id', location_columns) real_state_list = [ 'Uttar Pradesh', 'Madhya Pradesh', 'Chhattisgarh', 'Andhra Pradesh', 'Bihar', 'Jharkhand', 'Rajasthan', 'Maharashtra' ] logging.debug('Cases by state:') logging.debug(df['state_name'].value_counts(dropna=False)) df = df.loc[(df['state_name'].isin(real_state_list))] num_test_locations = pre_test_loc_total - len(df.index) df.drop(['state_name'], 1) logging.info( '%i cases in test locations or that are not AWCs removed' % num_test_locations) # if find non test locations, lets send that to an csv to inspect if num_test_locations != 0: non_realstate_df = df.loc[~df['state_name'].isin(real_state_list)] filename = 'non_realstate_case_' + str( datetime.date.today()) + '.csv' logging.info( 'Creating csv output of presumed test locations at %s' % filename) non_realstate_df.to_csv(filename) else: logging.info('Not removing test or non-AWC locations, owner_id column ' 'not found.') num_test_locations = np.nan # looks for string 'test' in any username if 'username' in df.columns.tolist(): num_test_users = df[df['username'].str.contains('test')].shape[0] logging.info('%i with _test_ in username removed' % num_test_users) df = df[~df['username'].str.contains('test')] elif 'awc_name' in df.columns.tolist(): num_test_users = df[df['awc_name'].str.contains('test')].shape[0] logging.info('%i with _test_ in awc_name removed' % num_test_users) df = df[~df['awc_name'].str.contains('test')] elif 'owner_name' in df.columns.tolist(): num_test_users = df[df['owner_name'].str.contains('test')].shape[0] logging.info('%i with _test_ in owner_name removed' % num_test_users) df = df[~df['owner_name'].str.contains('test')] else: logging.info('Not removing usernames for test users, neither username ' 'nor awc_name nor owner_name column found.') num_test_users = np.nan # look for locations for the case that aren't associated with awcs if (('commcare_location_id' in df.columns.tolist() or 'owner_id' in df.columns.tolist()) and awc_test is True): non_awc_num, non_awc_df, non_awc_index = search_non_awc_owners( df, index_out=True) df = df[~non_awc_index] else: logging.info('Not removing non-awc locations, neither owner_id nor ' 'commcare_location_id found.') non_awc_num = np.nan # prepare outputs num_clean_rows = len(df.index) logging.info('Returning dataframe with %i rows' % num_clean_rows) output_dict.update({ 'orig_rows': orig_rows, # 'num_wo_hh_id': num_wo_hh_id, 'num_blank_name': num_blank_name, 'non_awc_num': non_awc_num, 'num_closed': num_closed, 'num_test_locations': num_test_locations, 'num_test_users': num_test_users, 'num_clean_rows': num_clean_rows }) return df, output_dict
['owner_id']).sum() # add current cycle to the output person_case_df = person_case_df.append(person_input_df) logging.info(person_case_df.shape) # add the person case data to the existing df case_df = case_df.join(person_case_df) case_types.append('person') case_df = case_df.fillna(0) # get latest location fixture and add location data if refresh_locations: gf.refresh_locations() case_df = gf.add_locations(case_df, None, location_columns) case_df = case_df.loc[(case_df['state_name'].isin(real_state_list))] # get overall stats for case loads overall_stats = case_df.describe() logging.info(overall_stats) overall_stats['state'] = 'all_data' stats_df = stats_df.append(overall_stats) # get distribution graphs for each type of case # case_df.plot.hist(bins=100, alpha=0.2, xlim=(0,250)).set_title('Open Case Distribution by User') axes = case_df.hist(figsize=[10, 8], bins=25) plt.subplots_adjust(wspace=0.3, hspace=0.3) plt.suptitle('Case Load - All AWW Users') y = axes[0, 0].set_ylabel(ylabel='Frequency', labelpad=None) y = axes[1, 0].set_ylabel(ylabel='Frequency', labelpad=None)
# , 'Uttar Pradesh', 'Maharashtra'] output_df = pd.DataFrame() output_df = output_df.fillna('') file_list = gf.data_file_list(data_dir, data_regex) gf.start_logging(data_dir) # combine each csv, with only columns necessary, into single dataframe for data_file in file_list: # get data logging.info('going through %s' % data_file) input_df = pd.read_csv(os.path.join(data_dir, data_file), usecols=cols_to_use, parse_dates=date_cols, infer_datetime_format=True) output_df = output_df.append(input_df) print(output_df.shape) # add location in order to get rid of test locations output_df = gf.add_locations(output_df, 'userID', location_columns) output_df = output_df.loc[(output_df['state_name'].isin(real_state_list))] # perform calcs interested in output_df['days_since_add'] = output_df['form.case_load_ccs_record0.case.@date_modified'] - output_df['form.add'] output_df.to_csv(os.path.join(data_dir, output_name)) logging.info('all files combined, output saved to directory')
# start logging gf.start_logging(output_dir) # initialize dfs input_df = pd.DataFrame() input_df = input_df.fillna('') input_df = gf.csv_files_to_df(data_dir, case_data_regex, date_cols, cols_to_use) # only keep open cases case_df = input_df[input_df['closed'] == False] # get latest location fixture and add location data if refresh_locations: gf.refresh_locations() case_df = gf.add_locations(case_df, 'owner_id', location_columns) case_df = case_df.loc[(case_df['state_name'].isin(real_state_list))] # get caste percentages to df for output drop_cols = cols_to_use drop_cols.append('state_name') drop_cols.append('district_name') blocks = case_df['block_name'].unique().tolist() new_loc_cols = ['block_name', 'district_name', 'state_name'] denom = case_df['hh_caste'].count() for item in data_cols: logging.info('Going through %s' % item) loc_df = case_df.drop(drop_cols, axis=1).drop_duplicates('block_name').set_index('block_name') category = case_df[item].value_counts().index.tolist() for cat in category:
phone_clean_df, phone_output_dict, deep_dive_dups) logging.info('Finished initial analysis...') # add geographic information to bad phone num list real_state_list = [ 'Madhya Pradesh', 'Chhattisgarh', 'Andhra Pradesh', 'Bihar', 'Jharkhand', 'Rajasthan' ] # , 'Uttar Pradesh', 'Maharashtra'] if user_case: location_columns = [ 'doc_id', 'awc_name', 'supervisor_name', 'block_name', 'district_name', 'state_name' ] bad_num_list = gen_func.add_locations(bad_num_list, 'commcare_location_id', location_columns) bad_num_list = bad_num_list.loc[( bad_num_list['state_name'].isin(real_state_list))] else: location_columns = ['doc_id', 'block_name', 'district_name'] bad_num_list = gen_func.add_locations(bad_num_list, 'owner_id', location_columns) bad_num_list = bad_num_list.loc[( bad_num_list['state_name'].isin(real_state_list))] bad_num_list = bad_num_list.drop([ 'has_aadhar', 'aadhar_number', 'raw_aadhar_string', 'name', 'has_rch', 'rch_id' ], axis=1) bad_num_list.to_csv(