def file_subset_by_date(start_date, end_date, target_dir, data_regex): ''' Looks within a folder of files and returns a subset of files by date range Parameters ---------- start_date : pd.Timestamp Beginning date for range of dates care about end_date : pd.Timestamp End date for range of dates care about. target_dir : string Absolute path to directory with all the data data_regex : re object Regex on the type of file that contains data Returns ------- output_list : list of strings List of files in the target directory ''' output_list = [] # identify all the files to go through file_list = gen_func.data_file_list(target_dir, data_regex) date_range = pd.date_range(start_date, end_date, freq='D') # go through full list and get subset for file_to_check in file_list: file_date = file_to_check[5:-4] if file_date in date_range: output_list.append(file_to_check) return output_list
""" Created on Sat Jan 27 10:50:49 2018 @author: theism """ import os import pandas as pd import gen_func as gf import re import logging data_dir = r'C:\Users\theism\Downloads\[DA] Post Natal Care\[DA] Post Natal Care' data_regex = re.compile(r'Forms_\d\d\d.csv') output_df = pd.DataFrame() output_df = output_df.fillna('') output_name = 'combined_file.csv' file_list = gf.data_file_list(data_dir, data_regex) gf.start_logging(data_dir) for data_file in file_list: # get data logging.info('going through %' % data_file) input_df = pd.read_csv(os.path.join(data_dir, data_file), infer_datetime_format=True, low_memory=False) output_df = pd.concat([output_df, input_df], axis=1) output_df.to_csv(os.path.join(data_dir, output_name)) logging.info('all files combined, output saved to directory')
logging.info('Didnt find existing forms db file. Creating a new one.') # save file already exists. import and make sure index column in expected fmt else: forms_df = pd.read_csv(saved_file, header=0, parse_dates=date_cols, infer_datetime_format=True) forms_df = forms_df.set_index(['form_date', 'state_name']) # need to download new files if set to True if download_new is True: file_list = ff.download_form_ucr(download_start, download_stop, credential_path, target_dir, refresh_recent, refresh_days, data_regex) else: file_list = gf.data_file_list(target_dir, data_regex) # if recalc, lets refresh the location data too if recalc is True: gf.refresh_locations() if trim_dates is True: logging.info('Only look at data between %s and %s' % (trim_start, trim_stop)) file_dates = [pd.Timestamp(i[5:-4]) for i in file_list] file_dates_slice = [ i for i in file_dates if (i >= trim_start and i <= trim_stop) ] file_dates_str = pd.Series(file_dates_slice).dt.strftime( '%m.%d.%Y').tolist() file_list = ['icds.' + i + '.csv' for i in file_dates_str]
def download_form_ucr(start_date, end_date, cred_path, target_dir, refresh_recent, refresh_days, data_regex): ''' Downloads form submission UCR with option to refresh data directory Parameters ---------- start_date : pd.Timestamp Beginning date for range of dates care about end_date : pd.Timestamp End date for range of dates care about. cred_path : string Absolute path to the credential file holding CommCare user/pwd target_dir : string Absolute path to directory with all the data refresh_recent : boolean True will delete recent files and redownload (to get dawdling form submissions) refresh_days : integer Number of days to refresh data data_regex : re object Regex on the type of file that contains data Returns ------- file_list : list of strings List of files in the target directory ''' # identify all the files to go through file_list = gen_func.data_file_list(target_dir, data_regex) # download files if see that any are missing logging.info('Checking if all files have been downloaded') all_dates = pd.date_range(start_date, end_date, freq='D') download_link = 'https://www.icds-cas.gov.in/a/icds-cas/configurable_reports/data_sources/export/static-icds-cas-static-usage_forms/?format=csv&form_date=' user, password = gen_func.get_credentials(cred_path, 'icds') # get dates of files that have been downloaded file_dates = list(pd.to_datetime(x[5:-4]) for x in file_list) # if need to refresh data is True, get dates of files to delete if refresh_recent is True: remove_dates = sorted(file_dates)[-refresh_days:] for date_to_del in remove_dates: remove_name = 'icds.' + date_to_del.strftime('%m.%d.%Y') + '.csv' os.remove(os.path.join(target_dir, remove_name)) logging.info('Refreshing data file: %s' % remove_name) file_dates.remove(date_to_del) for date_to_check in all_dates: if date_to_check in file_dates: logging.debug('have data for %s already' % date_to_check) else: logging.info('Downloading form data for %s' % date_to_check) date_to_get = date_to_check.strftime('%Y-%m-%d') full_dwnld_link = download_link + date_to_get new_file_name = 'icds.' + date_to_check.strftime( '%m.%d.%Y') + '.csv' gen_func.download_ucr(full_dwnld_link, user, password, new_file_name, target_dir) # update file list file_list = gen_func.data_file_list(target_dir, data_regex) return file_list