def file_subset_by_date(start_date, end_date, target_dir, data_regex):
    '''
    Looks within a folder of files and returns a subset of files by date range

    Parameters
    ----------
    start_date : pd.Timestamp
      Beginning date for range of dates care about
    end_date : pd.Timestamp
      End date for range of dates care about.
    target_dir : string
      Absolute path to directory with all the data
    data_regex : re object
      Regex on the type of file that contains data

    Returns
    -------
    output_list : list of strings
      List of files in the target directory
    '''
    output_list = []
    # identify all the files to go through
    file_list = gen_func.data_file_list(target_dir, data_regex)
    date_range = pd.date_range(start_date, end_date, freq='D')
    # go through full list and get subset
    for file_to_check in file_list:
        file_date = file_to_check[5:-4]
        if file_date in date_range:
            output_list.append(file_to_check)
    return output_list
"""
Created on Sat Jan 27 10:50:49 2018

@author: theism
"""

import os
import pandas as pd
import gen_func as gf
import re
import logging

data_dir = r'C:\Users\theism\Downloads\[DA] Post Natal Care\[DA] Post Natal Care'
data_regex = re.compile(r'Forms_\d\d\d.csv')
output_df = pd.DataFrame()
output_df = output_df.fillna('')
output_name = 'combined_file.csv'
file_list = gf.data_file_list(data_dir, data_regex)
gf.start_logging(data_dir)

for data_file in file_list:
    # get data
    logging.info('going through %' % data_file)
    input_df = pd.read_csv(os.path.join(data_dir, data_file), infer_datetime_format=True, low_memory=False)
    output_df = pd.concat([output_df, input_df], axis=1)
    
output_df.to_csv(os.path.join(data_dir, output_name))
logging.info('all files combined, output saved to directory')
    
    
    logging.info('Didnt find existing forms db file.  Creating a new one.')
# save file already exists.  import and make sure index column in expected fmt
else:
    forms_df = pd.read_csv(saved_file,
                           header=0,
                           parse_dates=date_cols,
                           infer_datetime_format=True)
    forms_df = forms_df.set_index(['form_date', 'state_name'])

# need to download new files if set to True
if download_new is True:
    file_list = ff.download_form_ucr(download_start, download_stop,
                                     credential_path, target_dir,
                                     refresh_recent, refresh_days, data_regex)
else:
    file_list = gf.data_file_list(target_dir, data_regex)

# if recalc, lets refresh the location data too
if recalc is True:
    gf.refresh_locations()

if trim_dates is True:
    logging.info('Only look at data between %s and %s' %
                 (trim_start, trim_stop))
    file_dates = [pd.Timestamp(i[5:-4]) for i in file_list]
    file_dates_slice = [
        i for i in file_dates if (i >= trim_start and i <= trim_stop)
    ]
    file_dates_str = pd.Series(file_dates_slice).dt.strftime(
        '%m.%d.%Y').tolist()
    file_list = ['icds.' + i + '.csv' for i in file_dates_str]
def download_form_ucr(start_date, end_date, cred_path, target_dir,
                      refresh_recent, refresh_days, data_regex):
    '''
    Downloads form submission UCR with option to refresh data directory

    Parameters
    ----------
    start_date : pd.Timestamp
      Beginning date for range of dates care about
    end_date : pd.Timestamp
      End date for range of dates care about.
    cred_path : string
      Absolute path to the credential file holding CommCare user/pwd
    target_dir : string
      Absolute path to directory with all the data
    refresh_recent : boolean
      True will delete recent files and redownload (to get dawdling form submissions)
    refresh_days : integer
      Number of days to refresh data
    data_regex : re object
      Regex on the type of file that contains data

    Returns
    -------
    file_list : list of strings
      List of files in the target directory
    '''
    # identify all the files to go through
    file_list = gen_func.data_file_list(target_dir, data_regex)

    # download files if see that any are missing
    logging.info('Checking if all files have been downloaded')
    all_dates = pd.date_range(start_date, end_date, freq='D')
    download_link = 'https://www.icds-cas.gov.in/a/icds-cas/configurable_reports/data_sources/export/static-icds-cas-static-usage_forms/?format=csv&form_date='
    user, password = gen_func.get_credentials(cred_path, 'icds')

    # get dates of files that have been downloaded
    file_dates = list(pd.to_datetime(x[5:-4]) for x in file_list)

    # if need to refresh data is True, get dates of files to delete
    if refresh_recent is True:
        remove_dates = sorted(file_dates)[-refresh_days:]
        for date_to_del in remove_dates:
            remove_name = 'icds.' + date_to_del.strftime('%m.%d.%Y') + '.csv'
            os.remove(os.path.join(target_dir, remove_name))
            logging.info('Refreshing data file: %s' % remove_name)
            file_dates.remove(date_to_del)

    for date_to_check in all_dates:
        if date_to_check in file_dates:
            logging.debug('have data for %s already' % date_to_check)
        else:
            logging.info('Downloading form data for %s' % date_to_check)
            date_to_get = date_to_check.strftime('%Y-%m-%d')
            full_dwnld_link = download_link + date_to_get
            new_file_name = 'icds.' + date_to_check.strftime(
                '%m.%d.%Y') + '.csv'
            gen_func.download_ucr(full_dwnld_link, user, password,
                                  new_file_name, target_dir)

    # update file list
    file_list = gen_func.data_file_list(target_dir, data_regex)
    return file_list