# buffer on either side of immun schedule date to consider good
immun_buffer = 7

# what file type in folders to get
case_data_regex = re.compile(r'Cases_\d\d\d.csv')
location_columns = ['doc_id', 'block_name', 'district_name', 'state_name']
real_state_list = [
    'Madhya Pradesh', 'Chhattisgarh', 'Andhra Pradesh', 'Bihar', 'Jharkhand',
    'Rajasthan'
]
# , 'Uttar Pradesh', 'Maharashtra']

# ------------- don't edit below here -----------------------------
# start logging
gf.start_logging(output_dir)
os.chdir(output_dir)

# define immune schedule dates
preg_tasks = {
    'ANC 1 (immuns)': 0,
    'ANC 2 (immuns)': 0,
    'ANC 3 (immuns)': 42,
    'ANC 4 (immuns)': 42,
    'TT 1 (immuns)': 42,
    'TT 2 (immuns)': 42,
    'TT Booster (immuns)': 42
}

# get tasks df
logging.info('Getting task case data')
    'has_rch', 'rch_id', 'closed', 'owner_id', 'opened_date', 'dob', 'sex',
    'caseid'
]
real_state_list = [
    'Madhya Pradesh', 'Chhattisgarh', 'Andhra Pradesh', 'Bihar', 'Jharkhand',
    'Rajasthan'
]

# Practice Use Case on small dataset
#target_dir = (r'C:\Users\theism\Documents\Dimagi\Data\person_phone_aadhar-ap-anantapur2')
#output_dir = (r'C:\Users\theism\Documents\Dimagi\Data\person_phone_aadhar-ap-anantapur2\test')
#case_data_regex = re.compile(r'cases_\d\d\d.csv')

# ------------- don't edit below here -----------------------------

gen_func.start_logging(output_dir)

logging.info('Starting scripts to analyze aadhar data...')

# combine all csv into one dataframe
case_df = gen_func.csv_files_to_df(target_dir, case_data_regex, case_date_cols,
                                   cols_to_use)

# clean case data and start to get age distribution information
output_dict = {}
case_clean_df, output_dict = case_func.clean_case_data(case_df, output_dict)
case_clean_df = case_func.add_age_info(case_clean_df)
location_column_names = ['doc_id', 'district_name']
case_clean_df = gen_func.add_locations(case_clean_df, 'owner_id',
                                       location_column_names)
case_clean_df = case_clean_df.loc[(
"""
Created on Sat Jan 27 10:50:49 2018

@author: theism
"""

import os
import pandas as pd
import gen_func as gf
import re
import logging

data_dir = r'C:\Users\theism\Downloads\[DA] Post Natal Care\[DA] Post Natal Care'
data_regex = re.compile(r'Forms_\d\d\d.csv')
output_df = pd.DataFrame()
output_df = output_df.fillna('')
output_name = 'combined_file.csv'
file_list = gf.data_file_list(data_dir, data_regex)
gf.start_logging(data_dir)

for data_file in file_list:
    # get data
    logging.info('going through %' % data_file)
    input_df = pd.read_csv(os.path.join(data_dir, data_file), infer_datetime_format=True, low_memory=False)
    output_df = pd.concat([output_df, input_df], axis=1)
    
output_df.to_csv(os.path.join(data_dir, output_name))
logging.info('all files combined, output saved to directory')