def data_read_and_prep(csv_path, epwk, yr, test_wks=4, wght=False, log_tr=False):
    # Read in the historical ILI data from startdate given by epwk and year from the csv_path and 
    #   create train and test set 
    cdcdf = pd.read_csv(csv_path, header=1)
    df = cdcdf.drop(["REGION", "REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24", "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS"], axis=1)


    df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce'))

    
    week=epi.Week(yr, epwk)

    df_train = df[(df['DATE']<=pd.to_datetime(week.startdate()))]
    df_test = df[(df['DATE']>pd.to_datetime(week.startdate()))&((df['DATE']<=pd.to_datetime(week.startdate())+timedelta(weeks=test_wks)))]
    if wght:
        train = df_train['% WEIGHTED ILI']
        test = df_test['% WEIGHTED ILI']
    else:
        train = df_train['%UNWEIGHTED ILI']
        test = df_test['%UNWEIGHTED ILI']
    if log_tr:
        train = np.log(train)
        test = np.log(test)
    train.index = df_train['DATE']
    test.index = df_test['DATE']
    return train, test, df, df_train, df_test
Example #2
0
def test_year_weeks(year_cdc, year_iso):
    cdc_weeks = []
    for w in range(1, 53):
        cdc_weeks.append(epiweeks.Week(2015, w))
    assert list(year_cdc.iterweeks()) == cdc_weeks
    iso_weeks = []
    for w in range(1, 54):
        iso_weeks.append(epiweeks.Week(2015, w, system="iso"))
    assert list(year_iso.iterweeks()) == iso_weeks
Example #3
0
def national():
    cdcdf = pd.read_csv('data/national/ILINet.csv', header=1)
    df = cdcdf.drop(["REGION", "REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24", "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS"], axis=1)
    
    df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce'))
    
    return df
Example #4
0
def test_week_ordering(week_cdc, week_iso):
    assert week_cdc > epiweeks.Week(2014, 53, system="cdc")
    assert week_cdc >= epiweeks.Week(2015, 1, system="cdc")
    assert week_cdc < epiweeks.Week(2015, 2, system="cdc")
    assert week_cdc <= epiweeks.Week(2015, 1, system="cdc")
    assert week_iso > epiweeks.Week(2014, 52, system="iso")
    assert week_iso >= epiweeks.Week(2015, 1, system="iso")
    assert week_iso < epiweeks.Week(2015, 2, system="iso")
    assert week_iso <= epiweeks.Week(2015, 1, system="iso")
def prepdata_retro(csv_path,epwk):
    nat_csv_file = csv_path + '/' +'national/'+'ILINet_National_' + str(epwk) + '.csv'
    df = pd.read_csv(nat_csv_file, na_values='X')
    df['REGION'] = df['REGION'].fillna('National')
    hhs_csv_file = csv_path +'/'+'hhs/'+'ILINet_HHS_' + str(epwk) + '.csv'
    df = df.append(pd.read_csv(hhs_csv_file,na_values='X'))
    df['REGION'] = df['REGION'].fillna('National')
    df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce'))
    return df
def prepdata_append(csv_path):
    
    national = pd.read_csv(csv_path+'national/ILINet.csv', na_values='X',header=1)
    national['REGION'] = national['REGION'].fillna('National')
    regional = national.append(pd.read_csv(csv_path+'regional/ILINet.csv',na_values='X', header=1))
    df = regional.append(pd.read_csv(csv_path+'state/ILINet.csv', na_values='X', header=1))
    df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce'))

    return df
Example #7
0
def region(number):
    cdcdf = pd.read_csv('data/regional/ILINet.csv', header=1)
    cdcdf.drop(["REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24", "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS"], axis=1, inplace = True)
    dfs = {}
    for region in cdcdf["REGION"].unique():
        dfs[region] = pd.DataFrame(cdcdf.loc[cdcdf['REGION'] == region])
        
    for df in dfs.values():
        df.drop(["REGION"], axis=1, inplace=True)
        df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce'))
        #df.drop(["YEAR", "WEEK"], axis = 1, inplace = True)
    return dfs["Region " + number]
def prepdata_flux(csv_path,epwk):
    nat_csv_file = csv_path + '/'+'ILINet_national_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv'
    df = pd.read_csv(nat_csv_file, na_values='X')
    df['region'] = df['region'].fillna('National')
    hhs_csv_file = csv_path +'/'+ 'ILINet_hhs_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv'
    df = df.append(pd.read_csv(hhs_csv_file,na_values='X'))
    state_csv_file = csv_path +'/'+ 'ILINet_state_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv'
    df = df.append(pd.read_csv(state_csv_file,na_values='X'))

    df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["year"]), int(row["week"])).startdate() ,axis=1, result_type='reduce'))
    
    return df
def main(args):
    #parser = argparse.ArgumentParser(description='Script that runs an autoregressive forecasting model upon available CDC flu data.')
    #parser.add_argument('REGION', help='Region selector. Valid regions are "national", regions "1" - "10", or any state, e.g. "Alabama", "Michigan"')
    #parser.add_argument('TARGET', help='Target to forecast upon. Valid targets are Weighted ILI ("wili"), Unweighted ILI ("ili"), ILI Total ("ilitotal"), or Total Patients ("totalpatients")')
    #parser.add_argument('STARTDATE', help='Year in which the model will start training, formatted as "2018EW05".')
    #parser.add_argument('ENDDATE', help='Date at which the model will stop training, formatted as "2018EW05".')
    #args = parser.parse_args()
    args = vars(args)
    if args["REGION"] not in regions:
        raise TypeError("REGION is not valid")

    if args["REGION"] == "national":
        args["REGION"] = "US National"

    if args["TARGET"] not in targets:
        raise TypeError("TARGET is not valid")

    if re.fullmatch('\d{4}(EW)\d{2}', args["STARTDATE"]) is None:
        raise TypeError("STARTDATE is formatted incorrectly")

    if re.fullmatch('\d{4}(EW)\d{2}', args["ENDDATE"]) is None:
        raise TypeError("ENDDATE is formatted incorrectly")

    bin_ed = [
        0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
        1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7,
        2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1,
        4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5,
        5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9,
        7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3,
        8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7,
        9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9,
        11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1,
        12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 100
    ]

    startyear = args["STARTDATE"][:4]
    startweek = args["STARTDATE"][6:8]
    trainweek = startweek
    ww = epi.Week(int(startyear), int(startweek))
    region = args["REGION"]
    target = targets[args["TARGET"]]
    df = prepdata()
    directory = 'output/' + str(ww.year) + '/'
    if not os.path.exists(directory):
        os.makedirs(directory)
    for i in range(0, 40):
        predictions, bn_mat = ARLR_module(df, region, target, ww + i)
        #pdb.set_trace()
        outputdistribution(predictions.reshape(4), bn_mat.reshape([131, 4]),
                           bin_ed, region, target, directory, ww + i)
        pdb.set_trace()
def state(name):
    cdcdf = pd.read_csv('../data/state/ILINet.csv', header=1)
    cdcdf = cdcdf.drop([
        "REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24",
        "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS"
    ],
                       axis=1)
    dfs = {}

    for state in cdcdf["REGION"].unique():
        dfs[state] = pd.DataFrame(cdcdf.loc[cdcdf["REGION"] == state])

    for df in dfs.values():
        df.drop(["REGION"], axis=1, inplace=True)
        df['DATE'] = pd.to_datetime(
            df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])).
                     startdate(),
                     axis=1,
                     result_type='reduce'))
        #df.drop(["YEAR", "WEEK"], axis = 1, inplace = True)

    return dfs[name]
def state_data(csv_path,epwk, mode):
    if mode == "test":
        state_csv_file = csv_path+'state/ILINet.csv'
        df = pd.read_csv(state_csv_file,na_values='X', header=1)

    elif mode == "flux":
        state_csv_file = csv_path +'/'+ 'ILINet_state_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv'
        df = pd.read_csv(state_csv_file,na_values='X')

    elif mode == "retro": 
        state_csv_file = csv_path +'/'+'state/'+ 'ILINet_State_' + str(epwk.year) +  str(epwk.week) + '.csv'
        df = pd.read_csv(state_csv_file,na_values='X')
    df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce'))
    df = df.rename(columns={'REGION TYPE': 'region_type', 'REGION': 'region', '% WEIGHTED ILI': 'weighted_ili', '%UNWEIGHTED ILI': 'unweighted_ili', 'DATE':'date'})
    
    df = df.set_index('date')
    df_state = pd.DataFrame(columns=[],index=df.index.unique())
    st_dict = {'state':[]}
    for st in df.region.unique():
        df_state[st] = df[df.region==st]['unweighted_ili']
        st_dict['state'].append(st)
    return df_state, st_dict
def prep_aw_data(st_id_path, **kwargs):
    '''Prepares weather and return the corresponding dataframe. kwargs is a dictionary woth key as "national", "HHS", and/or "States" and values are the paths. Prepare this dictionary before calling this functions.'''
    df_wtr = pd.DataFrame()
    for key,value in kwargs.items():
        if key == "National":
            df_wtr_temp = pd.read_csv(value)
            df_wtr_temp['region'] = df_wtr_temp.apply(lambda x: "National", axis=1)
            df_wtr_temp['region_type'] = df_wtr_temp.apply(lambda x: "National", axis=1)
        elif key == "HHS":
            df_wtr_temp = pd.read_csv(value)
            df_wtr_temp['region'] = df_wtr_temp.apply(lambda x: "Region {}".format(x['area_id']),axis=1)
            df_wtr_temp['region_type'] = df_wtr_temp.apply(lambda x: "HHS Regions", axis=1)
        elif key == "States":
            df_wtr_temp = pd.read_csv(value)
            df_wtr_temp = df_wtr_temp[~df_wtr_temp.area_id.isin([72,78])]
            df_st_id = pd.read_csv(st_id_path)
            df_wtr_temp['region'] = df_wtr_temp.apply(lambda row: df_st_id[df_st_id['state']==row['area_id']]['state_name'].values[0], axis=1)
            df_wtr_temp['region_type'] = df_wtr_temp.apply(lambda x: "States", axis=1)
        df_wtr = df_wtr.append(df_wtr_temp)
    pp = pd.to_datetime([epi.Week(int(cdc_data.date2ew(d.date())[0]),int(cdc_data.date2ew(d.date())[1])).startdate() for d in pd.to_datetime(df_wtr.date)])
    df_wtr.index = pp  
    df_wtr.index = df_wtr.index.rename('DATE')
    return df_wtr
Example #13
0
def test_week_equality(week_cdc, week_iso, week_barc):
    assert week_cdc == epiweeks.Week(2015, 1, system="cdc")
    assert week_cdc != epiweeks.Week(2014, 1, system="cdc")
    assert week_iso == epiweeks.Week(2015, 1, system="iso")
    assert week_iso != epiweeks.Week(2014, 1, system="iso")
    assert week_barc == epiweeks.Week(2019, 53, system="barc")
Example #14
0
def week_barc():
    return epiweeks.Week(2019, 53, system="barc")
Example #15
0
def week_iso():
    return epiweeks.Week(2015, 1, system="iso")
Example #16
0
def test_iso_week_to_startdate(test_input, expected):
    year, week = test_input
    startdate = epiweeks.Week(year, week, "ISO").startdate()
    assert startdate.timetuple()[:3] == expected
Example #17
0
def main():
    config = configparser.ConfigParser()
    config_file = pkg_resources.resource_filename(__name__, 'config.ini')
    config.read(config_file)

    args = parse_args()

    level = logging.INFO
    if args.verbose:
        level = logging.DEBUG
    log.setLevel(level)

    if args.log is None:
        handler = logging.StreamHandler()
    else:
        handler = logging.FileHandler(args.log)

    log_formatter = logging.Formatter(
        '%(asctime)s:%(levelname)s:'
        '%(name)s.%(funcName)s:%(message)s',
        datefmt='%Y%m%d-%H%M%S')
    handler.setFormatter(log_formatter)
    log.addHandler(handler)

    log.info('{} v{}'.format(__processor__, __version__))

    #if args.region not in regions:
    #    raise TypeError("region is not valid")
    #if args.region_type == "national":
    #    args.region_type = "US National"
    fct_weeks = args.weeks
    #

    csv_path = args.ground_truth
    st_id_path = args.st_fips

    epiyear = args.forecast_from
    startyear = epiyear[:4]  #args.forecast_from[:4]
    startweek = epiyear[4:]  #args.forecast_from[6:8]
    #trainweek = startweek
    ews = epi.Week(int(startyear), int(startweek))
    targets = get_targets()
    header_region_type = targets[
        'flux_region_type']  #"REGION TYPE" for retro or old datasets
    header_region = targets['flux_region']  #"REGION" for retro or old datasets

    end_date = args.end_date
    if args.mode == "retro":
        fdf = prepdata_retro(csv_path, ews)
    if args.mode == "flux":
        fdf = prepdata_flux(csv_path, ews)
    if args.mode == "test":
        fdf = prepdata_append(csv_path)

    fdf = fdf.rename(
        columns={
            'REGION TYPE': 'region_type',
            'REGION': 'region',
            '% WEIGHTED ILI': 'weighted_ili',
            '%UNWEIGHTED ILI': 'unweighted_ili',
            'DATE': 'date'
        })
    if end_date is None:
        end_date = fdf['date'].max().date() + timedelta(days=3)
    else:
        dt = datetime.strptime(end_date, '%Y%m%d').date()
        end_date = dt + timedelta(days=(3 - dt.isoweekday() % 7))
    if args.end_date is not None:
        fdf = fdf[fdf['date'] <= pd.Timestamp(end_date)]
    fdf = fdf[~fdf.region.
              isin(['Puerto Rico', 'Virgin Islands', 'New York City'])]
    fdf.index = fdf['date']
    fdf.index = fdf.index.rename('date')

    # DataFrame preparation part, integrating accuweather, ght time series with ILI
    kwargs_wtr = {
        "National": args.accu_data_nat,
        "HHS": args.accu_data_hhs,
        "States": args.accu_data_state
    }
    accu_data_fl = None
    for _, value in kwargs_wtr.items():
        accu_data_fl = accu_data_fl or value
    kwargs_ght = {
        "National": args.ght_data_nat,
        "HHS": args.ght_data_hhs,
        "States": args.ght_data_state
    }
    ght_data_fl = None
    for _, value in kwargs_ght.items():
        ght_data_fl = ght_data_fl or value

    if ght_data_fl is None and accu_data_fl is None:
        df_ght = pd.DataFrame()
        df_wtr = pd.DataFrame()
        targ_dict = {
            "target": [targets['flux_ili'], targets['flux_wili']],
            "ght_target": [],
            "aw_target": []
        }

    elif ght_data_fl is None and accu_data_fl is not None:
        df_ght = pd.DataFrame()
        aw_target = [
            'temperature_max', 'temperature_min', 'temperature_mean', 'RH_max',
            'RH_min', 'RH_mean', 'wind_speed_mean', 'cloud_cover_mean',
            'water_total', 'pressure_max', 'pressure_min', 'pressure_mean',
            'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min', 'SH_mean'
        ]

        targ_dict = {
            "target": [targets['ili'], targets['wili']],
            "ght_target": [],
            "aw_target": [
                'temperature_max', 'temperature_min', 'temperature_mean',
                'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean',
                'cloud_cover_mean', 'water_total', 'pressure_max',
                'pressure_min', 'pressure_mean', 'AH_max', 'AH_min', 'AH_mean',
                'SH_max', 'SH_min', 'SH_mean'
            ]
        }  #, 'wind_speed_mean']}
        #aw_csv_path = args.accu_data#'../data/data-aw-cumulative_20191018_1620-weekly-state.csv'

        df_wtr = prep_aw_data(st_id_path, **kwargs_wtr)

        #df_state = prepdata_state(csv_path, ews)
        #pdb.set_trace()
    elif accu_data_fl is None and ght_data_fl is not None:
        df_wtr = pd.DataFrame()
        targ_dict = {
            "target": [targets['ili'], targets['wili']],
            "ght_target": ['flu', 'cough', 'fever', 'influenza', 'cold'],
            "aw_target": []
        }
        #ght_csv_path = args.ght_data
        df_ght = prep_ght_data(**kwargs_ght)
        #df_ght.index = df_ght.date
        #df_ght.index = df_ght.index.rename('DATE')
        #df_ght = df_ght.rename(columns={'state':'REGION'})
        ght_target = ['flu', 'cough', 'fever', 'influenza', 'cold']

    else:
        aw_target = [
            'temperature_max', 'temperature_min', 'temperature_mean', 'RH_max',
            'RH_min', 'RH_mean', 'wind_speed_mean', 'cloud_cover_mean',
            'water_total', 'pressure_max', 'pressure_min', 'pressure_mean',
            'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min', 'SH_mean'
        ]
        targ_dict = {
            "target": [targets['flux_ili'], targets['flux_wili']],
            "ght_target": ['flu', 'cough', 'fever', 'influenza', 'cold'],
            "aw_target": [
                'temperature_max', 'temperature_min', 'temperature_mean',
                'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean',
                'cloud_cover_mean', 'water_total', 'pressure_max',
                'pressure_min', 'pressure_mean', 'AH_max', 'AH_min', 'AH_mean',
                'SH_max', 'SH_min', 'SH_mean'
            ]
        }  #, 'wind_speed_mean']}
        # weather data
        #aw_csv_path = args.accu_data
        df_wtr = prep_aw_data(st_id_path, **kwargs_wtr)

        # GHT data
        #ght_csv_path = args.ght_data

        df_ght = prep_ght_data(**kwargs_ght)
        #df_ght.index = df_ght.date
        #df_ght.index = df_ght.index.rename('DATE')
        #df_ght = df_ght.rename(columns={'state':'REGION'})
        ght_target = ['flu', 'cough', 'fever', 'influenza', 'cold']
    if args.state_exog is None:
        df_state = pd.DataFrame()
    else:
        df_state, state_dict = state_data(csv_path, ews, args.mode)
        targ_dict.update(state_dict)
        df_state, targ_dict = state_shifter(df_state, targ_dict, 0)

    directory_bst = args.out_folder + 'ARLR_bst/'  # + str(args.forecast_from[:4])
    directory_Gaussker = args.out_folder + 'ARLR_Gaussker/'  # + str(args.forecast_from[:4])

    if not os.path.exists(directory_bst):
        os.makedirs(directory_bst)
    if not os.path.exists(directory_Gaussker):
        os.makedirs(directory_Gaussker)
    bin_ed = get_bin()

    allw_lags_f = np.arange(
        1, 55
    )  # should have atleast "ms_fct" lags as we find "ms_fct" filters separately

    #targ_dict = {"target" : [targets['ili'], targets['wili']], "ght_target" : ['flu', 'cough', 'fever', 'influenza', 'cold'], "aw_target" : ['temperature_max', 'temperature_min','temperature_mean', 'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean','cloud_cover_mean', 'water_total', 'pressure_max', 'pressure_min','pressure_mean', 'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min']}#, 'wind_speed_mean']}
    if args.sub_date is not None:
        sub_date = args.sub_date
    else:
        sub_date = ((ews + 1).enddate() + timedelta(days=2)).isoformat(
        )  #submission for epiweek N is (epiweek N+1).enddate() + timedelta(days=2)
    df_full_res = pd.DataFrame(columns=[
        'DATE', 'location', '1 week ahead', '2 week ahead', '3 week ahead',
        '4 week ahead', targ_dict['target'][0]
    ])
    df_full_res = df_full_res.set_index('DATE')
    df_full_seas = pd.DataFrame(columns=['season', 'location'])
    idx_fct = [(ews + i).startdate() for i in range(1, fct_weeks + 1)]
    df_full_seas = pd.DataFrame(columns=['season', 'location'])
    df_full_seas['DATE'] = idx_fct

    for region in fdf[header_region].unique():
        df_res = pd.DataFrame(columns=[
            'DATE', 'location', '1 week ahead', '2 week ahead', '3 week ahead',
            '4 week ahead', targ_dict['target'][0]
        ])
        idx_fct = [(ews + i).startdate() for i in range(1, fct_weeks + 1)]
        df_res['DATE'] = idx_fct
        df_res = df_res.set_index('DATE')

        targ_dict['target'] = [targets['flux_ili'], targets['flux_wili']]
        #targ_dict['aw_target'] = aw_target
        if fdf[header_region_type][fdf[header_region] ==
                                   region].unique() == 'States':
            print(region)
            for v in targ_dict.values():
                if targets['flux_wili'] in v:
                    v.remove(targets['flux_wili'])
        else:
            for v in targ_dict.values():
                if targets['flux_ili'] in v:
                    v.remove(targets['flux_ili'])

        win = int(config['Forecasting']['win'])  # training window
        max_lag = np.max(allw_lags_f)  # maximum lag considered in the model
        # Check if datastream has no missing information for all lagged regressors of length equal to training length window
        nan_chk_mask = (fdf[header_region]
                        == region) & (fdf.index <= pd.to_datetime(
                            ews.startdate())) & (fdf.index >= pd.to_datetime(
                                (ews - int(win + max_lag)).startdate()))
        if fdf[nan_chk_mask][targ_dict['target']].isna().values.any():
            print('Missing values in ILI data, cannot produce forecasts')
            continue
        diff_val = 'no_diff'
        df_m, df_ex = ARLR_regressor(fdf, df_wtr, df_ght, df_state, region,
                                     targ_dict, ews, diff_val)
        predictions, bn_mat_bst, bn_mat_Gaussker, seas, lags_app_f, coeffs_f = ARLR_exog_module(
            df_m, targ_dict, ews, fct_weeks, allw_lags_f)
        predictions = int_op(predictions, df_ex, targ_dict['target'], ews,
                             diff_val)
        for i in range(1, len(predictions[0, :]) + 1):
            print('Week: {}, Fct: {}'.format(i, (predictions[0, i - 1])))
            df_res.loc[(ews + i).startdate(), 'location'] = region
            df_res.loc[(ews + i).startdate(),
                       '{} week ahead'.format(i)] = predictions[0, i - 1]
            if args.eval is not None:
                df_res.loc[(ews + i).startdate(), targ_dict['target']] = fdf[
                    (fdf.index == pd.to_datetime((ews + i).startdate()))
                    & (fdf[header_region] == region)][
                        targ_dict['target']].values[0]
        idx = [(ews + i).startdate()
               for i in range(1, len(range((ews.week) - 40, 35)))]
        df_seas = pd.DataFrame(columns=['season', 'location'])
        df_seas['DATE'] = idx
        df_seas['location'] = df_seas.apply(lambda x: region, axis=1)
        df_seas.loc[:, 'season'] = seas
        df_seas = df_seas.set_index('DATE')
        #df_res = df_res.merge(df_seas, how='outer', left_index=True, right_index=True)

        df_full_res = df_full_res.append(df_res)
        df_full_seas = df_full_seas.append(df_seas)
        if int(args.CDC) and fdf[header_region_type][
                fdf[header_region] == region].unique() != 'States':
            target = targets['flux_wili']
            #outputdistribution_bst(predictions[0,0:4], bn_mat_bst[0,:,0:4], bin_ed, region, target, directory_bst, ews)
            #outputdistribution_Gaussker(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews)
            outputdistribution_fromtemplate_for_FSN(predictions[0, 0:4],
                                                    bn_mat_Gaussker[0, :, 0:4],
                                                    bin_ed, region, target,
                                                    directory_Gaussker, ews)
            #outputdistribution_fromtemplate_for_FluSight(predictions[0,0:4], bn_mat_Gaussker[0,:,0:4], bin_ed, region, target, directory_Gaussker, ews, sub_date)

        if fdf[header_region_type][fdf[header_region] ==
                                   region].unique() == 'States':
            target = targets['flux_ili']
            accu_output(predictions.reshape(fct_weeks), region, args.out_state,
                        ews, args.st_fips)
            outputdistribution_state_fromtemplate(predictions[0, 0:4],
                                                  bn_mat_Gaussker[0, :, 0:4],
                                                  bin_ed, region, target,
                                                  directory_Gaussker, ews,
                                                  sub_date)
    df_full_res.to_csv('result_' + str(ews.year) + 'EW' + str(ews.week))
    df_full_seas.to_csv('result_seas_' + str(ews.year) + 'EW' + str(ews.week))
Example #18
0
 def _week_to_date(self, row):
     return epiweeks.Week(row.iso_year, row.iso_week).startdate()
Example #19
0
def week_to_date(year: int, week: int, output_fmt: str = DATE_FORMAT):
    week = epiweeks.Week(year, week)
    dt = week.enddate()
    return clean_date(dt, output_fmt=output_fmt)
def prepdata(csv_path):    
    df = pd.read_csv(csv_path, na_values='X', header=1)
    df['REGION'] = df['REGION'].fillna('National')
    df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce'))
    return df
Example #21
0
def test_week_subtracting(week_cdc, week_iso):
    assert (week_cdc - 1) == epiweeks.Week(2014, 53, system="cdc")
    assert (week_iso - 1) == epiweeks.Week(2014, 52, system="iso")
Example #22
0
def test_week_addition(week_cdc, week_iso):
    assert (week_cdc + 1) == epiweeks.Week(2015, 2, system="cdc")
    assert (week_iso + 1) == epiweeks.Week(2015, 2, system="iso")
def ARLR_module(df, region, target, epi_week):
    config = configparser.ConfigParser()
    config_file = 'config.ini'
    config.read(config_file)
    ww_train = epi_week - 1
    ww_test = epi_week
    cdcdf = df
    starttraining_date = pd.to_datetime(ww_train.startdate())
    testing_date = pd.to_datetime(ww_test.startdate())
    #endtraining = pd.to_datetime(enddate.startdate())
    #startpredict = pd.to_datetime((enddate+1).startdate())
    #endpredict = pd.to_datetime((enddate+4).startdate())

    if region == 'US National':
        df = cdcdf[cdcdf['REGION TYPE'] == 'National']
        df['DATE'] = pd.to_datetime(
            df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])).
                     startdate(),
                     axis=1,
                     result_type='reduce'))
        #df.set_index(['DATE'], inplace=True)

    elif region.isdigit():
        df = cdcdf[cdcdf['REGION'] == "Region " + str(region)]
        df['DATE'] = pd.to_datetime(
            df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])).
                     startdate(),
                     axis=1,
                     result_type='reduce'))
        df.set_index(['DATE'], inplace=True)

        #When I set the date row as the index, I can no longer access it using df['DATE]
    else:
        df = cdcdf[cdcdf['REGION'] == region]
        df['DATE'] = pd.to_datetime(
            df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])).
                     startdate(),
                     axis=1,
                     result_type='reduce'))
        df.set_index(['DATE'], inplace=True)

    df_train = df[(df['DATE'] < pd.to_datetime(ww_train.startdate()))]
    df_test = df[(df['DATE'] >= pd.to_datetime(ww_train.startdate()))]

    #targetdf = Series(df[target])
    #target_series = targetdf[:starttraining_date]
    #df_train = target_series[:-1]
    #df_test = target_series[-1:]
    train = np.log(np.array(df_train[target], 'float').astype(float))
    test = np.log(np.array(df_test[target], 'float').astype(float))
    train = pd.Series(train)
    train.index = df_train['DATE']
    test = pd.Series(test)
    test.index = df_test['DATE']
    config = configparser.ConfigParser()
    config_file = 'config.ini'
    config.read(config_file)
    # Multi-step forecast

    win = int(
        config['Forecasting']
        ['win'])  # Length of the historial training data to be considered

    fut_wks = int(
        config['Forecasting']
        ['fut_wks'])  # Number of weeks ahead to forecast from training data
    ms_fct = int(
        config['Forecasting']['ms_fct']
    )  # For every forecast week, give additional ms_fct weeks forecast

    test_win = fut_wks + ms_fct  # Number of true value to be fetched (testing accuracy)
    exp_max_lags = int(config['Forecasting']['exp_max_lags']
                       )  # expected maximum lags to be considered in the model
    llr_tol = 1e-2  # log-likelihood tolerance

    # Uncertainty analysis
    uncer_anl = int(config['CDC']['uncer_anl'])
    Nb = int(config['CDC']['Nb'])
    # create bins
    n_bins = int(config['CDC']['n_bins'])

    #bin_ed = np.arange(0,n_bins,.1)
    #bin_ed = np.append(bin_ed,20)
    bin_ed = [
        0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
        1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7,
        2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1,
        4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5,
        5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9,
        7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3,
        8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7,
        9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9,
        11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1,
        12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 100
    ]
    # Read csv file and create train and test data
    # dates = pd.DatetimeIndex(df_train["DATE"])
    #plt.figure(figsize=(12,7))
    #plt.subplot(2,1,1);plt.plot(train.index,(train));plt.title('Full training data from specified epiweek {}, {}'.format(epwk,yr))
    # plt.subplot(2,1,2);plt.plot((hist_win(train,win)).index,(hist_win(train,win)));plt.title('Training data: 4 year period')

    # Check data for stationarity in the training data with padding
    train_win = train[-1:(
        -win - exp_max_lags -
        1):-1]  # training samples in the window period + buffer

    result = adfuller(train_win)
    #print(result)
    #if result[1] < 0.05:
    #    print('p-val of ADF test %e' %result[1])
    #    print('Stationary signal')
    # plt.plot(train_win)
    # Check seasonality
    season_ind = get_season(train_win, fft_len=1024, figs=False)
    # train the model
    max_lags = 55
    coeffs = np.zeros([ms_fct, max_lags])
    train_pred_err = np.zeros([ms_fct, win])
    yp_train = np.zeros([ms_fct, win])
    lags_app = np.zeros([ms_fct, max_lags])

    # Train to obtain ARLR coeffs for all specified multi-step forecast:
    # Ex: For 1-step forecast, consider data from t-1 to t-p for training: ms_fct = 1
    # for 4-step forecast, consider data for t-4 to t-p for training: ms_fct = 4
    # similarly for 1 season, ms_fct = 52
    for wks in range(1, ms_fct + 1):
        allw_lags = (np.arange(wks, max_lags))
        coeffs_temp, yp_train[wks - 1, :], tr_tp1, llr1, train_pred_err[
            wks - 1, :], lags_temp = ARLR_model(train, allw_lags, win, llr_tol)
        lags_app[wks - 1, lags_temp] = lags_temp
        coeffs[wks - 1, :] = coeffs_temp

    yp_fct = np.zeros([fut_wks, ms_fct])
    yb_fct = np.zeros([fut_wks, ms_fct, Nb])
    log_scr = np.zeros([fut_wks, ms_fct])
    bn_mat = np.zeros([fut_wks, len(bin_ed) - 1, ms_fct])
    # Once trained, use the coeffs to forecast multi-steps given data frame

    # For obtaining uncertainty in forecast estimates (using Boot strapping), choose uncer_anl = True,
    data_frame = train
    data_test = []  #test
    for new_wks in np.arange(0, fut_wks):
        data_frame = data_frame.append(test[new_wks:(new_wks + 1)])
        data_test = data_test[1:]
        yp_fct[new_wks, :], yb_fct[new_wks, :, :], log_scr[new_wks, :], bn_mat[
            new_wks, :, :], train_pred_err = multi_step_fct(
                data_frame, coeffs, lags_app, train_pred_err, ms_fct, win, Nb,
                bin_ed, uncer_anl)

    return np.exp(yp_fct), bn_mat
Example #24
0
def week_cdc():
    return epiweeks.Week(2015, 1, system="cdc")
    if save: l_outs.append(st)

l_outs = []

say('Beginning generation of power ratings from raw evaluation files.',l_outs)
datestamp = datetime.datetime.today().strftime('%Y-%m-%d')
outstring = 'Datestamp: '+datestamp
say(outstring,l_outs)
say('Following these policies:',l_outs)
for k, v in d_policy.items():
    outstring = '  '+str(k)+': '+str(v)
    say(outstring,l_outs)


# Create some handy dicts to move between YYYY-MM-DD and epiweek
d_epiweeks = {i: epiweeks.Week(2020,i) for i in range(first_week,last_week+1)}
d_wk_to_enddate = {k: w.enddate().strftime('%Y-%m-%d') for k, w in d_epiweeks.items()}
d_enddate_to_wk = {v: k for k, v in d_wk_to_enddate.items()}
d_wk_to_startdate = {k: (w.startdate()+datetime.timedelta(days=1)).strftime('%Y-%m-%d') for k, w in d_epiweeks.items()}
d_startdate_to_wk = {v: k for k, v in d_wk_to_startdate.items()}

# For each starting week, create list of week pairs that are (starting_week,ending_week). Store as dict.
d_week_pairs = {i: [(j,i) for j in range(first_week,last_week+1) if i >= j] for i in range(first_week,last_week+1)}

# Also do this with the full string YYYY-MM-DD version
d_week_pairs_str = {}
for k, v in d_week_pairs.items():
    d_week_pairs_str[d_wk_to_enddate[k]] = [d_wk_to_startdate[tup[0]]+'_'+d_wk_to_enddate[tup[1]] for tup in v]

# Read in all the files in the evaluations directory
# We could just glob but curating it slightly might make the manipulations easier
Example #26
0
def main():
    args = parse_args()

    level = logging.INFO
    if args.verbose:
        level = logging.DEBUG
    log.setLevel(level)

    if args.log is None:
        handler = logging.StreamHandler()
    else:
        handler = logging.FileHandler(args.log)

    log_formatter = logging.Formatter(
        '%(asctime)s:%(levelname)s:'
        '%(name)s.%(funcName)s:%(message)s',
        datefmt='%Y%m%d-%H%M%S')
    handler.setFormatter(log_formatter)
    log.addHandler(handler)

    log.info('{} v{}'.format(__processor__, __version__))

    regions = get_regions()
    targets = get_targets()
    #if args.region not in regions:
    #    raise TypeError("region is not valid")
    #if args.region_type == "national":
    #    args.region_type = "US National"
    fct_weeks = args.weeks
    #

    csv_path = args.ground_truth

    if int(args.test):
        directory = 'dump/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory_bst = args.out_folder + 'ARLR_bst/' + str(args.forecast_from)
    directory_Gaussker = args.out_folder + 'ARLR_Gaussker/' + str(
        args.forecast_from)

    if not os.path.exists(directory_bst):
        os.makedirs(directory_bst)
    if not os.path.exists(directory_Gaussker):
        os.makedirs(directory_Gaussker)
    bin_ed = get_bin()
    EWs = []
    year_f = args.forecast_from
    year_t = str(int(year_f) + 1)
    EWs = []
    for y in range(int(year_f), int(year_t) + 1):
        for week in epi.Year(y).iterweeks():

            w = int(str(week))
            if (w < int(year_f + '40')) | (w > int(year_t + '20')):
                continue
            EWs.append(str(w))
    for wks in EWs:  #epi.Year(int(args.forecast_from)).iterweeks():
        startyear = wks[:4]  #args.forecast_from[:4]
        startweek = wks[4:]  #args.forecast_from[6:8]

        #trainweek = startweek
        fdf = prepdata_retro(csv_path, wks)
        fdf['REGION'] = fdf['REGION'].fillna('National')
        fdf.dropna(subset=['%UNWEIGHTED ILI'], inplace=True)
        fdf = fdf.drop(fdf[(fdf['REGION'] == 'Puerto Rico') |
                           (fdf['REGION'] == 'Virgin Islands') |
                           (fdf['REGION'] == 'New York City')].index)

        ews = epi.Week(int(startyear), int(startweek))
        for region in fdf['REGION'].unique():
            #for i in range(0, 1):
            #if region=='National' or 'HHS Regions':
            #    target = targets["wili"]
            #else:
            #    target = targets["ili"]
            target = targets['wili']
            df = fdf[fdf['REGION'] == region]
            predictions, bn_mat_bst, bn_mat_Gaussker = ARLR_module(
                df, region, target, ews, fct_weeks)
            if int(args.CDC):
                outputdistribution_bst(predictions[0, 0:4],
                                       bn_mat_bst[0, :, 0:4], bin_ed, region,
                                       target, directory_bst, ews)
                #outputdistribution_Gaussker(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews)
                outputdistribution_fromtemplate(predictions[0, 0:4],
                                                bn_mat_Gaussker[:, 0:4],
                                                bin_ed, region, target,
                                                directory_Gaussker, ews)

            if df['REGION TYPE'].unique() == 'States':
                print(region)
                accu_output(predictions.reshape(fct_weeks), region,
                            args.out_state, ews, args.st_fips)
Example #27
0
def main():
    args = parse_args()
    
    level = logging.INFO
    if args.verbose:
        level = logging.DEBUG
    log.setLevel(level)

    if args.log is None:
        handler = logging.StreamHandler()
    else:
        handler = logging.FileHandler(args.log)

    log_formatter = logging.Formatter('%(asctime)s:%(levelname)s:'
                                      '%(name)s.%(funcName)s:%(message)s',
                                      datefmt='%Y%m%d-%H%M%S')
    handler.setFormatter(log_formatter)
    log.addHandler(handler)

    log.info('{} v{}'.format(__processor__,__version__))    
    
    
    regions = get_regions()
    targets = get_targets()
    #if args.region not in regions:
    #    raise TypeError("region is not valid")
    #if args.region_type == "national":
    #    args.region_type = "US National"
    fct_weeks = args.weeks
    # 
    
    
    
    csv_path = args.ground_truth
    
    epiyear = args.forecast_from
    startyear = epiyear[:4] #args.forecast_from[:4]
    startweek = epiyear[4:] #args.forecast_from[6:8]
    #trainweek = startweek
    ews = epi.Week(int(startyear), int(startweek))
    header_region_type = targets['flux_region_type'] #"REGION TYPE" for retro or old datasets
    header_region = targets['flux_region'] #"REGION" for retro or old datasets
    
    fdf = prepdata_flux(csv_path, ews)
    fdf[header_region] = fdf[header_region].fillna('National')
    fdf = fdf.drop(fdf[(fdf[header_region] == 'Puerto Rico')|(fdf[header_region] == 'Virgin Islands')|(fdf[header_region] == 'New York City')].index)
    
    if int(args.test):
        directory = 'dump/'
        if not os.path.exists(directory):
            os.makedirs(directory) 
        
    directory_bst = args.out_folder + 'ARLR_bst/' + str(args.forecast_from)
    directory_Gaussker = args.out_folder + 'ARLR_Gaussker/' + str(args.forecast_from)

    if not os.path.exists(directory_bst):
        os.makedirs(directory_bst)
    if not os.path.exists(directory_Gaussker):
        os.makedirs(directory_Gaussker)
    bin_ed = get_bin()

     
    
    
    for region in fdf[header_region].unique():
        #for i in range(0, 1):
        #if region=='National' or 'HHS Regions':
        #    target = targets["wili"]
        #else:
        #    target = targets["ili"]
        target = targets['flux_wili'] # "wili" for retro or old datasets
        df = fdf[fdf[header_region]==region]       
        predictions, bn_mat_bst, bn_mat_Gaussker = ARLR_module(df, region, target, ews, fct_weeks)
        if int(args.CDC):
            #outputdistribution_bst(predictions[0,0:4], bn_mat_bst[0,:,0:4], bin_ed, region, target, directory_bst, ews)
            #outputdistribution_Gaussker(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews)
            outputdistribution_fromtemplate(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews)


        if df[header_region_type].unique() == 'States':
            print(region)
            accu_output(predictions.reshape(fct_weeks), region,  args.out_state, ews, args.st_fips)