Esempio n. 1
0
def shift(df):
    df['reanalysis_relative_humidity_percent_2'] = dp.shift(
        df['reanalysis_relative_humidity_percent'], 8)
    df['reanalysis_relative_humidity_percent_3'] = dp.shift(
        df['reanalysis_relative_humidity_percent'], 3)
    df['reanalysis_precip_amt_kg_per_m2_2'] = dp.shift(
        df['reanalysis_precip_amt_kg_per_m2'], 8)
    df['reanalysis_precip_amt_kg_per_m2_3'] = dp.shift(
        df['reanalysis_precip_amt_kg_per_m2'], 6)
    df['reanalysis_specific_humidity_g_per_kg_2'] = dp.shift(
        df['reanalysis_specific_humidity_g_per_kg'], 2)
    df['reanalysis_specific_humidity_g_per_kg_3'] = dp.shift(
        df['reanalysis_specific_humidity_g_per_kg'], 6)  #11
    df['reanalysis_dew_point_temp_k_2'] = dp.shift(
        df['reanalysis_dew_point_temp_k'], 11)  #2,9
    df['reanalysis_dew_point_temp_k_3'] = dp.shift(
        df['reanalysis_dew_point_temp_k'], 5)
    df['reanalysis_dew_point_temp_k_4'] = dp.shift(
        df['reanalysis_dew_point_temp_k'], 6)
Esempio n. 2
0
features  = train.loc['sj'],train.loc['iq']
labels = labels.loc['sj'],labels.loc['iq']
test_features = test.loc['sj'],test.loc['iq']

#Shift 
sj_featurs = features[0]
sj_test = test_features[0]


#Correlations
f,l,t  = (features[0].copy(),labels[0],test_features[0])
f['total_cases'] = l
for i in range(1,5):
    for feature in ef_lst:
        f[feature+'_'+str(i)] =  dp.shift(f.loc[:,(feature)],i)
cor = f.corr().total_cases.sort_values(ascending=False)

corDict = {}
for f_name,v in cor.items():
    if f_name[-1] not in '123456789':
        continue;
    f_name,shift = f_name[:-2],int(f_name[-1])
    
    if f_name not in corDict.keys():
        corDict[f_name] = []
    corDict[f_name].append((v,shift))
    
for k,v in corDict.items():
    print(k,max(v,key=lambda x:x[0]))
def preprocess_data(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path, index_col=[0, 1, 2])

    #    # fill missing values
    #    df.fillna(method='ffill', inplace=True)
    #
    #    # add labels to dataframe
    #    if labels_path:
    #        labels = pd.read_csv(labels_path, index_col=[0, 1, 2])
    #        df = df.join(labels)

    #    reanalysis_sat_precip_amt_mm
    #    precipitation_amt_mm
    # f regression/
    # pca
    #fillna seperate for features

    #ADD SHIFTED FEATURES HERE
    df['reanalysis_relative_humidity_percent_2'] = dp.shift(
        df['reanalysis_relative_humidity_percent'], 8)
    df['reanalysis_relative_humidity_percent_3'] = dp.shift(
        df['reanalysis_relative_humidity_percent'], 3)
    df['reanalysis_precip_amt_kg_per_m2_2'] = dp.shift(
        df['reanalysis_precip_amt_kg_per_m2'], 8)
    df['reanalysis_precip_amt_kg_per_m2_3'] = dp.shift(
        df['reanalysis_precip_amt_kg_per_m2'], 6)
    df['reanalysis_specific_humidity_g_per_kg_2'] = dp.shift(
        df['reanalysis_specific_humidity_g_per_kg'], 11)
    df['reanalysis_specific_humidity_g_per_kg_3'] = dp.shift(
        df['reanalysis_specific_humidity_g_per_kg'], 6)
    df['reanalysis_dew_point_temp_k_2'] = dp.shift(
        df['reanalysis_dew_point_temp_k'], 11)
    df['reanalysis_dew_point_temp_k_3'] = dp.shift(
        df['reanalysis_dew_point_temp_k'], 5)
    df['reanalysis_dew_point_temp_k_4'] = dp.shift(
        df['reanalysis_dew_point_temp_k'], 6)
    df['reanalysis_air_temp_k_2'] = dp.shift(df['reanalysis_air_temp_k'], 1)
    df['reanalysis_air_temp_k_4'] = dp.shift(df['reanalysis_air_temp_k'], 5)
    df['reanalysis_air_temp_k_5'] = dp.shift(df['reanalysis_air_temp_k'], 6)
    df['reanalysis_air_temp_k_6'] = dp.shift(df['reanalysis_air_temp_k'], 7)
    df['reanalysis_air_temp_k_7'] = dp.shift(df['reanalysis_air_temp_k'], 11)
    df['reanalysis_air_temp_k_8'] = dp.shift(df['reanalysis_air_temp_k'], 12)
    df['station_max_temp_c_2'] = dp.shift(df['station_max_temp_c'], 12)
    df['station_max_temp_c_3'] = dp.shift(df['station_max_temp_c'], 3)
    df['station_max_temp_c_4'] = dp.shift(df['station_max_temp_c'], 1)
    df['station_max_temp_c_5'] = dp.shift(df['station_max_temp_c'], 10)  #10,4
    df['station_max_temp_c_6'] = dp.shift(df['station_max_temp_c'], 4)
    df['reanalysis_sat_precip_amt_mm_2'] = dp.shift(
        df['reanalysis_sat_precip_amt_mm'], 11)
    df['precipitation_amt_mm_2'] = dp.shift(df['precipitation_amt_mm'], 10)
    df['precipitation_amt_mm_3'] = dp.shift(df['precipitation_amt_mm'], 1)

    #CHANGE HERE ---- SJ FEATURES
    features_sj = [
        'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k',
        'station_avg_temp_c', 'reanalysis_air_temp_k', 'station_max_temp_c',
        'reanalysis_relative_humidity_percent',
        'reanalysis_relative_humidity_percent_2',
        'reanalysis_relative_humidity_percent_3',
        'reanalysis_precip_amt_kg_per_m2_2',
        'reanalysis_precip_amt_kg_per_m2_3',
        'reanalysis_specific_humidity_g_per_kg_2',
        'reanalysis_specific_humidity_g_per_kg_3',
        'reanalysis_dew_point_temp_k_2', 'reanalysis_dew_point_temp_k_3',
        'reanalysis_dew_point_temp_k_4', 'reanalysis_air_temp_k_2',
        'reanalysis_air_temp_k_4', 'reanalysis_air_temp_k_5',
        'reanalysis_air_temp_k_6', 'reanalysis_air_temp_k_7',
        'reanalysis_air_temp_k_8', 'station_max_temp_c_2',
        'station_max_temp_c_3', 'station_max_temp_c_4', 'station_max_temp_c_5',
        'station_max_temp_c_6', 'reanalysis_sat_precip_amt_mm_2',
        'precipitation_amt_mm_2', 'precipitation_amt_mm_3'
    ]

    #CHANGE HERE ---- IQ FEATURES
    features_iq = [
        'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k',
        'station_avg_temp_c', 'station_min_temp_c', 'reanalysis_min_air_temp_k'
    ]

    df_sj = df[features_sj]
    df_iq = df[features_iq]

    # fill missing values
    df_sj.fillna(method='ffill', inplace=True)
    df_iq.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path, index_col=[0, 1, 2])
        df_sj = df_sj.join(labels)
        df_iq = df_iq.join(labels)

    # separate san juan and iquitos
    sj = df_sj.loc['sj']
    iq = df_iq.loc['iq']

    return sj, iq
#    l_t_test = l.tail(l.shape[0] - size)
#
#    #Try
#    rgr.fit(f_t,np.ravel(l_t))
#    prd = list(map(int,map(round,rgr.predict(f_t_test))))
#    er = mean_absolute_error(l_t_test,prd)
#    print(er,shifts)
#    results.append((er,shifts))
#
#
#print(min(results,key=lambda x:x[0]))

##Code to run and get result to submit
shifts = [2, 2, 4, 2, 2]
for i, (f, v) in enumerate(zip(ef_lst, shifts)):
    sj_featurs.loc[:, (f + str(i))] = dp.shift(sj_featurs.loc[:, (f)], v)
    sj_test.loc[:, (f + str(i))] = dp.shift(sj_test.loc[:, (f)], v)
#
predictions = []
for est in range(1, 20):
    for f, l, t in zip(features, labels, test_features):

        #    #Pick best matching
        #    f['total_cases'] = l
        #    for i in range(1,5):
        #        for feature in ef_lst:
        #            f[feature+'_'+str(i)] =  dp.shift(f.loc[:,(feature)],i)
        #            t[feature+'_'+str(i)] =  dp.shift(t.loc[:,(feature)],i)
        #    cor = f.corr().total_cases.drop('total_cases').sort_values(ascending=False)
        #    selected_features = []
        #    for i,v in cor.items():
Esempio n. 5
0
import pandas as pd
import numpy as np
import dengue_data as dd
import dengue_processing as dp
from matplotlib import pyplot as plt

ef_lst = dd.BM_FEATURE_NAMES
cityNames = dd.CITY_NAMES
train, labels, test, submission = dd.getData(ef_lst)

train = train.join(labels)

sj_train, iq_train = train.loc['sj'], train.loc['iq']
sj_test, iq_test = test.loc['sj'], test.loc['iq']
sj_train['station_avg_temp_c_2'] = dp.shift(sj_train['station_avg_temp_c'], 2)
sj_test['station_avg_temp_c_2'] = dp.shift(sj_test['station_avg_temp_c'], 2)

#split
sj_train_subtrain = sj_train.head(800)
sj_train_subtest = sj_train.tail(sj_train.shape[0] - 800)

iq_train_subtrain = iq_train.head(400)
iq_train_subtest = iq_train.tail(iq_train.shape[0] - 400)

sj_best_model = dp.getBMNegBinomailModel(
    sj_train_subtrain, sj_train_subtest,
    dp.DEFAULT_MODEL + ' + station_avg_temp_c_2')
iq_best_model = dp.getBMNegBinomailModel(iq_train_subtrain, iq_train_subtest)

figs, axes = plt.subplots(nrows=2, ncols=1)