def shift(df): df['reanalysis_relative_humidity_percent_2'] = dp.shift( df['reanalysis_relative_humidity_percent'], 8) df['reanalysis_relative_humidity_percent_3'] = dp.shift( df['reanalysis_relative_humidity_percent'], 3) df['reanalysis_precip_amt_kg_per_m2_2'] = dp.shift( df['reanalysis_precip_amt_kg_per_m2'], 8) df['reanalysis_precip_amt_kg_per_m2_3'] = dp.shift( df['reanalysis_precip_amt_kg_per_m2'], 6) df['reanalysis_specific_humidity_g_per_kg_2'] = dp.shift( df['reanalysis_specific_humidity_g_per_kg'], 2) df['reanalysis_specific_humidity_g_per_kg_3'] = dp.shift( df['reanalysis_specific_humidity_g_per_kg'], 6) #11 df['reanalysis_dew_point_temp_k_2'] = dp.shift( df['reanalysis_dew_point_temp_k'], 11) #2,9 df['reanalysis_dew_point_temp_k_3'] = dp.shift( df['reanalysis_dew_point_temp_k'], 5) df['reanalysis_dew_point_temp_k_4'] = dp.shift( df['reanalysis_dew_point_temp_k'], 6)
features = train.loc['sj'],train.loc['iq'] labels = labels.loc['sj'],labels.loc['iq'] test_features = test.loc['sj'],test.loc['iq'] #Shift sj_featurs = features[0] sj_test = test_features[0] #Correlations f,l,t = (features[0].copy(),labels[0],test_features[0]) f['total_cases'] = l for i in range(1,5): for feature in ef_lst: f[feature+'_'+str(i)] = dp.shift(f.loc[:,(feature)],i) cor = f.corr().total_cases.sort_values(ascending=False) corDict = {} for f_name,v in cor.items(): if f_name[-1] not in '123456789': continue; f_name,shift = f_name[:-2],int(f_name[-1]) if f_name not in corDict.keys(): corDict[f_name] = [] corDict[f_name].append((v,shift)) for k,v in corDict.items(): print(k,max(v,key=lambda x:x[0]))
def preprocess_data(data_path, labels_path=None): # load data and set index to city, year, weekofyear df = pd.read_csv(data_path, index_col=[0, 1, 2]) # # fill missing values # df.fillna(method='ffill', inplace=True) # # # add labels to dataframe # if labels_path: # labels = pd.read_csv(labels_path, index_col=[0, 1, 2]) # df = df.join(labels) # reanalysis_sat_precip_amt_mm # precipitation_amt_mm # f regression/ # pca #fillna seperate for features #ADD SHIFTED FEATURES HERE df['reanalysis_relative_humidity_percent_2'] = dp.shift( df['reanalysis_relative_humidity_percent'], 8) df['reanalysis_relative_humidity_percent_3'] = dp.shift( df['reanalysis_relative_humidity_percent'], 3) df['reanalysis_precip_amt_kg_per_m2_2'] = dp.shift( df['reanalysis_precip_amt_kg_per_m2'], 8) df['reanalysis_precip_amt_kg_per_m2_3'] = dp.shift( df['reanalysis_precip_amt_kg_per_m2'], 6) df['reanalysis_specific_humidity_g_per_kg_2'] = dp.shift( df['reanalysis_specific_humidity_g_per_kg'], 11) df['reanalysis_specific_humidity_g_per_kg_3'] = dp.shift( df['reanalysis_specific_humidity_g_per_kg'], 6) df['reanalysis_dew_point_temp_k_2'] = dp.shift( df['reanalysis_dew_point_temp_k'], 11) df['reanalysis_dew_point_temp_k_3'] = dp.shift( df['reanalysis_dew_point_temp_k'], 5) df['reanalysis_dew_point_temp_k_4'] = dp.shift( df['reanalysis_dew_point_temp_k'], 6) df['reanalysis_air_temp_k_2'] = dp.shift(df['reanalysis_air_temp_k'], 1) df['reanalysis_air_temp_k_4'] = dp.shift(df['reanalysis_air_temp_k'], 5) df['reanalysis_air_temp_k_5'] = dp.shift(df['reanalysis_air_temp_k'], 6) df['reanalysis_air_temp_k_6'] = dp.shift(df['reanalysis_air_temp_k'], 7) df['reanalysis_air_temp_k_7'] = dp.shift(df['reanalysis_air_temp_k'], 11) df['reanalysis_air_temp_k_8'] = dp.shift(df['reanalysis_air_temp_k'], 12) df['station_max_temp_c_2'] = dp.shift(df['station_max_temp_c'], 12) df['station_max_temp_c_3'] = dp.shift(df['station_max_temp_c'], 3) df['station_max_temp_c_4'] = dp.shift(df['station_max_temp_c'], 1) df['station_max_temp_c_5'] = dp.shift(df['station_max_temp_c'], 10) #10,4 df['station_max_temp_c_6'] = dp.shift(df['station_max_temp_c'], 4) df['reanalysis_sat_precip_amt_mm_2'] = dp.shift( df['reanalysis_sat_precip_amt_mm'], 11) df['precipitation_amt_mm_2'] = dp.shift(df['precipitation_amt_mm'], 10) df['precipitation_amt_mm_3'] = dp.shift(df['precipitation_amt_mm'], 1) #CHANGE HERE ---- SJ FEATURES features_sj = [ 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k', 'station_avg_temp_c', 'reanalysis_air_temp_k', 'station_max_temp_c', 'reanalysis_relative_humidity_percent', 'reanalysis_relative_humidity_percent_2', 'reanalysis_relative_humidity_percent_3', 'reanalysis_precip_amt_kg_per_m2_2', 'reanalysis_precip_amt_kg_per_m2_3', 'reanalysis_specific_humidity_g_per_kg_2', 'reanalysis_specific_humidity_g_per_kg_3', 'reanalysis_dew_point_temp_k_2', 'reanalysis_dew_point_temp_k_3', 'reanalysis_dew_point_temp_k_4', 'reanalysis_air_temp_k_2', 'reanalysis_air_temp_k_4', 'reanalysis_air_temp_k_5', 'reanalysis_air_temp_k_6', 'reanalysis_air_temp_k_7', 'reanalysis_air_temp_k_8', 'station_max_temp_c_2', 'station_max_temp_c_3', 'station_max_temp_c_4', 'station_max_temp_c_5', 'station_max_temp_c_6', 'reanalysis_sat_precip_amt_mm_2', 'precipitation_amt_mm_2', 'precipitation_amt_mm_3' ] #CHANGE HERE ---- IQ FEATURES features_iq = [ 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k', 'station_avg_temp_c', 'station_min_temp_c', 'reanalysis_min_air_temp_k' ] df_sj = df[features_sj] df_iq = df[features_iq] # fill missing values df_sj.fillna(method='ffill', inplace=True) df_iq.fillna(method='ffill', inplace=True) # add labels to dataframe if labels_path: labels = pd.read_csv(labels_path, index_col=[0, 1, 2]) df_sj = df_sj.join(labels) df_iq = df_iq.join(labels) # separate san juan and iquitos sj = df_sj.loc['sj'] iq = df_iq.loc['iq'] return sj, iq
# l_t_test = l.tail(l.shape[0] - size) # # #Try # rgr.fit(f_t,np.ravel(l_t)) # prd = list(map(int,map(round,rgr.predict(f_t_test)))) # er = mean_absolute_error(l_t_test,prd) # print(er,shifts) # results.append((er,shifts)) # # #print(min(results,key=lambda x:x[0])) ##Code to run and get result to submit shifts = [2, 2, 4, 2, 2] for i, (f, v) in enumerate(zip(ef_lst, shifts)): sj_featurs.loc[:, (f + str(i))] = dp.shift(sj_featurs.loc[:, (f)], v) sj_test.loc[:, (f + str(i))] = dp.shift(sj_test.loc[:, (f)], v) # predictions = [] for est in range(1, 20): for f, l, t in zip(features, labels, test_features): # #Pick best matching # f['total_cases'] = l # for i in range(1,5): # for feature in ef_lst: # f[feature+'_'+str(i)] = dp.shift(f.loc[:,(feature)],i) # t[feature+'_'+str(i)] = dp.shift(t.loc[:,(feature)],i) # cor = f.corr().total_cases.drop('total_cases').sort_values(ascending=False) # selected_features = [] # for i,v in cor.items():
import pandas as pd import numpy as np import dengue_data as dd import dengue_processing as dp from matplotlib import pyplot as plt ef_lst = dd.BM_FEATURE_NAMES cityNames = dd.CITY_NAMES train, labels, test, submission = dd.getData(ef_lst) train = train.join(labels) sj_train, iq_train = train.loc['sj'], train.loc['iq'] sj_test, iq_test = test.loc['sj'], test.loc['iq'] sj_train['station_avg_temp_c_2'] = dp.shift(sj_train['station_avg_temp_c'], 2) sj_test['station_avg_temp_c_2'] = dp.shift(sj_test['station_avg_temp_c'], 2) #split sj_train_subtrain = sj_train.head(800) sj_train_subtest = sj_train.tail(sj_train.shape[0] - 800) iq_train_subtrain = iq_train.head(400) iq_train_subtest = iq_train.tail(iq_train.shape[0] - 400) sj_best_model = dp.getBMNegBinomailModel( sj_train_subtrain, sj_train_subtest, dp.DEFAULT_MODEL + ' + station_avg_temp_c_2') iq_best_model = dp.getBMNegBinomailModel(iq_train_subtrain, iq_train_subtest) figs, axes = plt.subplots(nrows=2, ncols=1)