print '\nFeature importance for different lead-lag relations:' print '----------------------------------------------------\n' if config.do_model_fit == True: start_T = time.time() # for time taking # initialisations for results feat_imp, feat_imp_sd, target_feat_corr =\ np.zeros((len(config.time_shifts),\ len(config.features))),np.zeros((len(config.time_shifts),len(config.features))),\ np.zeros((len(config.time_shifts),len(config.features))) # loop over horizon lengths (shift values) for t, hor in enumerate(config.time_shifts): df_train = data_func.data_framer(data.raw_data.copy(),config.target,config.features,config.time_var,\ config.start_time,config.end_time,shift=hor,\ trafos=data.trafos,name_trafo=False) m_test_2 = int(np.round(len(df_train) * config.test_fraction)) # training set size # model fit out_dict = ml_func.ML_train_tester(df_train,config.target,config.features,config.method,\ is_class=config.is_class,m_test=m_test_2,n_boot=config.n_boot,\ to_norm=config.to_norm,counter_fact=config.counter_fact,\ verbose=config.verbose) # print model specification and progress if t == 0: print '\tModel specs:\n\n\t', out_dict['models'][0], '\n' print '\tHorizon ({0}):'.format(config.unit), print hor, '..',
i_t = int(np.where(proj_index==end)[0]) i_t_hor = i_t+config.horizon # TRAINING # -------- # start time if config.fixed_start==False and t>0: i_s += config.time_step_size start = data.data_shifted.index[i_s] else: i_s = 0 start = data.data_shifted.index[i_s] # training data df_train = data_func.data_framer(data.data_shifted, config.target, config.features,\ index=config.time_var, start_i=start, end_i=end, name_trafo=False) m_test_2 = int(np.round(len(df_train)*config.test_fraction)) # test data set fraction of total # model fit out_dict = ml_func.ML_train_tester(df_train, config.target, config.features, config.method,\ is_class=config.is_class, m_test=m_test_2, n_boot=config.n_boot,\ to_norm=config.to_norm, counter_fact=config.counter_fact,\ verbose=config.verbose) # return model specification if t==0: print '\n\tModel specs:\n\n\t',out_dict['models'][0],'\n' # get variable importance p = ml_func.get_feat_importance(out_dict['feat_weights']) feat_imp[t,:], feat_imp_sd[t,:] = p[0], p[1]
i_cat_trafo = int(np.where(np.array(config.features) == cat)[0]) cat_trafo = trafos[i_cat_trafo] del trafos[i_cat_trafo] # get indicator frames cat_rawData = pat.dmatrix(cat, raw_data, return_type='dataframe').iloc[:, 1:] # append columns for col in cat_rawData.columns: raw_data[col] = cat_rawData[col] config.features.append(col) trafos.append(cat_trafo) config.features.remove(cat) #%% get transformed data data_shifted = data_func.data_framer(data=raw_data.copy(),target=config.target,features=config.features,\ index=config.time_var,start_i=config.start_time,end_i=config.end_time,\ shift=config.horizon,trafos=trafos,name_trafo=False,drop_missing=True) # number of observations trasining data length M = len(data_shifted) if M < 10: # thin dataset warning print 'Warning: Dataset has less than 20 observations.\n' if config.init_train_period == 0: # use full data set from the start config.init_train_period = M # test data, where features have not been shifted relative to target (needed for future projections). data_no_shift = data_func.data_framer(data=raw_data.copy(),target=config.target,features=config.features,\ index=config.time_var,start_i=config.start_time,end_i=config.end_time,\ shift=0,trafos=trafos,name_trafo=False,drop_missing=True)
# data & settings ref_time, cond, models = '2015Q4', True, pro.results_dict['models'] title = '(un)conditioned fan chart for future projection' fig_name = config.fig_path + 'fan_chart_{0}_ref-{1}-{2}.{3}'.format( data.ID_short, ref_time, cond, config.fig_format) # go one horizon length back and into the future proj_dates = pro.projections.index.values x_dates, y_dates = proj_dates[-3 * config.horizon - 1:-config.horizon], proj_dates[ -2 * config.horizon - 1:] df_X = data.data_no_shift[config.features].loc[x_dates] df_X.index = proj_dates[-2 * config.horizon - 1:] # shift index to prediction period df_Y = data_func.data_framer(data.raw_data,config.target,config.features,index=config.time_var,\ start_i=config.start_time,end_i='2016Q4',\ shift=0,trafos=data.trafos,name_trafo=False).loc[y_dates] # plot ml_plot.cond_fan_chart(df_X,df_Y,models,ref_time,h_ref_line=ref_line,cond=cond,\ x_label=x_label,y_label=y_label,title=title,\ save=config.save_plots,save_name=fig_name) # feature importance (full sample, fixed frequency, no time series) # ----------------------------------------------------------------- if (config.counter_fact == True) or (config.method.split('-')[0] in ['Tree', 'Forest']): # data & settings importance, impo_sd = pro.results_dict['feat_imp'], pro.results_dict[ 'feat_imp_sd']