df_end_date = temp_raw_df.Date[lastRow-1] feat_df = dSet.set_date_range(temp_raw_df, df_start_date, df_end_date) # Resolve any NA's for now feat_df.fillna(method='ffill', inplace=True) #set beLong level beLongThreshold = 0.000 feat_df = ct.setTarget(temp_raw_df, "Long", beLongThreshold) # Adding features with new day input_dict = sysUtil.get_dict(system_directory, 'input_dict') feat_df = featureGen.generate_features(feat_df, input_dict) feat_df = transf.normalizer(feat_df, 'Volume', 50) col_vals = [k for k,v in feature_dict.items() if v == 'Drop'] to_drop = ['Open','High','Low', 'gainAhead', 'Close', 'Volume', 'AdjClose', 'beLong'] for x in to_drop: col_vals.append(x) model_data = dSet.drop_columns(feat_df, col_vals) # Retrieve model best_model_name = "SVM" best_model_segment = "segment-0" #best_model_name = system_dict["best_model"] file_title = "fit-model-" + best_model_name + "-IS-" + system_name + "-" + best_model_segment +".sav" file_name = os.path.join(system_directory, file_title) model = pickle.load(open(file_name, 'rb')) # get last row of data lastRow = model_data.shape[0]
def model_and_test(self, dX, dy, model, model_results, tscv, info_dict, evData): accuracy_scores_is = [] accuracy_scores_oos = [] precision_scores_is = [] precision_scores_oos = [] recall_scores_is = [] recall_scores_oos = [] f1_scores_is = [] f1_scores_oos = [] hit_rate_is = [] hit_rate_oos = [] # Initialize the confusion matrix cm_sum_is = np.zeros((2, 2)) cm_sum_oos = np.zeros((2, 2)) # For each entry in the set of splits, fit and predict for train_index, test_index in tscv.split(dX, dy): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = dX[train_index], dX[test_index] y_train, y_test = dy[train_index], dy[test_index] # print("TRAIN:", train_index, "TEST:", test_index) # fit the model to the in-sample data model.fit(X_train, y_train) # test the in-sample fit y_pred_is = model.predict(X_train) #print("%s: %0.3f" % ("Hit rate (IS) ", model.score(X_train, y_train))) cm_is = confusion_matrix(y_train, y_pred_is) cm_sum_is = cm_sum_is + cm_is accuracy_scores_is.append(accuracy_score(y_train, y_pred_is)) precision_scores_is.append(precision_score(y_train, y_pred_is)) recall_scores_is.append(recall_score(y_train, y_pred_is)) f1_scores_is.append(f1_score(y_train, y_pred_is)) # test the out-of-sample data y_pred_oos = model.predict(X_test) #print("%s: %0.3f" % ("Hit rate (OOS) ", model.score(X_test, y_test))) cm_oos = confusion_matrix(y_test, y_pred_oos) #print(model.score(X_test, y_test)) cm_sum_oos = cm_sum_oos + cm_oos accuracy_scores_oos.append(accuracy_score(y_test, y_pred_oos)) precision_scores_oos.append(precision_score(y_test, y_pred_oos)) recall_scores_oos.append(recall_score(y_test, y_pred_oos)) f1_scores_oos.append(f1_score(y_test, y_pred_oos)) is_ev = self.print_expected_value(cm_sum_is, "In Sample", evData) oos_ev = self.print_expected_value(cm_sum_oos, "Out of Sample", evData) is_cm_results = self.conf_matrix_results(cm_sum_is, printResults=False) #print(is_cm_results) oos_cm_results = self.conf_matrix_results(cm_sum_oos, printResults=False) #print(oos_cm_results) col_save = [k for k, v in feature_dict.items() if v == 'Keep'] feature_count = len(col_save) model_results.append({ 'Issue': info_dict['issue'], 'StartDate': info_dict['modelStartDate'].strftime("%Y-%m-%d"), 'EndDate': info_dict['modelEndDate'].strftime("%Y-%m-%d"), 'Model': info_dict['modelname'], 'Rows': info_dict['nrows'], 'beLongCount': str(np.count_nonzero(dy == 1)), 'Features': col_save, 'FeatureCount': feature_count, 'Train-Accuracy': np.mean(accuracy_scores_is), 'Train-Precision': np.mean(precision_scores_is), 'Train-RMC': is_cm_results["rmc"], 'Train-RF': is_cm_results["rf"], 'Train-NPV': is_cm_results["npv"], 'Train-MCC': is_cm_results["mcc"], 'Train-Recall': np.mean(recall_scores_is), 'Train-F1': np.mean(f1_scores_is), 'Train-EV': is_ev * 100, 'Test-Accuracy': np.mean(accuracy_scores_oos), 'Test-Precision': np.mean(precision_scores_oos), 'Test-RMC': oos_cm_results["rmc"], 'Test-RF': oos_cm_results["rf"], 'Test-NPV': oos_cm_results["npv"], 'Test-MCC': oos_cm_results["mcc"], 'Test-Recall': np.mean(recall_scores_oos), 'Test-F1': np.mean(f1_scores_oos), 'Test-EV': oos_ev * 100 }) return model_results, model
oos_months, segments) dataLoadStartDate = isOosDates[0] is_start_date = isOosDates[1] oos_start_date = isOosDates[2] is_months = isOosDates[3] is_end_date = isOosDates[4] oos_end_date = isOosDates[5] modelStartDate = is_start_date modelEndDate = modelStartDate + relativedelta(months=is_months) oosModelStartDate = oos_start_date oosModelEndDate = oosModelStartDate + relativedelta(months=oos_months) # Correlation study corrData = dataSet[modelStartDate:oosModelEndDate].copy() col_vals = [k for k, v in feature_dict.items() if v == 'Drop'] to_drop = ['Open', 'High', 'Low', 'gainAhead', 'Close', 'beLong', 'Volume'] for x in to_drop: col_vals.append(x) corrData = dSet.drop_columns(corrData, col_vals) plotIt.correlation_matrix(corrData) # Create correlation matrix corr_matrix = corrData.corr() # Select upper triangle of correlation matrix upper = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.85 to_drop = [column for column in upper.columns if any(upper[column] > 0.85)] print(to_drop)
# 'f36': # {'fname' : 'MFI', # 'params' : [4] # } # } # df2 = featureGen.generate_features(df, input_dict) #print(df2.tail(10)) # Plot price and indicators startDate = "2018-11-01" endDate = "2019-02-01" plotDataSet = df2[startDate:endDate] plot_dict = {} plot_dict['Issue'] = issue plot_dict['Plot_Vars'] = [ k for k, v in feature_dict.items() if v == 'Keep' ] #plot_dict['Plot_Vars'] = list(feature_dict.keys()) plot_dict['Volume'] = 'Yes' plotIt.price_Ind_Vol_Plot(plot_dict, plotDataSet) # # # Drop columns where value is 'Drop' # col_vals=['Open', 'High', 'Close', 'Low'] # corrDataSet = dSet.drop_columns(plotDataSet, col_vals) # col_vals = [k for k,v in feature_dict.items() if v == 'Drop'] # corrDataSet = dSet.drop_columns(corrDataSet, col_vals) # plotIt.correlation_matrix(corrDataSet)