def train_xgb(X, y, params, save_path=None, save_path_booster=None): # the threshold is not handled by XGB interface params, binary_threshold = _parse_param_and_delete(params, 'binary_threshold', .5) # n_jobs is handled by XGB SKL interface params = _parse_param_and_keep(params, name='n_jobs', default=min(max_cpu_count(), 24)) X = np.asarray(X) y = np.asarray(y).flatten() if not tuple(np.sort(np.unique(y))) == (0, 1): raise NotImplementedError( 'XGB Wrapper currently only support biinary classification.') # Fit the model model = XGBClassifier(use_label_encoder=False, ) model = clone(model) model.set_params(**params) logging.info('Training...') model.fit( X, y, # early_stopping_rounds=10, verbose=True, ) # Save and re-load (feature-agnostic model) temp_file = f'temp-{time.time()}-{random.random()}.bin' model.get_booster().save_model(temp_file) booster = Booster(model_file=temp_file) os.remove(temp_file) if binary_threshold == 'auto': p_ = booster.predict(DMatrix(X)) p_ = np.sort(p_) binary_threshold = p_[int((y == 0).sum())] logging.info(f'Using a binary_threshold = {binary_threshold}') # Wrap model = XGBClassifierSKLWrapper(booster, features=X.shape[1], threshold=binary_threshold) # Save if save_path is not None: save_pickle(model, save_path) if save_path_booster is not None: save_pickle(model.get_booster(), save_path_booster) return model
def test_dmatrix_creator(self): # This function acts as a pseudo-itertools.chain() def row_tup_iter(data): pdf = pd.DataFrame(data) yield pdf # Standard testing DMatrix creation expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100) expected_labels = np.array([1, 0] * 100) expected_dmatrix = DMatrix(data=expected_features, label=expected_labels) data = { "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100, "label": [1, 0] * 100, } output_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=False, has_validation=False, has_base_margin=False, ) # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using # the same classifier and making sure the outputs are equal model = XGBClassifier() model.fit(expected_features, expected_labels) expected_preds = model.get_booster().predict(expected_dmatrix) output_preds = model.get_booster().predict(output_dmatrix) self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3)) # DMatrix creation with weights expected_weight = np.array([0.2, 0.8] * 100) expected_dmatrix = DMatrix(data=expected_features, label=expected_labels, weight=expected_weight) data["weight"] = [0.2, 0.8] * 100 output_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=True, has_validation=False, has_base_margin=False, ) model.fit(expected_features, expected_labels, sample_weight=expected_weight) expected_preds = model.get_booster().predict(expected_dmatrix) output_preds = model.get_booster().predict(output_dmatrix) self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
def test_predict_sklearn_pickle(self): x, y = build_dataset() kwargs = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'verbosity': 2, 'objective': 'binary:logistic', 'n_estimators': 10} model = XGBClassifier(**kwargs) model.fit(x, y) save_pickle(model, "model.pkl") del model # load model model: xgb.XGBClassifier = load_pickle("model.pkl") os.remove("model.pkl") gpu_pred = model.predict(x, output_margin=True) # Switch to CPU predictor bst = model.get_booster() bst.set_param({'predictor': 'cpu_predictor'}) cpu_pred = model.predict(x, output_margin=True) np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
def train_xgboost(args): """ Train a XGBoost model Args: args: structure with the following field: bucket_name, str, gcs bucket name to store trained model blob_name, str, gcs blob name to store trained model train_feature_name, str, name of the train feature csv train_label_name, str, name of train label csv no_classes, int, number of prediction classes in the model n_estimators, int, number of estimators (hypertune) max_depth, int, maximum depth of trees (hypertune) booster, str, type of boosters (hypertune) Return: xgboost model object """ x_train = pd.read_csv(args.train_feature_name) y_train = pd.read_csv(args.train_label_name) # --------------------------------------- # Train model # --------------------------------------- params = { 'n_estimators': args.n_estimators, 'max_depth': args.max_depth, 'booster': args.booster, 'min_child_weight': 1, 'learning_rate': 0.1, 'gamma': 0, 'subsample': 1, 'colsample_bytree': 1, 'reg_alpha': 0, 'objective': 'multi:softprob', 'num_class': args.no_classes, } xgb_model = XGBClassifier(**params, use_label_encoder=False) print(x_train.shape) print(y_train.shape) xgb_model.fit(x_train, y_train) # --------------------------------------- # Save the model to local # --------------------------------------- temp_name = 'model.bst' bst = xgb_model.get_booster() bst.save_model(temp_name) # --------------------------------------- # Move local model to gcs # --------------------------------------- subprocess.check_call( ['gsutil', 'cp', temp_name, os.path.join(args.job_dir, 'model.bst')], stderr=sys.stdout) return xgb_model
def get_feature_importance(data, labels, display=True): """ :param data: dataframe to be used for feature importance :param labels: cluster labels to be used for classification :param display: Number of top important features and respective feature importance to be displayed. """ df = pd.DataFrame(MinMaxScaler().fit_transform(data), index=data.index, columns=data.columns) imp_dict = {} for c in set(labels): print(f'cluster id = {c}') y = [1 if x == c else 0 for x in labels] X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=10) clf = XGBClassifier(n_estimators=1000, max_depth=6, learning_rate=0.01, objective='binary:logistic', eval_metric='auc') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print('accuracy score is ', accuracy_score(y_test, y_pred)) imp_dict[c] = clf.get_booster().get_score(importance_type='gain') if display: feature_imp_series = pd.Series(imp_dict[c], index=data.columns) print(feature_importance_df[cl + '_' + str(c)].dropna().sort_values( ascending=False)[:display]) return imp_dict
def test_xgboost_classifier_i5450(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10) clr = XGBClassifier(objective="multi:softmax", max_depth=1, n_estimators=2) clr.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=40) initial_type = [('float_input', FloatTensorType([None, 4]))] onx = convert_xgboost(clr, initial_types=initial_type) sess = InferenceSession(onx.SerializeToString()) input_name = sess.get_inputs()[0].name label_name = sess.get_outputs()[1].name predict_list = [1., 20., 466., 0.] predict_array = np.array(predict_list).reshape( (1, -1)).astype(np.float32) pred_onx = sess.run([label_name], {input_name: predict_array})[0] pred_xgboost = sessresults = clr.predict_proba(predict_array) bst = clr.get_booster() bst.dump_model('dump.raw.txt') dump_data_and_model( X_test.astype(np.float32) + 1e-5, clr, onx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBClassifierIris")
def get_importances(model: xgboost.XGBClassifier): """Возвращает важности моделей обученного xgboost-а""" imp = model.feature_importances_ names = model.get_booster().feature_names li = list(zip(imp, names)) li.sort(reverse=True) return li
def extract_xgboost_features(model: xgboost.XGBClassifier) -> pd.DataFrame: bst = model.get_booster() df = pd.DataFrame({ "feature_name": bst.feature_names, "feature_importance": model.feature_importances_, }) return df
def test_predict_sklearn_pickle(self): x, y = build_dataset() kwargs = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'objective': 'binary:logistic', 'n_estimators': 10 } model = XGBClassifier(**kwargs) model.fit(x, y) save_pickle(model, "model.pkl") del model # load model model: xgb.XGBClassifier = load_pickle("model.pkl") os.remove("model.pkl") gpu_pred = model.predict(x, output_margin=True) # Switch to CPU predictor bst = model.get_booster() bst.set_param({'predictor': 'cpu_predictor'}) cpu_pred = model.predict(x, output_margin=True) np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
def plot_feature_importance(plt,fig,X,Y,header,filename_out=None): model = XGBClassifier() model.fit(X, Y) keys, values = [],[] feature_importances = model.get_booster().get_score() for k, v in feature_importances.items(): keys.append(k) values.append(v) values = numpy.array(values) idx = numpy.argsort(-values) keys = numpy.array(keys)[idx] values = values[idx] header = header[idx] N=5 ax = fig.gca() ax.pie(values[:N], labels=header[:N], autopct='%1.1f%%',shadow=False, startangle=90) #plt.set_title('Feature importance') if filename_out is not None: plt.savefig(filename_out) return
def myref(seed=1, plt_type='gain'): ### load module from xgboost import XGBClassifier ### load datasets if seed > 0: df_train = loadDataset('voice/voice_train_%d.csv' % seed) df_test = loadDataset('voice/voice_test_%d.csv' % seed) else: df_train = loadDataset('voice/voice.csv') df_test = loadDataset('voice/voice.csv') fixDeafults(df_train, discard=True) fixDeafults(df_test, discard=False) ### fit model for train data model = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=20, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27) model.fit(df_train.iloc[:,:-1], df_train.loc[:,'label']) ### make prediction for test data y_pred = model.predict(df_test.iloc[:,:-1]) y_test = df_test.loc[:,'label'].values ### model evaluate diff = y_test - y_pred acc = float(diff[diff==0].size) / float(diff.size) print("accuarcy: %.4f%%" % (acc * 100.0)) from xgboost import plot_importance imp_dict = model.get_booster().get_score(importance_type=plt_type) imp = pd.Series(imp_dict).sort_values(ascending=False) print(imp) fig,ax = plt.subplots(figsize=(10,15)) plot_importance(model, height=0.5, max_num_features=64, ax=ax, importance_type=plt_type) plt.show()
def xgb_inference(model, X, Y, X_test): x = X.values y = Y.values x_tst = X_test.values x_tst = np.ascontiguousarray(x_tst) # y_oof = np.zeros(x.shape[0]) y_tst = np.zeros((x_tst.shape[0], len(np.unique(y)))) acc_scores = [] rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=SEED) params = model.get_params() for i, (train_index, valid_index) in enumerate(rskf.split(x, y)): print(i) X_A, X_B = x[train_index, :], x[valid_index, :] y_A, y_B = y[train_index], y[valid_index] xgb_model = XGBClassifier(**params) X_A, X_B = np.ascontiguousarray(X_A), np.ascontiguousarray(X_B) y_A, y_B = np.ascontiguousarray(y_A), np.ascontiguousarray(y_B) xgb_model.fit(X_A, y_A, eval_set=[(X_B, y_B)], early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=0) best_iteration = xgb_model.get_booster().best_ntree_limit # new # y_oof[valid_index] = xgb_model.predict(X_B, ntree_limit=best_iteration) # new tmp = xgb_model.predict(X_B, iteration_range=[0, best_iteration]) acc_score = accuracy_score(y_B, tmp) acc_scores.append(acc_score) y_tst += model.predict_proba(x_tst, iteration_range=[0, best_iteration]) y_tst /= N_SPLITS * N_REPEATS return y_tst, np.mean(acc_scores)
def __init__( self, model: XGBClassifier, feature_names: List[str], classification_labels: Optional[List[str]] = None, ): super().__init__( model.get_booster(), feature_names, model.base_score, model.objective, classification_labels, ) if model.classes_ is None: n_estimators = model.get_params()["n_estimators"] num_trees = model.get_booster().trees_to_dataframe()["Tree"].max() + 1 self._num_classes = num_trees // n_estimators else: self._num_classes = len(model.classes_)
def opt_BDT(input, output, params, show, names): model = XGBClassifier(**params) xgb_param = model.get_xgb_params() cvscores = [] AUC = [] X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42) matrix_train = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv( xgb_param, matrix_train, num_boost_round=model.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=30, verbose_eval=True, ) model.set_params(n_estimators=cvresult.shape[0]) model.fit(X_train, y_train, eval_metric="auc") y_prob = model.predict_proba(X_test) y_pred = model.predict(X_test) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_test, y_prob[:, 1]) accuracy = accuracy_score(y_test, prediction) print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc)) if show: name = "channel_" + str(channel) + "_BDT" name = "%s_%s" % (name, selection) modelname = "models/%s.h5" % name print("Save to %s" % modelname) plotter.plot_separation(model, X_test, y_test, name, False) plotter.plot_ROC(model, X_test, y_test, name, False) model.get_booster().feature_names = names mp.rc("figure", figsize=(5, 5)) plot_importance(model.get_booster()) plt.subplots_adjust(left=0.3) plt.show()
def feature_imporance_XGB(df, idx_target): X, Y = preprocess(df, idx_target) model = XGBClassifier() model.fit(X, Y) feature_importances = model.get_booster().get_score() # values = numpy.array([v[1] for v in feature_importances.items()]) values = numpy.zeros(X.shape[1]) for v in feature_importances.items(): values[int(v[0][1:])] = v[1] return numpy.array(values)
def __init__( self, model: XGBClassifier, feature_names: List[str], classification_labels: Optional[List[str]] = None, ): super().__init__( model.get_booster(), feature_names, model.base_score, model.objective, classification_labels, )
def objective(trial, x_train, y_train, params=params): start_time = timer() temp_map = { 'max_depth': trial.suggest_int('max_depth', 3, 12), "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05), "min_child_weight": trial.suggest_loguniform("min_child_weight", 5, 1000), "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8), "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.8), "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0), "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0), "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0) } params.update(temp_map) # x_train = df.iloc[:train_rows, :].values # y_train = train_label.iloc[:train_rows].values y_oof = np.zeros((x_train.shape[0])) acc_scores = [] # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-logloss") pruning_callback = optuna.integration.XGBoostPruningCallback( trial, "validation_0-auc") rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=RANDOM_SEED) for i, (train_index, valid_index) in enumerate(rskf.split(x_train, y_train)): X_A, X_B = x_train[train_index, :], x_train[valid_index, :] y_A, y_B = y_train[train_index], y_train[valid_index] xgb_classifier = XGBClassifier(**params) xgb_classifier.fit(X_A, y_A, eval_set=[(X_B, y_B)], early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=0, callbacks=[pruning_callback]) best_iteration = xgb_classifier.get_booster().best_ntree_limit # new y_oof[valid_index] = xgb_classifier.predict( X_B, ntree_limit=best_iteration ) # new iteration_range=[0,best_iteration] acc_score = accuracy_score(y_B, y_oof[valid_index]) acc_scores.append(acc_score) # print(f"===== {i} fold : acc {acc_score} =====") trial.set_user_attr(key="best_booster", value=xgb_classifier ) # NOTE update the best model in the optuna's table. res = np.mean(acc_scores) # print(f"===== {res} =====") timer(start_time) return res
def objective(trial, x_train, y_train, params=params): # x_train, y_train: ndarray start_time = timer() temp_map = { 'max_depth': trial.suggest_int('max_depth', 3, 10), "learning_rate": trial.suggest_loguniform("learning_rate", 5e-3, 5e-2), "min_child_weight": trial.suggest_loguniform("min_child_weight", 1, 300), # 5, 100 "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8), "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.8), "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0), "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0), "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0) } params.update(temp_map) y_oof = np.zeros(x_train.shape[0]) acc_scores = [] pruning_callback = optuna.integration.XGBoostPruningCallback( trial, "validation_0-auc" ) # depends on the choice of eval_metric; "validation_0-logloss" rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=SEED) for i, (train_index, valid_index) in enumerate(rskf.split(x_train, y_train)): X_A, X_B = x_train[train_index, :], x_train[valid_index, :] y_A, y_B = y_train[train_index], y_train[valid_index] xgb_classifier = XGBClassifier(**params) xgb_classifier.fit(X_A, y_A, eval_set=[(X_B, y_B)], early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=0, callbacks=[pruning_callback]) best_iteration = xgb_classifier.get_booster().best_ntree_limit # new y_oof[valid_index] = xgb_classifier.predict( X_B, ntree_limit=best_iteration) # new acc_scores.append(accuracy_score(y_B, y_oof[valid_index])) trial.set_user_attr(key="best_booster", value=xgb_classifier ) # NOTE update the best model in the optuna's table. res = np.mean(acc_scores) timer(start_time) return res
def XGBClassifierMalwareImportantFeature(dataset): malware_feature = dataset.columns dataset = dataset.dropna(axis=0) malware_feature=malware_feature.drop("Class") X = dataset[malware_feature] #independent columns y = dataset.Class train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500,1000] for max_l in candidate_max_leaf_nodes : get_mae_XGB_Classifier(max_l, train_X, val_X, train_y, val_y) scores = {leaf_size: get_mae_XGB_Classifier(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes} best_tree_size = min(scores, key=scores.get) print('best tree size: ',best_tree_size) XGBClassifierMalware=XGBClassifier(learning_rate=0.1,max_leaf_nodes=best_tree_size,n_estimators=100) XGBClassifierMalware.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(val_X, val_y)], verbose=False) features_W = pd.Series(XGBClassifierMalware.get_booster().get_score(importance_type='weight'), index=X.columns) features_W.sort_values(axis=0, ascending=False).nlargest(25).plot(kind='barh').set_title('XGBClassifierMalware_weight') plt.show() feat_importances = pd.Series(XGBClassifierMalware.feature_importances_, index=X.columns) feat_importances.sort_values(axis=0, ascending=False) print(feat_importances.values) print('nico','\r\n') print(feat_importances[feat_importances.values > 0.001]) best=feat_importances[feat_importances.values > 0.001] feat_importances.nlargest(20).plot(kind='barh').set_title('XGBClassifierMalware') plt.show() plot_importance(XGBClassifierMalware,max_num_features=22) pyplot.show() return best.index
'n_estimators': 100, 'max_depth': 3, }, { # performance test 'n_estimators': 5000, 'max_depth': 5, 'nthread': 4, }, ] X, y = make_classification(10000) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) df_test = pd.DataFrame(X_test) df_test['target'] = y_test prepare_test_env() for i, kwargs in enumerate(scenarios): classifier = XGBClassifier(**kwargs) classifier.fit(X_train, y_train) model_dir = os.path.join(os.path.dirname(__file__), f'build/model-{i}.txt') classifier.get_booster().dump_model( os.path.join(os.path.dirname(__file__), f'build/model-{i}.xgb')) probas = classifier.predict_proba(X_test) df_test[f'p_{i}_0'] = probas[:, 0] df_test[f'p_{i}_1'] = probas[:, 1] df_test.to_csv( os.path.join(os.path.dirname(__file__), 'build/comparison_data.csv'))
}, ignore_index=True) meta_results_with_avg.to_csv("meta_results_with_avg.csv", index=False) ####################################### FEATURES IMPORTANCE AND SHAP ####################################### # First fit the model on the DF - drop the 'dataset' column and nan values meta_dataset.fillna(0, inplace=True) class_col = meta_dataset.columns.get_loc('Best AUC') X, y = split_to_X_and_y(meta_dataset, class_col) X = np.delete(X, [0], axis=1) xgb = XGBClassifier(booster='gbtree') xgb.fit(X, y) weight_res = xgb.get_booster().get_score(importance_type='weight') gain_res = xgb.get_booster().get_score(importance_type='gain') cover_res = xgb.get_booster().get_score(importance_type='cover') # Plot the 10 most features per importance type weight_res = plot_xgb_importance(weight_res, 'Weight', meta_dataset.columns) gain_res = plot_xgb_importance(gain_res, 'Gain', meta_dataset.columns) cover_res = plot_xgb_importance(cover_res, 'Cover', meta_dataset.columns) # Save the results for all the meta-features in a csv file data = { 'Weight': list(weight_res.keys()), 'Gain': list(gain_res.keys()), 'Cover': list(cover_res.keys()) }
# if the feature hasn't been seen yet fmap[fid] = 1 gmap[fid] = g else: fmap[fid] += 1 gmap[fid] += g # calculate average value (gain/cover) for each feature for fid in gmap: gmap[fid] = gmap[fid] / fmap[fid] return gmap plot_importance(xgb1) dic = (xgb1.get_booster().get_score(importance_type='weight')) print(len(dic),dic) def get_dic(): data = pd.read_excel('./data/all_0.xlsx') columns = [column for column in data] columns.remove('target') dic = {} for i in range(len(columns)): dic['f'+str(i)] = columns[i] return dic conv = get_dic()
) # MLPClassifier(solver='lbfgs',alpha=1e-1,hidden_layer_sizes=(10,2), random_state=1) details["Decade"] = decade + "s" details["Model"] = model.fit(X_train, y_train) details["Feature Importance"] = list(model.feature_importances_) try: details["Co-Efficient"] = model.coef_ except: pass y_pred = model.predict(X_test) # predictions = [round(value) for value in y_pred] accuracy = round(100 * float(metrics.accuracy_score(y_test, y_pred)), 2) print(decade + "s Accuracy: ", accuracy) details["Accuracy"] = accuracy logger(details) #visualize(list(details["Feature Importance"])) header = ['danceability','energy','key',\ 'loudness','mode','speechiness','acousticness','instrumentalness',\ 'liveness','valence','tempo','duration_ms','time_signature','chorusHit','sections'] model.get_booster().feature_names = header plot_importance(model.get_booster()) #.set_yticklabels(header) plt.show() log.close()
def XGB(opts): reDirect = False FOLDER = 'clean_vpn12_xgb' if not os.path.exists(FOLDER): os.mkdir(FOLDER) MODEL_PATH = FOLDER + '/model.h5' FIG_PATH = FOLDER + '/Confusion_Matrix.png' FIG_PATH_N = FOLDER + '/Confusion_Matrix_Norm.png' import sys if(reDirect): old_stdout = sys.stdout sys.stdout = open( FOLDER + '/log', 'w') X_train = np.load(opts.source_data_folder+'/X_train.npy') y_train = np.load(opts.source_data_folder+'/y_train.npy') X_train = X_train.astype('float32') print('X_train:', np.shape(X_train)) print('y_train:', np.shape(y_train)) maxsize = 0 print('-'*20) for cat in np.unique(y_train): size = np.shape(np.where(y_train==cat))[1] print(str(cat)+": "+str(np.shape(np.where(y_train==cat))[1])) if(size > maxsize): maxsize = size print('-'*20) y = y_train X_train = normalize(X_train, norm='l2', axis=0, copy=True, return_norm=False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] print(dim) #Setting Classifier xgbc = XGBClassifier(max_depth=20, tree_method='exact', n_estimators=180, n_jobs=-1) #training xgbc.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=30, verbose = True) results = xgbc.score(X_test, y_test) print('Test accuracy: ', results) if(reDirect): sys.stdout = old_stdout print('Test accuracy: ', results) xgbc.get_booster().save_model(MODEL_PATH) y_pred = xgbc.predict(X_test) #load the best model import xbgoost as xgb bst = xgb.Booster({'nthread': 4}) # init model bst.load_model(MODEL_PATH) # load data y_pred = bst.predict(X_test) y_p = y_pred y_t = y_test class_names = [DIG2LABEL[i] for i in range(nclass)] cnf_matrix = confusion_matrix(y_t, y_p) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names,title='Confusion matrix, without normalization') plt.savefig(FIG_PATH) plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,title='Normalized confusion matrix') plt.savefig(FIG_PATH_N) print('f1-scroe = {}'.format(f1_score(y_t, y_p, average=None))) print('prcision = {}'.format(precision_score(y_t, y_p, average=None))) print('recall = {}'.format(recall_score(y_t, y_p, average=None))) print('macro f1 = {}'.format(f1_score(y_t, y_p, average='macro')))
ax.plot([0,1], [0,1], color ='k', linestyle='--') ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.05]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.legend(loc="lower right") plt.savefig('boost_comparison_roc.jpeg') fig.show() # Feature Importances data_boost.plot_importance() plt.savefig('feature_importances.jpeg') plt.show() ''' feature_important = boost.get_booster().get_score(importance_type='gain') keys = list(feature_important.keys()) values = list(feature_important.values()) data = pd.DataFrame(data=values, index=keys, columns=["gain"]).sort_values(by="gain", ascending=True) data.plot(kind='barh', color='r') plt.title('XGBoost Feature Importance') plt.show() """XGBoost - Run later today""" # Grid Search XGBoost # Ran in EC2 instance ''' parameter_grid = { 'max_depth': [3, 9],
def opt(trial): global LONG_PROBA_THRESH global SHORT_PROBA_THRESH global VORARITY_THRESH param = {} if is_use_gpu: param['tree_method'] = 'gpu_hist' param['max_bin'] = 16 param['gpu_id'] = 0 long_prob_thresh = trial.suggest_discrete_uniform('long_prob_thresh', 0.5, 0.9, 0.05) short_prob_thresh = trial.suggest_discrete_uniform('short_prob_thresh', 0.1, 0.5, 0.05) vorarity_thresh = trial.suggest_discrete_uniform('vorarity_thresh', 0.01, 0.3, 0.02) eta = trial.suggest_discrete_uniform('eta', 0.05, 0.5, 0.05) n_estimators = trial.suggest_int('n_estimators', 0, 10000) #n_estimators = trial.suggest_int('n_estimators', 0, 100) max_depth = trial.suggest_int('max_depth', 1, 10) min_child_weight = trial.suggest_int('min_child_weight', 1, 20) subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1) colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1) xgboost_tuna = XGBClassifier( max_depth=max_depth, random_state=42, n_estimators=n_estimators, min_child_weight=min_child_weight, subsample=subsample, # 0.7, colsample_bytree=colsample_bytree, # 0.6, eta=eta, objective='binary:logistic', verbosity=0, n_thread=WHEN_TUNE_PARAM_THREAD_NUM, **param) verbosity = True if is_use_gpu or is_colab_cpu: verbosity = False # optuna.logging.set_verbosity(optuna.logging.CRITICAL) # optuna.logging.disable_default_handler() xgboost_tuna.fit(tr_input_arr, tr_angle_arr, verbose=verbosity) booster = xgboost_tuna.get_booster() cur_params = { 'long_prob_thresh': str(long_prob_thresh), 'short_prob_thresh': str(short_prob_thresh), 'vorarity_thresh': str(vorarity_thresh), 'eta': str(eta), 'n_estimators': str(n_estimators), 'max_depth': str(max_depth), 'min_child_weight': str(min_child_weight), 'subsample': str(subsample), 'colsample_bytree': str(colsample_bytree) } logfile_writeln_opt(str(cur_params)) portfolio_rslt = run_backtest(booster=booster, long_prob_thresh=long_prob_thresh, short_prob_thresh=short_prob_thresh, vorarity_thresh=vorarity_thresh) logfile_writeln_opt("portfolio_rslt =" + str(portfolio_rslt)) #tuna_pred_test = xgboost_tuna.predict(val_input_arr) #return (1.0 - (accuracy_score(val_angle_arr, tuna_pred_test))) return (1.0 - ((portfolio_rslt / 1000000.0) - 0.5))
plt.xlim([-1, len(features)]) plt.savefig('vriable_importance_15032019_nTree260_endcap.png') variable_importance(model, input_vars) ################################################################################################################################## # convert xgboost to TMVA weights import tempfile feature_map = tempfile.NamedTemporaryFile(suffix=".txt") for index, varname in enumerate(input_vars): print >> feature_map, index, varname, "q" feature_map.flush() import re tmva_output_fname = re.sub("\\.pkl$", ".xml", model_fname) model_dump = model.get_booster().get_dump(fmap=feature_map.name) xgboost2tmva.convert_model(model_dump, input_variables=[(input_var, 'F') for input_var in input_vars], output_xml=tmva_output_fname, pretty=True) print "Wrote", tmva_output_fname ###############################################################################################################################
def train_and_generate_model(): #global log_fd global log_fd_opt global tr_input_arr global tr_angle_arr global val_input_arr global val_angle_arr data_len = len(exchange_rates) log_fd_tr = open("./train_progress_log_" + dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt", mode="w") # inner logger function for backtest def logfile_writeln_tr(log_str): nonlocal log_fd_tr log_fd_tr.write(log_str + "\n") log_fd_tr.flush() print("data size of rates: " + str(data_len)) print("num of rate datas for tarin: " + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR)) print("input features sets for tarin: " + str(COMPETITION_TRAIN_DATA_NUM)) logfile_writeln_tr("data size of rates: " + str(data_len)) logfile_writeln_tr("num of rate datas for tarin: " + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR)) tr_input_mat = [] tr_angle_mat = [] is_loaded_input_mat = False if os.path.exists("./tr_input_mat.pickle"): with open('./tr_input_mat.pickle', 'rb') as f: tr_input_mat = pickle.load(f) with open('./tr_angle_mat.pickle', 'rb') as f: tr_angle_mat = pickle.load(f) is_loaded_input_mat = True else: for i in range(DATA_HEAD_ASOBI, len(exchange_rates) - DATA_HEAD_ASOBI - OUTPUT_LEN, SLIDE_IDX_NUM_AT_GEN_INPUTS_AND_COLLECT_LABELS): tr_input_mat.append([ exchange_rates[i], (exchange_rates[i] - exchange_rates[i - 1]) / exchange_rates[i - 1], get_rsi(exchange_rates, i), get_ma(exchange_rates, i), get_ma_kairi(exchange_rates, i), get_bb_1(exchange_rates, i), get_bb_2(exchange_rates, i), get_ema(exchange_rates, i), get_ema_rsi(exchange_rates, i), get_cci(exchange_rates, i), get_mo(exchange_rates, i), get_lw(exchange_rates, i), get_ss(exchange_rates, i), get_dmi(exchange_rates, i), get_vorarity(exchange_rates, i), get_macd(exchange_rates, i), str(judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i])) ]) tr_input_mat.append([ reverse_exchange_rates[i], (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) / reverse_exchange_rates[i - 1], get_rsi(reverse_exchange_rates, i), get_ma(reverse_exchange_rates, i), get_ma_kairi(reverse_exchange_rates, i), get_bb_1(reverse_exchange_rates, i), get_bb_2(reverse_exchange_rates, i), get_ema(reverse_exchange_rates, i), get_ema_rsi(reverse_exchange_rates, i), get_cci(reverse_exchange_rates, i), get_mo(reverse_exchange_rates, i), get_lw(reverse_exchange_rates, i), get_ss(reverse_exchange_rates, i), get_dmi(reverse_exchange_rates, i), get_vorarity(reverse_exchange_rates, i), get_macd(reverse_exchange_rates, i), str( judge_chart_type( reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i])) ]) tmp = exchange_rates[i + OUTPUT_LEN] - exchange_rates[i] if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) tmp = reverse_exchange_rates[ i + OUTPUT_LEN] - reverse_exchange_rates[i] if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) if is_loaded_input_mat == False: with open('tr_input_mat.pickle', 'wb') as f: pickle.dump(tr_input_mat, f) with open('tr_angle_mat.pickle', 'wb') as f: pickle.dump(tr_angle_mat, f) #log output for tensorboard #configure("logs/xgboost_trade_cpu_1") tr_input_arr = np.array(tr_input_mat[0:COMPETITION_TRAIN_DATA_NUM]) tr_angle_arr = np.array(tr_angle_mat[0:COMPETITION_TRAIN_DATA_NUM]) watchlist = None split_idx = COMPETITION_TRAIN_DATA_NUM + int( (len(tr_input_mat) - COMPETITION_TRAIN_DATA_NUM) * VALIDATION_DATA_RATIO) if VALIDATION_DATA_RATIO != 0.0: val_input_arr = np.array( tr_input_mat[COMPETITION_TRAIN_DATA_NUM:split_idx]) val_angle_arr = np.array( tr_angle_mat[COMPETITION_TRAIN_DATA_NUM:split_idx]) watchlist = [(tr_input_arr, tr_angle_arr), (val_input_arr, val_angle_arr)] else: watchlist = [(tr_input_arr, tr_angle_arr)] start = time.time() if is_param_tune_with_optuna: log_fd_opt = open("./tune_progress_log_" + dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt", mode="w") study = None if is_use_db_at_tune: study = optuna.Study(study_name='fxsystrade', storage='sqlite:///../fxsystrade.db') else: study = optuna.create_study() parallel_num = RAPTOP_THREAD_NUM * 2 if is_colab_cpu or is_exec_at_mba: parallel_num = COLAB_CPU_AND_MBA_THREAD_NUM * 2 if special_optuna_parallel_num != -1: parallel_num = special_optuna_parallel_num study.optimize(opt, n_trials=OPTUNA_TRIAL_NUM, n_jobs=parallel_num) process_time = time.time() - start logfile_writeln_opt("best_params: " + str(study.best_params)) logfile_writeln_opt("best_value: " + str(study.best_value)) logfile_writeln_opt("best_trial: " + str(study.best_trial)) logfile_writeln_opt("excecution time of tune: " + str(process_time)) log_fd_opt.flush() log_fd_opt.close() exit() param = {} n_thread = RAPTOP_THREAD_NUM if is_use_gpu: param['tree_method'] = 'gpu_hist' param['max_bin'] = 16 param['gpu_id'] = 0 n_thread = COLAB_CPU_AND_MBA_THREAD_NUM if is_colab_cpu or is_exec_at_mba: n_thread = COLAB_CPU_AND_MBA_THREAD_NUM logfile_writeln_tr("training parameters are below...") logfile_writeln_tr(str(param)) eval_result_dic = {} logfile_writeln_tr("num_round: " + str(NUM_ROUND)) clf = XGBClassifier(max_depth=MAX_DEPTH, random_state=42, n_estimators=NUM_ROUND, min_child_weight=18, subsample=0.9, colsample_bytree=0.6, eta=ETA, objective='binary:logistic', verbosity=0, n_thread=n_thread, **param) verbosity = True if is_use_gpu or is_colab_cpu: verbosity = False clf.fit(tr_input_arr, tr_angle_arr, eval_set=watchlist, verbose=verbosity) process_time = time.time() - start logfile_writeln_tr("excecution time of training: " + str(process_time)) clf.save_model('./xgb.model') booster = clf.get_booster() booster.dump_model('./xgb_model.raw.txt') eval_result_dic = clf.evals_result() for ii in range(len(eval_result_dic['validation_0']['error'])): if VALIDATION_DATA_RATIO != 0.0: logfile_writeln_tr( str(ii) + "," + str(eval_result_dic['validation_0']['error'][ii]) + "," + str(eval_result_dic['validation_1']['error'][ii])) else: logfile_writeln_tr( str(ii) + "," + str(eval_result_dic['validation_0']['error'][ii])) # Feature Importance fti = clf.feature_importances_ logfile_writeln_tr('Feature Importances:') for i, feat in enumerate(FEATURE_NAMES): logfile_writeln_tr('\t{0:20s} : {1:>.6f}'.format(feat, fti[i])) log_fd_tr.flush() log_fd_tr.close() print("finished training and saved model.")
# train clf.fit(X, y, sample_weight=w) #save results if options.optimize: with open('%s/best_params.json' % options.out_dir, 'w+') as fout: fout.write(json.dumps(clf.best_params_)) pd.DataFrame(clf.cv_results_).to_hdf('%s/cv_results.hd5' % options.out_dir, key='cv_results') if options.refit: clf = clf.best_estimator_ else: with open('%s/best_params.json' % options.out_dir, 'w+') as fout: fout.write(json.dumps(options.clf_params)) if not options.optimize or optimize.optimize and options.refit: if options.save_pickle: with gopen('%s/model.pkl.gz' % options.out_dir, 'w+') as fout: pickle.dump(clf, fout) fout.close() try: model = clf.get_booster() except: model = clf.booster() model.save_model('%s/model.xgb' % options.out_dir) ## ## ## # train it ## clf.fit(X_train,y_train,w_train)
#calculate the xgboost probability import numpy as np from xgboost import XGBClassifier #simulate inputs for training the model #simulation with a normail distribution N{mean=1,std=1}, generating a matrix of 10*6, each element is iid from the normal distribution X=np.random.normal(1,1,[10,6]) #randomly generate 10 number, either 1 or 0 y=np.random.randint(2,size=10) #use xgboost to train model=XGBClassifier(learning_rate=0.1,n_estimators=2) model.fit(X,y) #simulate test data Xtest=np.random.normal(1,1,[2,6]) ytest=np.random.randint(2,size=2) #get prediction results model.predict_proba(Xtest) #get tree results model.get_booster().dump_model('output.txt') with open('output.txt','r') as f: lmodel_leaves=f.read() print(model_leaves) #replicate proba results with tree leaf results #for each row, find the leaf value on each tree, there are two trees in this example #proba = 1/(1+exp(-(tree0_leaf+tree1_leaf))
def run_backtest(booster=None, long_prob_thresh=None, short_prob_thresh=None, vorarity_thresh=None): LONG_PROBA_THRESH_IN = LONG_PROBA_THRESH if long_prob_thresh == None else long_prob_thresh SHORT_PROBA_THRESH_IN = SHORT_PROBA_THRESH if short_prob_thresh == None else short_prob_thresh VORARITY_THRESH_IN = VORARITY_THRESH if vorarity_thresh == None else vorarity_thresh data_len = len(exchange_rates) log_fd_bt = open("./backtest_log_" + dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt", mode="w") # inner logger function for backtest def logfile_writeln_bt(log_str): nonlocal log_fd_bt log_fd_bt.write(log_str + "\n") log_fd_bt.flush() logfile_writeln_bt("start backtest...") t_num = RAPTOP_THREAD_NUM if is_colab_cpu or is_exec_at_mba: t_num = COLAB_CPU_AND_MBA_THREAD_NUM if is_param_tune_with_optuna: t_num = WHEN_TUNE_PARAM_THREAD_NUM bst = None if booster == None: clf = XGBClassifier() clf.load_model("./xgb.model") bst = clf.get_booster() if is_use_gpu: bst.set_param({ 'predictor': 'gpu_predictor', 'tree_method': 'gpu_hist' }) else: bst.set_param({'predictor': 'cpu_predictor', 'nthread': t_num}) #bst.load_model("./xgb.model") else: bst = booster #引数のものを使う bst.set_param({'nthread': t_num}) portfolio = 1000000 LONG = "LONG" SHORT = "SHORT" NOT_HAVE = "NOT_HAVE" pos_kind = NOT_HAVE HALF_SPREAD = 0.0015 SONKIRI_RATE = 0.05 RIKAKU_PIPS = 0.60 positions = 0 trade_val = -1 pos_cont_count = 0 won_pips = 0 start = time.time() ts_input_mat = [] is_loaded_mat = False # if os.path.exists("./ts_input_mat.pickle"): # with open('./ts_input_mat.pickle', 'rb') as f: # ts_input_mat = pickle.load(f) # is_loaded_mat = True logfile_writeln_bt("trade parameters LONG_PROBA_THRESH=" + str(LONG_PROBA_THRESH) + " SHORT_PROBA_THRESH=" + str(LONG_PROBA_THRESH) + " VORARITY_THRESH=" + str(VORARITY_THRESH) + " trade_trying_times=" + str(data_len - COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR - OUTPUT_LEN)) # log format a_log_str_line = "log marker, loop count, Did Action == Sonkiri, chart_type, Did Action == skip according to chart_type, Did Action == Rieki Kakutei, Did Action == Skip according to position cointain time, voratility, Did Action == skip accordint to voratility, predicted prob, Get long position => 1 Get Short position => 2 else => 0, Did Action == Skip by chart_type at last decision" #logfile_writeln_bt("check_ts_input_mat,range func argument," + str(data_len - COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR - OUTPUT_LEN)) #logfile_writeln_bt("check_ts_input_mat,current_sport start," + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR + OUTPUT_LEN)) for window_s in range(data_len - COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR - OUTPUT_LEN): #current_spot = DATA_HEAD_ASOBI + window_s # for trying backtest with trained period current_spot = COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR + window_s + OUTPUT_LEN logfile_writeln_bt(a_log_str_line) skip_flag = False delay_continue_flag = False vorarity = -1 # default value for log output a_log_str_line = "log," + str(window_s) if pos_kind != NOT_HAVE: if pos_kind == LONG: cur_portfo = positions * (exchange_rates[current_spot] - HALF_SPREAD) diff = (exchange_rates[current_spot] - HALF_SPREAD) - trade_val elif pos_kind == SHORT: cur_portfo = portfolio + ( positions * trade_val - positions * (exchange_rates[current_spot] + HALF_SPREAD)) diff = trade_val - (exchange_rates[current_spot] + HALF_SPREAD) if (cur_portfo - portfolio) / portfolio < -1 * SONKIRI_RATE: portfolio = cur_portfo pos_kind = NOT_HAVE won_pips += diff logfile_writeln_bt( str(diff) + "pips " + str(won_pips) + "pips") a_log_str_line += ",1,0,0,0,0,0,0,0,0,0" #continue delay_continue_flag = True long_chart_ok = False short_chart_ok = False if delay_continue_flag == False: # or is_loaded_mat == False: chart_type = judge_chart_type( exchange_rates[current_spot - CHART_TYPE_JDG_LEN:current_spot]) long_chart_ok = chart_type in chart_filter_type_long short_chart_ok = chart_type in chart_filter_type_short #if chart_type != 1 and chart_type != 2: if not (long_chart_ok or short_chart_ok): skip_flag = True if pos_kind != NOT_HAVE: # if liner trend keep position a_log_str_line += ",0," + str( chart_type) + ",1,0,0,0,0,0,0,0" #continue delay_continue_flag = True if pos_kind != NOT_HAVE and delay_continue_flag == False: if pos_cont_count >= (OUTPUT_LEN - 1): if pos_kind == LONG: pos_kind = NOT_HAVE portfolio = positions * (exchange_rates[current_spot] - HALF_SPREAD) diff = (exchange_rates[current_spot] - HALF_SPREAD) - trade_val won_pips += diff logfile_writeln_bt( str(diff) + "pips " + str(won_pips) + "pips") logfile_writeln_bt(exchange_dates[current_spot] + " " + str(portfolio)) a_log_str_line += ",0," + str( chart_type) + ",0,1,0,0,0,0,0,0" elif pos_kind == SHORT: pos_kind = NOT_HAVE portfolio += positions * trade_val - positions * ( exchange_rates[current_spot] + HALF_SPREAD) diff = trade_val - (exchange_rates[current_spot] + HALF_SPREAD) won_pips += diff logfile_writeln_bt( str(diff) + "pips " + str(won_pips) + "pips") logfile_writeln_bt(exchange_dates[current_spot] + " " + str(portfolio)) a_log_str_line += ",0," + str( chart_type) + ",0,1,0,0,0,0,0,0" pos_cont_count = 0 else: a_log_str_line += ",0," + str(chart_type) + ",0,0,1,0,0,0,0,0" pos_cont_count += 1 #continue delay_continue_flag = True if delay_continue_flag == False: #or is_loaded_mat == False: vorarity = get_vorarity(exchange_rates, current_spot) # if vorarity >= 0.07: if vorarity >= VORARITY_THRESH_IN: a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str( vorarity) + ",1,0,0,0" #continue delay_continue_flag = True if skip_flag and delay_continue_flag == False: a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str( vorarity) + ",0,0,0,1" #continue delay_continue_flag = True if delay_continue_flag == True: continue # prediction ts_input_mat = [] if is_loaded_mat == False: ts_input_mat.append([ exchange_rates[current_spot], (exchange_rates[current_spot] - exchange_rates[current_spot - 1]) / exchange_rates[current_spot - 1], get_rsi(exchange_rates, current_spot), get_ma(exchange_rates, current_spot), get_ma_kairi(exchange_rates, current_spot), get_bb_1(exchange_rates, current_spot), get_bb_2(exchange_rates, current_spot), get_ema(exchange_rates, current_spot), get_ema_rsi(exchange_rates, current_spot), get_cci(exchange_rates, current_spot), get_mo(exchange_rates, current_spot), get_lw(exchange_rates, current_spot), get_ss(exchange_rates, current_spot), get_dmi(exchange_rates, current_spot), vorarity, get_macd(exchange_rates, current_spot), str(chart_type) ]) #logfile_writeln_bt("check_ts_input_mat,check append window_s," + str(window_s) + "\n") ts_input_arr = np.array(ts_input_mat) dtest = xgb.DMatrix(ts_input_arr) pred = bst.predict(dtest) #print(pred) predicted_prob = pred[0] if pos_kind == NOT_HAVE and skip_flag == False: if predicted_prob > LONG_PROBA_THRESH_IN and long_chart_ok: #chart_type == 2: pos_kind = LONG positions = portfolio / (exchange_rates[current_spot] + HALF_SPREAD) trade_val = exchange_rates[current_spot] + HALF_SPREAD a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str( vorarity) + ",1," + str(predicted_prob) + ",1,0" elif predicted_prob < SHORT_PROBA_THRESH_IN and short_chart_ok: #chart_type == 1: pos_kind = SHORT positions = portfolio / (exchange_rates[current_spot] - HALF_SPREAD) trade_val = exchange_rates[current_spot] - HALF_SPREAD a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str( vorarity) + ",1," + str(predicted_prob) + ",2,0" else: a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str( vorarity) + ",1," + str(predicted_prob) + ",0,0" else: raise Exception("this path should not be executed!!!!") #a_log_str_line += "0," + str(chart_type) + ",0,0,0," + str(vorarity) + ",1,0,0,1" # if is_loaded_mat == False: # with open('./ts_input_mat.pickle', 'wb') as f: # pickle.dump(ts_input_mat, f) logfile_writeln_bt("finished backtest.") process_time = time.time() - start logfile_writeln_bt("excecution time of backtest: " + str(process_time)) log_fd_bt.flush() log_fd_bt.close() return portfolio