def main(config_path): conf = sup.load_config(config_path) features, y, df_y, class_labels = sup.load_features(conf) source_filename = conf['Paths'].get( "prepared_data_directory") + "/" + "source" + ".csv" source = sup.load_data_source(source_filename) image_save_directory = conf['Paths'].get( 'results_directory') + "/data_preparation" scaler = StandardScaler( ) # Because normal distribution. Don't use minmax scaler for PCA or unsupervised learning # as the axis shall be centered and not shifted. scaler.fit(features) # Use this scaler also for the test data at the end X_scaled = pd.DataFrame(data=scaler.transform(features), index=features.index, columns=features.columns) print("Unscaled values") print(features.iloc[0:2, :]) print("Scaled values") print(X_scaled.iloc[0:2, :]) scaler.fit(df_y) y_scaled = pd.DataFrame(data=scaler.transform(df_y), index=df_y.index, columns=df_y.columns) # Reduce the training set with the number of samples randomly chosen X_train_index_subset = sup.get_random_data_subset_index(1000, features) X_train_scaled_subset = X_scaled.iloc[X_train_index_subset, :] y_train_subset = np.array(y[X_train_index_subset]).flatten() find_tsne_parmeters(X_train_scaled_subset, y_train_subset, class_labels, conf, image_save_directory)
def execute_wide_run(config_path, execute_search=True, debug_parameters=False): ''' Execute a wide hyperparameter grid search, visualize the results and extract the best categorical parameters for SVM :args: execute_search: True if the search shall be executed; False if no search and only visualization and extraction of the best features data_path: Path to the pickle with the complete results of the run :return: Nothing ''' conf = sup.load_config(config_path) # metrics = Metrics(conf) # Execute algotihm if execute_search == True: if debug_parameters: print("WARNING: Debug parameters are used, which only use a small subset of the search.") print("Execute grid search") execute_wide_search(conf, use_debug_parameters=debug_parameters) else: print("No grid search performed. Already existing model loaded.") # Visualize and get the best parameters extract_categorical_visualize_graphs_frame(conf)
def backtest(config_path, config_section): """ Backtesting """ conf = sup.load_config(config_path) print("Load paths") #paths = Paths(conf).paths #title = conf.get(config_section, 'title') X_val, y_val, labels, model, external_params = eval.load_evaluation_data(conf, config_section) #model_name = conf['Common'].get('dataset_name') source_path = conf[config_section].get('source_in') #result_directory = paths['results_directory'] save_folder = os.path.join(conf['Paths'].get('results_directory'), "evaluation") pred_outcomes_path = os.path.join(conf['Paths'].get('results_directory'), "evaluation", "outcomes_backtest.csv") y_values = pd.read_csv(pred_outcomes_path, sep=';').set_index('id') df_time_graph = stock.load_ohlc_graph(source_path) df_time_graph_cut = df_time_graph.loc[X_val.index] # Backtest validation data data = df_time_graph_cut.join(y_values['val']) data = data.rename(columns={"val": "y_val"}) data.set_index(['Date'], inplace=True) backTestModel(data, save_folder, 'Validation') # Backtest reference data MA200 data = df_time_graph_cut.join(y_values['SMA200']) data = data.rename(columns={"SMA200": "y_val"}) data.set_index(['Date'], inplace=True) backTestModel(data, save_folder, 'Reference_SMA200') # Backtest prediction data data = df_time_graph_cut.join(y_values['model']) data = data.rename(columns={"model": "y_val"}) data.set_index(['Date'], inplace=True) backTestModel(data, save_folder, 'Predicted_Model') # Backtest prediction data with post processing data = df_time_graph_cut.join(y_values['model_pp']) data = data.rename(columns={"model_pp": "y_val"}) data.set_index(['Date'], inplace=True) backTestModel(data, save_folder, 'Predicted_Smoothed_Model') print("Complete")
def main(config_path): conf = sup.load_config(config_path) features, y, df_y, class_labels = sup.load_features(conf) source_filename = os.path.join(conf['Preparation'].get("source_in")) source = sup.load_data_source(source_filename) image_save_directory = conf['Paths'].get( 'results_directory') + "/data_preparation" analyze_timegraph(source, features, y, conf, image_save_directory)
def split_train_validation_data(config_path): #Load all paths #paths = Paths(config_path).path print("=== Split data into training and validation data ===") conf = sup.load_config(config_path) features, y, df_y, class_labels = sup.load_features(conf) #Load training data #df_X, y, y_classes, df_feature_columns = load_files(paths, do_inference) X_train, X_val, y_train, y_val = \ train_test_split(features, df_y, random_state=0, test_size=float(conf['Preparation'].get('test_size')) , shuffle=conf['Preparation'].get('shuffle_data')=='True') print( "Total number of samples: {}. X_train: {}, X_test: {}, y_train: {}, y_test: {}" .format(features.shape[0], X_train.shape, X_val.shape, y_train.shape, y_val.shape)) #Check if training and test data have all classes if len(np.unique(y_train)) == 1: raise Exception( "y_train only consists one class after train/test split. Please adjust the data." ) if len(np.unique(y_val)) == 1: raise Exception( "y_test only consists one class after train/test split. Please adjust the data." ) # Save results X_train.to_csv(os.path.join(conf['Preparation'].get('features_out_train')), sep=';', index=True, header=True) X_val.to_csv(os.path.join(conf['Preparation'].get('features_out_val')), sep=';', index=True, header=True) y_train.to_csv(os.path.join(conf['Preparation'].get('outcomes_out_train')), sep=';', index=True, header=True) y_val.to_csv(os.path.join(conf['Preparation'].get('outcomes_out_val')), sep=';', index=True, header=True) print("Saved training and validation files.")
def main(config_path): conf = sup.load_config(config_path) features, y, df_y, class_labels = sup.load_features(conf) #source_filename = os.path.join(conf['Preparation'].get("source_in")) #source = sup.load_data_source(source_filename) image_save_directory = conf['Paths'].get( 'results_directory') + "/data_preparation" #analyze_timegraph(source, features, y, conf, image_save_directory) print( "WARNING: If a singular matrix occurs in a calculation, probably the outcome is " "only one value.") analyse_features(features, y, class_labels, conf, image_save_directory)
def run_training_predictors(data_input_path): ''' ''' config = sup.load_config(data_input_path) metrics = Metrics(config) #algorithm = config['Common'].get('model_type') pipeline_class_name = config.get('Training', 'pipeline_class', fallback=None) PipelineClass = locate('models.' + pipeline_class_name + '.ModelParam') model_param = PipelineClass() if model_param is None: raise Exception("Model pipeline could not be found: {}".format( 'models.' + pipeline_class_name + '.ModelParam')) X_train, y_train, X_val, y_val, y_classes, selected_features, \ feature_dict, paths, scorers, refit_scorer_name = exe.load_training_input_input(config) scorer = scorers[refit_scorer_name] results_directory = paths['results_directory'] save_fig_prefix = results_directory + '/model_images' #Baseline test baseline_results = exe.execute_baseline_classifier(X_train, y_train, X_val, y_val, y_classes, scorer) print("Baseline results=", baseline_results) #Set classifier and estimate performance model_clf = model_param.create_pipeline()['model'] log.info("{} selected.".format(model_clf)) #algorithm="" #if algorithm=='xgboost': # model_clf = XGBClassifier(objective="binary:logistic", random_state=42) # log.info("XBoost Classifier selected.") #else: # model_clf = SVC() # log.info("SVM (default) classifier selected.") run_training_estimation(X_train, y_train, X_val, y_val, scorer, model_clf, save_fig_prefix)
def main(config_path, on_inference_data, no_images, no_source_data): conf = sup.load_config(config_path) #if not on_inference_data: data_directory = conf['Paths'].get('prepared_data_directory') result_directory = os.path.join(conf['Paths'].get('results_directory'), "data_preparation") #annotations_filename = conf["Paths"].get("annotations_file") #if not os.path.isdir(result_directory): # os.makedirs(result_directory) # print("Created directory: ", result_directory) data_preparation_dump_file_path = os.path.join(conf['Paths'].get('prepared_data_directory'), "temp", "step31out.pickle") os.makedirs(os.path.dirname(data_preparation_dump_file_path), exist_ok=True) #if not os.path.isdir("tmp"): # os.makedirs("tmp") # print("Created directory: ", "tmp") features_path = os.path.join(conf['Preparation'].get('features_in')) if 'outcomes_in' in conf['Preparation']: outcomes_path = os.path.join(conf['Preparation'].get('outcomes_in')) else: outcomes_path = None print("No outcomes in, do inference") labels_path = conf['Paths'].get('labels_path') source_path = os.path.join(conf['Preparation'].get('source_in')) # Load files features_raw, outcomes_cleaned1, data_source_raw, class_labels = load_files(features_path, outcomes_path, source_path, labels_path, no_source_data) ## Data Cleanup of Features and Outcomes before Features are Modified features_cleaned1 = clean_features_first_pass(features_raw, class_labels) analyze_raw_data(features_cleaned1, outcomes_cleaned1, result_directory, conf['Common'].get('dataset_name'), conf['Common'].get('class_name'), no_images, on_inference_data) # Save structures for further processing # Dump path data dump((features_cleaned1, outcomes_cleaned1, class_labels, data_source_raw, data_directory, result_directory), open(data_preparation_dump_file_path, 'wb')) print("Stored paths to: ", data_preparation_dump_file_path)
def main(config_path): conf = sup.load_config(config_path) features, y, df_y, class_labels = sup.load_features(conf) image_save_directory = conf['Paths'].get( 'results_directory') + "/data_preparation" selected_feature_columns_filename = os.path.join( conf['Preparation'].get("selected_feature_columns_out")) selected_feature_list = perform_feature_selection_algorithms( features, y, conf, image_save_directory) print("List of selected features") print(selected_feature_list.transpose()) selected_feature_list.transpose().to_csv(selected_feature_columns_filename, sep=';', index=False, header=True) print("Saved selected feature columns to " + selected_feature_columns_filename)
def main(config_path): conf = sup.load_config(config_path) #image_save_directory = conf['result_directory'] + "/data_preparation_images" prepared_data_directory = conf['Paths'].get('prepared_data_directory') outcomes_filename_uncut = os.path.join(prepared_data_directory, "temp", "temp_outcomes_uncut" + ".csv") features_filename_uncut = os.path.join(prepared_data_directory, "temp", "temp_features_uncut" + ".csv") # Load only a subset of the whole raw data to create a debug dataset source_uncut = custom.load_source(conf['Paths'].get('source_path')) features_uncut = pd.read_csv(features_filename_uncut, sep=';').set_index('id') if os.path.isfile(outcomes_filename_uncut): outcomes_uncut = pd.read_csv(outcomes_filename_uncut, sep=';').set_index('id') print("Outcomes file found. Adapting dimensions for training data.") print("Outcomes shape: ", outcomes_uncut.shape) else: outcomes_uncut = None print("Outcomes file not found. Adapting dimensions for inference data.") print("Source shape: ", source_uncut.shape) print("Features shape: ", features_uncut.shape) # Cut outcomes and by last 50 as smoothing was used #outcomes_reduced1 = cut_unusable_parts_of_dataframe(outcomes_uncut, tail_index=50) #Clean features # Cut NaNs features_reduced1 = clean_nan(features_uncut) if not outcomes_uncut is None: intersection_index = outcomes_uncut.index.intersection(features_reduced1.index) # Cut all dataframes to have the same index outcomes = outcomes_uncut.loc[intersection_index] print("Cut outcomes shape: ", outcomes.shape) else: outcomes = None intersection_index = features_reduced1.index print("Nothing will be cut. Size of features will be used.") features = features_reduced1.loc[intersection_index] source = source_uncut.loc[intersection_index] print("Cut source shape: ", source.shape) print("Cut features shape: ", features.shape) # Cut for subsets subset_start = 0 subset_stop = features.shape[0] #subset_stop = 1000 features_subset = cut_dataframe_subset(features, subset_start, subset_stop) source_subset = cut_dataframe_subset(source, subset_start, subset_stop) print("Subset source shape: ", source_subset.shape) print("Subset features shape: ", features_subset.shape) if 'outcomes_out' in conf['Generation']: outcomes_out_filename = os.path.join(conf['Generation'].get('outcomes_out')) #conf['Common'].get('dataset_name') + "_outcomes" + ".csv") else: outcomes_out_filename = None outcomes = None print("Only preparing features for inference. No outcomes file used.") features_out_filename = os.path.join(conf['Generation'].get('features_out')) source_out_filename = os.path.join(conf['Generation'].get('source_out')) print("=== Paths ===") print("Features in: ", features_out_filename) print("Outcomes in: ", outcomes_out_filename) print("Source out: ", source_out_filename) # Save the graph data for visualization of the results print("Feature shape {}".format(features_subset.shape)) features_subset.to_csv(features_out_filename, sep=';', index=True, header=True) print("Saved features graph to " + features_out_filename) # Save the graph data for visualization of the results print("source shape {}".format(source_subset.shape)) source_subset.to_csv(source_out_filename, sep=';', index=True, header=True) print("Saved source graph to " + source_out_filename) if not outcomes is None: outcomes_subset = cut_dataframe_subset(outcomes, subset_start, subset_stop) print("Subset outcomes shape: ", outcomes_subset.shape) # Save the graph data for visualization of the results print("Outcomes shape {}".format(outcomes_subset.shape)) outcomes_subset.to_csv(outcomes_out_filename, sep=';', index=True, header=True) print("Saved source graph to " + outcomes_out_filename)
def execute_narrow_search(config_path): ''' Execute a narrow search on the subset of data ''' # Load config config = sup.load_config(config_path) # Load complete training input X_train, y_train, X_val, y_val, y_classes, selected_features, \ feature_dict, paths, scorers, refit_scorer_name = exe.load_training_input_input(config) #model_type = config.get('Common', 'model_type') pipeline_class_name = config.get('Training', 'pipeline_class', fallback=None) PipelineClass = locate('models.' + pipeline_class_name + '.ModelParam') model_param = PipelineClass() if model_param is None: raise Exception("Model pipeline could not be found: {}".format( 'models.' + pipeline_class_name + '.ModelParam')) # Load narrow training parameters samples = json.loads(config.get("Training", "narrow_samples")) kfolds = json.loads(config['Training'].get('narrow_kfolds')) iterations = json.loads(config['Training'].get('narrow_iterations')) selection = json.loads(config['Training'].get('narrow_selection')) iter_setup = dict() iter_setup['samples'] = samples iter_setup['kfolds'] = kfolds iter_setup['iter'] = iterations iter_setup['selection'] = selection # f = open(data_input_path, "rb") # prepared_data = pickle.load(f) # print("Loaded data: ", prepared_data) # results_run1_file_path = prepared_data['paths']['svm_run1_result_filename'] # X_train = train['X'] # y_train = train['y'] # scorers = model['scorers'] # refit_scorer_name = model['refit_scorer_name'] results_run2_file_path = paths['run2_result_filename'] pipe_first_selection = paths['pipe_first_selection'] # svm_pipe_final_selection = paths['svm_pipe_final_selection'] # Use predefined export location for the pipe pipe_final_selection = config.get('Training', 'pipeline_out') # model_directory = paths['models_directory'] result_directory = paths['results_directory'] # model_name = paths['dataset_name'] save_fig_prefix = result_directory + '/model_images' # if not os.path.isdir(save_fig_prefix): os.makedirs(save_fig_prefix, exist_ok=True) # print("Created folder: ", save_fig_prefix) # Load saved results r = open(pipe_first_selection, "rb") pipe_run_best_first_selection = pickle.load(r) if model_param.get_model_type() == 'svm': # SVM Code Start pipe_run_second_selection, results_run2 = perform_run2_svm( X_train, iter_setup, pipe_run_best_first_selection, refit_scorer_name, save_fig_prefix, scorers, y_train) # SVM Code End else: # XGBoost Code start warnings.warn("No 2nd search will be performed for {}".format( model_param.get_model_type())) pipe_run_second_selection, results_run2 = perform_run2_xgboost( X_train, iter_setup, pipe_run_best_first_selection, refit_scorer_name, save_fig_prefix, scorers, y_train) # XGBoost Code end #else: # raise Exception("No valid model for model type {}".format(model_param.get_model_type())) print("Model parameters defined", pipe_run_second_selection) print("Save model") # Save best pipe dump(pipe_run_second_selection, open(pipe_final_selection, 'wb')) print("Stored pipe_run_best_first_selection at ", pipe_final_selection) # Save results # only if any second runs were made if results_run2 is not None: dump(results_run2, open(results_run2_file_path, 'wb')) print("Stored results ", results_run2_file_path) # result_save = results_run2.copy() results_run2.round(4).to_csv(results_run2_file_path + "_results.csv", sep=";") with open(results_run2_file_path + "_pipe.txt", 'w') as f: print(pipe_run_second_selection, file=f) print("Method end")
def evaluate_model(config_path, config_section="EvaluationTraining"): ''' ''' # Get data config = sup.load_config(config_path) print("Load paths") paths = Paths(config).paths X_val, y_val, labels, model, external_params = evalutil.load_evaluation_data( config, config_section) y_classes = labels #train['label_map'] result_directory = paths['results_directory'] #model_name = config['Common'].get('dataset_name') title = config.get(config_section, 'title') figure_path_prefix = result_directory + '/model_images/' + title #if not os.path.isdir(result_directory + '/model_images'): os.makedirs(result_directory + '/model_images', exist_ok=True) # print("Created folder: ", result_directory + '/model_images') # Load model external parameters pr_threshold = external_params['pr_threshold'] print("Loaded precision/recall threshold: ", pr_threshold) # Load model print("Predict validation data") y_test_pred = model.predict(X_val.values) #If there is an error here, set model_pipe['svm'].probability = True y_test_pred_proba = model.predict_proba(X_val.values) y_test_pred_scores = y_test_pred_proba[:, 1] #model.decision_function(X_val.values) #Reduce the number of classes only to classes that can be found in the data #reduced_class_dict_train = model_util.reduce_classes(y_classes, y_train, y_train_pred) reduced_class_dict_test = model_util.reduce_classes( y_classes, y_val, y_test_pred) if len(y_classes) == 2: #y_train_pred_adjust = model_util.adjusted_classes(y_train_pred_scores, pr_threshold) # (y_train_pred_scores>=pr_threshold).astype('int') y_test_pred_adjust = model_util.adjusted_classes( y_test_pred_scores, pr_threshold) # (y_test_pred_scores>=pr_threshold).astype('int') print( "This is a binarized problem. Apply optimal threshold to precision/recall. Threshold=", pr_threshold) else: #y_train_pred_adjust = y_train_pred y_test_pred_adjust = y_test_pred print( "This is a multi class problem. No precision/recall adjustment of scores are made." ) #Plot graphs #If binary class plot precision/recall # Plot the precision and the recall together with the selected value for the test set if len(y_classes) == 2: print("Plot precision recall graphs") precision, recall, thresholds = precision_recall_curve( y_val, y_test_pred_scores) vis.plot_precision_recall_vs_threshold( precision, recall, thresholds, pr_threshold, save_fig_prefix=figure_path_prefix, title_prefix="pr_adjusted") vis.plot_precision_recall_evaluation( y_val, y_test_pred_adjust, y_test_pred_proba, reduced_class_dict_test, save_fig_prefix_dir=figure_path_prefix, title_prefix="pr_adjusted") #Plot evaluation for unadjusted values vis.plot_precision_recall_evaluation( y_val, y_test_pred, y_test_pred_proba, reduced_class_dict_test, save_fig_prefix_dir=figure_path_prefix, title_prefix="") #Plot decision boundary plot X_decision = X_val.values[0:1000, :] y_decision = y_val[0:1000] vis.plot_decision_boundary(X_decision, y_decision, model, title_prefix=title + "_", save_fig_prefix=figure_path_prefix) print("Visualization complete")
def main(config_path, debug_param): conf = sup.load_config(config_path) image_save_directory = os.path.join(conf['Paths'].get('results_directory'), "data_generation") features_filename_uncut = os.path.join( conf['Paths'].get('prepared_data_directory'), "temp", "temp_features_uncut" + ".csv") os.makedirs(os.path.dirname(features_filename_uncut), exist_ok=True) #Load only a subset of the whole raw data to create a debug dataset source = custom.load_source(conf['Paths'].get('source_path')) #Plot source plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k') plt.plot(source['Date'], source['Close']) plt.title(conf['Paths'].get('source_path')) plt.show(block=False) # Define features df features = pd.DataFrame(index=source.index) # Generate Price Based Values normed_days_features = price_normalizer(source, debug_param=debug_param) features = features.join(normed_days_features) number_days_features = impulse_count(source, debug_param=debug_param) features = features.join(number_days_features) mean_features = calculate_moving_average(source, debug_param=debug_param) features = features.join(mean_features) madiff_features = calculate_moving_average_direction(source, mean_features) features = features.join(madiff_features) rsi_features = get_rsi(source, debug_param=debug_param) features = features.join(rsi_features) rsi_change_features = get_rsi_difference(source) features = features.join(rsi_change_features) #rsi_signal_features = get_rsi_signal(source) #features = features.join(rsi_signal_features) stoch_features = get_stochastics(source) features = features.join(stoch_features) plt.figure(num=None, figsize=(10, 7), dpi=80, facecolor='w', edgecolor='k') plt.subplot(311) plt.plot(source['Date'][0:100], source['Close'][0:100]) plt.title("Close") plt.subplot(312) plt.title("Stochastic Variant " + str(stoch_features.columns[1])) plt.plot(source['Date'][0:100], stoch_features.iloc[:, 1][0:100]) plt.plot(source['Date'][0:100], stoch_features.iloc[:, 0][0:100]) plt.subplot(313) plt.title("Stochastic Variant " + str(stoch_features.columns[-1])) plt.plot(source['Date'][0:100], stoch_features.iloc[:, -1][0:100]) plt.plot(source['Date'][0:100], stoch_features.iloc[:, -2][0:100]) plt.tight_layout() macd_features = get_macd(source) features = features.join(macd_features) plt.figure(num=None, figsize=(10, 7), dpi=80, facecolor='w', edgecolor='k') plt.subplot(311) plt.plot(source['Date'][0:100], source['Close'][0:100]) plt.title("Close") plt.subplot(312) plt.title("MACD Variant 1") plt.plot(source['Date'][0:100], macd_features.iloc[:, 0][0:100]) plt.plot(source['Date'][0:100], macd_features.iloc[:, 1][0:100]) plt.legend(("MACD", "MACD Signal")) plt.subplot(313) plt.title("MACD Variant 1") plt.plot(source['Date'][0:100], macd_features.iloc[:, -2][0:100]) plt.plot(source['Date'][0:100], macd_features.iloc[:, -1][0:100]) plt.legend(("MACD", "MACD Signal")) plt.tight_layout() macd_diff_features = get_macd_difference(macd_features) features = features.join(macd_diff_features) macd_direction_change_features = get_trigger_signals(macd_diff_features) features = features.join(macd_direction_change_features) periodic_values = get_periodical_indicators(source) features = features.join(periodic_values) # Features structure print("Features: ", features.head(10)) print("Features shape: ", features.shape) # Save features to a csv file print("Features shape {}".format(features.shape)) features.to_csv(features_filename_uncut, sep=';', index=True, header=True) print("Saved features to " + features_filename_uncut) print("=== Data for {} prepared to be trained or inferred ===".format( conf['Common'].get('dataset_name')))
def visualize_temporal_data(config_path, config_section): # Load intermediate model, which has only been trained on training data # Get data # Load file paths config = sup.load_config(config_path) print("Load paths") paths = Paths(config).paths title = config.get(config_section, 'title') X_val, y_val, labels, model, external_params = eval.load_evaluation_data( config, config_section) y_classes = labels model_name = config['Common'].get('dataset_name') source_path = config[config_section].get('source_in') result_directory = paths['results_directory'] figure_path_prefix = result_directory + '/evaluation' os.makedirs(result_directory + '/evaluation', exist_ok=True) # Load model external parameters pr_threshold = external_params['pr_threshold'] print("Loaded precision/recall threshold: {0:.2f}".format(pr_threshold)) # Make predictions y_test_pred_scores = model.predict_proba(X_val.values)[:, 1] y_test_pred = model.predict(X_val.values) #y_test_pred_proba = evalclf.predict_proba(X_test.values) y_test_pred_adjust = model_util.adjusted_classes(y_test_pred_scores, pr_threshold) # Load original data for visualization df_time_graph = pd.read_csv(source_path, delimiter=';').set_index('id') df_time_graph['Date'] = pd.to_datetime(df_time_graph['Date']) df_time_graph['Date'].apply(mdates.date2num) print("Loaded feature names for time graph={}".format( df_time_graph.columns)) print("X. Shape={}".format(df_time_graph.shape)) # Create a df from the y array for the visualization functions y_order_test_pred = pd.DataFrame(index=X_val.index, data=pd.Series(data=y_test_pred, index=X_val.index, name="y")).sort_index() y_order_test_pred_adjust = pd.DataFrame(index=X_val.index, data=pd.Series( data=y_test_pred_adjust, index=X_val.index, name="y")).sort_index() #Visualize the results print("Plot for inference data to ", figure_path_prefix) vis.plot_three_class_graph(y_order_test_pred['y'].values, df_time_graph['Close'][y_order_test_pred.index], df_time_graph['Date'][y_order_test_pred.index], 0, 0, 0, ('close', 'neutral', 'positive', 'negative'), title=title + "_Inference_" + model_name, save_fig_prefix=figure_path_prefix) vis.plot_three_class_graph(y_order_test_pred_adjust['y'].values, df_time_graph['Close'][y_order_test_pred.index], df_time_graph['Date'][y_order_test_pred.index], 0, 0, 0, ('close', 'neutral', 'positive', 'negative'), title=title + "_Inference_Adjusted" + model_name, save_fig_prefix=figure_path_prefix)
def main(config_path): conf = sup.load_config(config_path) data_directory = conf['Paths'].get('prepared_data_directory') data_preparation_dump_file_path = os.path.join(data_directory, "temp", "step31out.pickle") (features_cleaned1, outcomes_cleaned1, class_labels, data_source_raw, data_directory, result_directory) = pickle.load( open(data_preparation_dump_file_path, "rb")) class_name = conf['Common'].get('class_name') model_features_filename = os.path.join( conf['Preparation'].get('features_out')) if 'outcomes_out' in conf['Preparation']: model_outcomes_filename = os.path.join( conf['Preparation'].get('outcomes_out')) else: model_outcomes_filename = None print( "No outcomes out defined. Use inference settings with no outcomes") if 'labels_out' in conf['Preparation']: model_labels_filename = os.path.join( conf['Preparation'].get('labels_out')) else: model_labels_filename = None print("No labels file available for inference.") features, y, class_labels = adapt_features_for_model( features_cleaned1, outcomes_cleaned1, result_directory, class_labels, conf) # === Save features to a csv file ===# print("Features shape {}".format(features.shape)) features.to_csv(model_features_filename, sep=';', index=True) # np.savetxt(filenameprefix + "_X.csv", X, delimiter=";", fmt='%s') print("Saved features to " + model_features_filename) # === Save the selected outcome to a csv file ===# if y is not None: print("outcome shape {}".format(y.shape)) y_true = pd.DataFrame(y, columns=[class_name], index=outcomes_cleaned1.index) y_true.to_csv(model_outcomes_filename, sep=';', index=True, header=True) print("Saved features to " + model_outcomes_filename) else: print("y values not saved as no ourcome was provided.") # === Save new y labels to a csv file ===# if class_labels is not None: print("Class labels length {}".format(len(class_labels))) with open(model_labels_filename, 'w') as f: for key in class_labels.keys(): f.write( "%s;%s\n" % (class_labels[key], key) ) # Classes are saved inverse to the labelling in the file, i.e. first value, then key print("Saved class names and id to " + model_labels_filename) else: print("Class labels were not saved as no outcome was available.")
def convert_time_series(config_path, feature_dir): """ """ # Load time series config = sup.load_config(config_path) X_train_path = os.path.join(feature_dir, "features_val.csv") y_train_path = os.path.join(feature_dir, "outcomes_val.csv") source_path = os.path.join(feature_dir, "source.csv") os.makedirs("./prepared-data/val_rolling_csv", exist_ok=True) os.makedirs("./prepared-data/val_rolling_images/pos", exist_ok=True) os.makedirs("./prepared-data/val_rolling_images/neg", exist_ok=True) # Load X and y X_train, y_train_df, y_train = exe.load_data(X_train_path, y_train_path) # Load source data df_time_graph = stock.load_ohlc_graph(source_path) df_time_graph_red = df_time_graph.loc[X_train.index] # Merge all values # np.log(df_time_graph['Close'] - df_time_graph['Close'].iloc[0] + 1) all_df = X_train.join(df_time_graph).join(y_train_df) all_df.reset_index(inplace=True) all_df.set_index('Date', inplace=True) all_columns = [ 'id', 'SMA200', 'RSI_14', 'Open', 'High', 'Low', 'Close', 'LongTrend' ] all_df_red = all_df[all_columns] # Create rolling values in a df for multivariates for df_subset in all_df_red.rolling(250): #print(type(df_subset), '\n', df_subset) if df_subset.shape[0] >= 250: print("Processing id {}".format(df_subset.iloc[-1].id)) # Save subset as file csv_columns = ['SMA200', 'RSI_14', 'LongTrend'] csv_path = "./prepared-data/val_rolling_csv/roll_" + str( int(df_subset.iloc[-1].id)) + ".csv" df_subset[csv_columns].to_csv(csv_path, sep=';', index=True, header=True) #Create graph of subset if df_subset.iloc[-1].LongTrend == 1: image_path = "./prepared-data/val_rolling_images/pos/roll_" + str( int(df_subset.iloc[-1].id)) + ".png" elif df_subset.iloc[-1].LongTrend == 0: image_path = "./prepared-data/val_rolling_images/neg/roll_" + str( int(df_subset.iloc[-1].id)) + ".png" apd = mpf.make_addplot(df_subset['RSI_14'], panel=1, color='black', ylim=(10, 90), secondary_y=True) mpf.plot(df_subset, type='candle', volume=False, mav=(20, 100, 200), figscale=1.5, addplot=apd, panel_ratios=(1, 0.3), savefig=image_path) # Save images of ohlc graph # Save rolling charts in files print("End")
def main(config_path): conf = sup.load_config(config_path) # Load annotations file y_labels = pd.read_csv(conf['Paths'].get('source_path'), sep=';', header=None).set_index(0).to_dict()[1] # Generating filenames for saving the files image_save_directory = os.path.join(conf['Paths'].get('results_directory'), "data_generation") outcomes_filename_raw = os.path.join(conf['Paths'].get('prepared_data_directory'), "temp", "temp_outcomes_uncut" + ".csv") #if os.path.isdir(conf['Paths'].get('prepared_data_directory'))==False: os.makedirs(os.path.dirname(outcomes_filename_raw), exist_ok=True) # print("Created directory ", conf['Paths'].get('training_data_directory')) #if os.path.isdir(conf['Paths'].get('result_directory'))==False: #os.makedirs(conf['Paths'].get('result_directory'), exist_ok=True) # print("Created directory ", conf['Paths'].get('result_directory')) #Load only a subset of the whole raw data to create a debug dataset source = custom.load_source(conf['Paths'].get('source_path')) #.iloc[0:1000, :] #Plot source plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k') plt.plot(source['Date'], source['Close']) plt.title(conf['Paths'].get('source_path')) #plt.show(block = False) vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename="Source_data") #y_labels = annotations #generate_custom_class_labels() outcomes = generate_features_outcomes(image_save_directory, source) # Drop the 50 last values as they cannot be used for prediction as +50 days ahead is predicted source_cut = source.drop(source.tail(50).index, inplace=False) outcomes_cut = outcomes.drop(outcomes.tail(50).index, inplace=False) vis.plot_three_class_graph(outcomes_cut['1dTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_1dTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['5dTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_5dTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['20dTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_20dTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['LongTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_LongTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['TopsBottoms'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'top', 'bottom'), title=conf['Common'].get('dataset_name') + '_GT_TopsBottoms', save_fig_prefix=image_save_directory) def binarize(outcomes, class_number): return (outcomes == class_number).astype(int) vis.plot_two_class_graph(binarize(outcomes_cut['1dTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_1dTrend', save_fig_prefix=image_save_directory) vis.plot_two_class_graph(binarize(outcomes_cut['5dTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_5dTrend', save_fig_prefix=image_save_directory) vis.plot_two_class_graph(binarize(outcomes_cut['20dTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_20dTrend', save_fig_prefix=image_save_directory) vis.plot_two_class_graph(binarize(outcomes_cut['LongTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_LongTrend', save_fig_prefix=image_save_directory) # Save file # Save outcomes to a csv file print("Outcomes shape {}".format(outcomes_cut.shape)) outcomes_cut.to_csv(outcomes_filename_raw, sep=';', index=True, header=True) print("Saved outcomes to " + outcomes_filename_raw)
def generate_values_for_backtesting(config_path, config_section): """ Generate values for backtesting to find out how well the system performs. First, an MA200 reference value is used. Second, the predictions are created and then smoothed """ conf = sup.load_config(config_path) print("Load paths") #paths = Paths(conf).paths #title = conf.get(config_section, 'title') X_val, y_val, labels, model, external_params = eval.load_evaluation_data( conf, config_section) #model_name = conf['Common'].get('dataset_name') source_path = conf[config_section].get('source_in') #result_directory = paths['results_directory'] # Load model external parameters pr_threshold = external_params['pr_threshold'] print("Loaded precision/recall threshold: {0:.2f}".format(pr_threshold)) df_time_graph = stock.load_ohlc_graph(source_path) # Y_val y_val_data = pd.DataFrame(index=X_val.index, data=y_val, columns=['val']) # Reference system print("Create reference values") y_ref = generate_reference_results(X_val['SMA200']) y_red_data = pd.DataFrame(y_ref) #Visualize the results figure_path_prefix = os.path.join(conf['Paths'].get('results_directory'), "evaluation") print("Plot for inference data to ", figure_path_prefix) vis.plot_three_class_graph(y_ref, df_time_graph['Close'][y_red_data.index], df_time_graph['Date'][y_red_data.index], 0, 0, 0, ('close', 'neutral', 'positive', 'negative'), title="Reference_System_SMA200" + "_Validation_", save_fig_prefix=figure_path_prefix) # Predictionsystem print("Get prediction values") y_test_pred = model.predict(X_val.values) y_test_pred_data = pd.DataFrame(index=X_val.index, data=y_test_pred, columns=['model']) pred_outcomes_filename = os.path.join( conf['Paths'].get('results_directory'), "evaluation", "outcomes_pred.csv") #y_test_pred_data.to_csv(pred_outcomes_filename, sep=';', index=True, header=True) # Post processing for prediction # Use a moving average N = 3 smoothed_values_unfixed = np.convolve(y_test_pred_data['model'], np.ones(N) / N, mode='valid') smoothed_data_raw = np.concatenate([np.zeros(2), smoothed_values_unfixed]) smoothed_data = (smoothed_data_raw > 0.5) * 1 y_test_pred_data_smoothed = pd.DataFrame(index=X_val.index, data=smoothed_data, columns=['model_pp']) print("Plot for post processed inference data to ", figure_path_prefix) vis.plot_three_class_graph( smoothed_data, df_time_graph['Close'][y_test_pred_data_smoothed.index], df_time_graph['Date'][y_test_pred_data_smoothed.index], 0, 0, 0, ('close', 'neutral', 'positive', 'negative'), title="Smoothed_Prediction_Model" + "_Validation_", save_fig_prefix=figure_path_prefix) # Merge all values all_data = y_test_pred_data.join(y_red_data).join(y_val_data).join( y_test_pred_data_smoothed) pred_outcomes_filename = os.path.join( conf['Paths'].get('results_directory'), "evaluation", "outcomes_backtest.csv") all_data.to_csv(pred_outcomes_filename, sep=';', index=True, header=True) print("Completed")
def __init__(self, config_file_path): self.conf = sup.load_config(config_file_path)
def train_final_model(config_path, config_section="Evaluation"): # Get data config = sup.load_config(config_path) #paths, model, train, test = step40.load_training_files(paths_path) X_train, y_train, pipe = load_data(config, config_section) #print("load inputs: ", data_input_path) #f = open(data_input_path, "rb") #prepared_data = pickle.load(f) #print("Loaded data: ", prepared_data) #X_train = train['X'] #y_train = train['y'] #X_test = test['X'] #y_test = test['y'] #y_classes = train['label_map'] #svm_pipe_final_selection = paths['svm_pipe_final_selection'] svm_final_model_filepath = config[config_section].get( 'model_out') #paths['svm_final_model_filename'] #model_directory = paths['model_directory'] #model_name = paths['dataset_name'] #figure_path_prefix = model_directory + '/images/' + model_name # Load model external parameters #with open(svm_external_parameters_filename, 'r') as fp: # external_params = json.load(fp) #pr_threshold = external_params['pr_threshold'] #print("Loaded precision/recall threshold: ", pr_threshold) # Load model #r = open(svm_pipe_final_selection, "rb") #model_pipe = pickle.load(r) #model_pipe['svm'].probability = True print("") print("Set probability measurements in the model to True") pipe['model'].probability = True print("Original final pipe: ", pipe) #Merge training and test data #X = X_train.append(X_test) #y = np.append(y_train, y_test) #print("Merge training and test data from sizes train {} and test {} to all data {}".format( # X_train.shape, X_train.shape, X.shape #)) t = time.time() local_time = time.ctime(t) print("=== Start training the SVM at {} ===".format(local_time)) clf = pipe.fit(X_train, y_train) t_end = time.time() - t print("Training took {0:.2f}s".format(t_end)) print("Store model") print("Model to save: ", clf) joblib.dump(clf, svm_final_model_filepath) print("Saved model at location ", svm_final_model_filepath)