Example #1
0
def main(config_path):
    conf = sup.load_config(config_path)
    features, y, df_y, class_labels = sup.load_features(conf)

    source_filename = conf['Paths'].get(
        "prepared_data_directory") + "/" + "source" + ".csv"
    source = sup.load_data_source(source_filename)

    image_save_directory = conf['Paths'].get(
        'results_directory') + "/data_preparation"

    scaler = StandardScaler(
    )  # Because normal distribution. Don't use minmax scaler for PCA or unsupervised learning
    # as the axis shall be centered and not shifted.
    scaler.fit(features)
    # Use this scaler also for the test data at the end
    X_scaled = pd.DataFrame(data=scaler.transform(features),
                            index=features.index,
                            columns=features.columns)
    print("Unscaled values")
    print(features.iloc[0:2, :])
    print("Scaled values")
    print(X_scaled.iloc[0:2, :])
    scaler.fit(df_y)
    y_scaled = pd.DataFrame(data=scaler.transform(df_y),
                            index=df_y.index,
                            columns=df_y.columns)

    # Reduce the training set with the number of samples randomly chosen
    X_train_index_subset = sup.get_random_data_subset_index(1000, features)
    X_train_scaled_subset = X_scaled.iloc[X_train_index_subset, :]
    y_train_subset = np.array(y[X_train_index_subset]).flatten()

    find_tsne_parmeters(X_train_scaled_subset, y_train_subset, class_labels,
                        conf, image_save_directory)
def execute_wide_run(config_path, execute_search=True, debug_parameters=False):
    '''
    Execute a wide hyperparameter grid search, visualize the results and extract the best categorical parameters
    for SVM

    :args:
        execute_search: True if the search shall be executed; False if no search and only visualization and extraction
            of the best features
        data_path: Path to the pickle with the complete results of the run
    :return:
        Nothing
    '''

    conf = sup.load_config(config_path)
    # metrics = Metrics(conf)

    # Execute algotihm
    if execute_search == True:
        if debug_parameters:
            print("WARNING: Debug parameters are used, which only use a small subset of the search.")
        print("Execute grid search")
        execute_wide_search(conf, use_debug_parameters=debug_parameters)
    else:
        print("No grid search performed. Already existing model loaded.")

    # Visualize and get the best parameters

    extract_categorical_visualize_graphs_frame(conf)
def backtest(config_path, config_section):
    """
    Backtesting

    """

    conf = sup.load_config(config_path)
    print("Load paths")
    #paths = Paths(conf).paths
    #title = conf.get(config_section, 'title')

    X_val, y_val, labels, model, external_params = eval.load_evaluation_data(conf, config_section)

    #model_name = conf['Common'].get('dataset_name')
    source_path = conf[config_section].get('source_in')
    #result_directory = paths['results_directory']
    save_folder = os.path.join(conf['Paths'].get('results_directory'), "evaluation")

    pred_outcomes_path = os.path.join(conf['Paths'].get('results_directory'), "evaluation", "outcomes_backtest.csv")
    y_values = pd.read_csv(pred_outcomes_path, sep=';').set_index('id')

    df_time_graph = stock.load_ohlc_graph(source_path)
    df_time_graph_cut = df_time_graph.loc[X_val.index]


    # Backtest validation data
    data = df_time_graph_cut.join(y_values['val'])
    data = data.rename(columns={"val": "y_val"})
    data.set_index(['Date'], inplace=True)

    backTestModel(data, save_folder, 'Validation')

    # Backtest reference data MA200
    data = df_time_graph_cut.join(y_values['SMA200'])
    data = data.rename(columns={"SMA200": "y_val"})
    data.set_index(['Date'], inplace=True)

    backTestModel(data, save_folder, 'Reference_SMA200')

    # Backtest prediction data
    data = df_time_graph_cut.join(y_values['model'])
    data = data.rename(columns={"model": "y_val"})
    data.set_index(['Date'], inplace=True)

    backTestModel(data, save_folder, 'Predicted_Model')

    # Backtest prediction data with post processing
    data = df_time_graph_cut.join(y_values['model_pp'])
    data = data.rename(columns={"model_pp": "y_val"})
    data.set_index(['Date'], inplace=True)

    backTestModel(data, save_folder, 'Predicted_Smoothed_Model')

    print("Complete")
def main(config_path):
    conf = sup.load_config(config_path)
    features, y, df_y, class_labels = sup.load_features(conf)

    source_filename = os.path.join(conf['Preparation'].get("source_in"))
    source = sup.load_data_source(source_filename)

    image_save_directory = conf['Paths'].get(
        'results_directory') + "/data_preparation"

    analyze_timegraph(source, features, y, conf, image_save_directory)
def split_train_validation_data(config_path):
    #Load all paths
    #paths = Paths(config_path).path
    print("=== Split data into training and validation data ===")
    conf = sup.load_config(config_path)
    features, y, df_y, class_labels = sup.load_features(conf)

    #Load training data
    #df_X, y, y_classes, df_feature_columns = load_files(paths, do_inference)

    X_train, X_val, y_train, y_val = \
        train_test_split(features, df_y, random_state=0,
                         test_size=float(conf['Preparation'].get('test_size')) ,
                         shuffle=conf['Preparation'].get('shuffle_data')=='True')

    print(
        "Total number of samples: {}. X_train: {}, X_test: {}, y_train: {}, y_test: {}"
        .format(features.shape[0], X_train.shape, X_val.shape, y_train.shape,
                y_val.shape))

    #Check if training and test data have all classes
    if len(np.unique(y_train)) == 1:
        raise Exception(
            "y_train only consists one class after train/test split. Please adjust the data."
        )
    if len(np.unique(y_val)) == 1:
        raise Exception(
            "y_test only consists one class after train/test split. Please adjust the data."
        )

    # Save results
    X_train.to_csv(os.path.join(conf['Preparation'].get('features_out_train')),
                   sep=';',
                   index=True,
                   header=True)
    X_val.to_csv(os.path.join(conf['Preparation'].get('features_out_val')),
                 sep=';',
                 index=True,
                 header=True)
    y_train.to_csv(os.path.join(conf['Preparation'].get('outcomes_out_train')),
                   sep=';',
                   index=True,
                   header=True)
    y_val.to_csv(os.path.join(conf['Preparation'].get('outcomes_out_val')),
                 sep=';',
                 index=True,
                 header=True)

    print("Saved training and validation files.")
def main(config_path):
    conf = sup.load_config(config_path)
    features, y, df_y, class_labels = sup.load_features(conf)

    #source_filename = os.path.join(conf['Preparation'].get("source_in"))
    #source = sup.load_data_source(source_filename)

    image_save_directory = conf['Paths'].get(
        'results_directory') + "/data_preparation"

    #analyze_timegraph(source, features, y, conf, image_save_directory)
    print(
        "WARNING: If a singular matrix occurs in a calculation, probably the outcome is "
        "only one value.")
    analyse_features(features, y, class_labels, conf, image_save_directory)
def run_training_predictors(data_input_path):
    '''


    '''

    config = sup.load_config(data_input_path)
    metrics = Metrics(config)
    #algorithm = config['Common'].get('model_type')

    pipeline_class_name = config.get('Training',
                                     'pipeline_class',
                                     fallback=None)
    PipelineClass = locate('models.' + pipeline_class_name + '.ModelParam')
    model_param = PipelineClass()
    if model_param is None:
        raise Exception("Model pipeline could not be found: {}".format(
            'models.' + pipeline_class_name + '.ModelParam'))


    X_train, y_train, X_val, y_val, y_classes, selected_features, \
    feature_dict, paths, scorers, refit_scorer_name = exe.load_training_input_input(config)
    scorer = scorers[refit_scorer_name]

    results_directory = paths['results_directory']
    save_fig_prefix = results_directory + '/model_images'

    #Baseline test
    baseline_results = exe.execute_baseline_classifier(X_train, y_train, X_val,
                                                       y_val, y_classes,
                                                       scorer)
    print("Baseline results=", baseline_results)

    #Set classifier and estimate performance

    model_clf = model_param.create_pipeline()['model']
    log.info("{} selected.".format(model_clf))

    #algorithm=""
    #if algorithm=='xgboost':
    #    model_clf = XGBClassifier(objective="binary:logistic", random_state=42)
    #    log.info("XBoost Classifier selected.")
    #else:
    #    model_clf = SVC()
    #    log.info("SVM (default) classifier selected.")

    run_training_estimation(X_train, y_train, X_val, y_val, scorer, model_clf,
                            save_fig_prefix)
def main(config_path, on_inference_data, no_images, no_source_data):
    conf = sup.load_config(config_path)

    #if not on_inference_data:
    data_directory = conf['Paths'].get('prepared_data_directory')
    result_directory = os.path.join(conf['Paths'].get('results_directory'), "data_preparation")
    #annotations_filename = conf["Paths"].get("annotations_file")

    #if not os.path.isdir(result_directory):
    #    os.makedirs(result_directory)
    #    print("Created directory: ", result_directory)

    data_preparation_dump_file_path = os.path.join(conf['Paths'].get('prepared_data_directory'), "temp", "step31out.pickle")
    os.makedirs(os.path.dirname(data_preparation_dump_file_path), exist_ok=True)
    #if not os.path.isdir("tmp"):
    #    os.makedirs("tmp")
    #    print("Created directory: ", "tmp")

    features_path = os.path.join(conf['Preparation'].get('features_in'))
    if 'outcomes_in' in conf['Preparation']:
        outcomes_path = os.path.join(conf['Preparation'].get('outcomes_in'))
    else:
        outcomes_path = None
        print("No outcomes in, do inference")
    labels_path = conf['Paths'].get('labels_path')
    source_path = os.path.join(conf['Preparation'].get('source_in'))

    # Load files
    features_raw, outcomes_cleaned1, data_source_raw, class_labels = load_files(features_path, outcomes_path, source_path, labels_path, no_source_data)

    ## Data Cleanup of Features and Outcomes before Features are Modified
    features_cleaned1 = clean_features_first_pass(features_raw, class_labels)

    analyze_raw_data(features_cleaned1, outcomes_cleaned1, result_directory, conf['Common'].get('dataset_name'), conf['Common'].get('class_name'), no_images, on_inference_data)

    # Save structures for further processing
    # Dump path data
    dump((features_cleaned1, outcomes_cleaned1, class_labels, data_source_raw, data_directory, result_directory),
         open(data_preparation_dump_file_path, 'wb'))
    print("Stored paths to: ", data_preparation_dump_file_path)
Example #9
0
def main(config_path):
    conf = sup.load_config(config_path)
    features, y, df_y, class_labels = sup.load_features(conf)

    image_save_directory = conf['Paths'].get(
        'results_directory') + "/data_preparation"

    selected_feature_columns_filename = os.path.join(
        conf['Preparation'].get("selected_feature_columns_out"))

    selected_feature_list = perform_feature_selection_algorithms(
        features, y, conf, image_save_directory)

    print("List of selected features")
    print(selected_feature_list.transpose())

    selected_feature_list.transpose().to_csv(selected_feature_columns_filename,
                                             sep=';',
                                             index=False,
                                             header=True)
    print("Saved selected feature columns to " +
          selected_feature_columns_filename)
Example #10
0
def main(config_path):
    conf = sup.load_config(config_path)

    #image_save_directory = conf['result_directory'] + "/data_preparation_images"
    prepared_data_directory = conf['Paths'].get('prepared_data_directory')
    outcomes_filename_uncut = os.path.join(prepared_data_directory, "temp", "temp_outcomes_uncut" + ".csv")
    features_filename_uncut = os.path.join(prepared_data_directory, "temp", "temp_features_uncut" + ".csv")

    # Load only a subset of the whole raw data to create a debug dataset
    source_uncut = custom.load_source(conf['Paths'].get('source_path'))
    features_uncut = pd.read_csv(features_filename_uncut, sep=';').set_index('id')
    if os.path.isfile(outcomes_filename_uncut):
        outcomes_uncut = pd.read_csv(outcomes_filename_uncut, sep=';').set_index('id')
        print("Outcomes file found. Adapting dimensions for training data.")
        print("Outcomes shape: ", outcomes_uncut.shape)
    else:
        outcomes_uncut = None
        print("Outcomes file not found. Adapting dimensions for inference data.")

    print("Source shape: ", source_uncut.shape)
    print("Features shape: ", features_uncut.shape)


    # Cut outcomes and by last 50 as smoothing was used
    #outcomes_reduced1 = cut_unusable_parts_of_dataframe(outcomes_uncut, tail_index=50)

    #Clean features # Cut NaNs
    features_reduced1 = clean_nan(features_uncut)

    if not outcomes_uncut is None:
        intersection_index = outcomes_uncut.index.intersection(features_reduced1.index)

        # Cut all dataframes to have the same index
        outcomes = outcomes_uncut.loc[intersection_index]
        print("Cut outcomes shape: ", outcomes.shape)
    else:
        outcomes = None
        intersection_index = features_reduced1.index
        print("Nothing will be cut. Size of features will be used.")

    features = features_reduced1.loc[intersection_index]
    source = source_uncut.loc[intersection_index]

    print("Cut source shape: ", source.shape)
    print("Cut features shape: ", features.shape)


    # Cut for subsets
    subset_start = 0
    subset_stop = features.shape[0]
    #subset_stop = 1000

    features_subset = cut_dataframe_subset(features, subset_start, subset_stop)
    source_subset = cut_dataframe_subset(source, subset_start, subset_stop)

    print("Subset source shape: ", source_subset.shape)
    print("Subset features shape: ", features_subset.shape)

    if 'outcomes_out' in conf['Generation']:
        outcomes_out_filename = os.path.join(conf['Generation'].get('outcomes_out')) #conf['Common'].get('dataset_name') + "_outcomes" + ".csv")
    else:
        outcomes_out_filename = None
        outcomes = None
        print("Only preparing features for inference. No outcomes file used.")

    features_out_filename = os.path.join(conf['Generation'].get('features_out'))

    source_out_filename = os.path.join(conf['Generation'].get('source_out'))

    print("=== Paths ===")
    print("Features in: ", features_out_filename)
    print("Outcomes in: ", outcomes_out_filename)
    print("Source out: ", source_out_filename)

    # Save the graph data for visualization of the results
    print("Feature shape {}".format(features_subset.shape))
    features_subset.to_csv(features_out_filename, sep=';', index=True, header=True)
    print("Saved features graph to " + features_out_filename)

    # Save the graph data for visualization of the results
    print("source shape {}".format(source_subset.shape))
    source_subset.to_csv(source_out_filename, sep=';', index=True, header=True)
    print("Saved source graph to " + source_out_filename)

    if not outcomes is None:
        outcomes_subset = cut_dataframe_subset(outcomes, subset_start, subset_stop)
        print("Subset outcomes shape: ", outcomes_subset.shape)

        # Save the graph data for visualization of the results
        print("Outcomes shape {}".format(outcomes_subset.shape))
        outcomes_subset.to_csv(outcomes_out_filename, sep=';', index=True, header=True)
        print("Saved source graph to " + outcomes_out_filename)
def execute_narrow_search(config_path):
    '''
    Execute a narrow search on the subset of data


    '''

    # Load config
    config = sup.load_config(config_path)
    # Load complete training input
    X_train, y_train, X_val, y_val, y_classes, selected_features, \
    feature_dict, paths, scorers, refit_scorer_name = exe.load_training_input_input(config)
    #model_type = config.get('Common', 'model_type')

    pipeline_class_name = config.get('Training',
                                     'pipeline_class',
                                     fallback=None)
    PipelineClass = locate('models.' + pipeline_class_name + '.ModelParam')
    model_param = PipelineClass()
    if model_param is None:
        raise Exception("Model pipeline could not be found: {}".format(
            'models.' + pipeline_class_name + '.ModelParam'))

    # Load narrow training parameters
    samples = json.loads(config.get("Training", "narrow_samples"))
    kfolds = json.loads(config['Training'].get('narrow_kfolds'))
    iterations = json.loads(config['Training'].get('narrow_iterations'))
    selection = json.loads(config['Training'].get('narrow_selection'))

    iter_setup = dict()
    iter_setup['samples'] = samples
    iter_setup['kfolds'] = kfolds
    iter_setup['iter'] = iterations
    iter_setup['selection'] = selection

    # f = open(data_input_path, "rb")
    # prepared_data = pickle.load(f)
    # print("Loaded data: ", prepared_data)

    # results_run1_file_path = prepared_data['paths']['svm_run1_result_filename']

    # X_train = train['X']
    # y_train = train['y']
    # scorers = model['scorers']
    # refit_scorer_name = model['refit_scorer_name']
    results_run2_file_path = paths['run2_result_filename']
    pipe_first_selection = paths['pipe_first_selection']
    # svm_pipe_final_selection = paths['svm_pipe_final_selection']
    # Use predefined export location for the pipe
    pipe_final_selection = config.get('Training', 'pipeline_out')
    # model_directory = paths['models_directory']
    result_directory = paths['results_directory']
    # model_name = paths['dataset_name']
    save_fig_prefix = result_directory + '/model_images'
    # if not os.path.isdir(save_fig_prefix):
    os.makedirs(save_fig_prefix, exist_ok=True)
    #    print("Created folder: ", save_fig_prefix)

    # Load saved results
    r = open(pipe_first_selection, "rb")
    pipe_run_best_first_selection = pickle.load(r)

    if model_param.get_model_type() == 'svm':
        # SVM Code Start
        pipe_run_second_selection, results_run2 = perform_run2_svm(
            X_train, iter_setup, pipe_run_best_first_selection,
            refit_scorer_name, save_fig_prefix, scorers, y_train)
        # SVM Code End
    else:
        # XGBoost Code start
        warnings.warn("No 2nd search will be performed for {}".format(
            model_param.get_model_type()))
        pipe_run_second_selection, results_run2 = perform_run2_xgboost(
            X_train, iter_setup, pipe_run_best_first_selection,
            refit_scorer_name, save_fig_prefix, scorers, y_train)
        # XGBoost Code end
    #else:
    #    raise Exception("No valid model for model type {}".format(model_param.get_model_type()))

    print("Model parameters defined", pipe_run_second_selection)

    print("Save model")
    # Save best pipe
    dump(pipe_run_second_selection, open(pipe_final_selection, 'wb'))
    print("Stored pipe_run_best_first_selection at ", pipe_final_selection)

    # Save results
    # only if any second runs were made
    if results_run2 is not None:
        dump(results_run2, open(results_run2_file_path, 'wb'))
        print("Stored results ", results_run2_file_path)

        # result_save = results_run2.copy()
        results_run2.round(4).to_csv(results_run2_file_path + "_results.csv",
                                     sep=";")

    with open(results_run2_file_path + "_pipe.txt", 'w') as f:
        print(pipe_run_second_selection, file=f)

    print("Method end")
def evaluate_model(config_path, config_section="EvaluationTraining"):
    '''


    '''
    # Get data
    config = sup.load_config(config_path)
    print("Load paths")
    paths = Paths(config).paths

    X_val, y_val, labels, model, external_params = evalutil.load_evaluation_data(
        config, config_section)
    y_classes = labels  #train['label_map']

    result_directory = paths['results_directory']
    #model_name = config['Common'].get('dataset_name')

    title = config.get(config_section, 'title')

    figure_path_prefix = result_directory + '/model_images/' + title
    #if not os.path.isdir(result_directory + '/model_images'):
    os.makedirs(result_directory + '/model_images', exist_ok=True)
    #    print("Created folder: ", result_directory + '/model_images')

    # Load model external parameters
    pr_threshold = external_params['pr_threshold']
    print("Loaded precision/recall threshold: ", pr_threshold)

    # Load model
    print("Predict validation data")
    y_test_pred = model.predict(X_val.values)
    #If there is an error here, set model_pipe['svm'].probability = True
    y_test_pred_proba = model.predict_proba(X_val.values)
    y_test_pred_scores = y_test_pred_proba[:,
                                           1]  #model.decision_function(X_val.values)

    #Reduce the number of classes only to classes that can be found in the data
    #reduced_class_dict_train = model_util.reduce_classes(y_classes, y_train, y_train_pred)
    reduced_class_dict_test = model_util.reduce_classes(
        y_classes, y_val, y_test_pred)

    if len(y_classes) == 2:
        #y_train_pred_adjust = model_util.adjusted_classes(y_train_pred_scores, pr_threshold)  # (y_train_pred_scores>=pr_threshold).astype('int')
        y_test_pred_adjust = model_util.adjusted_classes(
            y_test_pred_scores,
            pr_threshold)  # (y_test_pred_scores>=pr_threshold).astype('int')
        print(
            "This is a binarized problem. Apply optimal threshold to precision/recall. Threshold=",
            pr_threshold)
    else:
        #y_train_pred_adjust = y_train_pred
        y_test_pred_adjust = y_test_pred
        print(
            "This is a multi class problem. No precision/recall adjustment of scores are made."
        )

    #Plot graphs
    #If binary class plot precision/recall
    # Plot the precision and the recall together with the selected value for the test set
    if len(y_classes) == 2:
        print("Plot precision recall graphs")
        precision, recall, thresholds = precision_recall_curve(
            y_val, y_test_pred_scores)
        vis.plot_precision_recall_vs_threshold(
            precision,
            recall,
            thresholds,
            pr_threshold,
            save_fig_prefix=figure_path_prefix,
            title_prefix="pr_adjusted")

        vis.plot_precision_recall_evaluation(
            y_val,
            y_test_pred_adjust,
            y_test_pred_proba,
            reduced_class_dict_test,
            save_fig_prefix_dir=figure_path_prefix,
            title_prefix="pr_adjusted")

    #Plot evaluation for unadjusted values
    vis.plot_precision_recall_evaluation(
        y_val,
        y_test_pred,
        y_test_pred_proba,
        reduced_class_dict_test,
        save_fig_prefix_dir=figure_path_prefix,
        title_prefix="")
    #Plot decision boundary plot
    X_decision = X_val.values[0:1000, :]
    y_decision = y_val[0:1000]
    vis.plot_decision_boundary(X_decision,
                               y_decision,
                               model,
                               title_prefix=title + "_",
                               save_fig_prefix=figure_path_prefix)

    print("Visualization complete")
def main(config_path, debug_param):
    conf = sup.load_config(config_path)

    image_save_directory = os.path.join(conf['Paths'].get('results_directory'),
                                        "data_generation")
    features_filename_uncut = os.path.join(
        conf['Paths'].get('prepared_data_directory'), "temp",
        "temp_features_uncut" + ".csv")
    os.makedirs(os.path.dirname(features_filename_uncut), exist_ok=True)

    #Load only a subset of the whole raw data to create a debug dataset
    source = custom.load_source(conf['Paths'].get('source_path'))

    #Plot source
    plt.figure(num=None,
               figsize=(12.5, 7),
               dpi=80,
               facecolor='w',
               edgecolor='k')
    plt.plot(source['Date'], source['Close'])
    plt.title(conf['Paths'].get('source_path'))
    plt.show(block=False)

    # Define features df
    features = pd.DataFrame(index=source.index)

    # Generate Price Based Values

    normed_days_features = price_normalizer(source, debug_param=debug_param)
    features = features.join(normed_days_features)

    number_days_features = impulse_count(source, debug_param=debug_param)
    features = features.join(number_days_features)

    mean_features = calculate_moving_average(source, debug_param=debug_param)
    features = features.join(mean_features)

    madiff_features = calculate_moving_average_direction(source, mean_features)
    features = features.join(madiff_features)

    rsi_features = get_rsi(source, debug_param=debug_param)
    features = features.join(rsi_features)

    rsi_change_features = get_rsi_difference(source)
    features = features.join(rsi_change_features)

    #rsi_signal_features = get_rsi_signal(source)
    #features = features.join(rsi_signal_features)

    stoch_features = get_stochastics(source)
    features = features.join(stoch_features)

    plt.figure(num=None, figsize=(10, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.subplot(311)
    plt.plot(source['Date'][0:100], source['Close'][0:100])
    plt.title("Close")
    plt.subplot(312)
    plt.title("Stochastic Variant " + str(stoch_features.columns[1]))
    plt.plot(source['Date'][0:100], stoch_features.iloc[:, 1][0:100])
    plt.plot(source['Date'][0:100], stoch_features.iloc[:, 0][0:100])
    plt.subplot(313)
    plt.title("Stochastic Variant " + str(stoch_features.columns[-1]))
    plt.plot(source['Date'][0:100], stoch_features.iloc[:, -1][0:100])
    plt.plot(source['Date'][0:100], stoch_features.iloc[:, -2][0:100])
    plt.tight_layout()

    macd_features = get_macd(source)
    features = features.join(macd_features)

    plt.figure(num=None, figsize=(10, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.subplot(311)
    plt.plot(source['Date'][0:100], source['Close'][0:100])
    plt.title("Close")
    plt.subplot(312)
    plt.title("MACD Variant 1")
    plt.plot(source['Date'][0:100], macd_features.iloc[:, 0][0:100])
    plt.plot(source['Date'][0:100], macd_features.iloc[:, 1][0:100])
    plt.legend(("MACD", "MACD Signal"))
    plt.subplot(313)
    plt.title("MACD Variant 1")
    plt.plot(source['Date'][0:100], macd_features.iloc[:, -2][0:100])
    plt.plot(source['Date'][0:100], macd_features.iloc[:, -1][0:100])
    plt.legend(("MACD", "MACD Signal"))
    plt.tight_layout()

    macd_diff_features = get_macd_difference(macd_features)
    features = features.join(macd_diff_features)

    macd_direction_change_features = get_trigger_signals(macd_diff_features)
    features = features.join(macd_direction_change_features)

    periodic_values = get_periodical_indicators(source)
    features = features.join(periodic_values)

    # Features structure
    print("Features: ", features.head(10))
    print("Features shape: ", features.shape)

    # Save features to a csv file
    print("Features shape {}".format(features.shape))
    features.to_csv(features_filename_uncut, sep=';', index=True, header=True)
    print("Saved features to " + features_filename_uncut)

    print("=== Data for {} prepared to be trained or inferred ===".format(
        conf['Common'].get('dataset_name')))
def visualize_temporal_data(config_path, config_section):
    # Load intermediate model, which has only been trained on training data
    # Get data
    # Load file paths
    config = sup.load_config(config_path)
    print("Load paths")
    paths = Paths(config).paths
    title = config.get(config_section, 'title')

    X_val, y_val, labels, model, external_params = eval.load_evaluation_data(
        config, config_section)

    y_classes = labels

    model_name = config['Common'].get('dataset_name')
    source_path = config[config_section].get('source_in')
    result_directory = paths['results_directory']

    figure_path_prefix = result_directory + '/evaluation'
    os.makedirs(result_directory + '/evaluation', exist_ok=True)

    # Load model external parameters
    pr_threshold = external_params['pr_threshold']
    print("Loaded precision/recall threshold: {0:.2f}".format(pr_threshold))

    # Make predictions
    y_test_pred_scores = model.predict_proba(X_val.values)[:, 1]
    y_test_pred = model.predict(X_val.values)
    #y_test_pred_proba = evalclf.predict_proba(X_test.values)
    y_test_pred_adjust = model_util.adjusted_classes(y_test_pred_scores,
                                                     pr_threshold)

    # Load original data for visualization
    df_time_graph = pd.read_csv(source_path, delimiter=';').set_index('id')
    df_time_graph['Date'] = pd.to_datetime(df_time_graph['Date'])
    df_time_graph['Date'].apply(mdates.date2num)
    print("Loaded feature names for time graph={}".format(
        df_time_graph.columns))
    print("X. Shape={}".format(df_time_graph.shape))

    # Create a df from the y array for the visualization functions
    y_order_test_pred = pd.DataFrame(index=X_val.index,
                                     data=pd.Series(data=y_test_pred,
                                                    index=X_val.index,
                                                    name="y")).sort_index()

    y_order_test_pred_adjust = pd.DataFrame(index=X_val.index,
                                            data=pd.Series(
                                                data=y_test_pred_adjust,
                                                index=X_val.index,
                                                name="y")).sort_index()

    #Visualize the results
    print("Plot for inference data to ", figure_path_prefix)
    vis.plot_three_class_graph(y_order_test_pred['y'].values,
                               df_time_graph['Close'][y_order_test_pred.index],
                               df_time_graph['Date'][y_order_test_pred.index],
                               0,
                               0,
                               0, ('close', 'neutral', 'positive', 'negative'),
                               title=title + "_Inference_" + model_name,
                               save_fig_prefix=figure_path_prefix)

    vis.plot_three_class_graph(y_order_test_pred_adjust['y'].values,
                               df_time_graph['Close'][y_order_test_pred.index],
                               df_time_graph['Date'][y_order_test_pred.index],
                               0,
                               0,
                               0, ('close', 'neutral', 'positive', 'negative'),
                               title=title + "_Inference_Adjusted" +
                               model_name,
                               save_fig_prefix=figure_path_prefix)
def main(config_path):
    conf = sup.load_config(config_path)

    data_directory = conf['Paths'].get('prepared_data_directory')

    data_preparation_dump_file_path = os.path.join(data_directory, "temp",
                                                   "step31out.pickle")
    (features_cleaned1, outcomes_cleaned1, class_labels, data_source_raw,
     data_directory, result_directory) = pickle.load(
         open(data_preparation_dump_file_path, "rb"))

    class_name = conf['Common'].get('class_name')

    model_features_filename = os.path.join(
        conf['Preparation'].get('features_out'))
    if 'outcomes_out' in conf['Preparation']:
        model_outcomes_filename = os.path.join(
            conf['Preparation'].get('outcomes_out'))
    else:
        model_outcomes_filename = None
        print(
            "No outcomes out defined. Use inference settings with no outcomes")

    if 'labels_out' in conf['Preparation']:
        model_labels_filename = os.path.join(
            conf['Preparation'].get('labels_out'))
    else:
        model_labels_filename = None
        print("No labels file available for inference.")

    features, y, class_labels = adapt_features_for_model(
        features_cleaned1, outcomes_cleaned1, result_directory, class_labels,
        conf)

    # === Save features to a csv file ===#
    print("Features shape {}".format(features.shape))
    features.to_csv(model_features_filename, sep=';', index=True)
    # np.savetxt(filenameprefix + "_X.csv", X, delimiter=";", fmt='%s')
    print("Saved features to " + model_features_filename)

    # === Save the selected outcome to a csv file ===#
    if y is not None:
        print("outcome shape {}".format(y.shape))
        y_true = pd.DataFrame(y,
                              columns=[class_name],
                              index=outcomes_cleaned1.index)
        y_true.to_csv(model_outcomes_filename,
                      sep=';',
                      index=True,
                      header=True)
        print("Saved features to " + model_outcomes_filename)
    else:
        print("y values not saved as no ourcome was provided.")

    # === Save new y labels to a csv file ===#
    if class_labels is not None:
        print("Class labels length {}".format(len(class_labels)))
        with open(model_labels_filename, 'w') as f:
            for key in class_labels.keys():
                f.write(
                    "%s;%s\n" % (class_labels[key], key)
                )  # Classes are saved inverse to the labelling in the file, i.e. first value, then key
        print("Saved class names and id to " + model_labels_filename)
    else:
        print("Class labels were not saved as no outcome was available.")
Example #16
0
def convert_time_series(config_path, feature_dir):
    """


    """

    # Load time series
    config = sup.load_config(config_path)

    X_train_path = os.path.join(feature_dir, "features_val.csv")
    y_train_path = os.path.join(feature_dir, "outcomes_val.csv")
    source_path = os.path.join(feature_dir, "source.csv")

    os.makedirs("./prepared-data/val_rolling_csv", exist_ok=True)
    os.makedirs("./prepared-data/val_rolling_images/pos", exist_ok=True)
    os.makedirs("./prepared-data/val_rolling_images/neg", exist_ok=True)

    # Load X and y
    X_train, y_train_df, y_train = exe.load_data(X_train_path, y_train_path)
    # Load source data
    df_time_graph = stock.load_ohlc_graph(source_path)
    df_time_graph_red = df_time_graph.loc[X_train.index]
    # Merge all values
    # np.log(df_time_graph['Close'] - df_time_graph['Close'].iloc[0] + 1)

    all_df = X_train.join(df_time_graph).join(y_train_df)
    all_df.reset_index(inplace=True)
    all_df.set_index('Date', inplace=True)

    all_columns = [
        'id', 'SMA200', 'RSI_14', 'Open', 'High', 'Low', 'Close', 'LongTrend'
    ]
    all_df_red = all_df[all_columns]

    # Create rolling values in a df for multivariates
    for df_subset in all_df_red.rolling(250):
        #print(type(df_subset), '\n', df_subset)
        if df_subset.shape[0] >= 250:
            print("Processing id {}".format(df_subset.iloc[-1].id))
            # Save subset as file
            csv_columns = ['SMA200', 'RSI_14', 'LongTrend']

            csv_path = "./prepared-data/val_rolling_csv/roll_" + str(
                int(df_subset.iloc[-1].id)) + ".csv"
            df_subset[csv_columns].to_csv(csv_path,
                                          sep=';',
                                          index=True,
                                          header=True)

            #Create graph of subset

            if df_subset.iloc[-1].LongTrend == 1:
                image_path = "./prepared-data/val_rolling_images/pos/roll_" + str(
                    int(df_subset.iloc[-1].id)) + ".png"
            elif df_subset.iloc[-1].LongTrend == 0:
                image_path = "./prepared-data/val_rolling_images/neg/roll_" + str(
                    int(df_subset.iloc[-1].id)) + ".png"

            apd = mpf.make_addplot(df_subset['RSI_14'],
                                   panel=1,
                                   color='black',
                                   ylim=(10, 90),
                                   secondary_y=True)
            mpf.plot(df_subset,
                     type='candle',
                     volume=False,
                     mav=(20, 100, 200),
                     figscale=1.5,
                     addplot=apd,
                     panel_ratios=(1, 0.3),
                     savefig=image_path)

    # Save images of ohlc graph

    # Save rolling charts in files

    print("End")
def main(config_path):
    conf = sup.load_config(config_path)
    # Load annotations file
    y_labels = pd.read_csv(conf['Paths'].get('source_path'), sep=';', header=None).set_index(0).to_dict()[1]

    # Generating filenames for saving the files
    image_save_directory = os.path.join(conf['Paths'].get('results_directory'), "data_generation")
    outcomes_filename_raw = os.path.join(conf['Paths'].get('prepared_data_directory'), "temp", "temp_outcomes_uncut" + ".csv")

    #if os.path.isdir(conf['Paths'].get('prepared_data_directory'))==False:
    os.makedirs(os.path.dirname(outcomes_filename_raw), exist_ok=True)
    #    print("Created directory ", conf['Paths'].get('training_data_directory'))

    #if os.path.isdir(conf['Paths'].get('result_directory'))==False:
    #os.makedirs(conf['Paths'].get('result_directory'), exist_ok=True)
    #    print("Created directory ", conf['Paths'].get('result_directory'))

    #Load only a subset of the whole raw data to create a debug dataset
    source = custom.load_source(conf['Paths'].get('source_path')) #.iloc[0:1000, :]

    #Plot source
    plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.plot(source['Date'], source['Close'])
    plt.title(conf['Paths'].get('source_path'))
    #plt.show(block = False)

    vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename="Source_data")

    #y_labels = annotations #generate_custom_class_labels()
    outcomes = generate_features_outcomes(image_save_directory, source)

    # Drop the 50 last values as they cannot be used for prediction as +50 days ahead is predicted
    source_cut = source.drop(source.tail(50).index, inplace=False)
    outcomes_cut = outcomes.drop(outcomes.tail(50).index, inplace=False)

    vis.plot_three_class_graph(outcomes_cut['1dTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_1dTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['5dTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_5dTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['20dTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_20dTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['LongTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_LongTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['TopsBottoms'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'top', 'bottom'),
                               title=conf['Common'].get('dataset_name') + '_GT_TopsBottoms',
                               save_fig_prefix=image_save_directory)

    def binarize(outcomes, class_number):
        return (outcomes == class_number).astype(int)

    vis.plot_two_class_graph(binarize(outcomes_cut['1dTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_1dTrend',
                             save_fig_prefix=image_save_directory)

    vis.plot_two_class_graph(binarize(outcomes_cut['5dTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_5dTrend',
                             save_fig_prefix=image_save_directory)

    vis.plot_two_class_graph(binarize(outcomes_cut['20dTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_20dTrend',
                             save_fig_prefix=image_save_directory)

    vis.plot_two_class_graph(binarize(outcomes_cut['LongTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_LongTrend',
                             save_fig_prefix=image_save_directory)

    # Save file
    # Save outcomes to a csv file
    print("Outcomes shape {}".format(outcomes_cut.shape))
    outcomes_cut.to_csv(outcomes_filename_raw, sep=';', index=True, header=True)
    print("Saved outcomes to " + outcomes_filename_raw)
Example #18
0
def generate_values_for_backtesting(config_path, config_section):
    """
    Generate values for backtesting to find out how well the system performs. First, an MA200 reference value is used.
    Second, the predictions are created and then smoothed

    """

    conf = sup.load_config(config_path)
    print("Load paths")
    #paths = Paths(conf).paths
    #title = conf.get(config_section, 'title')

    X_val, y_val, labels, model, external_params = eval.load_evaluation_data(
        conf, config_section)

    #model_name = conf['Common'].get('dataset_name')
    source_path = conf[config_section].get('source_in')
    #result_directory = paths['results_directory']

    # Load model external parameters
    pr_threshold = external_params['pr_threshold']
    print("Loaded precision/recall threshold: {0:.2f}".format(pr_threshold))

    df_time_graph = stock.load_ohlc_graph(source_path)

    # Y_val
    y_val_data = pd.DataFrame(index=X_val.index, data=y_val, columns=['val'])

    # Reference system
    print("Create reference values")
    y_ref = generate_reference_results(X_val['SMA200'])
    y_red_data = pd.DataFrame(y_ref)

    #Visualize the results
    figure_path_prefix = os.path.join(conf['Paths'].get('results_directory'),
                                      "evaluation")
    print("Plot for inference data to ", figure_path_prefix)
    vis.plot_three_class_graph(y_ref,
                               df_time_graph['Close'][y_red_data.index],
                               df_time_graph['Date'][y_red_data.index],
                               0,
                               0,
                               0, ('close', 'neutral', 'positive', 'negative'),
                               title="Reference_System_SMA200" +
                               "_Validation_",
                               save_fig_prefix=figure_path_prefix)

    # Predictionsystem
    print("Get prediction values")
    y_test_pred = model.predict(X_val.values)
    y_test_pred_data = pd.DataFrame(index=X_val.index,
                                    data=y_test_pred,
                                    columns=['model'])
    pred_outcomes_filename = os.path.join(
        conf['Paths'].get('results_directory'), "evaluation",
        "outcomes_pred.csv")
    #y_test_pred_data.to_csv(pred_outcomes_filename, sep=';', index=True, header=True)

    # Post processing for prediction
    # Use a moving average
    N = 3
    smoothed_values_unfixed = np.convolve(y_test_pred_data['model'],
                                          np.ones(N) / N,
                                          mode='valid')
    smoothed_data_raw = np.concatenate([np.zeros(2), smoothed_values_unfixed])
    smoothed_data = (smoothed_data_raw > 0.5) * 1

    y_test_pred_data_smoothed = pd.DataFrame(index=X_val.index,
                                             data=smoothed_data,
                                             columns=['model_pp'])

    print("Plot for post processed inference data to ", figure_path_prefix)
    vis.plot_three_class_graph(
        smoothed_data,
        df_time_graph['Close'][y_test_pred_data_smoothed.index],
        df_time_graph['Date'][y_test_pred_data_smoothed.index],
        0,
        0,
        0, ('close', 'neutral', 'positive', 'negative'),
        title="Smoothed_Prediction_Model" + "_Validation_",
        save_fig_prefix=figure_path_prefix)

    # Merge all values
    all_data = y_test_pred_data.join(y_red_data).join(y_val_data).join(
        y_test_pred_data_smoothed)
    pred_outcomes_filename = os.path.join(
        conf['Paths'].get('results_directory'), "evaluation",
        "outcomes_backtest.csv")
    all_data.to_csv(pred_outcomes_filename, sep=';', index=True, header=True)
    print("Completed")
 def __init__(self, config_file_path):
     self.conf = sup.load_config(config_file_path)
Example #20
0
def train_final_model(config_path, config_section="Evaluation"):
    # Get data
    config = sup.load_config(config_path)
    #paths, model, train, test = step40.load_training_files(paths_path)
    X_train, y_train, pipe = load_data(config, config_section)

    #print("load inputs: ", data_input_path)
    #f = open(data_input_path, "rb")
    #prepared_data = pickle.load(f)
    #print("Loaded data: ", prepared_data)

    #X_train = train['X']
    #y_train = train['y']
    #X_test = test['X']
    #y_test = test['y']
    #y_classes = train['label_map']

    #svm_pipe_final_selection = paths['svm_pipe_final_selection']
    svm_final_model_filepath = config[config_section].get(
        'model_out')  #paths['svm_final_model_filename']
    #model_directory = paths['model_directory']
    #model_name = paths['dataset_name']

    #figure_path_prefix = model_directory + '/images/' + model_name

    # Load model external parameters
    #with open(svm_external_parameters_filename, 'r') as fp:
    #    external_params = json.load(fp)

    #pr_threshold = external_params['pr_threshold']
    #print("Loaded precision/recall threshold: ", pr_threshold)

    # Load model
    #r = open(svm_pipe_final_selection, "rb")
    #model_pipe = pickle.load(r)
    #model_pipe['svm'].probability = True
    print("")

    print("Set probability measurements in the model to True")
    pipe['model'].probability = True
    print("Original final pipe: ", pipe)

    #Merge training and test data
    #X = X_train.append(X_test)
    #y = np.append(y_train, y_test)
    #print("Merge training and test data from sizes train {} and test {} to all data {}".format(
    #    X_train.shape, X_train.shape, X.shape
    #))

    t = time.time()
    local_time = time.ctime(t)
    print("=== Start training the SVM at {} ===".format(local_time))
    clf = pipe.fit(X_train, y_train)
    t_end = time.time() - t
    print("Training took {0:.2f}s".format(t_end))

    print("Store model")
    print("Model to save: ", clf)

    joblib.dump(clf, svm_final_model_filepath)
    print("Saved model at location ", svm_final_model_filepath)