コード例 #1
0
    def parallelised_function(file):
        select_file_path = os.path.join(jointFeatureLocation,
                                        file)  # formulate the path
        print('Symbol:----->', file.split("_")[0])
        symbol = file.split("_")[0]

        select_hmm_date = select_file_path.split("_")[
            3]  # pull out the hmm_date - strip it out

        select_feature_label_date = select_file_path.split("_")[
            6]  # pull out the label_feature_date

        select_label_idx = select_file_path.split("_")[
            9]  # pull out the label _idx

        unpickled_select_file = open_pickle_filepath(
            select_file_path)  # unplickle the select file

        hmm_keys = sorted(list(
            unpickled_select_file.keys()))  # hmm keys for the select file.

        for hmm_date_key in hmm_keys:  # pick and hmm date
            feature_label_keys = sorted(
                unpickled_select_file[hmm_date_key].keys(
                ))  # each key here unlocks a feature and label set

            for feature_label_date in feature_label_keys:  # make a list of all the feature dates
                features_file_path = unpickled_select_file[hmm_date_key][
                    feature_label_date][0]  # this is the feature path
                labels_file_path = unpickled_select_file[hmm_date_key][
                    feature_label_date][1]  # this is the labels path

                if os.path.isfile(features_file_path
                                  ):  # if label file exists I can traing
                    print(
                        'ok----->', feature_label_date
                    )  # if you got to this point we have data so we can mov eon
                    labels = pd.read_csv(labels_file_path)  # open labels file
                    label_name = str(
                        labels.columns[labels.columns.str.contains(
                            pat='label')].values[0])
                    features = open_pickle_filepath(
                        features_file_path)  # opens features file
                    hmm_features = nfu.hmm_features_df(
                        features
                    )  # get the hmm features out, so unpack the tuples!
                    print('loaded features and labels ')
                    if hmm_features.isnull().values.all(
                    ):  # checking that the HMM features are actually not null
                        continue
                    else:  # if features not null then start moving on!
                        market_features_df = CreateMarketFeatures(
                            CreateMarketFeatures(
                                CreateMarketFeatures(df=CreateMarketFeatures(
                                    df=labels).ma_spread_duration()).ma_spread(
                                    )).chaikin_mf()).obv_calc(
                                    )  # market features dataframe

                        df_concat = pd.DataFrame(
                            pd.concat([hmm_features, market_features_df],
                                      axis=1,
                                      sort='False').dropna())

                        df = df_concat[df_concat[label_name].notna()]
                        df_final = df.drop(columns=[
                            'TradedPrice', 'Duration', 'TradedTime',
                            'ReturnTradedPrice', 'Volume', label_name
                        ])
                        y_train = df[df.columns[df.columns.str.contains(
                            pat='label')]].iloc[:, 0]  # training labels
                        if df_final.shape[
                                0] < 10:  # make sure it all looks reasonable
                            print(
                                ' the ratio of classes is too low. try another label permutation'
                            )
                            continue
                        else:

                            print("starting model fit")
                            # put the features in a tensor format
                            X = np.asarray(
                                df_final.values)  # need this for torch
                            Xtr = normalization(rescale_01(torch.Tensor(
                                X)))  # features in a tensor format

                            Ytr = torch.Tensor(
                                y_train.values
                            )  # put the labels in a tensor format
                            print(
                                '-----------------first bit done------------------'
                            )
                            KLrbf = generators.RBF_generator(
                                Xtr, gamma=[.01, .1, .25, .5]
                            )  # get a few RBF Kernels ready - maybe need more here
                            print('done with kernel')
                            best_results = {}

                            C_range = [0.1, 1]
                            lam_range = [0.2]
                            try:

                                for C_choice in C_range:
                                    base_learner = SVC(
                                        C=C_choice)  # "hard"-margin svm
                                    # clf = EasyMKL(lam=0.2, multiclass_strategy='ova', learner=base_learner).fit(KLrbf,
                                    #                                                                             Ytr)
                                    # print('done')
                                    # print('the combination weights are:')
                                    #
                                    # for sol in clf.solution:
                                    #     print('(%d vs all): ' % sol,
                                    #           clf.solution[
                                    #               sol].weights)  # need to store these results somewhere

                                    for lam in lam_range:  # possible lambda values for the EasyMKL algorithm
                                        # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns
                                        # accuracy, auc, or F1 scores
                                        scores = cross_val_score(
                                            KLrbf,
                                            Ytr,
                                            EasyMKL(learner=base_learner,
                                                    lam=lam),
                                            n_folds=5,
                                            scoring='accuracy'
                                        )  # get the cross-validation scores
                                        acc = np.mean(scores)
                                        if not best_results or best_results[
                                                'score'] < acc:
                                            best_results = {
                                                'C': C_choice,
                                                'lam': lam,
                                                'score': acc,
                                                'scores': scores
                                            }  # these should get dumped somewhere
                                print('done')
                                best_learner = SVC(C=best_results['C'])
                                clf = EasyMKL(learner=best_learner,
                                              lam=best_results['lam']).fit(
                                                  KLrbf, Ytr)
                                y_pred = clf.predict(KLrbf)
                                accuracy = accuracy_score(Ytr, y_pred)
                                print(
                                    'accuracy on the test set: %.3f, with lambda=%.2f'
                                    % (accuracy, best_results['lam']))
                                print(scores)

                                pickle_out_filename = os.path.join(
                                    mainPath,
                                    "ExperimentCommonLocs/CrossValidationResults",
                                    "_".join((symbol, 'feature_label_date',
                                              str(select_feature_label_date),
                                              str(select_label_idx),
                                              'hmm_date:', hmm_date_key, 'RBF',
                                              'MultiKernelSVC.pkl')))
                                # pickle_out = open(pickle_out_filename, 'wb')
                                # pickle.dump(best_results, pickle_out)
                                # pickle_out.close()

                            except ValueError:
                                continue

                else:
                    print('PROBLEM----->in one of of your locations')
                    continue
コード例 #2
0
                                    f, feature_file in enumerate(sorted(os.listdir(hmm_features_date_path)))}
            keys = sorted(list(symbol_feature_paths.keys()))
            print('For hmm date: ', hmm_date, '###########')
            print(keys == fit_select.forward_Dates(hmm_dates, hmm_date) )
            for key in keys: #<- parallelisation here!

                labels_file_path = os.path.join(symbolData.symbol_specific_label_path(label_idx), key + ".csv")
                print(os.path.isfile(labels_file_path))

                if os.path.isfile(labels_file_path):  # check that this is a real path

                    print(" reading labels")  # this is the labels path!
                    labels = pd.read_csv(labels_file_path)
                    label_name = str(labels.columns[labels.columns.str.contains(pat='label')].values[0])
                    fit_select.logmemoryusage("Before garbage collect")
                    hmm_features = nfu.hmm_features_df(open_pickle_filepath(symbol_feature_paths[key]))

                    if hmm_features.isnull().values.all():  # checking that the HMM features are actually not null

                        pass

                    else:

                        print('can fit and predict!')
                        Xtr, Ytr = features_and_labels(labels)
                        if Xtr.shape[0]<10:
                            print('ratio is too low')
                            continue
                        else:
                            forward_dates_list =fit_select.forward_Dates(keys, key)
                            print('the number of forward dates is:',len(forward_dates_list))
コード例 #3
0
    def oos_prediction_function(symbol, label_idx):

        for pickled_model in pickled_models:
            print(' new model coming')

            print('Your new model is here:', pickled_model)
            model_date = (pickled_model.split("_")[4]
                          )  # load an HMM date model
            model_path = os.path.join(fittedModelsPath, pickled_model)
            print('model date- you are here ! ', model_date)
            best_svc = open_pickle_filepath(model_path)
            for hmm_date_idx, _ in enumerate(symbolData.hmm_dates_list):
                print(
                    ' ------------------------------------new hmm date coming----------'
                )
                # all the various combinations of HMM dates, # features models.
                # all the labels dates that are after the key date that this model was fitted
                # all the various paths
                hmm_date = symbolData.hmm_dates_list[hmm_date_idx]

                # get all the dates we have essentially an hmm model - which we will use for
                # features!

                print(" now get your feature paths:")
                features_paths = symbolData.hmm_model_date_feature_list_filepaths(
                    hmm_date)[1]

                # print(features_paths)
                print('now go get your forward dates')
                # now go get it for each forward date

                labels_paths = symbolData.hmm_model_feature_corrsp_labels_files(
                    hmm_date, alternate_labels_nos[label_idx])

                forwardDatesList = forwardDates(list(labels_paths.keys()),
                                                model_date)
                print(
                    "-------------------############------------------- predictions start next-------###"
                )

                for forwardDateKey in forwardDatesList:
                    if model_date < forwardDateKey:  # simple check that your model date is not after your forward date!
                        oos_svc_predictions = defaultdict(dict)

                        # get your labels
                        labels = pd.read_csv(labels_paths[forwardDateKey])
                        label_name = str(
                            labels.columns[labels.columns.str.contains(
                                pat='label')].values[0])

                        print('you are on this date:', forwardDateKey,
                              "and doing this label", label_name)

                        # create features - first HMM and second some Market Features!
                        try:

                            hmm_features = nfu.hmm_features_df(
                                open_pickle_filepath(
                                    features_paths[forwardDateKey]))
                            if hmm_features.isnull().values.all():
                                print(
                                    'Problem: your HMM features did not compute properly'
                                )
                            else:

                                market_features_df = CreateMarketFeatures(
                                    CreateMarketFeatures(
                                        CreateMarketFeatures(
                                            df=CreateMarketFeatures(
                                                df=labels).ma_spread_duration(
                                                )).ma_spread()).chaikin_mf()
                                ).obv_calc()  # market features dataframe

                                df_concat = pd.DataFrame(
                                    pd.concat(
                                        [hmm_features, market_features_df],
                                        axis=1,
                                        sort='False').dropna())

                                df = df_concat[df_concat[label_name].notna()]
                                df_final = df.drop(columns=[
                                    'TradedPrice', 'Duration', 'TradedTime',
                                    'ReturnTradedPrice', 'Volume', label_name
                                ])

                                y_test = df[df.columns[df.columns.str.contains(
                                    pat='label')]].iloc[:, 0]
                                print(
                                    'Success---------->*******READY TO FIT MODELS **********'
                                )

                                try:

                                    X_test = MinMaxScaler().fit_transform(
                                        df_final)

                                    y_pred = best_svc[str(symbol)][model_date][
                                        'SVC'].predict(X_test)
                                    print(evaluate_predictions(y_test, y_pred))
                                    # store the results
                                    results_predict_alias = "_".join(
                                        (symbol, forwardDateKey,
                                         str(alternate_labels_nos[label_idx])))
                                    oos_svc_predictions[results_predict_alias][
                                        forwardDateKey] = evaluate_predictions(
                                            y_test, y_pred)

                                except ValueError:
                                    print(
                                        'value error here:****************************************'
                                    )

                                    continue

                    # store the results

                            print('******* Finished and now saving -*-*-*-')

                            pickle_out_filename = os.path.join(
                                oosPredictionsPath, "_".join(
                                    (symbol, str("Label_") +
                                     str(alternate_labels_nos[label_idx]),
                                     forwardDateKey, 'OOS_results_dict.pkl')))
                            pickle_out = open(pickle_out_filename, 'wb')
                            pickle.dump(oos_svc_predictions, pickle_out)
                            pickle_out.close()
                            print('saved', pickle_out_filename)

                        except:
                            error_dates.append(forwardDateKey)
                            print('issue here:', forwardDateKey)

                            pass

                    print(' you are about to switch')
コード例 #4
0
    def parallised_function(symbol, label_idx):

        symbolData = DataLoader(mainPath, symbol)
        # hmm_dates_list = (symbolData.hmm_dates_list) hmm_dates_models_list

        for hmm_date_idx, hmm_date in enumerate(
                sorted(symbolData.hmm_dates_list)):
            hmm_features_date_path = os.path.join(
                symbolData.symbol_features_path, hmm_date)
            symbol_feature_paths = {
                feature_file.split("_")[5]:
                os.path.join(hmm_features_date_path, feature_file)
                for f, feature_file in enumerate(
                    sorted(os.listdir(hmm_features_date_path)))
            }
            keys = sorted(list(symbol_feature_paths.keys()))
            for key in keys:  # <--- this is the label key in older versions of the code!
                labels_file_path = os.path.join(
                    symbolData.symbol_specific_label_path(label_idx),
                    key + ".csv")
                # this is the label path in older versions of the code
                best_svc_dict = defaultdict(dict)
                if os.path.isfile(
                        labels_file_path):  # check that this is a real path
                    print("can train")
                    print(" reading labels")  # this is the labels path!
                    labels = pd.read_csv(labels_file_path)
                    label_name = str(
                        labels.columns[labels.columns.str.contains(
                            pat='label')].values[0])
                    print(
                        symbol_feature_paths[key]
                    )  # this is the labels path! in the correct order !!!!
                    logmemoryusage("Before garbage collect")
                    hmm_features = nfu.hmm_features_df(
                        open_pickle_filepath(symbol_feature_paths[key]))
                    if hmm_features.isnull().values.all(
                    ):  # checking that the HMM features are actually not null
                        continue
                    else:  # if features not null then start moving on!
                        market_features_df = CreateMarketFeatures(
                            CreateMarketFeatures(
                                CreateMarketFeatures(df=CreateMarketFeatures(
                                    df=labels).ma_spread_duration()).ma_spread(
                                    )).chaikin_mf()).obv_calc(
                                    )  # market features dataframe

                        df_concat = pd.DataFrame(
                            pd.concat([hmm_features, market_features_df],
                                      axis=1,
                                      sort='False').dropna())

                        df = df_concat[df_concat[label_name].notna()]
                        df_final = df.drop(columns=[
                            'TradedPrice', 'Duration', 'TradedTime',
                            'ReturnTradedPrice', 'Volume', label_name
                        ])
                        y_train = df[df.columns[df.columns.str.contains(
                            pat='label')]].iloc[:, 0]  # training labels
                        if df_final.shape[0] < 10:
                            print(
                                ' the ratio of classes is too low. try another label permutation'
                            )
                            continue
                        else:
                            try:
                                print("starting model fit")
                                X_train = MinMaxScaler().fit_transform(
                                    df_final)
                                models_cls = FitModels(X_train, y_train)
                                print(models_cls.best_svc_clf())
                                best_svc_dict[symbol][key] = {
                                    'SVC':
                                    models_cls.best_svc_clf(),
                                    'best_params':
                                    models_cls.best_svc_clf().best_params_,
                                    'means':
                                    models_cls.best_svc_clf(
                                    ).cv_results_['mean_test_score'],
                                    'stds':
                                    models_cls.best_svc_clf(
                                    ).cv_results_['std_test_score'],
                                    'params':
                                    models_cls.best_svc_clf().
                                    cv_results_['params'],
                                    'best_score':
                                    models_cls.best_svc_clf().best_score_
                                }

                            except ValueError:
                                continue
                                logmemoryusage("at the end")

                        pickle_out_filename = os.path.join(
                            mainPath, "ExperimentCommonLocs/FittedModels",
                            "_".join((symbol, 'model_fit_date', str(key),
                                      str(alternate_labels_nos[label_idx]),
                                      'SingleKernelSVC.pkl')))
                        pickle_out = open(pickle_out_filename, 'wb')
                        pickle.dump(best_svc_dict, pickle_out)
                        pickle_out.close()

                else:
                    print(
                        "#################### Your Labels File does not exist ----- ####"
                    )
                    continue
                logmemoryusage("Before garbage collect")
        print(best_svc_dict[symbol][key])
コード例 #5
0
def fitting_function_mkl(key):
    print('For key: ', key, '############')
    labels_file_path = os.path.join(
        symbolData.symbol_specific_label_path(label_idx), key + ".csv")
    print(os.path.isfile(labels_file_path))
    output_dict = defaultdict(dict)

    if os.path.isfile(labels_file_path):  # check that this is a real path
        print(" reading labels")  # this is the labels path!
        labels = pd.read_csv(labels_file_path)
        label_name = str(
            labels.columns[labels.columns.str.contains(pat='label')].values[0])
        logmemoryusage("Before garbage collect")
        hmm_features = nfu.hmm_features_df(
            open_pickle_filepath(symbol_feature_paths[key]))

        if hmm_features.isnull().values.all(
        ):  # checking that the HMM features are actually not null
            pass
            print('lots of NaNs on features')
        else:  # if features not null then start moving on!
            print("can train")
            market_features_df = CreateMarketFeatures(
                CreateMarketFeatures(
                    CreateMarketFeatures(df=CreateMarketFeatures(
                        df=labels).ma_spread_duration()).ma_spread()).
                chaikin_mf()).obv_calc()  # market features dataframe

            df_concat = pd.DataFrame(
                pd.concat([hmm_features, market_features_df],
                          axis=1,
                          sort='False').dropna())

            df = df_concat[df_concat[label_name].notna()]
            df_final = df.drop(columns=[
                'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice',
                'Volume', label_name
            ])

            y_train = df.reindex(columns=df.columns[df.columns.str.contains(
                pat='label')])  # training labels
            print('go to the labels')

            if df_final.shape[0] < 10:
                print(
                    ' the ratio of classes is too low. try another label permutation'
                )
                # problem_dict[hmm_date][key] = str(key)
                pass
            else:
                print("starting model fit")

                Xtr, Xte, Ytr, Yte = train_test_split(df_final,
                                                      y_train,
                                                      test_size=.2,
                                                      random_state=42)
                # training
                arrXtr = np.array(Xtr)
                X_tr = normalization(rescale_01(arrXtr))
                Y_tr = torch.Tensor(Ytr.values.ravel())

                # testing

                arrXte = np.array(Xte)
                X_te = normalization(rescale_01(arrXte))
                Y_te = torch.Tensor(Yte.values.ravel())

                KLtr = [
                    pairwise.homogeneous_polynomial_kernel(X_tr, degree=d)
                    for d in range(1, 11)
                ] + [identity_kernel(len(Y_tr))]
                KLte = [
                    pairwise.homogeneous_polynomial_kernel(X_te,
                                                           X_tr,
                                                           degree=d)
                    for d in range(1, 11)
                ]
                KLte.append(torch.zeros(KLte[0].size()))
                print('done with kernel')
                try:
                    lam_values = [0.1, 0.2, 1]
                    best_results = {}
                    C_range = [0.1, 1]
                    for C_ch in C_range:
                        base_learner = SVC(C=C_ch)  # "soft"-margin svm
                        print(' fitted the base learner')
                        # possible lambda values for the EasyMKL algorithm
                        for lam in lam_values:
                            print('now here', lam)
                            print(' and tuning lambda for EasyMKL...', end='')
                            base_learner = SVC(C=C_ch)  # "soft"-margin svm
                            # MKLpy.model_selection.cross_val_score performs the cross validation automatically,
                            # it may returns accuracy, auc, or F1 scores
                            scores = cross_val_score(KLtr,
                                                     Y_tr,
                                                     EasyMKL(
                                                         learner=base_learner,
                                                         lam=lam),
                                                     n_folds=5,
                                                     scoring='accuracy')
                            acc = np.mean(scores)
                            if not best_results or best_results['score'] < acc:
                                best_results = {'lam': lam, 'score': acc}
                            # evaluation on the test set

                            print('done', best_results)
                            cv_dict_list[(symbol, hmm_date,
                                          label_idx)][(lam, C_ch)] = [
                                              scores, best_results
                                          ]
                            print(cv_dict_list)

                            pickle_out_filename = os.path.join(
                                mainPath,
                                "ExperimentCommonLocs/MKLFittedModels",
                                "_".join((symbol, 'model_fit_date', str(key),
                                          str(alternate_labels_nos[label_idx]),
                                          'MultiKernelSVC.pkl')))
                            print(pickle_out_filename)

                            pickle_out = open(pickle_out_filename, 'wb')

                            pickle.dump(cv_dict_list, pickle_out)
                            pickle_out.close()

                except (ValueError, TypeError, EOFError):
                    pass