def parallelised_function(file): select_file_path = os.path.join(jointFeatureLocation, file) # formulate the path print('Symbol:----->', file.split("_")[0]) symbol = file.split("_")[0] select_hmm_date = select_file_path.split("_")[ 3] # pull out the hmm_date - strip it out select_feature_label_date = select_file_path.split("_")[ 6] # pull out the label_feature_date select_label_idx = select_file_path.split("_")[ 9] # pull out the label _idx unpickled_select_file = open_pickle_filepath( select_file_path) # unplickle the select file hmm_keys = sorted(list( unpickled_select_file.keys())) # hmm keys for the select file. for hmm_date_key in hmm_keys: # pick and hmm date feature_label_keys = sorted( unpickled_select_file[hmm_date_key].keys( )) # each key here unlocks a feature and label set for feature_label_date in feature_label_keys: # make a list of all the feature dates features_file_path = unpickled_select_file[hmm_date_key][ feature_label_date][0] # this is the feature path labels_file_path = unpickled_select_file[hmm_date_key][ feature_label_date][1] # this is the labels path if os.path.isfile(features_file_path ): # if label file exists I can traing print( 'ok----->', feature_label_date ) # if you got to this point we have data so we can mov eon labels = pd.read_csv(labels_file_path) # open labels file label_name = str( labels.columns[labels.columns.str.contains( pat='label')].values[0]) features = open_pickle_filepath( features_file_path) # opens features file hmm_features = nfu.hmm_features_df( features ) # get the hmm features out, so unpack the tuples! print('loaded features and labels ') if hmm_features.isnull().values.all( ): # checking that the HMM features are actually not null continue else: # if features not null then start moving on! market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures(df=CreateMarketFeatures( df=labels).ma_spread_duration()).ma_spread( )).chaikin_mf()).obv_calc( ) # market features dataframe df_concat = pd.DataFrame( pd.concat([hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_train = df[df.columns[df.columns.str.contains( pat='label')]].iloc[:, 0] # training labels if df_final.shape[ 0] < 10: # make sure it all looks reasonable print( ' the ratio of classes is too low. try another label permutation' ) continue else: print("starting model fit") # put the features in a tensor format X = np.asarray( df_final.values) # need this for torch Xtr = normalization(rescale_01(torch.Tensor( X))) # features in a tensor format Ytr = torch.Tensor( y_train.values ) # put the labels in a tensor format print( '-----------------first bit done------------------' ) KLrbf = generators.RBF_generator( Xtr, gamma=[.01, .1, .25, .5] ) # get a few RBF Kernels ready - maybe need more here print('done with kernel') best_results = {} C_range = [0.1, 1] lam_range = [0.2] try: for C_choice in C_range: base_learner = SVC( C=C_choice) # "hard"-margin svm # clf = EasyMKL(lam=0.2, multiclass_strategy='ova', learner=base_learner).fit(KLrbf, # Ytr) # print('done') # print('the combination weights are:') # # for sol in clf.solution: # print('(%d vs all): ' % sol, # clf.solution[ # sol].weights) # need to store these results somewhere for lam in lam_range: # possible lambda values for the EasyMKL algorithm # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns # accuracy, auc, or F1 scores scores = cross_val_score( KLrbf, Ytr, EasyMKL(learner=base_learner, lam=lam), n_folds=5, scoring='accuracy' ) # get the cross-validation scores acc = np.mean(scores) if not best_results or best_results[ 'score'] < acc: best_results = { 'C': C_choice, 'lam': lam, 'score': acc, 'scores': scores } # these should get dumped somewhere print('done') best_learner = SVC(C=best_results['C']) clf = EasyMKL(learner=best_learner, lam=best_results['lam']).fit( KLrbf, Ytr) y_pred = clf.predict(KLrbf) accuracy = accuracy_score(Ytr, y_pred) print( 'accuracy on the test set: %.3f, with lambda=%.2f' % (accuracy, best_results['lam'])) print(scores) pickle_out_filename = os.path.join( mainPath, "ExperimentCommonLocs/CrossValidationResults", "_".join((symbol, 'feature_label_date', str(select_feature_label_date), str(select_label_idx), 'hmm_date:', hmm_date_key, 'RBF', 'MultiKernelSVC.pkl'))) # pickle_out = open(pickle_out_filename, 'wb') # pickle.dump(best_results, pickle_out) # pickle_out.close() except ValueError: continue else: print('PROBLEM----->in one of of your locations') continue
f, feature_file in enumerate(sorted(os.listdir(hmm_features_date_path)))} keys = sorted(list(symbol_feature_paths.keys())) print('For hmm date: ', hmm_date, '###########') print(keys == fit_select.forward_Dates(hmm_dates, hmm_date) ) for key in keys: #<- parallelisation here! labels_file_path = os.path.join(symbolData.symbol_specific_label_path(label_idx), key + ".csv") print(os.path.isfile(labels_file_path)) if os.path.isfile(labels_file_path): # check that this is a real path print(" reading labels") # this is the labels path! labels = pd.read_csv(labels_file_path) label_name = str(labels.columns[labels.columns.str.contains(pat='label')].values[0]) fit_select.logmemoryusage("Before garbage collect") hmm_features = nfu.hmm_features_df(open_pickle_filepath(symbol_feature_paths[key])) if hmm_features.isnull().values.all(): # checking that the HMM features are actually not null pass else: print('can fit and predict!') Xtr, Ytr = features_and_labels(labels) if Xtr.shape[0]<10: print('ratio is too low') continue else: forward_dates_list =fit_select.forward_Dates(keys, key) print('the number of forward dates is:',len(forward_dates_list))
def oos_prediction_function(symbol, label_idx): for pickled_model in pickled_models: print(' new model coming') print('Your new model is here:', pickled_model) model_date = (pickled_model.split("_")[4] ) # load an HMM date model model_path = os.path.join(fittedModelsPath, pickled_model) print('model date- you are here ! ', model_date) best_svc = open_pickle_filepath(model_path) for hmm_date_idx, _ in enumerate(symbolData.hmm_dates_list): print( ' ------------------------------------new hmm date coming----------' ) # all the various combinations of HMM dates, # features models. # all the labels dates that are after the key date that this model was fitted # all the various paths hmm_date = symbolData.hmm_dates_list[hmm_date_idx] # get all the dates we have essentially an hmm model - which we will use for # features! print(" now get your feature paths:") features_paths = symbolData.hmm_model_date_feature_list_filepaths( hmm_date)[1] # print(features_paths) print('now go get your forward dates') # now go get it for each forward date labels_paths = symbolData.hmm_model_feature_corrsp_labels_files( hmm_date, alternate_labels_nos[label_idx]) forwardDatesList = forwardDates(list(labels_paths.keys()), model_date) print( "-------------------############------------------- predictions start next-------###" ) for forwardDateKey in forwardDatesList: if model_date < forwardDateKey: # simple check that your model date is not after your forward date! oos_svc_predictions = defaultdict(dict) # get your labels labels = pd.read_csv(labels_paths[forwardDateKey]) label_name = str( labels.columns[labels.columns.str.contains( pat='label')].values[0]) print('you are on this date:', forwardDateKey, "and doing this label", label_name) # create features - first HMM and second some Market Features! try: hmm_features = nfu.hmm_features_df( open_pickle_filepath( features_paths[forwardDateKey])) if hmm_features.isnull().values.all(): print( 'Problem: your HMM features did not compute properly' ) else: market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures( df=CreateMarketFeatures( df=labels).ma_spread_duration( )).ma_spread()).chaikin_mf() ).obv_calc() # market features dataframe df_concat = pd.DataFrame( pd.concat( [hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_test = df[df.columns[df.columns.str.contains( pat='label')]].iloc[:, 0] print( 'Success---------->*******READY TO FIT MODELS **********' ) try: X_test = MinMaxScaler().fit_transform( df_final) y_pred = best_svc[str(symbol)][model_date][ 'SVC'].predict(X_test) print(evaluate_predictions(y_test, y_pred)) # store the results results_predict_alias = "_".join( (symbol, forwardDateKey, str(alternate_labels_nos[label_idx]))) oos_svc_predictions[results_predict_alias][ forwardDateKey] = evaluate_predictions( y_test, y_pred) except ValueError: print( 'value error here:****************************************' ) continue # store the results print('******* Finished and now saving -*-*-*-') pickle_out_filename = os.path.join( oosPredictionsPath, "_".join( (symbol, str("Label_") + str(alternate_labels_nos[label_idx]), forwardDateKey, 'OOS_results_dict.pkl'))) pickle_out = open(pickle_out_filename, 'wb') pickle.dump(oos_svc_predictions, pickle_out) pickle_out.close() print('saved', pickle_out_filename) except: error_dates.append(forwardDateKey) print('issue here:', forwardDateKey) pass print(' you are about to switch')
def parallised_function(symbol, label_idx): symbolData = DataLoader(mainPath, symbol) # hmm_dates_list = (symbolData.hmm_dates_list) hmm_dates_models_list for hmm_date_idx, hmm_date in enumerate( sorted(symbolData.hmm_dates_list)): hmm_features_date_path = os.path.join( symbolData.symbol_features_path, hmm_date) symbol_feature_paths = { feature_file.split("_")[5]: os.path.join(hmm_features_date_path, feature_file) for f, feature_file in enumerate( sorted(os.listdir(hmm_features_date_path))) } keys = sorted(list(symbol_feature_paths.keys())) for key in keys: # <--- this is the label key in older versions of the code! labels_file_path = os.path.join( symbolData.symbol_specific_label_path(label_idx), key + ".csv") # this is the label path in older versions of the code best_svc_dict = defaultdict(dict) if os.path.isfile( labels_file_path): # check that this is a real path print("can train") print(" reading labels") # this is the labels path! labels = pd.read_csv(labels_file_path) label_name = str( labels.columns[labels.columns.str.contains( pat='label')].values[0]) print( symbol_feature_paths[key] ) # this is the labels path! in the correct order !!!! logmemoryusage("Before garbage collect") hmm_features = nfu.hmm_features_df( open_pickle_filepath(symbol_feature_paths[key])) if hmm_features.isnull().values.all( ): # checking that the HMM features are actually not null continue else: # if features not null then start moving on! market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures(df=CreateMarketFeatures( df=labels).ma_spread_duration()).ma_spread( )).chaikin_mf()).obv_calc( ) # market features dataframe df_concat = pd.DataFrame( pd.concat([hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_train = df[df.columns[df.columns.str.contains( pat='label')]].iloc[:, 0] # training labels if df_final.shape[0] < 10: print( ' the ratio of classes is too low. try another label permutation' ) continue else: try: print("starting model fit") X_train = MinMaxScaler().fit_transform( df_final) models_cls = FitModels(X_train, y_train) print(models_cls.best_svc_clf()) best_svc_dict[symbol][key] = { 'SVC': models_cls.best_svc_clf(), 'best_params': models_cls.best_svc_clf().best_params_, 'means': models_cls.best_svc_clf( ).cv_results_['mean_test_score'], 'stds': models_cls.best_svc_clf( ).cv_results_['std_test_score'], 'params': models_cls.best_svc_clf(). cv_results_['params'], 'best_score': models_cls.best_svc_clf().best_score_ } except ValueError: continue logmemoryusage("at the end") pickle_out_filename = os.path.join( mainPath, "ExperimentCommonLocs/FittedModels", "_".join((symbol, 'model_fit_date', str(key), str(alternate_labels_nos[label_idx]), 'SingleKernelSVC.pkl'))) pickle_out = open(pickle_out_filename, 'wb') pickle.dump(best_svc_dict, pickle_out) pickle_out.close() else: print( "#################### Your Labels File does not exist ----- ####" ) continue logmemoryusage("Before garbage collect") print(best_svc_dict[symbol][key])
def fitting_function_mkl(key): print('For key: ', key, '############') labels_file_path = os.path.join( symbolData.symbol_specific_label_path(label_idx), key + ".csv") print(os.path.isfile(labels_file_path)) output_dict = defaultdict(dict) if os.path.isfile(labels_file_path): # check that this is a real path print(" reading labels") # this is the labels path! labels = pd.read_csv(labels_file_path) label_name = str( labels.columns[labels.columns.str.contains(pat='label')].values[0]) logmemoryusage("Before garbage collect") hmm_features = nfu.hmm_features_df( open_pickle_filepath(symbol_feature_paths[key])) if hmm_features.isnull().values.all( ): # checking that the HMM features are actually not null pass print('lots of NaNs on features') else: # if features not null then start moving on! print("can train") market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures(df=CreateMarketFeatures( df=labels).ma_spread_duration()).ma_spread()). chaikin_mf()).obv_calc() # market features dataframe df_concat = pd.DataFrame( pd.concat([hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_train = df.reindex(columns=df.columns[df.columns.str.contains( pat='label')]) # training labels print('go to the labels') if df_final.shape[0] < 10: print( ' the ratio of classes is too low. try another label permutation' ) # problem_dict[hmm_date][key] = str(key) pass else: print("starting model fit") Xtr, Xte, Ytr, Yte = train_test_split(df_final, y_train, test_size=.2, random_state=42) # training arrXtr = np.array(Xtr) X_tr = normalization(rescale_01(arrXtr)) Y_tr = torch.Tensor(Ytr.values.ravel()) # testing arrXte = np.array(Xte) X_te = normalization(rescale_01(arrXte)) Y_te = torch.Tensor(Yte.values.ravel()) KLtr = [ pairwise.homogeneous_polynomial_kernel(X_tr, degree=d) for d in range(1, 11) ] + [identity_kernel(len(Y_tr))] KLte = [ pairwise.homogeneous_polynomial_kernel(X_te, X_tr, degree=d) for d in range(1, 11) ] KLte.append(torch.zeros(KLte[0].size())) print('done with kernel') try: lam_values = [0.1, 0.2, 1] best_results = {} C_range = [0.1, 1] for C_ch in C_range: base_learner = SVC(C=C_ch) # "soft"-margin svm print(' fitted the base learner') # possible lambda values for the EasyMKL algorithm for lam in lam_values: print('now here', lam) print(' and tuning lambda for EasyMKL...', end='') base_learner = SVC(C=C_ch) # "soft"-margin svm # MKLpy.model_selection.cross_val_score performs the cross validation automatically, # it may returns accuracy, auc, or F1 scores scores = cross_val_score(KLtr, Y_tr, EasyMKL( learner=base_learner, lam=lam), n_folds=5, scoring='accuracy') acc = np.mean(scores) if not best_results or best_results['score'] < acc: best_results = {'lam': lam, 'score': acc} # evaluation on the test set print('done', best_results) cv_dict_list[(symbol, hmm_date, label_idx)][(lam, C_ch)] = [ scores, best_results ] print(cv_dict_list) pickle_out_filename = os.path.join( mainPath, "ExperimentCommonLocs/MKLFittedModels", "_".join((symbol, 'model_fit_date', str(key), str(alternate_labels_nos[label_idx]), 'MultiKernelSVC.pkl'))) print(pickle_out_filename) pickle_out = open(pickle_out_filename, 'wb') pickle.dump(cv_dict_list, pickle_out) pickle_out.close() except (ValueError, TypeError, EOFError): pass