def MKL(): fname, pv, tv, org_metrics = experiment_setting() print(fname, pv, tv) list_pair_metrics = [["l1", "l2"]] for metrics in list_pair_metrics: X, y, sim_matrices = get_s_metric(fname=fname, tv=tv, pv=pv, metrics=metrics) # # from similarity to kernel matrix KL = [np.exp(s) / 0.01 for s in sim_matrices] KL_norm = [kernel_normalization(K) for K in KL] print(KL_norm, sim_matrices) # KLtr, KLte, Ytr, Yte = train_test_split(KL, Y, random_state=42, shuffle=True, test_size=.3) print(y) # # polynomial kernel # KL_norm = [hpk(X, degree=d) for d in range(1,11)] gamma_values = [0.001, 0.01, 0.1, 1, 10] lam_values = [0, 0.1, 0.2, 1] C_values = [0.01, 1, 100] # for lam in lam_values: # for gamma, C in product(gamma_values, C_values): # svm = SVR(kernel="rbf", C=C, gamma=gamma) # mkl = EasyMKL(lam=lam, learner=svm) # scores = cross_val_score(KL_norm, y, mkl, n_folds=3, scoring='mae') # print (lam, C, scores) for lam, C in product(lam_values, C_values): svm = SVC(C=C) mkl = EasyMKL(lam=lam, learner=svm) # # add into MKL sources scores = cross_val_score(KL_norm, y, mkl, n_folds=3, scoring='mae') print(lam, C, scores)
# print(base_learner) ########################################################################################### best_results = {} for lam in [0, 0.0001, 0.0009, 0.001, 0.009, 0.01, 0.09, 0.1, 0.2, 0.9, 1]: base_learner = GridSearchCV(svm.SVC(probability=True), param_grid=param_grid, cv=cv, refit='AUC', error_score=0, pre_dispatch='1*n_jobs', n_jobs=1) scores = cross_val_score(k1, y_tr_A, EasyMKL(learner=base_learner, lam=lam), cv=cv, n_folds=5, scoring='accuracy') # print(lam, scores) acc = np.mean(scores) if not best_results or best_results['score'] < acc: best_results = {'lam': lam, 'score': acc} # EasyMKL-BASED ############################################################################################# clf = EasyMKL(learner=base_learner, lam=best_results['lam']).fit(k1 + k2 + k3 + k4 + k5 + k6, y_tr_A) print(clf) ############################################################################################# # evaluate the solution from sklearn.metrics import accuracy_score, roc_auc_score
def parallelised_function(file): select_file_path = os.path.join(jointFeatureLocation, file) # formulate the path print('Symbol:----->', file.split("_")[0]) symbol = file.split("_")[0] select_hmm_date = select_file_path.split("_")[ 3] # pull out the hmm_date - strip it out select_feature_label_date = select_file_path.split("_")[ 6] # pull out the label_feature_date select_label_idx = select_file_path.split("_")[ 9] # pull out the label _idx unpickled_select_file = open_pickle_filepath( select_file_path) # unplickle the select file hmm_keys = sorted(list( unpickled_select_file.keys())) # hmm keys for the select file. for hmm_date_key in hmm_keys: # pick and hmm date feature_label_keys = sorted( unpickled_select_file[hmm_date_key].keys( )) # each key here unlocks a feature and label set for feature_label_date in feature_label_keys: # make a list of all the feature dates features_file_path = unpickled_select_file[hmm_date_key][ feature_label_date][0] # this is the feature path labels_file_path = unpickled_select_file[hmm_date_key][ feature_label_date][1] # this is the labels path if os.path.isfile(features_file_path ): # if label file exists I can traing print( 'ok----->', feature_label_date ) # if you got to this point we have data so we can mov eon labels = pd.read_csv(labels_file_path) # open labels file label_name = str( labels.columns[labels.columns.str.contains( pat='label')].values[0]) features = open_pickle_filepath( features_file_path) # opens features file hmm_features = nfu.hmm_features_df( features ) # get the hmm features out, so unpack the tuples! print('loaded features and labels ') if hmm_features.isnull().values.all( ): # checking that the HMM features are actually not null continue else: # if features not null then start moving on! market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures(df=CreateMarketFeatures( df=labels).ma_spread_duration()).ma_spread( )).chaikin_mf()).obv_calc( ) # market features dataframe df_concat = pd.DataFrame( pd.concat([hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_train = df[df.columns[df.columns.str.contains( pat='label')]].iloc[:, 0] # training labels if df_final.shape[ 0] < 10: # make sure it all looks reasonable print( ' the ratio of classes is too low. try another label permutation' ) continue else: print("starting model fit") # put the features in a tensor format X = np.asarray( df_final.values) # need this for torch Xtr = normalization(rescale_01(torch.Tensor( X))) # features in a tensor format Ytr = torch.Tensor( y_train.values ) # put the labels in a tensor format print( '-----------------first bit done------------------' ) KLrbf = generators.RBF_generator( Xtr, gamma=[.01, .1, .25, .5] ) # get a few RBF Kernels ready - maybe need more here print('done with kernel') best_results = {} C_range = [0.1, 1] lam_range = [0.2] try: for C_choice in C_range: base_learner = SVC( C=C_choice) # "hard"-margin svm # clf = EasyMKL(lam=0.2, multiclass_strategy='ova', learner=base_learner).fit(KLrbf, # Ytr) # print('done') # print('the combination weights are:') # # for sol in clf.solution: # print('(%d vs all): ' % sol, # clf.solution[ # sol].weights) # need to store these results somewhere for lam in lam_range: # possible lambda values for the EasyMKL algorithm # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns # accuracy, auc, or F1 scores scores = cross_val_score( KLrbf, Ytr, EasyMKL(learner=base_learner, lam=lam), n_folds=5, scoring='accuracy' ) # get the cross-validation scores acc = np.mean(scores) if not best_results or best_results[ 'score'] < acc: best_results = { 'C': C_choice, 'lam': lam, 'score': acc, 'scores': scores } # these should get dumped somewhere print('done') best_learner = SVC(C=best_results['C']) clf = EasyMKL(learner=best_learner, lam=best_results['lam']).fit( KLrbf, Ytr) y_pred = clf.predict(KLrbf) accuracy = accuracy_score(Ytr, y_pred) print( 'accuracy on the test set: %.3f, with lambda=%.2f' % (accuracy, best_results['lam'])) print(scores) pickle_out_filename = os.path.join( mainPath, "ExperimentCommonLocs/CrossValidationResults", "_".join((symbol, 'feature_label_date', str(select_feature_label_date), str(select_label_idx), 'hmm_date:', hmm_date_key, 'RBF', 'MultiKernelSVC.pkl'))) # pickle_out = open(pickle_out_filename, 'wb') # pickle.dump(best_results, pickle_out) # pickle_out.close() except ValueError: continue else: print('PROBLEM----->in one of of your locations') continue
KLtr, KLte, Ytr, Yte = train_test_split(KL, Y, test_size=.3, random_state=42) #MKL algorithms from MKLpy.algorithms import EasyMKL, KOMD #KOMD is not a MKL algorithm but a simple kernel machine like the SVM from MKLpy.model_selection import cross_val_score from sklearn.svm import SVC import numpy as np print('tuning lambda for EasyMKL...', end='') base_learner = SVC(C=10000) #"hard"-margin svm best_results = {} for lam in [0, 0.01, 0.1, 0.2, 0.9, 1]: #possible lambda values for the EasyMKL algorithm #MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns #accuracy, auc, or F1 scores scores = cross_val_score(KLtr, Ytr, EasyMKL(learner=base_learner, lam=lam), n_folds=5, scoring='accuracy') acc = np.mean(scores) if not best_results or best_results['score'] < acc: best_results = {'lam': lam, 'score': acc} #evaluation on the test set from sklearn.metrics import accuracy_score print('done') clf = EasyMKL(learner=base_learner, lam=best_results['lam']).fit(KLtr, Ytr) y_pred = clf.predict(KLte) accuracy = accuracy_score(Yte, y_pred) print('accuracy on the test set: %.3f, with lambda=%.2f' % (accuracy, best_results['lam']))
Xtr = normalization(rescale_01(torch.Tensor(pkl_file[date][0].values))) Ytr = torch.Tensor(pkl_file[date][1].values) print('first bit done') nalsvm.gc.collect() KLrbf = generators.RBF_generator(Xtr, gamma=[.001, .01, .1]) print('done with kernel') nalsvm.gc.collect() try: lam_values = [0, 0.1, 0.2, 1] C_values = [0.01, 1, 10, 100] print(C_values) for lam, C in product(lam_values, C_values): print('now here', C, lam) svm = SVC(C=C) mkl = EasyMKL(lam=lam, learner=svm) scores = cross_val_score(KLrbf, Ytr, mkl, n_folds=3, scoring='accuracy') print(str(scores)) print(lam, C, scores) print(type(scores)) cv_dict_list[(symbol, date, alternate_label)][(lam, C)] = scores nalsvm.logmemoryusage("Before garbage collect") print('---------------> moving on') except (ValueError, TypeError, EOFError): continue # only way that seems to work for this pickle_out_filename = os.path.join(cross_validation_data_location, "_".join((symbol, date, 'RBF_CrossValidationResults.pkl'))) test_df = pd.DataFrame.from_dict(cv_dict_list) test_df.to_pickle(pickle_out_filename) print('Now saved: ', pickle_out_filename)
def fitting_function_mkl(key): print('For key: ', key, '############') labels_file_path = os.path.join( symbolData.symbol_specific_label_path(label_idx), key + ".csv") print(os.path.isfile(labels_file_path)) output_dict = defaultdict(dict) if os.path.isfile(labels_file_path): # check that this is a real path print(" reading labels") # this is the labels path! labels = pd.read_csv(labels_file_path) label_name = str( labels.columns[labels.columns.str.contains(pat='label')].values[0]) logmemoryusage("Before garbage collect") hmm_features = nfu.hmm_features_df( open_pickle_filepath(symbol_feature_paths[key])) if hmm_features.isnull().values.all( ): # checking that the HMM features are actually not null pass print('lots of NaNs on features') else: # if features not null then start moving on! print("can train") market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures(df=CreateMarketFeatures( df=labels).ma_spread_duration()).ma_spread()). chaikin_mf()).obv_calc() # market features dataframe df_concat = pd.DataFrame( pd.concat([hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_train = df.reindex(columns=df.columns[df.columns.str.contains( pat='label')]) # training labels print('go to the labels') if df_final.shape[0] < 10: print( ' the ratio of classes is too low. try another label permutation' ) # problem_dict[hmm_date][key] = str(key) pass else: print("starting model fit") Xtr, Xte, Ytr, Yte = train_test_split(df_final, y_train, test_size=.2, random_state=42) # training arrXtr = np.array(Xtr) X_tr = normalization(rescale_01(arrXtr)) Y_tr = torch.Tensor(Ytr.values.ravel()) # testing arrXte = np.array(Xte) X_te = normalization(rescale_01(arrXte)) Y_te = torch.Tensor(Yte.values.ravel()) KLtr = [ pairwise.homogeneous_polynomial_kernel(X_tr, degree=d) for d in range(1, 11) ] + [identity_kernel(len(Y_tr))] KLte = [ pairwise.homogeneous_polynomial_kernel(X_te, X_tr, degree=d) for d in range(1, 11) ] KLte.append(torch.zeros(KLte[0].size())) print('done with kernel') try: lam_values = [0.1, 0.2, 1] best_results = {} C_range = [0.1, 1] for C_ch in C_range: base_learner = SVC(C=C_ch) # "soft"-margin svm print(' fitted the base learner') # possible lambda values for the EasyMKL algorithm for lam in lam_values: print('now here', lam) print(' and tuning lambda for EasyMKL...', end='') base_learner = SVC(C=C_ch) # "soft"-margin svm # MKLpy.model_selection.cross_val_score performs the cross validation automatically, # it may returns accuracy, auc, or F1 scores scores = cross_val_score(KLtr, Y_tr, EasyMKL( learner=base_learner, lam=lam), n_folds=5, scoring='accuracy') acc = np.mean(scores) if not best_results or best_results['score'] < acc: best_results = {'lam': lam, 'score': acc} # evaluation on the test set print('done', best_results) cv_dict_list[(symbol, hmm_date, label_idx)][(lam, C_ch)] = [ scores, best_results ] print(cv_dict_list) pickle_out_filename = os.path.join( mainPath, "ExperimentCommonLocs/MKLFittedModels", "_".join((symbol, 'model_fit_date', str(key), str(alternate_labels_nos[label_idx]), 'MultiKernelSVC.pkl'))) print(pickle_out_filename) pickle_out = open(pickle_out_filename, 'wb') pickle.dump(cv_dict_list, pickle_out) pickle_out.close() except (ValueError, TypeError, EOFError): pass