def test_rescale_01(self): Xn = preprocessing.rescale_01(self.X) self.assertAlmostEqual(Xn.min().item(), 0) self.assertAlmostEqual(Xn.max().item(), 1) self.assertEqual(Xn.shape, (5, 4)) self.assertEqual(Xn[2, 0], 0.75) self.assertEqual(Xn[2, 2], 0) o_torch = preprocessing.rescale_01(self.X) o_numpy = preprocessing.rescale_01(self.Xnumpy) self.assertTrue(matNear(o_torch, o_numpy)) self.assertEqual(type(o_torch), torch.Tensor) self.assertEqual(type(o_numpy), torch.Tensor)
def featureCreation(idxKey, locDict): """ gives out clean features and labels for a given locDict and a idxKey """ keys = list(locDict.keys()) featuresIdxDirFileLoc = locDict[keys[idxKey]][0] labelsIdxDirFileLoc = locDict[keys[idxKey]][1] dataDate = keys[idxKey] ''' read the features file''' featuresTupleFile = pkl.load(open(featuresIdxDirFileLoc, "rb"), encoding='latin1') dfFeatures = pd.concat([featuresTupleFile[0], featuresTupleFile[1], \ featuresTupleFile[2], featuresTupleFile[3]], axis=1, sort=False).fillna(0) ''' read the labels file''' labelsDf = pd.read_csv(labelsIdxDirFileLoc) ''' pop the labels out''' labels = labelsDf['label_PrMov__window_5__thres_arbitrary__0.1'] '''dataframe of Features and Labels - X and Y''' dfXY = pd.concat([dfFeatures, labels], axis=1, sort='False').dropna() labelName = str( dfXY.columns[dfXY.columns.str.contains(pat='label')].values[0]) ''' drop the labels from the features''' dfX = dfXY.drop(columns=[labelName]) arrX = np.array(dfX) ''' feature normalisation''' # feature scaling in [0,1] - X = rescale_01(arrX) X = normalization(rescale_01(arrX)) y = dfXY[dfXY.columns[dfXY.columns.str.contains(pat='label')]].iloc[:, 0] ''' returns features, labels''' return X, y, dataDate
y1 = feature1.iloc[:, 0].values X2 = feature2.iloc[:, 1:].values y2 = feature2.iloc[:, 0].values X3 = feature3.iloc[:, 1:].values y3 = feature3.iloc[:, 0].values X4 = feature4.iloc[:, 1:].values y4 = feature4.iloc[:, 0].values X5 = feature5.iloc[:, 1:].values y5 = feature5.iloc[:, 0].values X6 = feature6.iloc[:, 1:].values y6 = feature6.iloc[:, 0].values # Preprocess data from MKLpy.preprocessing import normalization, rescale_01 X1 = rescale_01(X1) # feature scaling in [0,1] X1 = normalization(X1) # ||X_i||_2^2 = 1 X2 = rescale_01(X2) X2 = normalization(X2) X3 = rescale_01(X3) X3 = normalization(X3) X4 = rescale_01(X4) X4 = normalization(X4) X5 = rescale_01(X5) X5 = normalization(X5) X6 = rescale_01(X6) X6 = normalization(X6) # # train/test X_tr_A, X_te_A, y_tr_A, y_te_A = train_test_split(X1, y1,
print('loading \'breast cancer\' dataset...', end='') from sklearn.datasets import load_breast_cancer ds = load_breast_cancer() X, Y = ds.data, ds.target print('done') ''' WARNING: be sure that your matrix is not sparse! EXAMPLE: from sklearn.datasets import load_svmlight_file X,Y = load_svmlight_file(...) X = X.toarray() ''' #preprocess data print('preprocessing data...', end='') from MKLpy.preprocessing import normalization, rescale_01 X = rescale_01(X) #feature scaling in [0,1] X = normalization(X) #||X_i||_2^2 = 1 #train/test split from sklearn.model_selection import train_test_split Xtr, Xte, Ytr, Yte = train_test_split(X, Y, test_size=.25, random_state=42) print('done') #compute homogeneous polynomial kernels with degrees 0,1,2,...,10. print('computing Homogeneous Polynomial Kernels...', end='') from MKLpy.metrics import pairwise KLtr = [ pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(11) ] KLte = [ pairwise.homogeneous_polynomial_kernel(Xte, Xtr, degree=d)
def parallelised_function(file): select_file_path = os.path.join(jointFeatureLocation, file) # formulate the path print('Symbol:----->', file.split("_")[0]) symbol = file.split("_")[0] select_hmm_date = select_file_path.split("_")[ 3] # pull out the hmm_date - strip it out select_feature_label_date = select_file_path.split("_")[ 6] # pull out the label_feature_date select_label_idx = select_file_path.split("_")[ 9] # pull out the label _idx unpickled_select_file = open_pickle_filepath( select_file_path) # unplickle the select file hmm_keys = sorted(list( unpickled_select_file.keys())) # hmm keys for the select file. for hmm_date_key in hmm_keys: # pick and hmm date feature_label_keys = sorted( unpickled_select_file[hmm_date_key].keys( )) # each key here unlocks a feature and label set for feature_label_date in feature_label_keys: # make a list of all the feature dates features_file_path = unpickled_select_file[hmm_date_key][ feature_label_date][0] # this is the feature path labels_file_path = unpickled_select_file[hmm_date_key][ feature_label_date][1] # this is the labels path if os.path.isfile(features_file_path ): # if label file exists I can traing print( 'ok----->', feature_label_date ) # if you got to this point we have data so we can mov eon labels = pd.read_csv(labels_file_path) # open labels file label_name = str( labels.columns[labels.columns.str.contains( pat='label')].values[0]) features = open_pickle_filepath( features_file_path) # opens features file hmm_features = nfu.hmm_features_df( features ) # get the hmm features out, so unpack the tuples! print('loaded features and labels ') if hmm_features.isnull().values.all( ): # checking that the HMM features are actually not null continue else: # if features not null then start moving on! market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures(df=CreateMarketFeatures( df=labels).ma_spread_duration()).ma_spread( )).chaikin_mf()).obv_calc( ) # market features dataframe df_concat = pd.DataFrame( pd.concat([hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_train = df[df.columns[df.columns.str.contains( pat='label')]].iloc[:, 0] # training labels if df_final.shape[ 0] < 10: # make sure it all looks reasonable print( ' the ratio of classes is too low. try another label permutation' ) continue else: print("starting model fit") # put the features in a tensor format X = np.asarray( df_final.values) # need this for torch Xtr = normalization(rescale_01(torch.Tensor( X))) # features in a tensor format Ytr = torch.Tensor( y_train.values ) # put the labels in a tensor format print( '-----------------first bit done------------------' ) KLrbf = generators.RBF_generator( Xtr, gamma=[.01, .1, .25, .5] ) # get a few RBF Kernels ready - maybe need more here print('done with kernel') best_results = {} C_range = [0.1, 1] lam_range = [0.2] try: for C_choice in C_range: base_learner = SVC( C=C_choice) # "hard"-margin svm # clf = EasyMKL(lam=0.2, multiclass_strategy='ova', learner=base_learner).fit(KLrbf, # Ytr) # print('done') # print('the combination weights are:') # # for sol in clf.solution: # print('(%d vs all): ' % sol, # clf.solution[ # sol].weights) # need to store these results somewhere for lam in lam_range: # possible lambda values for the EasyMKL algorithm # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns # accuracy, auc, or F1 scores scores = cross_val_score( KLrbf, Ytr, EasyMKL(learner=base_learner, lam=lam), n_folds=5, scoring='accuracy' ) # get the cross-validation scores acc = np.mean(scores) if not best_results or best_results[ 'score'] < acc: best_results = { 'C': C_choice, 'lam': lam, 'score': acc, 'scores': scores } # these should get dumped somewhere print('done') best_learner = SVC(C=best_results['C']) clf = EasyMKL(learner=best_learner, lam=best_results['lam']).fit( KLrbf, Ytr) y_pred = clf.predict(KLrbf) accuracy = accuracy_score(Ytr, y_pred) print( 'accuracy on the test set: %.3f, with lambda=%.2f' % (accuracy, best_results['lam'])) print(scores) pickle_out_filename = os.path.join( mainPath, "ExperimentCommonLocs/CrossValidationResults", "_".join((symbol, 'feature_label_date', str(select_feature_label_date), str(select_label_idx), 'hmm_date:', hmm_date_key, 'RBF', 'MultiKernelSVC.pkl'))) # pickle_out = open(pickle_out_filename, 'wb') # pickle.dump(best_results, pickle_out) # pickle_out.close() except ValueError: continue else: print('PROBLEM----->in one of of your locations') continue
def Learning_curve_using_weather_data(): ''' Cross validation using weather data: PASS: 2021.02.05 ''' # load data print('loading dataset...', end='') # from sklearn.datasets import load_breast_cancer as load # ds = load() # X, Y = ds.data, ds.target # # Files training_data = io.loadmat( r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca.mat") # training_data = io.loadmat(r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca_file.mat") # training_data = io.loadmat(r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca_file_0202.mat") results_data = open( r"D:\CVProject\CBAM-keras-master\handcraft\results\learning_curve_results_0202_01.txt", "w") # length = len(training_data['array'][0]) length = len(training_data['array'][0]) # X, Y = training_data['array'][:, 0:length - 1], training_data['array'][:, -1] X, Y = training_data['array'][:, 0:length - 1], training_data['array'][:, -1] print('done') # preprocess data print('preprocessing data...', end='') from MKLpy.preprocessing import normalization, rescale_01 X = rescale_01(X) # feature scaling in [0,1] X = normalization(X) # ||X_i||_2^2 = 1 print('done') from MKLpy.algorithms import EasyMKL, KOMD # KOMD is not a WeatherClsMKL algorithm but a simple kernel machine like the SVM from MKLpy.model_selection import cross_val_score from sklearn.svm import SVC import numpy as np # base_learner = SVC(C=10000) # "hard"-margin svm print("Build a base learner") base_learner = SVC(C=20) # "hard"-margin svm # # # === parameters selection === # best_results = {} # # for lam in [0, 0.01, 0.1, 0.2, 0.9, 1]: # possible lambda values for the EasyMKL algorithm # for lam in [0]: # possible lambda values for the EasyMKL algorithm # # MKLpy.model_selection.cross_val_score performs the cross validation automatically, it may returns # # accuracy, auc, or F1 scores # # evaluation on the test set # print("Model training with lam {}".format(lam)) # clf = EasyMKL(lam=0.1, multiclass_strategy='ova', learner=base_learner).fit(KLtr, Ytr) # scores = cross_val_score(KLtr, Ytr, clf, n_folds=5, scoring='accuracy') # acc = np.mean(scores) # if not best_results or best_results['score'] < acc: # best_results = {'lam': lam, 'score': acc} print("Build EasyMKL classifier") # clf = EasyMKL(lam=0.1, multiclass_strategy='ova', learner=base_learner).fit(KLtr, Ytr) # scores = cross_val_score(KLtr, Ytr, clf, n_folds=5, scoring='accuracy') # acc = np.mean(scores) # print("acc:", acc) # ====== Learning curve ======= # # X1_tr = numpy.array(Xtr[:, :2]) # time # X2_tr = numpy.array(Xtr[:, 2:92]) # color # X3_tr = numpy.array(Xtr[:, 92:124]) # Gabor # X4_tr = numpy.array(Xtr[:, 124:156]) # lbp # X5_tr = numpy.array(Xtr[:, 156:348]) # cloud # X6_tr = numpy.array(Xtr[:, 348:432]) # haze # X7_tr = numpy.array(Xtr[:, 432:603]) # contrast # X8_tr = numpy.array(Xtr[:, 603:651]) # shadow # X9_tr = numpy.array(Xtr[:, 606:683]) # snow # X10_tr = numpy.array(Xtr[:, 683:]) # pca # # X1_te = numpy.array(Xte[:, :2]) # time # X2_te = numpy.array(Xte[:, 2:92]) # color # X3_te = numpy.array(Xte[:, 92:124]) # Gabor # X4_te = numpy.array(Xte[:, 124:156]) # lbp # X5_te = numpy.array(Xte[:, 156:348]) # cloud # X6_te = numpy.array(Xte[:, 348:432]) # haze # X7_te = numpy.array(Xte[:, 432:603]) # contrast # X8_te = numpy.array(Xte[:, 603:651]) # shadow # X9_te = numpy.array(Xte[:, 606:683]) # snow # X10_te = numpy.array(Xte[:, 683:]) # pca # # # # # # all features # KLtr = Multiview_generator([X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.rbf_kernel) # KLte = Multiview_generator([X1_te, X2_te, X3_te, X4_te, X5_te, X6_te, X7_te, X8_te, X9_te, X10_te], [X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.rbf_kernel) # # KYtr = Ytr[:] # KYte = Yte[:] # for elem in [0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: for elem in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: # for elem in [1]: learn_count = int(elem * X.shape[0]) KLtr, KYtr, KLte, KYte = bulid_kernel_transform( X[:learn_count], Y[:learn_count]) train_count, test_count = len(KYtr), len(KYte) clf = EasyMKL(lam=0.1, multiclass_strategy='ova', learner=base_learner).fit(KLtr, KYtr) # scores = cross_val_score(KLtr, Ytr, clf, n_folds=5, scoring='accuracy') # acc = np.mean(scores) y_train_pred = clf.predict(KLtr) y_test_pred = clf.predict(KLte) train_set_accuracy = accuracy_score(KYtr, y_train_pred) tests_et_accuracy = accuracy_score(KYte, y_test_pred) # display the results print("Test on {0} train samples and {1} test samples,".format( train_count, test_count), end="") print( 'accuracy on the train set: %.3f and accuracy on the test set : %.3f' % (train_set_accuracy, tests_et_accuracy)) # save the results in txt print("Test on {0} train samples and {1} test samples,".format( train_count, test_count), end="", file=results_data) print( 'accuracy on the train set: %.3f and accuracy on the test set : %.3f' % (train_set_accuracy, tests_et_accuracy), file=results_data) # from sklearn.metrics import accuracy_score print('done') # ============================== pass # # # ===== evaluate the model ===== # # # Chose the model with high performance # # # Transform # X1_tr = numpy.array(Xtr[:, :2]) # time # X2_tr = numpy.array(Xtr[:, 2:92]) # color # X3_tr = numpy.array(Xtr[:, 92:124]) # Gabor # X4_tr = numpy.array(Xtr[:, 124:156]) # lbp # X5_tr = numpy.array(Xtr[:, 156:348]) # cloud # X6_tr = numpy.array(Xtr[:, 348:432]) # haze # X7_tr = numpy.array(Xtr[:, 432:603]) # contrast # X8_tr = numpy.array(Xtr[:, 603:606]) # shadow # X9_tr = numpy.array(Xtr[:, 606:608]) # snow # X10_tr = numpy.array(Xtr[:, 608:]) # pca # # X1_te = numpy.array(Xte[:, :2]) # time # X2_te = numpy.array(Xte[:, 2:92]) # color # X3_te = numpy.array(Xte[:, 92:124]) # Gabor # X4_te = numpy.array(Xte[:, 124:156]) # lbp # X5_te = numpy.array(Xte[:, 156:348]) # cloud # X6_te = numpy.array(Xte[:, 348:432]) # haze # X7_te = numpy.array(Xte[:, 432:603]) # contrast # X8_te = numpy.array(Xte[:, 603:606]) # shadow # X9_te = numpy.array(Xte[:, 606:608]) # snow # X10_te = numpy.array(Xte[:, 608:]) # pca # # # # all features # KLtr = Multiview_generator([X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.homogeneous_polynomial_kernel) # KLte = Multiview_generator([X1_te, X2_te, X3_te, X4_te, X5_te, X6_te, X7_te, X8_te, X9_te, X10_te], [X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.homogeneous_polynomial_kernel) # # KYtr = Ytr[:] # KYte = Yte[:] # # clf = EasyMKL(learner=base_learner, lam=0.1).fit(KLtr, KYtr) # y_train_pred = clf.predict(KLtr) # y_test_pred = clf.predict(KLte) # # train_set_accuracy = accuracy_score(KYtr, y_train_pred) # tests_et_accuracy = accuracy_score(KYte, y_test_pred) # # # print('accuracy on the test set: %.3f, with lambda=%.2f' % (accuracy, best_results['lam'])) # print('accuracy on the train set: %.3f, and accuracy on the test set : %.3f' % (train_set_accuracy, tests_et_accuracy)) # # ====================== pass
def MultiView_learning(): """MultiView learning""" print('loading dataset...', end='') training_data = io.loadmat( r"D:\CVProject\CBAM-keras-master\handcraft\features_with_pca_file_0202.mat" ) length = len(training_data['array'][0]) X, Y = training_data['array'][:, 0:length - 2], training_data['array'][:, -1] print('done') # preprocess data print('preprocessing data...', end='') from MKLpy.preprocessing import normalization, rescale_01 X = rescale_01(X) # feature scaling in [0,1] X = normalization(X) # ||X_i||_2^2 = 1 # train/test split from sklearn.model_selection import train_test_split Xtr, Xte, Ytr, Yte = train_test_split(X, Y, test_size=.1, random_state=42, shuffle=True) print(numpy.array(Xtr).shape) print(numpy.array(Ytr).shape) print('done') print('Training on {0} samples, Testing on {1} samples'.format( len(Xtr), len(Xte))) print('computing RBF Kernels...', end='') from MKLpy.metrics import pairwise from MKLpy.generators import Multiview_generator X1_tr = numpy.array(Xtr[:, :2]) # time X2_tr = numpy.array(Xtr[:, 2:92]) # color X3_tr = numpy.array(Xtr[:, 92:124]) # Gabor X4_tr = numpy.array(Xtr[:, 124:156]) # lbp X5_tr = numpy.array(Xtr[:, 156:348]) # cloud X6_tr = numpy.array(Xtr[:, 348:432]) # haze X7_tr = numpy.array(Xtr[:, 432:603]) # contrast X8_tr = numpy.array(Xtr[:, 603:606]) # shadow X9_tr = numpy.array(Xtr[:, 606:608]) # snow X10_tr = numpy.array(Xtr[:, 608:]) # pca X1_te = numpy.array(Xte[:, :2]) # time X2_te = numpy.array(Xte[:, 2:92]) # color X3_te = numpy.array(Xte[:, 92:124]) # Gabor X4_te = numpy.array(Xte[:, 124:156]) # lbp X5_te = numpy.array(Xte[:, 156:348]) # cloud X6_te = numpy.array(Xte[:, 348:432]) # haze X7_te = numpy.array(Xte[:, 432:603]) # contrast X8_te = numpy.array(Xte[:, 603:606]) # shadow X9_te = numpy.array(Xte[:, 606:608]) # snow X10_te = numpy.array(Xte[:, 608:]) # pca KLtr = Multiview_generator([ X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr ], kernel=pairwise.rbf_kernel) KLte = Multiview_generator([ X1_te, X2_te, X3_te, X4_te, X5_te, X6_te, X7_te, X8_te, X9_te, X10_te ], [X1_tr, X2_tr, X3_tr, X4_tr, X5_tr, X6_tr, X7_tr, X8_tr, X9_tr, X10_tr], kernel=pairwise.rbf_kernel) print('done') from MKLpy.algorithms import AverageMKL, EasyMKL print('training EasyMKL with one-vs-all multiclass strategy...', end='') from sklearn.svm import SVC base_learner = SVC(C=8) clf = EasyMKL(lam=0.1, multiclass_strategy='ova', learner=base_learner).fit(KLtr, Ytr) print('the combination weights are:') for sol in clf.solution: print('(%d vs all): ' % sol, clf.solution[sol].weights) from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix y_pred = clf.predict(KLte) # predictions y_score = clf.decision_function(KLte) # rank accuracy = accuracy_score(Yte, y_pred) print('Accuracy score: %.4f' % (accuracy)) recall = recall_score(Yte, y_pred, average='macro') print('Recall score: %.4f' % (recall)) cm = confusion_matrix(Yte, y_pred) print('Confusion matrix', cm) print('training EasyMKL with one-vs-one multiclass strategy...', end='') clf = EasyMKL(lam=0.1, multiclass_strategy='ovo', learner=base_learner).fit(KLtr, Ytr) print('done') print('the combination weights are:') for sol in clf.solution: print('(%d vs %d): ' % (sol[0], sol[1]), clf.solution[sol].weights) y_pred = clf.predict(KLte) # predictions y_score = clf.decision_function(KLte) # rank accuracy = accuracy_score(Yte, y_pred) print('Accuracy score: %.4f' % (accuracy)) recall = recall_score(Yte, y_pred, average='macro') print('Recall score: %.4f' % (recall)) cm = confusion_matrix(Yte, y_pred) print('Confusion matrix', cm)
def test_rescale_01(self): Xn = preprocessing.rescale_01(self.X) self.assertAlmostEqual(Xn.min(), 0) self.assertAlmostEqual(Xn.max(), 1) self.assertEqual(Xn.shape, (5, 4))
from sklearn.datasets import load_breast_cancer ds = load_breast_cancer() X,Y = ds.data, ds.target print ('done') ''' WARNING: be sure that your matrix is not sparse! EXAMPLE: from sklearn.datasets import load_svmlight_file X,Y = load_svmlight_file(...) X = X.toarray() ''' #preprocess data print ('preprocessing data...', end='') from MKLpy.preprocessing import normalization, rescale_01 X = rescale_01(X) #feature scaling in [0,1] X = normalization(X) #||X_i||_2^2 = 1 #train/test split from sklearn.model_selection import train_test_split Xtr,Xte,Ytr,Yte = train_test_split(X,Y, test_size=.25, random_state=42) print ('done') #compute homogeneous polynomial kernels with degrees 0,1,2,...,10. print ('computing Homogeneous Polynomial Kernels...', end='') from MKLpy.metrics import pairwise KLtr = [pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(11)] KLte = [pairwise.homogeneous_polynomial_kernel(Xte,Xtr, degree=d) for d in range(11)] print ('done')
for symbol in ['ECM.L']: print(symbol) # which symbol - unnecessary at this point cross_validation_data_location = cross_validation_results_location(symbol) clean_data_location = storage_location(symbol) for alternate_label_idx in range(0, 4): alternate_label = nalsvm.labels_pickle_files[alternate_label_idx] print(alternate_label) file_to_load = os.path.join(clean_data_location, os.listdir(clean_data_location)[alternate_label_idx]) pkl_file = load_pickled_in_filename(file_to_load) date_keys = list(pkl_file.keys()) print('--------------->') for date in date_keys: # date is model fit-date i.e the date we pick up to fit the training model in CV print(date) start = time.time() nalsvm.logmemoryusage("Before garbage collect") Xtr = normalization(rescale_01(torch.Tensor(pkl_file[date][0].values))) Ytr = torch.Tensor(pkl_file[date][1].values) print('first bit done') nalsvm.gc.collect() KLrbf = generators.RBF_generator(Xtr, gamma=[.001, .01, .1]) print('done with kernel') nalsvm.gc.collect() try: lam_values = [0, 0.1, 0.2, 1] C_values = [0.01, 1, 10, 100] print(C_values) for lam, C in product(lam_values, C_values): print('now here', C, lam) svm = SVC(C=C) mkl = EasyMKL(lam=lam, learner=svm) scores = cross_val_score(KLrbf, Ytr, mkl, n_folds=3, scoring='accuracy')
model_dates = model_dates_list(return_cross_val_symbol_path(symbol)) # location of data -->dataDrive, Clean Data Storage, and label. pkl_file = load_pickled_in_filename( os.path.join(clean_data_location, os.listdir(clean_data_location)[alternate_label_idx])) date_keys = list(pkl_file.keys()) # list of out of sample dates model_date = '20170705' # <-need to replace this with file reading model dates above for model_date in model_dates: forward_dates = nalsvm.forwardDates(date_keys, model_date) print('---------------> Doing Model Date:', model_date) try: # put the features in a tensor format Xtr = normalization( rescale_01(torch.Tensor( pkl_file[model_date][0].values))) # fitting model # put the labels in a tensor format Ytr = torch.Tensor(pkl_file[model_date][1].values) print('first bit done') # force garbage collect nalsvm.gc.collect() # kernels KLrbf = generators.RBF_generator(Xtr, gamma=[.001, .01, .1]) # dont need the next bit print('done with kernel') print(forward_dates) # base learner- use c =1 or 10 # the c and lambda values need to be picked up by the cross-val results ! base_learner = SVC(C=10) clf = EasyMKL(lam=0.2,
import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split # load the dataset feature1 = pd.read_csv('..............................') X1 = feature1.iloc[:, 1:].values y1 = feature1.iloc[:, 0].values # Preprocess data from MKLpy.preprocessing import normalization, rescale_01 X1 = rescale_01(X1) X1 = normalization(X1) # # train/test X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X1, y1, test_size=0.3, random_state=42) # Applying Polynomial kernel from MKLpy.metrics import pairwise k1 = [ pairwise.homogeneous_polynomial_kernel(X_train_A, degree=d) for d in range(5) ] k11 = [
model_dates = model_dates_list(return_cross_val_symbol_path(symbol)) # location of data -->dataDrive, Clean Data Storage, and label. pkl_file = load_pickled_in_filename( os.path.join(clean_data_location, os.listdir(clean_data_location)[alternate_label_idx])) date_keys = list(pkl_file.keys()) # model_date = '20170704' # list of out of sample dates for model_date in model_dates: start = time.time() # forward_dates = nalsvm.forwardDates(date_keys, model_date) print('---------------> Doing Model Date:', model_date) # put the features in a tensor format Xtr = rescale_01(torch.Tensor(pkl_file[model_date][0].values)) Xtr = normalization(Xtr) # fitting model # put the labels in a tensor format Ytr = torch.Tensor(pkl_file[model_date][1].values) # try: KLtr_poly = [pairwise.homogeneous_polynomial_kernel(Xtr, degree=d) for d in range(6)] deg = 5 K = KLtr_poly[deg] # the HPK with degree 5 # K is always a squared kernel matrix, i.e. it is not the kernel computed between test and training # examples. kernel_evaluation_dict = kernel_evaluation(K) print('done') print('results of the %d-degree HP kernel:' % deg) print('margin: %.4f, radius: %.4f, radiu-margin ratio: %.4f,' % (kernel_evaluation_dict['score_margin'], kernel_evaluation_dict['score_radius'], kernel_evaluation_dict['score_ratio']))
def fitting_function_mkl(key): print('For key: ', key, '############') labels_file_path = os.path.join( symbolData.symbol_specific_label_path(label_idx), key + ".csv") print(os.path.isfile(labels_file_path)) output_dict = defaultdict(dict) if os.path.isfile(labels_file_path): # check that this is a real path print(" reading labels") # this is the labels path! labels = pd.read_csv(labels_file_path) label_name = str( labels.columns[labels.columns.str.contains(pat='label')].values[0]) logmemoryusage("Before garbage collect") hmm_features = nfu.hmm_features_df( open_pickle_filepath(symbol_feature_paths[key])) if hmm_features.isnull().values.all( ): # checking that the HMM features are actually not null pass print('lots of NaNs on features') else: # if features not null then start moving on! print("can train") market_features_df = CreateMarketFeatures( CreateMarketFeatures( CreateMarketFeatures(df=CreateMarketFeatures( df=labels).ma_spread_duration()).ma_spread()). chaikin_mf()).obv_calc() # market features dataframe df_concat = pd.DataFrame( pd.concat([hmm_features, market_features_df], axis=1, sort='False').dropna()) df = df_concat[df_concat[label_name].notna()] df_final = df.drop(columns=[ 'TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', 'Volume', label_name ]) y_train = df.reindex(columns=df.columns[df.columns.str.contains( pat='label')]) # training labels print('go to the labels') if df_final.shape[0] < 10: print( ' the ratio of classes is too low. try another label permutation' ) # problem_dict[hmm_date][key] = str(key) pass else: print("starting model fit") Xtr, Xte, Ytr, Yte = train_test_split(df_final, y_train, test_size=.2, random_state=42) # training arrXtr = np.array(Xtr) X_tr = normalization(rescale_01(arrXtr)) Y_tr = torch.Tensor(Ytr.values.ravel()) # testing arrXte = np.array(Xte) X_te = normalization(rescale_01(arrXte)) Y_te = torch.Tensor(Yte.values.ravel()) KLtr = [ pairwise.homogeneous_polynomial_kernel(X_tr, degree=d) for d in range(1, 11) ] + [identity_kernel(len(Y_tr))] KLte = [ pairwise.homogeneous_polynomial_kernel(X_te, X_tr, degree=d) for d in range(1, 11) ] KLte.append(torch.zeros(KLte[0].size())) print('done with kernel') try: lam_values = [0.1, 0.2, 1] best_results = {} C_range = [0.1, 1] for C_ch in C_range: base_learner = SVC(C=C_ch) # "soft"-margin svm print(' fitted the base learner') # possible lambda values for the EasyMKL algorithm for lam in lam_values: print('now here', lam) print(' and tuning lambda for EasyMKL...', end='') base_learner = SVC(C=C_ch) # "soft"-margin svm # MKLpy.model_selection.cross_val_score performs the cross validation automatically, # it may returns accuracy, auc, or F1 scores scores = cross_val_score(KLtr, Y_tr, EasyMKL( learner=base_learner, lam=lam), n_folds=5, scoring='accuracy') acc = np.mean(scores) if not best_results or best_results['score'] < acc: best_results = {'lam': lam, 'score': acc} # evaluation on the test set print('done', best_results) cv_dict_list[(symbol, hmm_date, label_idx)][(lam, C_ch)] = [ scores, best_results ] print(cv_dict_list) pickle_out_filename = os.path.join( mainPath, "ExperimentCommonLocs/MKLFittedModels", "_".join((symbol, 'model_fit_date', str(key), str(alternate_labels_nos[label_idx]), 'MultiKernelSVC.pkl'))) print(pickle_out_filename) pickle_out = open(pickle_out_filename, 'wb') pickle.dump(cv_dict_list, pickle_out) pickle_out.close() except (ValueError, TypeError, EOFError): pass