def __init__(self, x, n_components, nkernel_components=100, kernel='rbf', preprocess=True, gamma=None, coef0=1, degree=3): self.x = x self.kernel = kernel self.n_components = n_components self.preprocess = preprocess self.gamma = gamma self.coef0 = coef0 self.degree = degree self.nkernel_components = nkernel_components self.kX = ks.Nystroem(kernel=self.kernel, gamma=self.gamma, coef0=self.coef0, degree=self.degree, n_components=self.nkernel_components) self.Xkernel = self.kX.fit_transform(x) if self.preprocess: self.Xscaler = preprocessing.StandardScaler().fit(self.Xkernel) self.Xkernel = self.Xscaler.transform(self.Xkernel)
def convert_to_kernel(self, X0): kX = ks.Nystroem(kernel=self.kernel, gamma=self.gamma, coef0=self.coef0, degree=self.degree, n_components=self.nkernel_components) Xkernel = kX.fit_transform(X0) if self.preprocess: Xscaler = preprocessing.StandardScaler().fit(Xkernel) Xkernel = Xscaler.transform(Xkernel) self.Xscaler = Xscaler #print(Xkernel) self.Xkernel = Xkernel self.X0 = X0 self.kX = kX return Xkernel
def __init__(self, x, y, copy=True, max_iter=500, n_components=1, nkernel_components=100, scale=True, tol=1e-06, kernel='linear', preprocess=True, gamma=None, coef0=1, degree=3): super(MspmKernelPartialLeastSquares, self).__init__(copy=copy, max_iter=max_iter, n_components=n_components, scale=scale, tol=tol) self.x = x self.y = y self.kernel = kernel self.preprocess = preprocess self.gamma = gamma self.coef0 = coef0 self.degree = degree self.nkernel_components = nkernel_components self.kX = ks.Nystroem(kernel=self.kernel, gamma=self.gamma, coef0=self.coef0, degree=self.degree, n_components=self.nkernel_components) self.Xkernel = self.kX.fit_transform(x) if self.preprocess: self.Xscaler = preprocessing.StandardScaler().fit(self.Xkernel) self.Xkernel = self.Xscaler.transform(self.Xkernel)
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
tab_score = np.array([apply_svm_cross_validation(X_train, y_train, {'loss':'hinge', 'penalty':penalty, 'max_iter':1000, 'alpha':1e-9, 'tol':1e-3, 'random_state':123456, 'class_weight':None}) for penalty in tab_penalty]) print(tab_score) plt.plot(tab_penalty, tab_score[:, 0]) plt.plot(tab_penalty, tab_score[:, 1]) plt.plot(tab_penalty, tab_score[:, 2]) plt.legend(["test_precision", "test_recall", "test_f1"]) plt.xlabel("Regularization Penalty") plt.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.savefig("cross_validation_svm_penalty.pdf", format="pdf") plt.show() feature_map_nystroem = kernel_approximation.Nystroem(kernel='poly', gamma=None, degree=2, n_components=200, random_state=123456) feature_map_nystroem.fit(X_train) X_train_new = feature_map_nystroem.transform(X_train) X_test_new = feature_map_nystroem.transform(X_test) clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', max_iter=1000, alpha=1e-9, tol=1e-3, random_state=123456, class_weight=None, verbose=1) clf.fit(X_train_new, y_train) print("Train") y_pred_train = clf.predict(X_train_new) print("accuracy score = ", metrics.balanced_accuracy_score(y_train, y_pred_train)) precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_train, y_pred_train) print("precision = ", precision[1]) print("recall = ", recall[1]) print("fbeta_score = ", fbeta_score[1]) print("support = ", support[1])
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params
y, test_size=.5, random_state=42, stratify=y) log(f"Start Time: {(arrow.now() - lap_time).seconds // 60}:{(arrow.now() - lap_time).seconds % 60} minutes." ) #sgd = svm.SVC() logreg = linear_model.LogisticRegression() logreg.fit(X_tr, y_tr) y_pred = logreg.predict(X_te) log(metrics.classification_report(y_te, y_pred)) # for _ in range(1): # log(f"Start Time: {(arrow.now() - lap_time).seconds // 60}:{(arrow.now() - lap_time).seconds % 60} minutes.") # # GBC.fit(X_tr, y_tr) # print('train done') # log(f"Start Time: {(arrow.now() - lap_time).seconds // 60}:{(arrow.now() - lap_time).seconds % 60} minutes.") # # print(metrics.classification_report(GBC.predict(X_te),y_te)) # log(f"Start Time: {(arrow.now() - lap_time).seconds // 60}:{(arrow.now() - lap_time).seconds % 60} minutes.") svm_sgd = linear_model.SGDClassifier(max_iter=500, tol=1e-3) svm_sgd = pipeline.make_pipeline( kernel_approximation.Nystroem(kernel='rbf'), svm_sgd) svm_sgd.fit(X_tr, y_tr) y_pred = svm_sgd.predict(X_te) log(metrics.classification_report(y_te, y_pred))
for i in range(10): sklearn.base.clone(reducer).fit_transform(x_pipe, y_pipe) end = time.time() duration = end - start print("reducer", duration) # 2 reducer.fit(x_pipe, y_pipe) x_pipe = reducer.transform(x_pipe) #Approximator NYS_kernel NYS_gamma NYS_n_components #NYS, rbf, 100, approximator = kernel_approximation.Nystroem() approximator.set_params(kernel = 'rbf', gamma = None, n_components = 100) start = time.time() for i in range(10): sklearn.base.clone(approximator).fit_transform(x_pipe, y_pipe) end = time.time() duration = end - start print("approximator", duration) # .24 approximator.fit(x_pipe, y_pipe) x_pipe = approximator.transform(x_pipe)