class SVMClassifier(ClassifierI): """Wrapper for scikit-learn svm classifier.""" def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None): """Init. See scikit-learn.""" self._clf = SVC(C=1, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, random_state=random_state) self.classes_ = None def __repr__(self): return "<SVMClassifier(%r)>" % self._clf def classify_many(self, vectors): """Classify a batch of verbs. :param vectors: An doc term array of vectors :return: The predicted class label for each input sample. :rtype: list """ classes = self.classes_ return [classes[i] for i in self._clf.predict(vectors)] def prob_classify_many(self, vectors): """Compute per-class probabilities for a batch of samples. :param vectors: A doc term array of vectors :rtype: list of ``ProbDistI`` """ y_proba_list = self._clf.predict_proba(vectors) return [self._make_probdist(y_proba) for y_proba in y_proba_list] def labels(self): """The class labels learned by this classifier. :rtype: list """ return list(self.classes_) def train(self, vectors, labels): """ Train (fit) the scikit-learn svm classifier. :param vectors: a doc-term array of vectors to learn from :param labels: a list of labels corresponding to the rows of the doc term array. """ self.classes_, labels = np.unique(labels, return_inverse=True) self._clf.fit(vectors, labels) return self def _make_probdist(self, y_proba): classes = self.classes_ return dict((classes[i], p) for i, p in enumerate(y_proba))
class Clf(object): SVC_RBF = SVC(kernel='rbf', class_weight=None, random_state=0) SVC_RBF_CW = SVC(kernel='rbf', class_weight='auto', random_state=0) LINEAR_L1 = LinearSVC(loss='l1', random_state=0, class_weight=None) LINEAR_L1_CW = LinearSVC(loss='l1', random_state=0, class_weight='auto') LINEAR_SVC = SVC(kernel='linear', random_state=0, class_weight='auto') TREE = DecisionTreeClassifier(random_state=0) RF = RandomForestClassifier(random_state=0) MAJORITY = DummyClassifier(strategy='most_frequent') RANDOM = DummyClassifier(strategy='stratified') ADABOOST = AdaBoostClassifier(random_state=0) LR = LogisticRegression()
def train_all(self, g): X = np.concatenate([self.train_X, self.val_X], axis=0) if self.use_scale: self.scale.fit(X) X = self.scale.transform(X) for i in range(3): y = np.concatenate([self.train_y, self.val_y], axis=0) y[y!=i+1]=0 y[y!=0]=1 clf = SVC() clf.set_params(**g) self.model_a.append(clf.fit(X, y))
def SVCClassify(self, x_train, y_train): ''' Basic Support Vector Machine Classifier ''' # the parameter can be set kernel = 'rbf' # init classifier and train it # if need the proba-predict result, parameter probability must be =True clf = SVC(kernel=kernel, probability=True) clf.fit(x_train, y_train) return clf
def train(self, g): self.model = [] X = self.train_X.copy() if self.use_scale: self.scale.fit(X) X = self.scale.transform(X) for i in range(3): y = self.train_y.copy() y[y!=i+1]=0 y[y!=0]=1 clf = SVC() clf.set_params(**g) self.model.append(clf.fit(X, y))
def cross_validate(samples, labels, outputDir): ''' Function to perform K-fold cross validation ''' # K(=10) FOLD CROSS VALIDATION K = 10 fold_samples, fold_labels = cv_split(samples, np.array(labels), K) log_loss = [['Log Loss'],[]] total_ll = 0.0 for fold in range(K): samples_chunk = fold_samples[:fold] + fold_samples[fold+1:] labels_chunk = fold_labels[:fold] + fold_labels[fold+1:] #Training L1 logistic regression logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1') logRegrL1.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #Training SVM with linear kernel svmLin = SVC(kernel='linear', probability=True) svmLin.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #TEST ON CROSS VALIDATION HOLD OUT SET val = [i for i in range(len(fold_labels[fold]))] id = 0 for item in fold_samples[fold]: predictionL1 = logRegrL1.predict_proba(item)#first component is probability of 0 class, second is of class 1 predictionSvmLin = svmLin.predict_proba(item) predictionRfc = rfc.predict_proba(item) #Taking the average of each of the model predictions as final health status prediction val[id] = (predictionL1[0][1] + predictionSvmLin[0][1] + predictionRfc[0][1])/3.0 id = id + 1 for i in range(len(fold_labels[fold])): total_ll += logloss(fold_labels[fold][i], val[i]) log_loss[1] = total_ll/len(samples) #Save csv file in the output directory with name Dota2Val.csv np.savetxt(outputDir + "\\Dota2Val.csv", log_loss, delimiter=',', fmt='%s' )
def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight='balanced', verbose=False, max_iter=(-1), decision_function_shape='ovr', random_state=None): self._hyperparams = { 'C': C, 'kernel': kernel, 'degree': degree, 'gamma': gamma, 'coef0': coef0, 'shrinking': shrinking, 'probability': probability, 'tol': tol, 'cache_size': cache_size, 'class_weight': class_weight, 'verbose': verbose, 'max_iter': max_iter, 'decision_function_shape': decision_function_shape, 'random_state': random_state } self._wrapped_model = Op(**self._hyperparams)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def train_ensemble_classifier(): # classifier2 = SklearnClassifier(GaussianNB(), sparse=False) # classifier1 = SklearnClassifier(SVC(), sparse=False) # classifier3 = SklearnClassifier(RandomForestClassifier(), sparse=False) # classifier4 = SklearnClassifier(DecisionTreeClassifier(), sparse=False) classifier2 = SklearnClassifier(GaussianNB(), sparse=False) classifier1 = SklearnClassifier(SVC(degree=18, C=12), sparse=False) classifier3 = SklearnClassifier(RandomForestClassifier(max_depth=100, n_estimators=10), sparse=False) classifier4 = SklearnClassifier(DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=2, max_leaf_nodes=30, splitter='best', random_state=0), sparse=False) test_classifiers = [] test_classifiers.append(classifier1) test_classifiers.append(classifier2) test_classifiers.append(classifier3) test_classifiers.append(classifier4) trained_classifiers = [] for classifier in test_classifiers: classifier = classifier.train(train_features) trained_classifiers.append(classifier) voted_classifier = VoteClassifier(trained_classifiers) save_classifier(voted_classifier, 'voted_classifier.pickle') print_and_get_split_dataset_accuracy(test_classifiers, train_features) print_voted_classifier_cross_validation_experiment_result( test_classifiers, train_features)
def train_and_predict(samples, labels, feature_selector, inputDir, outputDir): #Training L1 logistic regression logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1') logRegrL1.fit(samples, labels) #Training SVM with linear kernel svmLin = SVC(kernel='linear', probability=True) svmLin.fit(samples, labels) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit(samples, labels) #test set testDir = inputDir + "/set_test" testFiles = sorted([ join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f)) ], key=numericalSort) #Read feature vectors of test images testSamples = cubeVoxelsVar(testFiles) testSamples = feature_selector.transform(testSamples) print(len(testSamples)) #2D array to report final prediction in format (ID,Prediction) final = [[0 for j in range(2)] for i in range(139)] final[0][0] = 'ID' final[0][1] = 'Prediction' id = 1 #Predict health status of test image using each of the 3 models trained above for item in testSamples: predictionL1 = logRegrL1.predict_proba( item ) #first component is probability of 0 class, second is of class 1 predictionSvmLin = svmLin.predict_proba(item) predictionRfc = rfc.predict_proba(item) final[id][0] = id #Taking the average of each of the model predictions as final health status prediction final[id][1] = (predictionL1[0][1] + predictionSvmLin[0][1] + predictionRfc[0][1]) / 3.0 id = id + 1 #Save csv file in the output directory with name final_sub.csv np.savetxt(outputDir + "/final_sub.csv", final, delimiter=',', fmt='%s')
def test_kernel_sigmoid(self): clf = SVC(C=1., kernel='sigmoid', gamma=0.001, random_state=0) self.set_classifier(clf) java_preds, py_preds = [], [] min_vals = np.amin(self.X, axis=0) max_vals = np.amax(self.X, axis=0) for n in range(self.N_RANDOM_TESTS): x = [random.uniform(min_vals[f], max_vals[f]) for f in range(self.n_features)] java_preds.append(self.make_pred_in_js(x)) py_preds.append(self.make_pred_in_py(x)) self.assertListEqual(py_preds, java_preds)
def test_pipeline_estimator(self): self.X, self.y = samples_generator.make_classification( n_informative=5, n_redundant=0, random_state=42) anova_filter = SelectKBest(f_regression, k=5) self.mdl = Pipeline([('anova', anova_filter), ('svc', SVC(kernel='linear'))]) self.mdl.set_params(anova__k=10, svc__C=.1) try: self._port_model() except Exception as e: self.fail('Unexpected exception raised: {}'.format(e.message)) finally: self._clear_model()
def test_kernel_sigmoid(self): clf = SVC(C=1., kernel='sigmoid', gamma=0.001, random_state=0) self._port_model(clf) Y, Y_py = [], [] min_vals = np.amin(self.X, axis=0) max_vals = np.amax(self.X, axis=0) for n in range(self.n_random_tests): x = [random.uniform(min_vals[f], max_vals[f]) for f in range(self.n_features)] Y.append(self.make_pred_in_custom(x)) Y_py.append(self.make_pred_in_py(x)) self.assertListEqual(Y, Y_py)
def learnModel(train): data = [] for duplicate in train["is_duplicate"]: data.append(int(duplicate)) znacajkePitanja = get_avg(train) svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=10000) print("Learning started") tmStart = timer() svmKlasifikator.fit(znacajkePitanja, data) tmEnd = timer() print("Learning ended") print("Learning lasted", tmEnd - tmStart) joblib.dump(svmKlasifikator, 'Word2VecSVMNauceni.pkl') print("Spremljen je napredak ucenja")
def classifier_panchenko2016(X_train, y_train, X_test, y_test, separateClassifier=False): train_or_test_labels = ["train" for i in y_train] + ["test" for i in y_test] y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels, X_train + X_test, y_train + y_test) y_train, X_train = features_extraction( y_train, X_train, separateClassifier=separateClassifier, featuresCount=100) y_test, X_test = features_extraction(y_test, X_test, separateClassifier=separateClassifier, featuresCount=100) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = SVC(kernel="rbf", C=2e11, gamma=2e-1, max_iter=5000, class_weight="balanced", verbose=1) print("fitting") classifier.fit(X_train, y_train) print("testing") y_predictions = classifier.predict(X_test) #, y_test) return y_test, y_predictions
class SVCImpl(): def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight='balanced', verbose=False, max_iter=(- 1), decision_function_shape='ovr', random_state=None): self._hyperparams = { 'C': C, 'kernel': kernel, 'degree': degree, 'gamma': gamma, 'coef0': coef0, 'shrinking': shrinking, 'probability': probability, 'tol': tol, 'cache_size': cache_size, 'class_weight': class_weight, 'verbose': verbose, 'max_iter': max_iter, 'decision_function_shape': decision_function_shape, 'random_state': random_state} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None): """Init. See scikit-learn.""" self._clf = SVC(C=1, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, random_state=random_state) self.classes_ = None
def test_kernel_poly(self): clf = SVC(C=1., kernel='poly', gamma=0.001, random_state=0) self._port_model(clf) java_preds, py_preds = [], [] min_vals = np.amin(self.X, axis=0) max_vals = np.amax(self.X, axis=0) for n in range(self.n_random_tests): x = [ random.uniform(min_vals[f], max_vals[f]) for f in range(self.n_features) ] java_preds.append(self.make_pred_in_js(x)) py_preds.append(self.make_pred_in_py(x)) self.assertListEqual(py_preds, java_preds)
def test_sigmoid_kernel(self): self.mdl = SVC(C=1., kernel='sigmoid', gamma=0.001, random_state=0) self.load_iris_data() self._port_model() amin = np.amin(self.X, axis=0) amax = np.amax(self.X, axis=0) preds, ground_truth = [], [] for _ in range(self.N_RANDOM_FEATURE_SETS): x = np.random.uniform(amin, amax, self.n_features) preds.append(self.pred_in_custom(x)) ground_truth.append(self.pred_in_py(x)) self._clear_model() # noinspection PyUnresolvedReferences self.assertListEqual(preds, ground_truth)
def test_auto_gamma(self): self.estimator = SVC(C=1., gamma='auto', random_state=0) self.load_iris_data() self._port_estimator() amin = np.amin(self.X, axis=0) amax = np.amax(self.X, axis=0) preds, ground_truth = [], [] for _ in range(self.N_RANDOM_FEATURE_SETS): x = np.random.uniform(amin, amax, self.n_features) preds.append(self.pred_in_custom(x)) ground_truth.append(self.pred_in_py(x)) self._clear_estimator() # noinspection PyUnresolvedReferences self.assertListEqual(preds, ground_truth)
def train_svm(params, suffix, train_X, train_Y, test_X, test_Y): C = params['C'] kernel = params['kernel'] model = SVC(gamma='scale', probability=True, C=C, kernel=kernel) print("Params C:", C, "kernel:", kernel) model.fit(train_X, train_Y) print("Train score", model.score(train_X, train_Y)) test_score = model.score(test_X, test_Y) print("Test score", test_score) return test_score, None
def train_cv_clf(topics_train, classes_train, features, n_folds=10, param_grid=_PARAM_GRID, tuned_clf=SVC(C=1, kernel='linear'), scoring=util.weighted_f1, random_state=0): """Trains the topic type classifier, given the various parameters. """ kf = cross_validation.KFold(len(topics_train), n_folds=n_folds, random_state=random_state) cv_clf = GridSearchCV(estimator=tuned_clf, param_grid=param_grid, cv=kf, scoring=scoring) topic_vectors_train = to_features(features, topics_train) cv_clf.fit(topic_vectors_train, classes_train) return cv_clf
def featureSelector(data,trainHeaderList,target,selectorType): dataFrame = pd.DataFrame(data) if(selectorType == 'VT'): cols = dataFrame.columns pi = 0.6 selector = VarianceThreshold(threshold=(pi*(1-pi))) values = selector.fit_transform(dataFrame) labels = list() i = 0 for x in selector.get_support(indices=False): if x: labels.append(trainHeaderList.__getitem__(i)) i += 1 return pd.DataFrame(values , columns=labels) elif(selectorType == 'KB'): selector = SelectKBest(chi2, k=6) values = selector.fit_transform(dataFrame, target) labels = list() i = 0 for x in selector.get_support(indices=False): if x: labels.append(trainHeaderList.__getitem__(i)) i += 1 return pd.DataFrame(values, columns=labels) elif(selectorType == 'SVC'): svc = SVC(kernel="linear", C=1) selector = RFE(estimator=svc, n_features_to_select=20, step=0.5, verbose=5) values =selector.fit_transform(dataFrame, target) labels = list() i = 0 for x in selector.get_support(indices=False): if x: labels.append(trainHeaderList.__getitem__(i)) i += 1 return pd.DataFrame(values, columns=labels)
from sklearn.svm.classes import SVC import cPickle import sys import time # Performs K-means clustering and save the model to a local file if __name__ == '__main__': t1 = time.time() event_name = "P003" feat_dir = "kmeans/" feat_dim = 50 output_file = "mfcc_pred/svm.%s.model" % event_name fread = open("list/train", "r") clf = SVC(probability=True) X, Y = [], [] for i in fread.readlines(): i = i.split(" ") line = i[0] label = i[1].replace('\n', '') kmeans_path = "kmeans/" + line + ".kmeans.txt" if os.path.exists(kmeans_path): kmeans_feat = numpy.genfromtxt(kmeans_path, delimiter=";") else: kmeans_feat = numpy.zeros(feat_dim) label = "NULL" if label != event_name: label = "NULL" X.append(kmeans_feat) Y.append(label)
def setUp(self): super(SVCJSTest, self).setUp() self.estimator = SVC(C=1., kernel='rbf', gamma=0.001, random_state=0)
label_string += '0 ' fopen.close() # generate the label_vec = numpy.fromstring(label_string.strip(), dtype=int, sep=' ') print 'Totally we get %s labels' % (label_vec.shape[0]) # for debugging # create the feature matrix, in which each row represents a video video_num = len(video_list) feat_mat = numpy.zeros([video_num, feat_dim]) for i in xrange(video_num): # BOW features of this video feat_vec = numpy.genfromtxt(feat_dir + video_list[i], dtype=numpy.float32, delimiter=";") assert (feat_vec.shape[0] == feat_dim) # fill the feature vector to the matrix feat_mat[i, :] = feat_vec # initialize svm svm = SVC(kernel=chi2_kernel) # svm = SVC(probability=True) # train the svm models svm.fit(feat_mat, label_vec) # finally save the k-means model cPickle.dump(svm, open(output_file, "wb"), cPickle.HIGHEST_PROTOCOL) print 'SVM trained successfully for event %s!' % (event_name)
def parameter_tuning(Xn, yn, scale=1): # FEATURE SELECTION print Xn.shape print yn.shape # FEATURE SCALING if scale == 1: Xn = preprocessing.scale(Xn, with_mean=True) print 'NORMALIZING' elif scale == 2: Xn = preprocessing.scale(Xn, with_mean=False) print 'NORMALIZING' tuned_parameters = [{ 'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10), 'gamma': np.logspace(-4, 2, 7) }] tuned_parameters2 = { 'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10), 'gamma': np.logspace(-4, 2, 7) } linear_parameters = [{'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)}] linear_parameters2 = {'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)} cv = cross_validation.StratifiedKFold(yn, shuffle=True, n_folds=3, random_state=42) if RBF: clf = RandomizedSearchCV(estimator=SVC(C=1, cache_size=1000), param_distributions=tuned_parameters2, cv=cv, scoring='accuracy', n_iter=30, verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn)) if LINEAR: clf = GridSearchCV(estimator=SVC(C=1, cache_size=1000), param_grid=linear_parameters, cv=cv, scoring='accuracy', verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn))
y, test_size=0.2, random_state=42) #:# preprocessing transform_pipeline = Pipeline([('scaler', StandardScaler())]) X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns) #:# model params = {'gamma': 5, 'kernel': 'sigmoid', 'probability': True} classifier = SVC(**params) classifier.fit(X_train, y_train) #:# hash #:# aad366f6d5961bc98783c2ad9fb3918d md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}')
('extender', AttributesExtension()), ('imputer', SimpleImputer(strategy="mean")), ]) learning_data = pipeline.fit_transform(features_data) # ### Select a model # In[ ]: from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn.svm.classes import SVC from sklearn.metrics import accuracy_score svc = SVC() log_reg = LogisticRegression() #log_reg.fit(learning_data, labels) rand_for = RandomForestClassifier() #rand_for.fit(learning_data, labels) models = { "Logistic Regression": log_reg, "Random Forest": rand_for, "SVM": svc, } for model in models.keys(): scores = cross_val_score(models[model], learning_data, labels,
def generate_plot(self, generate_testpoints=True, generate_background=True, tune_background_model=False, background_resolution=100): """Generates and returns arrays for visualizing the dataset and the identified decision boundary in 2D. Parameters ---------- generate_testpoints : boolean, optional (default=True) Whether to generate demo points around the estimated decision boundary as a sanity check generate_background : boolean, optional (default=True) Whether to generate faint background plot (using prediction probabilities of a fitted suppor vector machine, trained on generated demo points) to aid visualization tune_background_model : boolean, optional (default=False) Whether to tune the parameters of the support vector machine generating the background background_resolution : int, optional (default=100) Desired resolution (height and width) of background to be generated Returns ------- decision_boundary_points_2d : array Array containing points in the dimensionality-reduced 2D space which are very close to the true decision boundary X_testpoints_2d : array Array containing generated demo points in the dimensionality-reduced 2D space which surround the decision boundary and can be used for visual feedback to estimate which area would be assigned which class y_testpoints : array Classifier predictions for each of the generated demo points background: array Generated background image showing prediction probabilities of the classifier in each region (only returned if generate_background is set to True!) """ if len(self.decision_boundary_points) == 0: raise Exception("Please call the fit method first!") if not generate_testpoints and generate_background: print("Warning: cannot generate a background without testpoints") if len(self.X_testpoints) == 0 and generate_testpoints: if self.verbose: print("Generating demo points around decision boundary...") self._generate_testpoints() if generate_background and generate_testpoints: if tune_background_model: params = {'C': np.power(10, np.linspace(0, 2, 2)), 'gamma': np.power(10, np.linspace(-2, 0, 2))} grid = GridSearchCV(SVC(), params, n_jobs=-1 if os.name != 'nt' else 1) grid.fit(np.vstack((self.X2d[self.train_idx], self.X_testpoints_2d)), np.hstack( (self.y[self.train_idx], self.y_testpoints))) bestparams = grid.best_params_ else: bestparams = {'C': 1, 'gamma': 1} self.background_model = SVC(probability=True, C=bestparams['C'], gamma=bestparams['gamma']).fit(np.vstack( (self.X2d[self.train_idx], self.X_testpoints_2d)), np.hstack((self.y[self.train_idx], self.y_testpoints))) xx, yy = np.meshgrid(np.linspace(self.X2d_xmin, self.X2d_xmax, background_resolution), np.linspace( self.X2d_ymin, self.X2d_ymax, background_resolution)) Z = self.background_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0] Z = Z.reshape((background_resolution, background_resolution)) self.background = Z if generate_background and generate_testpoints: return self.decision_boundary_points_2d, self.X_testpoints_2d, self.y_testpoints, Z elif generate_testpoints: return self.decision_boundary_points_2d, self.X_testpoints_2d, self.y_testpoints else: return self.decision_boundary_points_2d
def setUp(self): super(SVCCTest, self).setUp() self.mdl = SVC(C=1., kernel='rbf', gamma=0.001, random_state=0)
class DBPlot(BaseEstimator): """ Heuristic approach to estimate and visualize high-dimensional decision boundaries for trained binary classifiers by using black-box optimization to find regions in which the classifier is maximally uncertain (0.5 prediction probability). The total number of keypoints representing the decision boundary will depend on n_connecting_keypoints and n_interpolated_keypoints. Reduce either or both to reduce runtime. Parameters ---------- estimator : BaseEstimator instance, optional (default=`KNeighborsClassifier(n_neighbors=10)`). Classifier for which the decision boundary should be plotted. Can be trained or untrained (in which case the fit method will train it). Must have probability estimates enabled (i.e. `estimator.predict_proba` must work). Make sure it is possible for probability estimates to get close to 0.5 (more specifically, as close as specified by acceptance_threshold) - this usally requires setting an even number of neighbors, estimators etc. dimensionality_reduction : BaseEstimator instance, optional (default=`PCA(n_components=2)`). Dimensionality reduction method to help plot the decision boundary in 2D. Can be trained or untrained (in which case the fit method will train it). Must have n_components=2. Must be able to project new points into the 2D space after fitting (i.e. `dimensionality_reduction.transform` must work). acceptance_threshold : float, optional (default=0.03) Maximum allowed deviation from decision boundary (defined as the region with 0.5 prediction probability) when accepting decision boundary keypoints n_decision_boundary_keypoints : int, optional (default=60) Total number of decision boundary keypoints added, including both connecting and interpolated keypoints. n_connecting_keypoints : int, optional (default=None) Number of decision boundary keypoints estimated along lines connecting instances from two different classes (each such line must cross the decision boundary at least once). If None (default), it is set to 1/3 of n_decision_boundary_keypoints n_interpolated_keypoints : int, optional (default=None) Number of decision boundary keypoints interpolated between connecting keypoints to increase keypoint density. If None (default), it is set to 2/3 of n_decision_boundary_keypoints n_generated_testpoints_per_keypoint : int, optional (default=15) Number of demo points generated around decision boundary keypoints, and labeled according to the specified classifier, in order to enrich and validate the decision boundary plot linear_iteration_budget : int, optional (default=100) Maximum number of iterations the optimizer is allowed to run for each keypoint estimation while looking along linear trajectories hypersphere_iteration_budget : int, optional (default=300) Maximum number of iterations the optimizer is allowed to run for each keypoint estimation while looking along hypersphere surfaces verbose: bool, optional (default=True) Verbose output """ def __init__(self, estimator=KNeighborsClassifier(n_neighbors=10), dimensionality_reduction=PCA(n_components=2), acceptance_threshold=0.03, n_decision_boundary_keypoints=60, n_connecting_keypoints=None, n_interpolated_keypoints=None, n_generated_testpoints_per_keypoint=15, linear_iteration_budget=100, hypersphere_iteration_budget=300, verbose=True): if acceptance_threshold == 0: raise Warning( "A nonzero acceptance threshold is strongly recommended so the optimizer can finish in finite time") if linear_iteration_budget < 2 or hypersphere_iteration_budget < 2: raise Exception("Invalid iteration budget") self.classifier = estimator self.dimensionality_reduction = dimensionality_reduction self.acceptance_threshold = acceptance_threshold if n_decision_boundary_keypoints and n_connecting_keypoints and n_interpolated_keypoints and n_connecting_keypoints + n_interpolated_keypoints != n_decision_boundary_keypoints: raise Exception( "n_connecting_keypoints and n_interpolated_keypoints must sum to n_decision_boundary_keypoints (set them to None to use calculated suggestions)") self.n_connecting_keypoints = n_connecting_keypoints if n_connecting_keypoints != None else n_decision_boundary_keypoints / 3 self.n_interpolated_keypoints = n_interpolated_keypoints if n_interpolated_keypoints != None else n_decision_boundary_keypoints * 2 / 3 self.linear_iteration_budget = linear_iteration_budget self.n_generated_testpoints_per_keypoint = n_generated_testpoints_per_keypoint self.hypersphere_iteration_budget = hypersphere_iteration_budget self.verbose = verbose self.decision_boundary_points = [] self.decision_boundary_points_2d = [] self.X_testpoints = [] self.y_testpoints = [] self.background = [] self.steps = 3 self.hypersphere_max_retry_budget = 20 self.penalties_enabled = True self.random_gap_selection = False def setclassifier(self, estimator=KNeighborsClassifier(n_neighbors=10)): """Assign classifier for which decision boundary should be plotted. Parameters ---------- estimator : BaseEstimator instance, optional (default=KNeighborsClassifier(n_neighbors=10)). Classifier for which the decision boundary should be plotted. Must have probability estimates enabled (i.e. estimator.predict_proba must work). Make sure it is possible for probability estimates to get close to 0.5 (more specifically, as close as specified by acceptance_threshold). """ self.classifier = estimator def fit(self, X, y, training_indices=None): """Specify data to be plotted, and fit classifier only if required (the specified clasifier is only trained if it has not been trained yet). All the input data is provided in the matrix X, and corresponding binary labels (values taking 0 or 1) in the vector y Parameters ---------- X : array-like, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix containing data y : array-like, shape = [n_samples] Labels training_indices : array-like or float, optional (default=None) Indices on which the classifier has been trained / should be trained. If float, it is converted to a random sample with the specified proportion of the full dataset. Returns ------- self : returns an instance of self. """ if set(np.array(y, dtype=int).tolist()) != set([0, 1]): raise Exception( "Currently only implemented for binary classification. Make sure you pass in two classes (0 and 1)") if training_indices == None: train_idx = range(len(y)) elif type(training_indices) == float: train_idx, test_idx = train_test_split(range(len(y)), test_size=0.5) else: train_idx = training_indices self.X = X self.y = y self.train_idx = train_idx #self.test_idx = np.setdiff1d(np.arange(len(y)), self.train_idx, assume_unique=False) self.test_idx = list(set(range(len(y))).difference(set(self.train_idx))) # fit classifier if necessary try: self.classifier.predict([X[0]]) except: self.classifier.fit(X[train_idx, :], y[train_idx]) self.y_pred = self.classifier.predict(self.X) # fit DR method if necessary try: self.dimensionality_reduction.transform([X[0]]) except: self.dimensionality_reduction.fit(X, y) try: self.dimensionality_reduction.transform([X[0]]) except: raise Exception( "Please make sure your dimensionality reduction method has an exposed transform() method! If in doubt, use PCA or Isomap") # transform data self.X2d = self.dimensionality_reduction.transform(self.X) self.mean_2d_dist = np.mean(pdist(self.X2d)) self.X2d_xmin, self.X2d_xmax = np.min(self.X2d[:, 0]), np.max(self.X2d[:, 0]) self.X2d_ymin, self.X2d_ymax = np.min(self.X2d[:, 1]), np.max(self.X2d[:, 1]) self.majorityclass = 0 if list(y).count(0) > list(y).count(1) else 1 self.minorityclass = 1 - self.majorityclass minority_idx, majority_idx = np.where(y == self.minorityclass)[ 0], np.where(y == self.majorityclass)[0] self.Xminor, self.Xmajor = X[minority_idx], X[majority_idx] self.Xminor2d, self.Xmajor2d = self.X2d[minority_idx], self.X2d[majority_idx] # set up efficient nearest neighbor models for later use self.nn_model_2d_majorityclass = NearestNeighbors(n_neighbors=2) self.nn_model_2d_majorityclass.fit(self.X2d[majority_idx, :]) self.nn_model_2d_minorityclass = NearestNeighbors(n_neighbors=2) self.nn_model_2d_minorityclass.fit(self.X2d[minority_idx, :]) # step 1. look for decision boundary points between corners of majority & # minority class distribution minority_corner_idx, majority_corner_idx = [], [] for extremum1 in [np.min, np.max]: for extremum2 in [np.min, np.max]: _, idx = self.nn_model_2d_minorityclass.kneighbors( [[extremum1(self.Xminor2d[:, 0]), extremum2(self.Xminor2d[:, 1])]]) minority_corner_idx.append(idx[0][0]) _, idx = self.nn_model_2d_majorityclass.kneighbors( [[extremum1(self.Xmajor2d[:, 0]), extremum2(self.Xmajor2d[:, 1])]]) majority_corner_idx.append(idx[0][0]) # optimize to find new db keypoints between corners self._linear_decision_boundary_optimization( minority_corner_idx, majority_corner_idx, all_combinations=True, step=1) # step 2. look for decision boundary points on lines connecting randomly # sampled points of majority & minority class n_samples = int(self.n_connecting_keypoints) from_idx = list(random.sample(list(np.arange(len(self.Xminor))), n_samples)) to_idx = list(random.sample(list(np.arange(len(self.Xmajor))), n_samples)) # optimize to find new db keypoints between minority and majority class self._linear_decision_boundary_optimization( from_idx, to_idx, all_combinations=False, step=2) if len(self.decision_boundary_points_2d) < 2: print("Failed to find initial decision boundary. Retrying... If this keeps happening, increasing the acceptance threshold might help. Also, make sure the classifier is able to find a point with 0.5 prediction probability (usually requires an even number of estimators/neighbors/etc).") return self.fit(X, y, training_indices) # step 3. look for decision boundary points between already known db # points that are too distant (search on connecting line first, then on # surrounding hypersphere surfaces) edges, gap_distances, gap_probability_scores = self._get_sorted_db_keypoint_distances() # find gaps self.nn_model_decision_boundary_points = NearestNeighbors(n_neighbors=2) self.nn_model_decision_boundary_points.fit(self.decision_boundary_points) i = 0 retries = 0 while i < self.n_interpolated_keypoints: if self.verbose: print("Step 3/{}:{}/".format(self.steps, i, self.n_interpolated_keypoints)) if self.random_gap_selection: # randomly sample from sorted DB keypoint gaps? gap_idx = np.random.choice(len(gap_probability_scores), 1, p=gap_probability_scores)[0] else: # get largest gap gap_idx = 0 from_point = self.decision_boundary_points[edges[gap_idx][0]] to_point = self.decision_boundary_points[edges[gap_idx][1]] # optimize to find new db keypoint along line connecting two db keypoints # with large gap db_point = self._find_decision_boundary_along_line( from_point, to_point, penalize_tangent_distance=self.penalties_enabled) if self.decision_boundary_distance(db_point) > self.acceptance_threshold: if self.verbose: print("No good solution along straight line - trying to find decision boundary on hypersphere surface around known decision boundary point") # hypersphere radius half the distance between from and to db keypoints R = euclidean(from_point, to_point) / 2.0 # search around either source or target keypoint, with 0.5 probability, # hoping to find decision boundary in between if random.random() > 0.5: from_point = to_point # optimize to find new db keypoint on hypersphere surphase around known keypoint db_point = self._find_decision_boundary_on_hypersphere(from_point, R) if self.decision_boundary_distance(db_point) <= self.acceptance_threshold: db_point2d = self.dimensionality_reduction.transform([db_point])[0] self.decision_boundary_points.append(db_point) self.decision_boundary_points_2d.append(db_point2d) i += 1 retries = 0 else: retries += 1 if retries > self.hypersphere_max_retry_budget: i += 1 dist = self.decision_boundary_distance(db_point) msg = "Found point is too distant from decision boundary ({}), but retry budget exceeded ({})" print(msg.format(dist, self.hypersphere_max_retry_budget)) elif self.verbose: dist = self.decision_boundary_distance(db_point) print("Found point is too distant from decision boundary ({}) retrying...".format(dist)) else: db_point2d = self.dimensionality_reduction.transform([db_point])[0] self.decision_boundary_points.append(db_point) self.decision_boundary_points_2d.append(db_point2d) i += 1 retries = 0 edges, gap_distances, gap_probability_scores = self._get_sorted_db_keypoint_distances() # reload gaps self.decision_boundary_points = np.array(self.decision_boundary_points) self.decision_boundary_points_2d = np.array(self.decision_boundary_points_2d) if self.verbose: print("Done fitting! Found {} decision boundary keypoints.".format( len(self.decision_boundary_points))) return self def plot(self, plt=None, generate_testpoints=True, generate_background=True, tune_background_model=False, background_resolution=100, scatter_size_scale=1.0, legend=True): """Plots the dataset and the identified decision boundary in 2D. (If you wish to create custom plots, get the data using generate_plot() and plot it manually) Parameters ---------- plt : matplotlib.pyplot or axis object (default=matplotlib.pyplot) Object to be plotted on generate_testpoints : boolean, optional (default=True) Whether to generate demo points around the estimated decision boundary as a sanity check generate_background : boolean, optional (default=True) Whether to generate faint background plot (using prediction probabilities of a fitted suppor vector machine, trained on generated demo points) to aid visualization tune_background_model : boolean, optional (default=False) Whether to tune the parameters of the support vector machine generating the background background_resolution : int, optional (default=100) Desired resolution (height and width) of background to be generated scatter_size_scale : float, optional (default=1.0) Scaling factor for scatter plot marker size legend : boolean, optional (default=False) Whether to display a legend Returns ------- plt : The matplotlib.pyplot or axis object which has been passed in, after plotting the data and decision boundary on it. (plt.show() is NOT called and will be required) """ if plt == None: plt = mplt if len(self.X_testpoints) == 0: self.generate_plot(generate_testpoints=generate_testpoints, generate_background=generate_background, tune_background_model=tune_background_model, background_resolution=background_resolution) if generate_background and generate_testpoints: try: plt.imshow(np.flipud(self.background), extent=[ self.X2d_xmin, self.X2d_xmax, self.X2d_ymin, self.X2d_ymax], cmap="GnBu", alpha=0.33) except (Exception, ex): print("Failed to render image background") # decision boundary plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[ :, 1], 600 * scatter_size_scale, c='c', marker='p') # generated demo points if generate_testpoints: plt.scatter(self.X_testpoints_2d[:, 0], self.X_testpoints_2d[ :, 1], 20 * scatter_size_scale, c=['g' if i else 'b' for i in self.y_testpoints], alpha=0.6) # training data plt.scatter(self.X2d[self.train_idx, 0], self.X2d[self.train_idx, 1], 150 * scatter_size_scale, facecolor=['g' if i else 'b' for i in self.y[self.train_idx]], edgecolor=['g' if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 1 else ('b' if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 0 else 'r') for i in range(len(self.train_idx))], linewidths=5 * scatter_size_scale) # testing data plt.scatter(self.X2d[self.test_idx, 0], self.X2d[self.test_idx, 1], 150 * scatter_size_scale, facecolor=['g' if i else 'b' for i in self.y[self.test_idx]], edgecolor=['g' if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 1 else ('b' if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 0 else 'r') for i in range(len(self.test_idx))], linewidths=5 * scatter_size_scale, marker='s') # label data points with their indices for i in range(len(self.X2d)): plt.text(self.X2d[i, 0] + (self.X2d_xmax - self.X2d_xmin) * 0.5e-2, self.X2d[i, 1] + (self.X2d_ymax - self.X2d_ymin) * 0.5e-2, str(i), size=8) if legend: plt.legend(["Estimated decision boundary keypoints", "Generated demo data around decision boundary", "Actual data (training set)", "Actual data (demo set)"], loc="lower right", prop={'size': 9}) # decision boundary keypoints, in case not visible in background plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1], 600 * scatter_size_scale, c='c', marker='p', alpha=0.1) plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1], 30 * scatter_size_scale, c='c', marker='p', edgecolor='c', alpha=0.8) # minimum spanning tree through decision boundary keypoints D = pdist(self.decision_boundary_points_2d) edges = minimum_spanning_tree(squareform(D)) for e in edges: plt.plot([self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0]], [self.decision_boundary_points_2d[e[0], 1], self.decision_boundary_points_2d[e[1], 1]], '--c', linewidth=4 * scatter_size_scale) plt.plot([self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0]], [self.decision_boundary_points_2d[e[0], 1], self.decision_boundary_points_2d[e[1], 1]], '--k', linewidth=1) if len(self.test_idx) == 0: print("No demo performance calculated, as no testing data was specified") else: freq = itemfreq(self.y[self.test_idx]).astype(float) imbalance = np.round(np.max((freq[0, 1], freq[1, 1])) / len(self.test_idx), 3) acc_score = np.round(accuracy_score( self.y[self.test_idx], self.y_pred[self.test_idx]), 3) f1 = np.round(f1_score(self.y[self.test_idx], self.y_pred[self.test_idx]), 3) plt.title("Test accuracy: " + str(acc_score) + ", F1 score: " + str(f1) + ". Imbalance (max chance accuracy): " + str(imbalance)) if self.verbose: print("Plot successfully generated! Don't forget to call the show() method to display it") return plt def generate_plot(self, generate_testpoints=True, generate_background=True, tune_background_model=False, background_resolution=100): """Generates and returns arrays for visualizing the dataset and the identified decision boundary in 2D. Parameters ---------- generate_testpoints : boolean, optional (default=True) Whether to generate demo points around the estimated decision boundary as a sanity check generate_background : boolean, optional (default=True) Whether to generate faint background plot (using prediction probabilities of a fitted suppor vector machine, trained on generated demo points) to aid visualization tune_background_model : boolean, optional (default=False) Whether to tune the parameters of the support vector machine generating the background background_resolution : int, optional (default=100) Desired resolution (height and width) of background to be generated Returns ------- decision_boundary_points_2d : array Array containing points in the dimensionality-reduced 2D space which are very close to the true decision boundary X_testpoints_2d : array Array containing generated demo points in the dimensionality-reduced 2D space which surround the decision boundary and can be used for visual feedback to estimate which area would be assigned which class y_testpoints : array Classifier predictions for each of the generated demo points background: array Generated background image showing prediction probabilities of the classifier in each region (only returned if generate_background is set to True!) """ if len(self.decision_boundary_points) == 0: raise Exception("Please call the fit method first!") if not generate_testpoints and generate_background: print("Warning: cannot generate a background without testpoints") if len(self.X_testpoints) == 0 and generate_testpoints: if self.verbose: print("Generating demo points around decision boundary...") self._generate_testpoints() if generate_background and generate_testpoints: if tune_background_model: params = {'C': np.power(10, np.linspace(0, 2, 2)), 'gamma': np.power(10, np.linspace(-2, 0, 2))} grid = GridSearchCV(SVC(), params, n_jobs=-1 if os.name != 'nt' else 1) grid.fit(np.vstack((self.X2d[self.train_idx], self.X_testpoints_2d)), np.hstack( (self.y[self.train_idx], self.y_testpoints))) bestparams = grid.best_params_ else: bestparams = {'C': 1, 'gamma': 1} self.background_model = SVC(probability=True, C=bestparams['C'], gamma=bestparams['gamma']).fit(np.vstack( (self.X2d[self.train_idx], self.X_testpoints_2d)), np.hstack((self.y[self.train_idx], self.y_testpoints))) xx, yy = np.meshgrid(np.linspace(self.X2d_xmin, self.X2d_xmax, background_resolution), np.linspace( self.X2d_ymin, self.X2d_ymax, background_resolution)) Z = self.background_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0] Z = Z.reshape((background_resolution, background_resolution)) self.background = Z if generate_background and generate_testpoints: return self.decision_boundary_points_2d, self.X_testpoints_2d, self.y_testpoints, Z elif generate_testpoints: return self.decision_boundary_points_2d, self.X_testpoints_2d, self.y_testpoints else: return self.decision_boundary_points_2d def _generate_testpoints(self, tries=100): """Generate random demo points around decision boundary keypoints """ nn_model = NearestNeighbors(n_neighbors=3) nn_model.fit(self.decision_boundary_points) nn_model_2d = NearestNeighbors(n_neighbors=2) nn_model_2d.fit(self.decision_boundary_points_2d) #max_radius = 2*np.max([nn_model_2d.kneighbors([self.decision_boundary_points_2d[i]])[0][0][1] for i in range(len(self.decision_boundary_points_2d))]) self.X_testpoints = np.zeros((0, self.X.shape[1])) self.y_testpoints = [] for i in range(len(self.decision_boundary_points)): if self.verbose: msg = "Generating testpoint for plotting {}/{}" print(msg.format(i, len(self.decision_boundary_points))) testpoints = np.zeros((0, self.X.shape[1])) # generate Np points in Gaussian around decision_boundary_points[i] with # radius depending on the distance to the next point d, idx = nn_model.kneighbors([self.decision_boundary_points[i]]) radius = d[0][1] if d[0][1] != 0 else d[0][2] if radius == 0: radius = np.mean(pdist(self.decision_boundary_points_2d)) max_radius = radius * 2 radius /= 5.0 # add demo points, keeping some balance max_imbalance = 5.0 y_testpoints = [] for j in range(self.n_generated_testpoints_per_keypoint - 2): c_radius = radius freq = itemfreq(y_testpoints).astype(float) imbalanced = freq.shape[0] != 0 if freq.shape[0] == 2 and (freq[0, 1] / freq[1, 1] < 1.0 / max_imbalance or freq[0, 1] / freq[1, 1] > max_imbalance): imbalanced = True for try_i in range(tries): testpoint = np.random.normal(self.decision_boundary_points[ i], radius, (1, self.X.shape[1])) try: testpoint2d = self.dimensionality_reduction.transform(testpoint)[0] except: # DR can fail e.g. if NMF gets negative values testpoint = [] continue # demo point needs to be close to current key point if euclidean(testpoint2d, self.decision_boundary_points_2d[i]) <= max_radius: if not imbalanced: # needs to be not imbalanced break y_pred = self.classifier.predict(testpoint)[0] # imbalanced but this would actually improve things if freq.shape[0] == 2 and freq[y_pred, 1] < freq[1 - y_pred, 1]: break c_radius /= 2.0 if len(testpoint) != 0: testpoints = np.vstack((testpoints, testpoint)) y_testpoints.append(self.classifier.predict(testpoint)[0]) self.X_testpoints = np.vstack((self.X_testpoints, testpoints)) self.y_testpoints = np.hstack((self.y_testpoints, y_testpoints)) self.X_testpoints_2d = self.dimensionality_reduction.transform(self.X_testpoints) idx_within_bounds = np.where((self.X_testpoints_2d[:, 0] >= self.X2d_xmin) & (self.X_testpoints_2d[:, 0] <= self.X2d_xmax) & (self.X_testpoints_2d[:, 1] >= self.X2d_ymin) & (self.X_testpoints_2d[:, 1] <= self.X2d_ymax))[0] self.X_testpoints = self.X_testpoints[idx_within_bounds] self.y_testpoints = self.y_testpoints[idx_within_bounds] self.X_testpoints_2d = self.X_testpoints_2d[idx_within_bounds] def decision_boundary_distance(self, x, grad=0): """Returns the distance of the given point from the decision boundary, i.e. the distance from the region with maximal uncertainty (0.5 prediction probability)""" return np.abs(0.5 - self.classifier.predict_proba([x])[0][1]) def get_decision_boundary_keypoints(self): """Returns the arrays of located decision boundary keypoints (both in the original feature space, and in the dimensionality-reduced 2D space) Returns ------- decision_boundary_points : array Array containing points in the original feature space which are very close to the true decision boundary (closer than acceptance_threshold) decision_boundary_points_2d : array Array containing points in the dimensionality-reduced 2D space which are very close to the true decision boundary """ if len(self.decision_boundary_points) == 0: raise Exception("Please call the fit method first!") return self.decision_boundary_points, self.decision_boundary_points_2d def _get_sorted_db_keypoint_distances(self, N=None): """Use a minimum spanning tree heuristic to find the N largest gaps in the line constituted by the current decision boundary keypoints. """ if N == None: N = self.n_interpolated_keypoints edges = minimum_spanning_tree(squareform(pdist(self.decision_boundary_points_2d))) edged = np.array([euclidean(self.decision_boundary_points_2d[u], self.decision_boundary_points_2d[v]) for u, v in edges]) gap_edge_idx = np.argsort(edged)[::-1][:N] edges = edges[gap_edge_idx] gap_distances = np.square(edged[gap_edge_idx]) gap_probability_scores = gap_distances / np.sum(gap_distances) return edges, gap_distances, gap_probability_scores def _linear_decision_boundary_optimization(self, from_idx, to_idx, all_combinations=True, retry_neighbor_if_failed=True, step=None, suppress_output=False): """Use global optimization to locate the decision boundary along lines defined by instances from_idx and to_idx in the dataset (from_idx and to_idx have to contain indices from distinct classes to guarantee the existence of a decision boundary between them!) """ step_str = ("Step " + str(step) + "/" + str(self.steps) + ":") if step != None else "" retries = 4 if retry_neighbor_if_failed else 1 for i in range(len(from_idx)): n = len(to_idx) if all_combinations else 1 for j in range(n): from_i = from_idx[i] to_i = to_idx[j] if all_combinations else to_idx[i] for k in range(retries): if k == 0: from_point = self.Xminor[from_i] to_point = self.Xmajor[to_i] else: # first attempt failed, try nearest neighbors of source and destination # point instead _, idx = self.nn_model_2d_minorityclass.kneighbors([self.Xminor2d[from_i]]) from_point = self.Xminor[idx[0][k / 2]] _, idx = self.nn_model_2d_minorityclass.kneighbors([self.Xmajor2d[to_i]]) to_point = self.Xmajor[idx[0][k % 2]] if euclidean(from_point, to_point) == 0: break # no decision boundary between equivalent points db_point = self._find_decision_boundary_along_line( from_point, to_point, penalize_tangent_distance=self.penalties_enabled, penalize_extremes=self.penalties_enabled) if self.decision_boundary_distance(db_point) <= self.acceptance_threshold: db_point2d = self.dimensionality_reduction.transform([db_point])[0] if db_point2d[0] >= self.X2d_xmin and db_point2d[0] <= self.X2d_xmax and db_point2d[1] >= self.X2d_ymin and db_point2d[1] <= self.X2d_ymax: self.decision_boundary_points.append(db_point) self.decision_boundary_points_2d.append(db_point2d) if self.verbose and not suppress_output: # , ": New decision boundary keypoint found using linear optimization!" print("{} {}/{}".format(step_str, i * n + j, len(from_idx) * n)) break else: if self.verbose and not suppress_output: msg = "{} {}/{}: Rejected decision boundary keypoint (outside of plot area)" print(msg.format(step_str, i * n + j, len(from_idx) * n)) def _find_decision_boundary_along_line(self, from_point, to_point, penalize_extremes=False, penalize_tangent_distance=False): def objective(l, grad=0): # interpolate between source and destionation; calculate distance from # decision boundary X = from_point + l[0] * (to_point - from_point) error = self.decision_boundary_distance(X) if penalize_tangent_distance: # distance from tangent between class1 and class0 point in 2d space x0, y0 = self.dimensionality_reduction.transform([X])[0] x1, y1 = self.dimensionality_reduction.transform([from_point])[0] x2, y2 = self.dimensionality_reduction.transform([to_point])[0] error += 1e-12 * np.abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1) / np.sqrt((y2 - y1)**2 + (x2 - x1)**2) if penalize_extremes: error += 1e-8 * np.abs(0.5 - l[0]) return error optimizer = self._get_optimizer() optimizer.set_min_objective(objective) cl = optimizer.optimize([random.random()]) db_point = from_point + cl[0] * (to_point - from_point) return db_point def _find_decision_boundary_on_hypersphere(self, centroid, R, penalize_known=False): def objective(phi, grad=0): # search on hypersphere surface in polar coordinates - map back to cartesian cx = centroid + polar_to_cartesian(phi, R) try: cx2d = self.dimensionality_reduction.transform([cx])[0] error = self.decision_boundary_distance(cx) if penalize_known: # slight penalty for being too close to already known decision boundary # keypoints db_distances = [euclidean(cx2d, self.decision_boundary_points_2d[k]) for k in range(len(self.decision_boundary_points_2d))] error += 1e-8 * ((self.mean_2d_dist - np.min(db_distances)) / self.mean_2d_dist)**2 return error except (Exception, ex): print("Error in objective function:", ex) return np.infty optimizer = self._get_optimizer( D=self.X.shape[1] - 1, upper_bound=2 * np.pi, iteration_budget=self.hypersphere_iteration_budget) optimizer.set_min_objective(objective) db_phi = optimizer.optimize([rnd.random() * 2 * np.pi for k in range(self.X.shape[1] - 1)]) db_point = centroid + polar_to_cartesian(db_phi, R) return db_point def _get_optimizer(self, D=1, upper_bound=1, iteration_budget=None): """Utility function creating an NLOPT optimizer with default parameters depending on this objects parameters """ if iteration_budget == None: iteration_budget = self.linear_iteration_budget opt = nlopt.opt(nlopt.GN_DIRECT_L_RAND, D) # opt.set_stopval(self.acceptance_threshold/10.0) opt.set_ftol_rel(1e-5) opt.set_maxeval(iteration_budget) opt.set_lower_bounds(0) opt.set_upper_bounds(upper_bound) return opt
audio_name=line.split(" ")[0] #print count count=count+1 # if (count%100==0): # print count label=line.split(" ")[1].split("\n")[0] if "imtraj" in feat_dir: feat_vec=import_imtraj_txt(feat_dir+audio_name+".spbof") else: feat_vec=np.genfromtxt(feat_dir+audio_name,delimiter=";") if (label==event_name): label=1 pos_count+=1 else: label=0 neg_count+=1 if len(X)==0: X=[feat_vec] else: X=np.append(X,[feat_vec],axis=0) Y=Y+[label] print "Data loading finished positive "+str(pos_count)+" negative "+str(neg_count) #pipe_lrSVC=SVC(C=10,gamma=0.0001,probability=True) pipe_lrSVC=SVC(probability=True) #svm=LinearSVC(C=10) #pipe_lrSVC=CalibratedClassifierCV(svm) pipe_lrSVC.fit(preprocessing.scale(X),Y) pickle.dump(pipe_lrSVC,open(output_file+'.pickle','wb')) print 'SVM trained successfully for event %s!' % (event_name)+" round num %s" % (round_num)
# read in features features = [] for video_id in video_ids: feat_path = feat_dir + video_id + "." + feat_suffix feature = [0]*feat_dim if os.path.exists(feat_path) is True: if feat_type == 'dense': feature = numpy.genfromtxt(feat_path, delimiter=';') else: line = numpy.genfromtxt(feat_path, delimiter=' ', dtype=str) if len(line.shape) == 0: line = numpy.array([line]) for item in line: if len(item) == 0: continue tokens = item.split(':') key = int(tokens[0])-1 value = float(tokens[1]) if key < feat_dim: feature[key] = value features.append(feature) # train svm clf = SVC(probability=True) clf.fit(features, labels) # Dump model with open(output_file, 'wb') as f: cPickle.dump(clf, f) print 'SVM trained successfully for event %s!' % (event_name)
if Y_label != 'NULL' or random.random() > 0: if Y_label == event_name: Y = 1 else: Y = 0 if i == 0: X_all = X Y_all = Y i = 1 else: X_all = np.vstack((X_all, X)) Y_all = np.append(Y_all, Y) i += 1 # print (i) # print (np.sum(X_all, axis = 1)) # print(X_all, Y_all) clf = SVC(kernel=chi2_kernel) # clf = SVC() clf.fit(X_all, Y_all) print(clf.score(X_all, Y_all)) print(clf.predict(X_all)) fread.close() cPickle.dump(clf, open(output_file, "wb")) print 'SVM trained successfully for event %s!' % (event_name)
train_arrays = [] train_labels = [] test_arrays = [] test_labels = [] for email in emails: email_id = email.id prefix_train_pos = 'email_' + str(email_id) if email_id % 5 != 0: train_arrays.append(model.docvecs[prefix_train_pos]) train_labels.append(int(email.label)) else: test_arrays.append(model.docvecs[prefix_train_pos]) test_labels.append(int(email.label)) classifier = SVC() classifier.fit(numpy.array(train_arrays), numpy.array(train_labels)) print("Overall score is %f." % classifier.score(numpy.array(test_arrays), numpy.array(test_labels))) corrects = [] wrongs = [] for email in emails: email_id = email.id prefix_train_pos = 'email_' + str(email_id) if email_id % 5 == 0: prediction = classifier.predict([model.docvecs[prefix_train_pos]])[0] actual = int(email.label) if prediction != actual: wrongs.append((email.id, prediction, actual)) else: