def phoneAccelerometerISVM(): print("Loading data...") data = pd.read_csv("./Train_Phone-Acc-nexus4_1-a.csv") print("Done!") # Parse data and make bike vs not-biking classification using an SVM. # Note: I'm assuming a window width of 500 print("Finding time series windows indexes for each class kind...") previousClassLabel = str(data.get_value(data.index[0], 'gt')) pos = 0 y = [] X = [] window = 500 while pos < data.shape[0]: # Make y label. if str(data.iloc[pos]['gt']) == 'sit': y.append(1) else: y.append(-1) # Make X row. X.append(data.iloc[pos:pos + window]['y']) # Move to the next window pos += window print("Done!") # Build and fit the SVM. print("Training SVM on all data accelerometer data...") X = np.array(X) y = np.array(y) #clfs = LinearSVC() clfs = SVC() clfs.fit(X, y) print("Done!") # print("Predicting accelerometer classes on all data using SVM...") # ypred = predict(X, clfs.coef_.reshape(len(clfs.coef_.ravel()), 1)) # print("Done!") # error = calculateTotalAbsoluteError(y, ypred) / y.shape[0] # print("Accelerometer training error (Means kind of nothing): %f"%error) # Cross validation print("Training SVM on accelerometer training only data...") X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.1) #, random_state = 0 clfs = SVC() clfs.fit(X_train, y_train) yhat = clfs.predict(X_test) print("Abs Error = %f"%( calculateTotalAbsoluteError(yhat, y_test)/len(yhat))) print("Test data mean accuracy SVM score: %f"%clfs.score(X_test, y_test)) f1_c0 = f1_score(y_test, clfs.predict(X_test), pos_label=1, average='binary') #print("Test data f1 score for class -1: %f"%(f1_c0)) print("Test data f1 score for class +1: %f" % (f1_c0)) print("Done!")
class Clf(object): SVC_RBF = SVC(kernel='rbf', class_weight=None, random_state=0) SVC_RBF_CW = SVC(kernel='rbf', class_weight='auto', random_state=0) LINEAR_L1 = LinearSVC(loss='l1', random_state=0, class_weight=None) LINEAR_L1_CW = LinearSVC(loss='l1', random_state=0, class_weight='auto') LINEAR_SVC = SVC(kernel='linear', random_state=0, class_weight='auto') TREE = DecisionTreeClassifier(random_state=0) RF = RandomForestClassifier(random_state=0) MAJORITY = DummyClassifier(strategy='most_frequent') RANDOM = DummyClassifier(strategy='stratified') ADABOOST = AdaBoostClassifier(random_state=0) LR = LogisticRegression()
def fit(self, data, args): self.model = SVC(probability=True) with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def train_ensemble_classifier(): # classifier2 = SklearnClassifier(GaussianNB(), sparse=False) # classifier1 = SklearnClassifier(SVC(), sparse=False) # classifier3 = SklearnClassifier(RandomForestClassifier(), sparse=False) # classifier4 = SklearnClassifier(DecisionTreeClassifier(), sparse=False) classifier2 = SklearnClassifier(GaussianNB(), sparse=False) classifier1 = SklearnClassifier(SVC(degree=18, C=12), sparse=False) classifier3 = SklearnClassifier(RandomForestClassifier(max_depth=100, n_estimators=10), sparse=False) classifier4 = SklearnClassifier(DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=2, max_leaf_nodes=30, splitter='best', random_state=0), sparse=False) test_classifiers = [] test_classifiers.append(classifier1) test_classifiers.append(classifier2) test_classifiers.append(classifier3) test_classifiers.append(classifier4) trained_classifiers = [] for classifier in test_classifiers: classifier = classifier.train(train_features) trained_classifiers.append(classifier) voted_classifier = VoteClassifier(trained_classifiers) save_classifier(voted_classifier, 'voted_classifier.pickle') print_and_get_split_dataset_accuracy(test_classifiers, train_features) print_voted_classifier_cross_validation_experiment_result( test_classifiers, train_features)
def learnPhase(): if os.path.isfile("Doc2VecSVMNauceni.pkl"): return None tablecolrow = loadData("train.csv") tablecolrow[3] = FilterQuestions(tablecolrow[3]) tablecolrow[4] = FilterQuestions(tablecolrow[4]) model = prepareDoc2Vec(tablecolrow[3], tablecolrow[4]) for i in range(len(tablecolrow[3])): tablecolrow[3][i] = model.infer_vector(tablecolrow[3][i].split(" ")) tablecolrow[4][i] = model.infer_vector(tablecolrow[4][i].split(" ")) traindataX = [None] * len(tablecolrow[3]) traindataY = [None] * len(tablecolrow[3]) for i in range(len(traindataX)): traindataX[i] = tablecolrow[3][i] + tablecolrow[4][i] traindataY[i] = int(tablecolrow[5][i]) svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=1000000) print("Learning started") tmStart = timer() svmKlasifikator.fit(traindataX, traindataY) tmEnd = timer() print("Predicting lasted", tmEnd - tmStart) joblib.dump(svmKlasifikator, 'Doc2VecSVMNauceni.pkl') print("Spremljen je napredak ucenja")
def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None, _verbose: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = SVC( C=self.hyperparams['C'], kernel=self.hyperparams['kernel'], degree=self.hyperparams['degree'], gamma=self.hyperparams['gamma'], coef0=self.hyperparams['coef0'], probability=self.hyperparams['probability'], shrinking=self.hyperparams['shrinking'], tol=self.hyperparams['tol'], class_weight=self.hyperparams['class_weight'], max_iter=self.hyperparams['max_iter'], decision_function_shape=self.hyperparams['decision_function_shape'], verbose=_verbose, random_state=self.random_seed, ) self._training_inputs = None self._training_outputs = None self._fitted = False
def learnModel(data): if os.path.isfile("BagOfWordsSVMNauceni.pkl"): return None data[0] = FilterQuestions(data[0]) data[1] = FilterQuestions(data[1]) # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 20000) allQuestions = data[0] + data[1] vectorizer.fit(allQuestions) joblib.dump(vectorizer, 'BagOfWordsVectorizerNauceni.pkl') znacajkePitanja = [vectorizer.transform(data[0]), vectorizer.transform(data[1])] for i, r in enumerate(data[2]): data[2][i] = int(r) znacajkePitanja = hstack(znacajkePitanja).tocsr() svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=1000000) print("Learning started") tmStart = timer() svmKlasifikator.fit(znacajkePitanja, data[2]) tmEnd = timer() print("Learning ended") print("Learning lasted", tmEnd - tmStart) joblib.dump(svmKlasifikator, 'BagOfWordsSVMNauceni.pkl') print("Spremljen je napredak ucenja")
def svm_train(X, y, model_path): model = SVC() model.fit(X, y) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def train_svm(params, suffix, train_X, train_Y, test_X, test_Y): C = params['C'] kernel = params['kernel'] model = SVC(gamma='scale', probability=True, C=C, kernel=kernel) print("Params C:", C, "kernel:", kernel) model.fit(train_X, train_Y) print("Train score", model.score(train_X, train_Y)) test_score = model.score(test_X, test_Y) print("Test score", test_score) return test_score, None
def train_all(self, g): X = np.concatenate([self.train_X, self.val_X], axis=0) if self.use_scale: self.scale.fit(X) X = self.scale.transform(X) for i in range(3): y = np.concatenate([self.train_y, self.val_y], axis=0) y[y!=i+1]=0 y[y!=0]=1 clf = SVC() clf.set_params(**g) self.model_a.append(clf.fit(X, y))
def test_kernel_sigmoid(self): clf = SVC(C=1., kernel='sigmoid', gamma=0.001, random_state=0) self.set_classifier(clf) java_preds, py_preds = [], [] min_vals = np.amin(self.X, axis=0) max_vals = np.amax(self.X, axis=0) for n in range(self.N_RANDOM_TESTS): x = [random.uniform(min_vals[f], max_vals[f]) for f in range(self.n_features)] java_preds.append(self.make_pred_in_js(x)) py_preds.append(self.make_pred_in_py(x)) self.assertListEqual(py_preds, java_preds)
def test_kernel_sigmoid(self): clf = SVC(C=1., kernel='sigmoid', gamma=0.001, random_state=0) self._port_model(clf) Y, Y_py = [], [] min_vals = np.amin(self.X, axis=0) max_vals = np.amax(self.X, axis=0) for n in range(self.n_random_tests): x = [random.uniform(min_vals[f], max_vals[f]) for f in range(self.n_features)] Y.append(self.make_pred_in_custom(x)) Y_py.append(self.make_pred_in_py(x)) self.assertListEqual(Y, Y_py)
def test_pipeline_estimator(self): self.X, self.y = samples_generator.make_classification( n_informative=5, n_redundant=0, random_state=42) anova_filter = SelectKBest(f_regression, k=5) self.mdl = Pipeline([('anova', anova_filter), ('svc', SVC(kernel='linear'))]) self.mdl.set_params(anova__k=10, svc__C=.1) try: self._port_model() except Exception as e: self.fail('Unexpected exception raised: {}'.format(e.message)) finally: self._clear_model()
def train(self, g): self.model = [] X = self.train_X.copy() if self.use_scale: self.scale.fit(X) X = self.scale.transform(X) for i in range(3): y = self.train_y.copy() y[y!=i+1]=0 y[y!=0]=1 clf = SVC() clf.set_params(**g) self.model.append(clf.fit(X, y))
def SVCClassify(self, x_train, y_train): ''' Basic Support Vector Machine Classifier ''' # the parameter can be set kernel = 'rbf' # init classifier and train it # if need the proba-predict result, parameter probability must be =True clf = SVC(kernel=kernel, probability=True) clf.fit(x_train, y_train) return clf
def test_sigmoid_kernel(self): self.mdl = SVC(C=1., kernel='sigmoid', gamma=0.001, random_state=0) self.load_iris_data() self._port_model() amin = np.amin(self.X, axis=0) amax = np.amax(self.X, axis=0) preds, ground_truth = [], [] for _ in range(self.N_RANDOM_FEATURE_SETS): x = np.random.uniform(amin, amax, self.n_features) preds.append(self.pred_in_custom(x)) ground_truth.append(self.pred_in_py(x)) self._clear_model() # noinspection PyUnresolvedReferences self.assertListEqual(preds, ground_truth)
def test_kernel_poly(self): clf = SVC(C=1., kernel='poly', gamma=0.001, random_state=0) self._port_model(clf) java_preds, py_preds = [], [] min_vals = np.amin(self.X, axis=0) max_vals = np.amax(self.X, axis=0) for n in range(self.n_random_tests): x = [ random.uniform(min_vals[f], max_vals[f]) for f in range(self.n_features) ] java_preds.append(self.make_pred_in_js(x)) py_preds.append(self.make_pred_in_py(x)) self.assertListEqual(py_preds, java_preds)
def test_auto_gamma(self): self.estimator = SVC(C=1., gamma='auto', random_state=0) self.load_iris_data() self._port_estimator() amin = np.amin(self.X, axis=0) amax = np.amax(self.X, axis=0) preds, ground_truth = [], [] for _ in range(self.N_RANDOM_FEATURE_SETS): x = np.random.uniform(amin, amax, self.n_features) preds.append(self.pred_in_custom(x)) ground_truth.append(self.pred_in_py(x)) self._clear_estimator() # noinspection PyUnresolvedReferences self.assertListEqual(preds, ground_truth)
def cross_validate(samples, labels, outputDir): ''' Function to perform K-fold cross validation ''' # K(=10) FOLD CROSS VALIDATION K = 10 fold_samples, fold_labels = cv_split(samples, np.array(labels), K) log_loss = [['Log Loss'],[]] total_ll = 0.0 for fold in range(K): samples_chunk = fold_samples[:fold] + fold_samples[fold+1:] labels_chunk = fold_labels[:fold] + fold_labels[fold+1:] #Training L1 logistic regression logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1') logRegrL1.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #Training SVM with linear kernel svmLin = SVC(kernel='linear', probability=True) svmLin.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #TEST ON CROSS VALIDATION HOLD OUT SET val = [i for i in range(len(fold_labels[fold]))] id = 0 for item in fold_samples[fold]: predictionL1 = logRegrL1.predict_proba(item)#first component is probability of 0 class, second is of class 1 predictionSvmLin = svmLin.predict_proba(item) predictionRfc = rfc.predict_proba(item) #Taking the average of each of the model predictions as final health status prediction val[id] = (predictionL1[0][1] + predictionSvmLin[0][1] + predictionRfc[0][1])/3.0 id = id + 1 for i in range(len(fold_labels[fold])): total_ll += logloss(fold_labels[fold][i], val[i]) log_loss[1] = total_ll/len(samples) #Save csv file in the output directory with name Dota2Val.csv np.savetxt(outputDir + "\\Dota2Val.csv", log_loss, delimiter=',', fmt='%s' )
def train_and_predict(samples, labels, feature_selector, inputDir, outputDir): #Training L1 logistic regression logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1') logRegrL1.fit(samples, labels) #Training SVM with linear kernel svmLin = SVC(kernel='linear', probability=True) svmLin.fit(samples, labels) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit(samples, labels) #test set testDir = inputDir + "/set_test" testFiles = sorted([ join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f)) ], key=numericalSort) #Read feature vectors of test images testSamples = cubeVoxelsVar(testFiles) testSamples = feature_selector.transform(testSamples) print(len(testSamples)) #2D array to report final prediction in format (ID,Prediction) final = [[0 for j in range(2)] for i in range(139)] final[0][0] = 'ID' final[0][1] = 'Prediction' id = 1 #Predict health status of test image using each of the 3 models trained above for item in testSamples: predictionL1 = logRegrL1.predict_proba( item ) #first component is probability of 0 class, second is of class 1 predictionSvmLin = svmLin.predict_proba(item) predictionRfc = rfc.predict_proba(item) final[id][0] = id #Taking the average of each of the model predictions as final health status prediction final[id][1] = (predictionL1[0][1] + predictionSvmLin[0][1] + predictionRfc[0][1]) / 3.0 id = id + 1 #Save csv file in the output directory with name final_sub.csv np.savetxt(outputDir + "/final_sub.csv", final, delimiter=',', fmt='%s')
def train_cv_clf(topics_train, classes_train, features, n_folds=10, param_grid=_PARAM_GRID, tuned_clf=SVC(C=1, kernel='linear'), scoring=util.weighted_f1, random_state=0): """Trains the topic type classifier, given the various parameters. """ kf = cross_validation.KFold(len(topics_train), n_folds=n_folds, random_state=random_state) cv_clf = GridSearchCV(estimator=tuned_clf, param_grid=param_grid, cv=kf, scoring=scoring) topic_vectors_train = to_features(features, topics_train) cv_clf.fit(topic_vectors_train, classes_train) return cv_clf
def learnModel(train): data = [] for duplicate in train["is_duplicate"]: data.append(int(duplicate)) znacajkePitanja = get_avg(train) svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=10000) print("Learning started") tmStart = timer() svmKlasifikator.fit(znacajkePitanja, data) tmEnd = timer() print("Learning ended") print("Learning lasted", tmEnd - tmStart) joblib.dump(svmKlasifikator, 'Word2VecSVMNauceni.pkl') print("Spremljen je napredak ucenja")
def classifier_panchenko2016(X_train, y_train, X_test, y_test, separateClassifier=False): train_or_test_labels = ["train" for i in y_train] + ["test" for i in y_test] y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels, X_train + X_test, y_train + y_test) y_train, X_train = features_extraction( y_train, X_train, separateClassifier=separateClassifier, featuresCount=100) y_test, X_test = features_extraction(y_test, X_test, separateClassifier=separateClassifier, featuresCount=100) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = SVC(kernel="rbf", C=2e11, gamma=2e-1, max_iter=5000, class_weight="balanced", verbose=1) print("fitting") classifier.fit(X_train, y_train) print("testing") y_predictions = classifier.predict(X_test) #, y_test) return y_test, y_predictions
def featureSelector(data,trainHeaderList,target,selectorType): dataFrame = pd.DataFrame(data) if(selectorType == 'VT'): cols = dataFrame.columns pi = 0.6 selector = VarianceThreshold(threshold=(pi*(1-pi))) values = selector.fit_transform(dataFrame) labels = list() i = 0 for x in selector.get_support(indices=False): if x: labels.append(trainHeaderList.__getitem__(i)) i += 1 return pd.DataFrame(values , columns=labels) elif(selectorType == 'KB'): selector = SelectKBest(chi2, k=6) values = selector.fit_transform(dataFrame, target) labels = list() i = 0 for x in selector.get_support(indices=False): if x: labels.append(trainHeaderList.__getitem__(i)) i += 1 return pd.DataFrame(values, columns=labels) elif(selectorType == 'SVC'): svc = SVC(kernel="linear", C=1) selector = RFE(estimator=svc, n_features_to_select=20, step=0.5, verbose=5) values =selector.fit_transform(dataFrame, target) labels = list() i = 0 for x in selector.get_support(indices=False): if x: labels.append(trainHeaderList.__getitem__(i)) i += 1 return pd.DataFrame(values, columns=labels)
if Y_label != 'NULL' or random.random() > 0: if Y_label == event_name: Y = 1 else: Y = 0 if i == 0: X_all = X Y_all = Y i = 1 else: X_all = np.vstack((X_all, X)) Y_all = np.append(Y_all, Y) i += 1 # print (i) # print (np.sum(X_all, axis = 1)) # print(X_all, Y_all) clf = SVC(kernel=chi2_kernel) # clf = SVC() clf.fit(X_all, Y_all) print(clf.score(X_all, Y_all)) print(clf.predict(X_all)) fread.close() cPickle.dump(clf, open(output_file, "wb")) print 'SVM trained successfully for event %s!' % (event_name)
y, test_size=0.2, random_state=42) #:# preprocessing transform_pipeline = Pipeline([('scaler', StandardScaler())]) X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns) #:# model params = {'gamma': 5, 'kernel': 'sigmoid', 'probability': True} classifier = SVC(**params) classifier.fit(X_train, y_train) #:# hash #:# aad366f6d5961bc98783c2ad9fb3918d md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}')
audio_name=line.split(" ")[0] #print count count=count+1 # if (count%100==0): # print count label=line.split(" ")[1].split("\n")[0] if "imtraj" in feat_dir: feat_vec=import_imtraj_txt(feat_dir+audio_name+".spbof") else: feat_vec=np.genfromtxt(feat_dir+audio_name,delimiter=";") if (label==event_name): label=1 pos_count+=1 else: label=0 neg_count+=1 if len(X)==0: X=[feat_vec] else: X=np.append(X,[feat_vec],axis=0) Y=Y+[label] print "Data loading finished positive "+str(pos_count)+" negative "+str(neg_count) #pipe_lrSVC=SVC(C=10,gamma=0.0001,probability=True) pipe_lrSVC=SVC(probability=True) #svm=LinearSVC(C=10) #pipe_lrSVC=CalibratedClassifierCV(svm) pipe_lrSVC.fit(preprocessing.scale(X),Y) pickle.dump(pipe_lrSVC,open(output_file+'.pickle','wb')) print 'SVM trained successfully for event %s!' % (event_name)+" round num %s" % (round_num)
def setUp(self): super(SVCCTest, self).setUp() self.mdl = SVC(C=1., kernel='rbf', gamma=0.001, random_state=0)
('extender', AttributesExtension()), ('imputer', SimpleImputer(strategy="mean")), ]) learning_data = pipeline.fit_transform(features_data) # ### Select a model # In[ ]: from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn.svm.classes import SVC from sklearn.metrics import accuracy_score svc = SVC() log_reg = LogisticRegression() #log_reg.fit(learning_data, labels) rand_for = RandomForestClassifier() #rand_for.fit(learning_data, labels) models = { "Logistic Regression": log_reg, "Random Forest": rand_for, "SVM": svc, } for model in models.keys(): scores = cross_val_score(models[model], learning_data, labels,
from sklearn.svm.classes import SVC import cPickle import sys import time # Performs K-means clustering and save the model to a local file if __name__ == '__main__': t1 = time.time() event_name = "P003" feat_dir = "kmeans/" feat_dim = 50 output_file = "mfcc_pred/svm.%s.model" % event_name fread = open("list/train", "r") clf = SVC(probability=True) X, Y = [], [] for i in fread.readlines(): i = i.split(" ") line = i[0] label = i[1].replace('\n', '') kmeans_path = "kmeans/" + line + ".kmeans.txt" if os.path.exists(kmeans_path): kmeans_feat = numpy.genfromtxt(kmeans_path, delimiter=";") else: kmeans_feat = numpy.zeros(feat_dim) label = "NULL" if label != event_name: label = "NULL" X.append(kmeans_feat) Y.append(label)