def learnPhase(): if os.path.isfile("Doc2VecSVMNauceni.pkl"): return None tablecolrow = loadData("train.csv") tablecolrow[3] = FilterQuestions(tablecolrow[3]) tablecolrow[4] = FilterQuestions(tablecolrow[4]) model = prepareDoc2Vec(tablecolrow[3], tablecolrow[4]) for i in range(len(tablecolrow[3])): tablecolrow[3][i] = model.infer_vector(tablecolrow[3][i].split(" ")) tablecolrow[4][i] = model.infer_vector(tablecolrow[4][i].split(" ")) traindataX = [None] * len(tablecolrow[3]) traindataY = [None] * len(tablecolrow[3]) for i in range(len(traindataX)): traindataX[i] = tablecolrow[3][i] + tablecolrow[4][i] traindataY[i] = int(tablecolrow[5][i]) svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=1000000) print("Learning started") tmStart = timer() svmKlasifikator.fit(traindataX, traindataY) tmEnd = timer() print("Predicting lasted", tmEnd - tmStart) joblib.dump(svmKlasifikator, 'Doc2VecSVMNauceni.pkl') print("Spremljen je napredak ucenja")
class SVMLearner(RLRLearner): def __init__(self, data_model): super().__init__(data_model) self.svm = SVC(kernel='linear', probability=True, tol=0.0001) def fit_transform(self, pairs, y): y = numpy.array(y) if not y.any() and self.y.any(): random_pair = random.choice(self.candidates) exact_match = (random_pair[0], random_pair[0]) pairs = pairs + [exact_match] y = numpy.concatenate([y, [1]]) elif numpy.count_nonzero(y) == len(y) and numpy.count_nonzero( self.y) == len(self.y): random_pair = random.choice(self.candidates) pairs = pairs + [random_pair] y = numpy.concatenate([y, [0]]) super().fit_transform(pairs, y) def fit(self, X, y): self.y = y self.X = X self.svm.fit(X, y) def predict_proba(self, examples): return self.svm.predict_proba(examples)[:, 1].reshape(-1, 1)
def learnModel(data): if os.path.isfile("BagOfWordsSVMNauceni.pkl"): return None data[0] = FilterQuestions(data[0]) data[1] = FilterQuestions(data[1]) # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 20000) allQuestions = data[0] + data[1] vectorizer.fit(allQuestions) joblib.dump(vectorizer, 'BagOfWordsVectorizerNauceni.pkl') znacajkePitanja = [vectorizer.transform(data[0]), vectorizer.transform(data[1])] for i, r in enumerate(data[2]): data[2][i] = int(r) znacajkePitanja = hstack(znacajkePitanja).tocsr() svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=1000000) print("Learning started") tmStart = timer() svmKlasifikator.fit(znacajkePitanja, data[2]) tmEnd = timer() print("Learning ended") print("Learning lasted", tmEnd - tmStart) joblib.dump(svmKlasifikator, 'BagOfWordsSVMNauceni.pkl') print("Spremljen je napredak ucenja")
class SVCImpl(): def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight='balanced', verbose=False, max_iter=(- 1), decision_function_shape='ovr', random_state=None): self._hyperparams = { 'C': C, 'kernel': kernel, 'degree': degree, 'gamma': gamma, 'coef0': coef0, 'shrinking': shrinking, 'probability': probability, 'tol': tol, 'cache_size': cache_size, 'class_weight': class_weight, 'verbose': verbose, 'max_iter': max_iter, 'decision_function_shape': decision_function_shape, 'random_state': random_state} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
class SVMClassifier(ClassifierI): """Wrapper for scikit-learn svm classifier.""" def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None): """Init. See scikit-learn.""" self._clf = SVC(C=1, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, random_state=random_state) self.classes_ = None def __repr__(self): return "<SVMClassifier(%r)>" % self._clf def classify_many(self, vectors): """Classify a batch of verbs. :param vectors: An doc term array of vectors :return: The predicted class label for each input sample. :rtype: list """ classes = self.classes_ return [classes[i] for i in self._clf.predict(vectors)] def prob_classify_many(self, vectors): """Compute per-class probabilities for a batch of samples. :param vectors: A doc term array of vectors :rtype: list of ``ProbDistI`` """ y_proba_list = self._clf.predict_proba(vectors) return [self._make_probdist(y_proba) for y_proba in y_proba_list] def labels(self): """The class labels learned by this classifier. :rtype: list """ return list(self.classes_) def train(self, vectors, labels): """ Train (fit) the scikit-learn svm classifier. :param vectors: a doc-term array of vectors to learn from :param labels: a list of labels corresponding to the rows of the doc term array. """ self.classes_, labels = np.unique(labels, return_inverse=True) self._clf.fit(vectors, labels) return self def _make_probdist(self, y_proba): classes = self.classes_ return dict((classes[i], p) for i, p in enumerate(y_proba))
class CreateSVC(CreateLinearSVC): def fit(self, data, args): self.model = SVC(probability=True) with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def svm_train(X, y, model_path): model = SVC() model.fit(X, y) expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def train_svm(params, suffix, train_X, train_Y, test_X, test_Y): C = params['C'] kernel = params['kernel'] model = SVC(gamma='scale', probability=True, C=C, kernel=kernel) print("Params C:", C, "kernel:", kernel) model.fit(train_X, train_Y) print("Train score", model.score(train_X, train_Y)) test_score = model.score(test_X, test_Y) print("Test score", test_score) return test_score, None
def phoneAccelerometerISVM(): print("Loading data...") data = pd.read_csv("./Train_Phone-Acc-nexus4_1-a.csv") print("Done!") # Parse data and make bike vs not-biking classification using an SVM. # Note: I'm assuming a window width of 500 print("Finding time series windows indexes for each class kind...") previousClassLabel = str(data.get_value(data.index[0], 'gt')) pos = 0 y = [] X = [] window = 500 while pos < data.shape[0]: # Make y label. if str(data.iloc[pos]['gt']) == 'sit': y.append(1) else: y.append(-1) # Make X row. X.append(data.iloc[pos:pos + window]['y']) # Move to the next window pos += window print("Done!") # Build and fit the SVM. print("Training SVM on all data accelerometer data...") X = np.array(X) y = np.array(y) #clfs = LinearSVC() clfs = SVC() clfs.fit(X, y) print("Done!") # print("Predicting accelerometer classes on all data using SVM...") # ypred = predict(X, clfs.coef_.reshape(len(clfs.coef_.ravel()), 1)) # print("Done!") # error = calculateTotalAbsoluteError(y, ypred) / y.shape[0] # print("Accelerometer training error (Means kind of nothing): %f"%error) # Cross validation print("Training SVM on accelerometer training only data...") X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.1) #, random_state = 0 clfs = SVC() clfs.fit(X_train, y_train) yhat = clfs.predict(X_test) print("Abs Error = %f"%( calculateTotalAbsoluteError(yhat, y_test)/len(yhat))) print("Test data mean accuracy SVM score: %f"%clfs.score(X_test, y_test)) f1_c0 = f1_score(y_test, clfs.predict(X_test), pos_label=1, average='binary') #print("Test data f1 score for class -1: %f"%(f1_c0)) print("Test data f1 score for class +1: %f" % (f1_c0)) print("Done!")
def SVCClassify(self, x_train, y_train): ''' Basic Support Vector Machine Classifier ''' # the parameter can be set kernel = 'rbf' # init classifier and train it # if need the proba-predict result, parameter probability must be =True clf = SVC(kernel=kernel, probability=True) clf.fit(x_train, y_train) return clf
def cross_validate(samples, labels, outputDir): ''' Function to perform K-fold cross validation ''' # K(=10) FOLD CROSS VALIDATION K = 10 fold_samples, fold_labels = cv_split(samples, np.array(labels), K) log_loss = [['Log Loss'],[]] total_ll = 0.0 for fold in range(K): samples_chunk = fold_samples[:fold] + fold_samples[fold+1:] labels_chunk = fold_labels[:fold] + fold_labels[fold+1:] #Training L1 logistic regression logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1') logRegrL1.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #Training SVM with linear kernel svmLin = SVC(kernel='linear', probability=True) svmLin.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) ) #TEST ON CROSS VALIDATION HOLD OUT SET val = [i for i in range(len(fold_labels[fold]))] id = 0 for item in fold_samples[fold]: predictionL1 = logRegrL1.predict_proba(item)#first component is probability of 0 class, second is of class 1 predictionSvmLin = svmLin.predict_proba(item) predictionRfc = rfc.predict_proba(item) #Taking the average of each of the model predictions as final health status prediction val[id] = (predictionL1[0][1] + predictionSvmLin[0][1] + predictionRfc[0][1])/3.0 id = id + 1 for i in range(len(fold_labels[fold])): total_ll += logloss(fold_labels[fold][i], val[i]) log_loss[1] = total_ll/len(samples) #Save csv file in the output directory with name Dota2Val.csv np.savetxt(outputDir + "\\Dota2Val.csv", log_loss, delimiter=',', fmt='%s' )
def train_and_predict(samples, labels, feature_selector, inputDir, outputDir): #Training L1 logistic regression logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1') logRegrL1.fit(samples, labels) #Training SVM with linear kernel svmLin = SVC(kernel='linear', probability=True) svmLin.fit(samples, labels) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit(samples, labels) #test set testDir = inputDir + "/set_test" testFiles = sorted([ join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f)) ], key=numericalSort) #Read feature vectors of test images testSamples = cubeVoxelsVar(testFiles) testSamples = feature_selector.transform(testSamples) print(len(testSamples)) #2D array to report final prediction in format (ID,Prediction) final = [[0 for j in range(2)] for i in range(139)] final[0][0] = 'ID' final[0][1] = 'Prediction' id = 1 #Predict health status of test image using each of the 3 models trained above for item in testSamples: predictionL1 = logRegrL1.predict_proba( item ) #first component is probability of 0 class, second is of class 1 predictionSvmLin = svmLin.predict_proba(item) predictionRfc = rfc.predict_proba(item) final[id][0] = id #Taking the average of each of the model predictions as final health status prediction final[id][1] = (predictionL1[0][1] + predictionSvmLin[0][1] + predictionRfc[0][1]) / 3.0 id = id + 1 #Save csv file in the output directory with name final_sub.csv np.savetxt(outputDir + "/final_sub.csv", final, delimiter=',', fmt='%s')
def learnModel(train): data = [] for duplicate in train["is_duplicate"]: data.append(int(duplicate)) znacajkePitanja = get_avg(train) svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=10000) print("Learning started") tmStart = timer() svmKlasifikator.fit(znacajkePitanja, data) tmEnd = timer() print("Learning ended") print("Learning lasted", tmEnd - tmStart) joblib.dump(svmKlasifikator, 'Word2VecSVMNauceni.pkl') print("Spremljen je napredak ucenja")
def train_all(self, g): X = np.concatenate([self.train_X, self.val_X], axis=0) if self.use_scale: self.scale.fit(X) X = self.scale.transform(X) for i in range(3): y = np.concatenate([self.train_y, self.val_y], axis=0) y[y!=i+1]=0 y[y!=0]=1 clf = SVC() clf.set_params(**g) self.model_a.append(clf.fit(X, y))
def classifier_panchenko2016(X_train, y_train, X_test, y_test, separateClassifier=False): train_or_test_labels = ["train" for i in y_train] + ["test" for i in y_test] y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels, X_train + X_test, y_train + y_test) y_train, X_train = features_extraction( y_train, X_train, separateClassifier=separateClassifier, featuresCount=100) y_test, X_test = features_extraction(y_test, X_test, separateClassifier=separateClassifier, featuresCount=100) scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) classifier = SVC(kernel="rbf", C=2e11, gamma=2e-1, max_iter=5000, class_weight="balanced", verbose=1) print("fitting") classifier.fit(X_train, y_train) print("testing") y_predictions = classifier.predict(X_test) #, y_test) return y_test, y_predictions
def train(self, g): self.model = [] X = self.train_X.copy() if self.use_scale: self.scale.fit(X) X = self.scale.transform(X) for i in range(3): y = self.train_y.copy() y[y!=i+1]=0 y[y!=0]=1 clf = SVC() clf.set_params(**g) self.model.append(clf.fit(X, y))
#Read age labels of training images labels = [] for t in targets: labels.append(t[0]) #Training LASSO regressor, alpha value tuned to produce best result when used alone on the test set regrL = linear_model.Lasso(alpha=15.0) regrL.fit(samples, labels) #Training Ridge regressor, alpha value tuned to produce best result when used alone on the test set regrR = linear_model.Ridge(alpha=1e-13, normalize=True) regrR.fit(samples, labels) #Training SVM with linear kernel regrS = SVC(kernel='linear') regrS.fit(samples, labels) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit(samples, labels) #test set testDir = inputDir + "\\set_test" testFiles = sorted([ join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f)) ], key=numericalSort) #Read feature vectors of test images testSamples = readVoxels(testFiles) print(len(testSamples))
X_train, Y_train = train_data_set.convert_2_binary_format() test_data_set = DataSet() test_data_set.load(config.get_value('test'), class_index, has_header=False) Xtest, Ytest = test_data_set.convert_2_binary_format_with( X_train.item_dict, Y_train.item_dict) Ytest = Ytest.flatten() class_count = train_data_set.number_of_classes() unexpected_rules = IOHelper.load_json_object(config.get_value('rules')) refined_unexpected_rules = filter_association_rules(unexpected_rules) print('svm testing...') svc_model = SVC(kernel='poly', degree=3, coef0=0.1, random_state=1) svc_model.fit(X_train.relation_matrix, Y_train.values.flatten()) svc_y_pred = svc_model.predict(Xtest) print(f1_score(Ytest, svc_y_pred, average=None)) if (class_count <= 2): fpr, tpr, _ = roc_curve(Ytest, svc_y_pred.flatten()) print(auc(fpr, tpr)) refine_with_unexpectedness(test_data_set, Y_train.item_dict, svc_y_pred, Ytest, refined_unexpected_rules) print('Random forest testing...') rf_model = RandomForestClassifier(n_estimators=20, random_state=1) rf_model.fit(X_train.relation_matrix, Y_train.values.flatten()) rf_y_pred = rf_model.predict(Xtest)
train_labels = [] test_arrays = [] test_labels = [] for email in emails: email_id = email.id prefix_train_pos = 'email_' + str(email_id) if email_id % 5 != 0: train_arrays.append(model.docvecs[prefix_train_pos]) train_labels.append(int(email.label)) else: test_arrays.append(model.docvecs[prefix_train_pos]) test_labels.append(int(email.label)) classifier = SVC() classifier.fit(numpy.array(train_arrays), numpy.array(train_labels)) print("Overall score is %f." % classifier.score(numpy.array(test_arrays), numpy.array(test_labels))) corrects = [] wrongs = [] for email in emails: email_id = email.id prefix_train_pos = 'email_' + str(email_id) if email_id % 5 == 0: prediction = classifier.predict([model.docvecs[prefix_train_pos]])[0] actual = int(email.label) if prediction != actual: wrongs.append((email.id, prediction, actual)) else: # print(max(classifier.predict_proba([model.docvecs[prefix_train_pos]])[0]), actual)
def train_and_predict(samples, labels, feature_selector, inputDir, outputDir): #test set testDir = inputDir + "\\set_test" testFiles = sorted([ join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f)) ], key=numericalSort) # Different features for gender testSamples_gender = cubeVoxelsVar_gender(testFiles) # Same features for age and health testSamples_age = cubeVoxelsVar_age(testFiles) testSamples_health = testSamples_age testSamples = [testSamples_gender, testSamples_age, testSamples_health] #2D array to report final prediction in format (ID,Prediction) final = [[0 for j in range(4)] for i in range(1 + 138 * 3)] final[0][0] = 'ID' final[0][1] = 'Sample' final[0][2] = 'Label' final[0][3] = 'Predicted' total_labels = ['gender', 'age', 'health'] for label in range(3): print("Prediction label 1 started!") id_count = label #Training logistic regression logRegrL1 = linear_model.LogisticRegression() logRegrL1.fit(samples[label], labels[label]) #Training SVM with linear kernel svmLin = SVC(kernel='linear') svmLin.fit(samples[label], labels[label]) #Training Random Forest Classifier rfc = RandomForestClassifier(n_estimators=100) rfc.fit(samples[label], labels[label]) print("Training complete!") # Do feature selection only for age and health if label == 0: testSamples_curr = testSamples[label] else: testSamples_curr = feature_selector[label].transform( testSamples[label]) print(len(testSamples_curr)) id = label + 1 #Predict gender, age and health status of test image using each of the 3 models trained above for sampleNum, sample in enumerate(testSamples_curr): predictionL1 = logRegrL1.predict(sample) predictionSvmLin = svmLin.predict(sample) predictionRfc = rfc.predict(sample) final[id][0] = id_count final[id][1] = sampleNum final[id][2] = total_labels[label] votes = predictionL1[0] + predictionSvmLin[0] + predictionRfc[0] final[id][3] = 'TRUE' if votes >= 2.0 else 'FALSE' id = id + 3 id_count = id_count + 3 print('Prediction done!') #Save csv file in the output directory with name final_sub.csv np.savetxt(outputDir + "\\final_sub.csv", final, delimiter=',', fmt='%s')
"Sidorovbigramsdeprel" ] print("train: ", len(tweets_training)) print("test: ", len(tweets_test)) X, X_test, feature_name, feature_index = feature_manager.create_feature_space( tweets_training, feature_type, tweets_test) print(feature_name) print("feature space dimension X:", X.shape) print("feature space dimension X_test:", X_test.shape) clf = SVC(kernel="linear") clf.fit(X, labels_training) test_predict = clf.predict(X_test) """prec, recall, f, support = precision_recall_fscore_support( labels_test, test_predict, beta=1) accuracy = accuracy_score( test_predict, labels_test ) print(prec, recall, f, support ) print(accuracy)""" for i in range(0, len(tweets_test)):
label_string += '0 ' fopen.close() # generate the label_vec = numpy.fromstring(label_string.strip(), dtype=int, sep=' ') print 'Totally we get %s labels' % (label_vec.shape[0]) # for debugging # create the feature matrix, in which each row represents a video video_num = len(video_list) feat_mat = numpy.zeros([video_num, feat_dim]) for i in xrange(video_num): # BOW features of this video feat_vec = numpy.genfromtxt(feat_dir + video_list[i], dtype=numpy.float32, delimiter=";") assert (feat_vec.shape[0] == feat_dim) # fill the feature vector to the matrix feat_mat[i, :] = feat_vec # initialize svm svm = SVC(kernel=chi2_kernel) # svm = SVC(probability=True) # train the svm models svm.fit(feat_mat, label_vec) # finally save the k-means model cPickle.dump(svm, open(output_file, "wb"), cPickle.HIGHEST_PROTOCOL) print 'SVM trained successfully for event %s!' % (event_name)
audio_name=line.split(" ")[0] #print count count=count+1 # if (count%100==0): # print count label=line.split(" ")[1].split("\n")[0] if "imtraj" in feat_dir: feat_vec=import_imtraj_txt(feat_dir+audio_name+".spbof") else: feat_vec=np.genfromtxt(feat_dir+audio_name,delimiter=";") if (label==event_name): label=1 pos_count+=1 else: label=0 neg_count+=1 if len(X)==0: X=[feat_vec] else: X=np.append(X,[feat_vec],axis=0) Y=Y+[label] print "Data loading finished positive "+str(pos_count)+" negative "+str(neg_count) #pipe_lrSVC=SVC(C=10,gamma=0.0001,probability=True) pipe_lrSVC=SVC(probability=True) #svm=LinearSVC(C=10) #pipe_lrSVC=CalibratedClassifierCV(svm) pipe_lrSVC.fit(preprocessing.scale(X),Y) pickle.dump(pipe_lrSVC,open(output_file+'.pickle','wb')) print 'SVM trained successfully for event %s!' % (event_name)+" round num %s" % (round_num)
X_val, Y_val = get_data("val_esea_real.lst", ranks) trn_embedding = embed(X_trn, triplet_model) val_embedding = embed(X_val, triplet_model) # print (X_trn, X_val) # print (trn_embedding, val_embedding) # print (triplet_model.get_weights()) clf = SVC( # class_weight='balanced', probability=True, # tol=1e-4, ) clf.fit(trn_embedding, Y_trn) print(clf.score(val_embedding, Y_val)) print(clf.predict_proba(val_embedding)) print(roc_auc_score(Y_val, clf.predict(val_embedding))) print(classification_report(Y_val, clf.predict(val_embedding), digits=4)) all_files = [x[:-8] for x in os.listdir(ALL_FILES)] X = [ pickle.load(open(os.path.join(FEATURE_PATH, x + ".fkmeans"), "rb"), encoding='latin1') for x in all_files ] # Y = [ranks[x.split()[0].strip()] for x in all_files] proba = clf.predict_proba(embed(np.array(X), triplet_model))
# read in features features = [] for video_id in video_ids: feat_path = feat_dir + video_id + "." + feat_suffix feature = [0]*feat_dim if os.path.exists(feat_path) is True: if feat_type == 'dense': feature = numpy.genfromtxt(feat_path, delimiter=';') else: line = numpy.genfromtxt(feat_path, delimiter=' ', dtype=str) if len(line.shape) == 0: line = numpy.array([line]) for item in line: if len(item) == 0: continue tokens = item.split(':') key = int(tokens[0])-1 value = float(tokens[1]) if key < feat_dim: feature[key] = value features.append(feature) # train svm clf = SVC(probability=True) clf.fit(features, labels) # Dump model with open(output_file, 'wb') as f: cPickle.dump(clf, f) print 'SVM trained successfully for event %s!' % (event_name)
[list(feature_names).index(f) for f in feature_filtered]) feature_index_filtered = numpy.concatenate( feature_index_global[list(feature_index_filtered)]) #print(feature_name_global[feature_index_filtered]) X_filter = X[:, feature_index_filtered] #print(feature_filtered,X.shape,X_filter.shape) predict = [] golden = [] for index_train, index_test in kf: X_train = X_filter[index_train] X_test = X_filter[index_test] clf = SVC(kernel='linear') clf.fit(X_train, stance[index_train]) test_predict = clf.predict(X_test) predict = numpy.concatenate((predict, test_predict)) golden = numpy.concatenate((golden, stance[index_test])) prec, recall, f, support = precision_recall_fscore_support(golden, predict, beta=1) accuracy = accuracy_score(golden, predict) print('"' + (' '.join(feature_filtered)) + '"' + '\t' + str(((f[0] + f[1] + f[2]) / 3)) + '\t' + str(((f[1] + f[2]) / 2)) + '\t' + str(prec) + '\t' + str(recall) + '\t' + str(f) + '\n')
feat_dir = "kmeans/" feat_dim = 50 output_file = "mfcc_pred/svm.%s.model" % event_name fread = open("list/train", "r") clf = SVC(probability=True) X, Y = [], [] for i in fread.readlines(): i = i.split(" ") line = i[0] label = i[1].replace('\n', '') kmeans_path = "kmeans/" + line + ".kmeans.txt" if os.path.exists(kmeans_path): kmeans_feat = numpy.genfromtxt(kmeans_path, delimiter=";") else: kmeans_feat = numpy.zeros(feat_dim) label = "NULL" if label != event_name: label = "NULL" X.append(kmeans_feat) Y.append(label) X = numpy.array(X) Y = numpy.array(Y) clf.fit(X, Y) cPickle.dump(clf, open(output_file, "wb")) print " " t2 = time.time() - t1 print "Time taken for training %s SVM : %f seconds" % (event_name, t2) print 'SVM trained successfully for event %s!' % event_name
Y_all = Y i = 1 else: X_all = np.vstack((X_all, X)) Y_all = np.append(Y_all, Y) i += 1 clf = SVC(kernel=laplacian_kernel) features = ['.cnn.', '.mfcc.', '.asr.'] X_score = np.zeros((len(X_all), 3)) j = 0 for j, feature in enumerate(features): X = X_all[:, dimension_i[j]:dimension_i[j] + dimension_i[j + 1]] clf.fit(X, Y_all) print('saving scores...') X_score[:, j] = clf.decision_function(X) # print (X_score[:, j]) np.save(event_name + feature + 'score', X_score[:, j]) cPickle.dump(clf, open(output_file + feature + 'score', "wb")) clf = SVC(kernel='linear') clf.fit(X_score, Y_all) fread.close() cPickle.dump(clf, open(output_file, "wb")) print 'SVM trained successfully for event %s!' % (event_name)
test_size=0.2, random_state=42) #:# preprocessing transform_pipeline = Pipeline([('scaler', StandardScaler())]) X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns) #:# model params = {'gamma': 5, 'kernel': 'sigmoid', 'probability': True} classifier = SVC(**params) classifier.fit(X_train, y_train) #:# hash #:# aad366f6d5961bc98783c2ad9fb3918d md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}') print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
file_read.close() # Highly imbalanced data for training X_df = np.array(X_df) y_df = np.array(y_df) #print(Counter(y_df)) #print(X_df.shape) # Add SMOTE resampling for dealing with imbalanced issue smote = SMOTE() X_df_res, y_df_res = smote.fit_sample(X_df, y_df) #print(Counter(y_df_res)) # Train svm clf = SVC(kernel=chi2_kernel, class_weight='balanced', C=2.0, gamma='scale', probability=True, random_state=0) # clf = SVC(kernel = 'rbf', class_weight='balanced', # C = 1.0, gamma = 'scale', probability=True, random_state=0) # clf.fit(X_df, y_df) clf.fit(X_df_res, y_df_res) # Save the SVC model with open(output_file, 'w') as f: cPickle.dump(clf, f) print 'SVM trained successfully for event %s!' % (event_name)
if Y_label != 'NULL' or random.random() > 0: if Y_label == event_name: Y = 1 else: Y = 0 if i == 0: X_all = X Y_all = Y i = 1 else: X_all = np.vstack((X_all, X)) Y_all = np.append(Y_all, Y) i += 1 # print (i) # print (np.sum(X_all, axis = 1)) # print(X_all, Y_all) clf = SVC(kernel=chi2_kernel) # clf = SVC() clf.fit(X_all, Y_all) print(clf.score(X_all, Y_all)) print(clf.predict(X_all)) fread.close() cPickle.dump(clf, open(output_file, "wb")) print 'SVM trained successfully for event %s!' % (event_name)
i += 1 clf = SVC(kernel=laplacian_kernel) features = ['.cnn.', '.mfcc.', '.asr.', '.cnn+mfcc.', '.cnn+mfcc+asr.'] X_score = np.zeros((len(X_all), len(features))) j = 0 for j, feature in enumerate(features): if j <= 2: X = X_all[:, dimension_i[j]:dimension_i[j] + dimension_i[j + 1]] elif j == 3: X = X_all[:, 0:150] elif j == 4: X = X_all clf.fit(X, Y_all) print('saving scores...') X_score[:, j] = clf.decision_function(X) # print (X_score[:, j]) np.save(event_name + feature + 'score', X_score[:, j]) cPickle.dump(clf, open(output_file + feature + 'score', "wb")) clf = SVC(kernel='linear') clf.fit(X_score[:, [0, 1, 2, 4]], Y_all) fread.close() cPickle.dump(clf, open(output_file, "wb")) print 'SVM trained successfully for event %s!' % (event_name)