def train_data_SVC(X, y): """ Create and train the Support Vector Machine. """ classif = OneVsRestClassifier(LinearSVC()) classif.fit(X,y) return classif
def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"], ["ham", "eggs"], ["ham"]] #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] Y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]]) classes = set("ham eggs spam".split()) for base_clf in (MultinomialNB(), LinearSVC(random_state=0), LinearRegression(), Ridge(), ElasticNet(), Lasso(alpha=0.5)): # test input as lists of tuples clf = assert_warns(DeprecationWarning, OneVsRestClassifier(base_clf).fit, X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict([[0, 4, 4]])[0] assert_equal(set(y_pred), set(["spam", "eggs"])) assert_true(clf.multilabel_) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[0, 4, 4]])[0] assert_array_equal(y_pred, [0, 1, 1]) assert_true(clf.multilabel_)
def test_decision_function_shape_two_class(): for n_classes in [2, 3]: X, y = make_blobs(centers=n_classes, random_state=0) for estimator in [svm.SVC, svm.NuSVC]: clf = OneVsRestClassifier(estimator( decision_function_shape="ovr")).fit(X, y) assert_equal(len(clf.predict(X)), len(y))
def roc(features_trunc, labels, categories, classifier): """ compute and plot the roc curve for the given classifier features_trunc - features matrix truncated to the k best features labels - the classes of the data categories - different possible categories (66 for subcategories or 14 for categories) classifier - MultinomialNB or lda """ # divide the data into training and test set features_train, features_test, categoryids_train, categoryids_test = train_test_split(features_trunc, labels, test_size=.1,random_state=0) # define the OneVsRestClassifier with the given classifier (LDA or Naive Bayes) clf = OneVsRestClassifier(classifier) # train the classifier and compute the probabilities for the test data labels clf_fit = clf.fit(features_train, categoryids_train) labels_score = clf_fit.predict_proba(features_test) # binarize the labels (necessary for the roc curve) categoryids_test = label_binarize(categoryids_test, classes=categories) # compute the false positive rate, true positive rate and the thresholds fpr, tpr, thresholds = metrics.roc_curve(categoryids_test.ravel(), labels_score.ravel()) # compute the area under the curve roc_auc = metrics.auc(fpr, tpr) # plot the roc curve pl.clf() pl.plot(fpr, tpr, 'r',label='micro-average ROC curve (area = {0:0.2f})'''.format(roc_auc), linewidth=2) pl.plot([0, 1], [0, 1], 'k--', linewidth=2) pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.05]) pl.xlabel('false positive rate') pl.ylabel('true positive rate') pl.title('Receiver operating characteristic for micro-averaged classification scores') pl.legend(loc="lower right") pl.show()
def train(self, trainfile_name): print >>sys.stderr, "Reading data.." train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")] shuffle(train_data) filter_feature = get_filter() train_labels, train_clauses = zip(*train_data) train_labels = [tl.lower() for tl in train_labels] print >>sys.stderr, "Indexing features.." self.fp.index_data(train_clauses, filter_feature) X = numpy.asarray([self.fp.featurize(clause, filter_feature) for clause in train_clauses]) tagset = list(set(train_labels)) tag_index = {l:i for (i, l) in enumerate(tagset)} Y = numpy.asarray([[tag_index[label]] for label in train_labels]) classifier = OneVsRestClassifier(SVC(kernel='linear')) if self.cv: print >>sys.stderr, "Starting Cross-validation for %d folds.."%(self.folds) y = [l[0] for l in Y] scores = cross_validation.cross_val_score(classifier, X, y, cv=self.folds, scoring='f1_weighted') print >>sys.stderr, "Scores:", scores print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(scores.mean(), scores.std() * 2) print >>sys.stderr, "Starting training.." classifier.fit(X, Y) pickle.dump(classifier, open(self.trained_model_name, "wb")) pickle.dump(self.fp.feat_index, open(self.feat_index_name, "wb")) pickle.dump(tagset, open(self.stored_tagset, "wb")) print >>sys.stderr, "Done"
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = [tuple(l.nonzero()[0]) for l in (Y_proba > 0.5)] assert_equal(pred, Y_pred)
def test_ovr_multiclass(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]]) y = ["eggs", "spam", "ham", "eggs", "ham"] Y = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1], [1, 0, 0]]) classes = set("ham eggs spam".split()) for base_clf in (MultinomialNB(), LinearSVC(random_state=0), LinearRegression(), Ridge(), ElasticNet()): clf = OneVsRestClassifier(base_clf).fit(X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict(np.array([[0, 0, 4]]))[0] assert_equal(set(y_pred), set("eggs")) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[0, 0, 4]])[0] assert_array_equal(y_pred, [0, 0, 1])
def svm(): #load data x_train,y_train=load_svmlight_file("12trainset") x_train.todense() x_test,y_test=load_svmlight_file("12testdata") x_test.todense() sk=SelectKBest(f_classif,9).fit(x_train,y_train) x_new=sk.transform(x_train) x_newtest=sk.transform(x_test) print(sk.scores_) print(x_new.shape) print(sk.get_support()) #classfier clf=SVC(C=2,gamma=2) ovrclf=OneVsRestClassifier(clf,-1) ovrclf.fit(x_train,y_train) y_pred=ovrclf.predict(x_test) # write result with open("result.txt","w") as fw: for st in y_pred.tolist(): fw.write(str(st)+'\n') print(np.array(y_pred).shape) target_names=['0','1','2','3'] #result #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) #print(classification_report(y_test,y_pred,target_names=target_names)) #print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def main(): dataTuples=getDataInFormat() print "Length of dataTuples is: ", len(dataTuples) shuffle(dataTuples) trainTuples=dataTuples del dataTuples ids, labels, vectors= getLabelsAndVectors(trainTuples) del trainTuples followerCountsList = loadFollowerCountsFromFile() space=getSpace(vectors) reducedSpace=getReducedSpace(vectors, space) spaceWithMetaFeatures= augmentSpace(reducedSpace, emotionFeatures) print "Total # of features in your space is: ", len(space) print "Total # of features in your reducedSpace is: ", len(reducedSpace) oneHotVectors=getOneHotVectors(ids, labels, vectors,spaceWithMetaFeatures , followerCountsList) trainVectors, trainLabels=getOneHotVectorsAndLabels(oneHotVectors) del oneHotVectors clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear',gamma=0.1, verbose= False, probability=False)) clf.fit(trainVectors, trainLabels) print "\nDone fitting classifier on training data...\n" print "\nDone fitting classifier on training data...\n" print "="*50, "\n" print "Results with 10-fold cross validation:\n" print "="*50, "\n" predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=10) print "*"*20 print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted) print "*"*20 print "precision_score\t", metrics.precision_score(trainLabels, predicted) print "recall_score\t", metrics.recall_score(trainLabels, predicted) print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted) print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted)
def train_svm(X, y): """ Create and train the Support Vector Machine. """ svm = OneVsRestClassifier(SVC(C=1000000.0, gamma='auto', kernel='rbf')) svm.fit(X, y) return svm
def AgeClassifier(data_feature_stack,data_age_stack,test_size = 0.5): Age_range = np.unique(data_age_stack) # 923, 1529, 856, 1617, 13836, 6260, 1198 AgeX_train,AgeX_test,AgeY_train,AgeY_test = preprocess(data_feature_stack,data_age_stack,test_size) print "fitting Age Clssfifer..." # parameters = (C=1.0, class_weight=None, dual=True, fit_intercept=True,\ # intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',\ # random_state=0, tol=0.0001, verbose=0) clf = OneVsRestClassifier(LinearSVC(C = 0.001)).fit(AgeX_train, AgeY_train) print "predicting Age..." Age_test_result = clf.predict(AgeX_test) Age_train_result = clf.predict(AgeX_train) # Age_acc_test = clf.score(AgeX_test, AgeY_test) # Age_acc_train = clf.score(AgeX_train, AgeY_train) Age_acc_test = np.sum(Age_test_result == AgeY_test) Age_acc_train = np.sum(Age_train_result == AgeY_train) temp = Age_test_result-AgeY_test error = np.sqrt(temp**2) rmse = np.mean(error) error2 = np.sqrt(temp[temp!=0]**2) rmse2 = np.mean(error2) pdb.set_trace() return clf, Age_acc_test,Age_acc_train
def trainAndPredictLR(trainX, trainY, testX): """ Logistic regression is used for predicting the target labels of the test data The probability of belonging to each of the labels is predicted for every test data and the labels with the top 10 probability values are extracted Input: 1. trainX: ntrainingSamples * 2000 numpy matrix representing training data features 2. trainY: ntrainingSamples * 185 numpy matrix representing the training data labels 3. testX: ntestSamples * 2000 numpy matrix representing test data features Output: testY: ntestSamples * 19 numpy matrix representing the labels for the test data """ clf = OneVsRestClassifier(LogisticRegression(C = 1.0)) clf.fit(trainX, trainY) actY = clf.predict_proba(testX) testY = [] # fetch the labels with max probability for prob in actY: y = [] for i in range(10): index = np.argmax(prob, axis=0) classVal = classOrder[index] y.append(classVal) prob[index] = -1 testY.append(y) return np.array(testY)
def run(data_path): print "Reading the dataset:", data_path mnist = fetch_mldata('MNIST original') mnist.data, mnist.target = shuffle(mnist.data, mnist.target) # Trunk the data n_train = 600 n_test = 400 # Define training and testing sets indices = arange(len(mnist.data)) random.seed(0) train_idx = random.sample(indices, n_train) test_idx = random.sample(indices, n_test) X_train, y_train = mnist.data[train_idx], mnist.target[train_idx] X_test, y_test = mnist.data[test_idx], mnist.target[test_idx] # Apply a learning algorithm print "Applying a learning algorithm..." clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train) # Make a prediction print "Making predictions..." y_pred = clf.predict(X_test) print y_pred # Evaluate the prediction print "Evaluating results..." print "Precision: \t", metrics.precision_score(y_test, y_pred) print "Recall: \t", metrics.recall_score(y_test, y_pred) print "F1 score: \t", metrics.f1_score(y_test, y_pred) print "Mean accuracy: \t", clf.score(X_test, y_test)
def run_classifier(sentences, labels, test_doc_list, output_file_path_list): import numpy as np train_matrix, tfidf = tf_idf_fit_transform(sentences) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() label_matrix = mlb.fit_transform(labels) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC estimator = LinearSVC() classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, label_matrix) for test_doc, output_file_path in zip(test_doc_list, output_file_path_list): test_sentences = doc2sentences([test_doc]) sentence_matrix = tfidf.transform(test_sentences) print("Shape of sentence matrix : ", sentence_matrix.shape) predictions = classifier.predict(sentence_matrix) from lxml import etree document = etree.Element('doc') doc_tree = etree.ElementTree(document) for i in range(len(test_sentences)): curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1] etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i] doc_tree.write(output_file_path)
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train = X[:2000, :] X_test = X[2000:, :] Y_train = Y[:2000, :] Y_test = Y[2000:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression(), order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13])) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def fit(self, df_X, df_y): if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if df_y.shape[1] != 1: raise ValueError("y needs to have 1 label column") le = LabelEncoder() y = le.fit_transform(df_y.iloc[:,0].values) clf = RandomForestClassifier(n_estimators=100) # Multiclass if len(le.classes_) > 2: orc = OneVsRestClassifier(clf) orc.fit(df_X.values, y) importances = np.array([c.feature_importances_ for c in orc.estimators_]).T else: # Only two classes clf.fit(df_X.values, y) importances = np.array([ clf.feature_importances_, clf.feature_importances_ ]).T for i,c in enumerate(le.classes_): diff = df_X.loc[y == c].quantile(q=0.75) - df_X.loc[y != c].quantile(q=0.75) sign = (diff >= 0) * 2 - 1 importances[:,i] *= sign # create output DataFrame self.act_ = pd.DataFrame(importances, columns=le.inverse_transform(range(len(le.classes_))), index=df_X.columns)
def benchmark(clf_current): print('_' * 80) print("Test performance for: ") clf_descr = str(clf_current).split('(')[0] print(clf_descr) t0 = time() classif = OneVsRestClassifier(clf_current) classif.fit(X_train, Y_train.toarray()) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() if hasattr(clf_current,"decision_function"): dfmatrix = classif.decision_function(X_test) score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5)) else: probsmatrix = classif.predict_proba(X_test) score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5)) test_time = time() - t0 print("f1-score: %0.7f" % score) print("test time: %0.3fs" % test_time) print('_' * 80) return clf_descr, score, train_time, test_time
def one_vs_all(X, y, test_size=0.2, run_num = 100, svm_type='linear'): """Trains 15 1 vs all SVM classifiers of specified type""" # Python has a wonderful wrapper function that creates 1 vs all classifiers! if type == 'linear': estimator = LinearSVC() else: # This will automatically use RBF functions estimator = SVC() ovr = OneVsRestClassifier(estimator = estimator) acc_tr = [] acc_tst = [] for i in range(run_num): [X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size=test_size) # Train the classifier ovr.fit(X_train, y_train.ravel()) # Work out the score on the training data. However there is nothing # to optimise for - we are just getting an idea of the accuracy for # training vs test data. box plot opportunity! tr_acc = ovr.score(X_train, y_train.ravel()) tst_acc = ovr.score(X_test, y_test.ravel()) acc_tr.append(tr_acc) acc_tst.append(tst_acc) # All the data isn't used here as it tends to overtrain the classifier. return ovr, acc_tr, acc_tst
class ClassDistanceMapper(TransformerMixin): """ Fit a OneVsRestClassifier for each sentiment class (against all others combined) and return the distances from the decision boundary for each class. Hence, this transformation can be seen as a dimensionality reduction from #words to #sentiment_classes (=5). """ def __init__(self): """ Initialize a one-vs-rest multiclass classifer with a SGDClassifier. The choice of the SGDclassifier here is arbitrary, any other classifier might work as well. """ self.clf = OneVsRestClassifier(LogisticRegression()) def fit(self, X, y): """ Fit the multiclass classifier. """ self.clf.fit(X, y) return self def transform(self, X): """ Return the distance of each sample from the decision boundary for each class. """ return self.clf.decision_function(X)
def fit_multiclass_svm1(documents, idfs): model = gensim.models.doc2vec.Doc2Vec.load("train_doc2vec.model") X = np.zeros([4000, 300]); X_test = np.zeros([490, 300]); y = np.zeros(4000); y_test = np.zeros(490); i = 0 for doc in documents[:4000]: x = np.zeros(300) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X[i, :] = x/count y[i] = doc["topic_id"] i = i + 1; svm_model = OneVsRestClassifier(svm.SVC(kernel='poly', gamma=2)).fit(X, y) i = 0 for doc in documents[4000:4490]: x = np.zeros(300) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X_test[i, :] = x/count y_test[i] = doc["topic_id"] i = i + 1; print svm_model.score(X_test, y_test)
def make_classifier(): test_size=0 X, y = make_X_Y() X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size) X_train = X_train.astype(int) X_test = X_test.astype(int) y_train = y_train.astype(int) y_test = y_test.astype(int) clf = OneVsRestClassifier(SVC(kernel='linear', class_weight='auto', probability=True)) clf.fit(X_train, y_train) try: y_suggest = clf.predict_proba(X_test) nn = 0 n = 0 for y_s, y_t in zip(y_suggest, y_test): s1 = chords_Y[np.argmax(y_s)] y_s[np.argmax(y_s)]=0 s2 = chords_Y[np.argmax(y_s)] t = chords_Y[np.argmax(y_t)] print 'Suggest: ' + s1 + ' or ' + s2 + ' Real: ' + t n = n+1 if s1==t: nn = nn+1 if n>0: print 'Accuracy is ' + str(float(nn)/n) except ValueError: pass #print classification_report(clf.predict(X_test), y_test) pickle.dump(clf, open("classifier.bin", "wb"))
def train_linear(X, Y, splits, model_config, results_dir, best_k=10, validation_score='f1', threshold_score='f1', threshold_criterion='zack', fn_prefix='', label_idx=None): label_idx = np.arange(Y.shape[1]) if label_idx is None else label_idx best_perf = None best_C = None best_model = None for C in np.logspace(-3,3, num=20): sys.stdout.write('Training Ridge Regression with C={0}...'.format(C)) sys.stdout.flush() model = OneVsRestClassifier(LogisticRegression(C=C)) try: model.fit(X[splits[0]], Y[splits[0]]) except KeyboardInterrupt: sys.stdout.write('training interrupted...') break except: raise Yp = model.predict_proba(X[splits[1]]) perf = compute_micro_evaluations(Y[splits[1]][:,label_idx], Yp[:,label_idx], k=best_k, threshold_score=threshold_score, criterion=threshold_criterion) sys.stdout.write(' {0}={1:.4f}'.format(validation_score, perf[validation_score])) sys.stdout.flush() if best_perf is None or perf[validation_score] > best_perf[validation_score]: best_perf = perf best_model = model best_C = C sys.stdout.write(' *BEST') sys.stdout.write('\n') model_config['C'] = best_C cPickle.dump(best_model, open(os.path.join(results_dir, fn_prefix + '-model.pkl'), 'wb')) return best_model, model_config
def ml_train(datasetFilePath, falsePredictionsFilePath, unknownPredictionsFilePath, confusionMatricesDir, classifierFilePath): logger.info("start of training and testing phase") classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True), n_jobs=NUMBER_OF_CPUS_TO_USE) logger.info("loading data set") dataset, features_names = load_dataset(datasetFilePath) #limited_dataset = limit_dataset(dataset) limited_dataset = dataset ml_dataset = split_dataset(limited_dataset, len(features_names)) logger.info("fitting training set X_train - %s, y_train - %s" % (ml_dataset.X_train.shape, ml_dataset.y_train.shape)) classifier.fit(ml_dataset.X_train, ml_dataset.y_train) logger.info("predicting test set X_test - %s, y_test - %s" % (ml_dataset.X_test.shape, ml_dataset.y_test.shape)) y_pred = classifier.predict(ml_dataset.X_test) y_pred_probabilities = classifier.predict_proba(ml_dataset.X_test) y_pred_with_unknown_cls, y_pred_fictive, max_y_pred_probs = process_prediction_vector(ml_dataset.y_test, y_pred, y_pred_probabilities) validation(ml_dataset.y_test, y_pred, y_pred_with_unknown_cls, y_pred_fictive, list(classifier.classes_) + ["unknown"]) plot_confusion_matrices(ml_dataset.y_test, y_pred, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "1") plot_confusion_matrices(ml_dataset.y_test, y_pred_with_unknown_cls, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "2") plot_confusion_matrices(ml_dataset.y_test, y_pred_fictive, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "3") produce_output(ml_dataset.y_test, y_pred, max_y_pred_probs, ml_dataset.test_terms_name, falsePredictionsFilePath, unknownPredictionsFilePath) logger.info("exporting classifier model") joblib.dump(classifier, classifierFilePath) logger.info("end of training and testing phase")
def main(): word_vec_dict = readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt') tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt') tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict) print tweets[0] print getSumVectors(tweets[0], word_vec_dict) tweetClasses = set(tweets[-1]) mapping = {'favor': 1, 'none': 0, 'against': 1} tweetClasses = np.asarray([mapping[x] for x in tweets[-1]]) tweetData = np.asarray(tweetVectors) print tweetClasses.shape print tweetData.shape X = tweetData Y = tweetClasses clf = OneVsRestClassifier(LinearSVC(random_state=0)) # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0) X_train = X[0:int(0.7 * len(X))] y_train = Y[0:int(0.7 * len(Y))] X_test = X[int(0.7 * len(X)) : len(X)] y_test = Y[int(0.7 * len(Y)) : len(Y)] clf.fit(X_train, y_train) print clf.score(X_test, y_test)
def compute_ranking(learnFullModel=False): path='/home/arya/PubMed/GEO/Datasets/' modelpath=path+'libsvm/model/' if not os.path.exists(modelpath): os.makedirs(modelpath) outpath='{}libsvm/out/'.format(path) sys.stdout=open('{}SVM.log'.format('/home/arya/PubMed/GEO/Log/'),'w') sys.stderr=open('{}SVM.err'.format('/home/arya/PubMed/GEO/Log/'),'w') if not os.path.exists(outpath): os.makedirs(outpath) X, Y = load_svmlight_file(path+'Corpus.libsvm',multilabel=True) Y=np.array(Y) if learnFullModel: model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y) joblib.dump(model, modelpath+'Model.libsvm') print 'The Full Model is Saved!' Folds=pd.read_pickle(path+'Folds.df') for fold in range(Folds.shape[1]): start=time() Xtr,Ytr=X[Folds[fold].values,:],Y[Folds[fold].values] print 'learning on fold...',Xtr.shape,fold, sys.stdout.flush() model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(Xtr, Ytr) Xte=X[~Folds[fold].values,:] labels=model.classes_ # Yte=remove_unknown_classes(Yte, labels) # idx=np.array(map(lambda x: len(x)>0,Yte)) # Yte=np.array(Yte)[idx] # Xte=Xte[idx] print 'predicting...',Xte.shape, sys.stdout.flush() pd.DataFrame(columns=labels,data=model.decision_function(Xte)).to_pickle('{}deci.{}.df'.format(outpath,fold)) # (pd.DataFrame(columns=labels,data=MultiLabelBinarizer().fit_transform(list(Yte)+[labels]))).iloc[:-1].to_pickle('{}labels.{}.df'.format(outpath,fold)) # ranking.to_pickle('{}ranking.{}.df'.format(outpath,fold)) print 'Done in {:.0f} minutes'.format((time()-start)/60.0)
def runDigitsDensity(n,_i, j): metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute #skclf = KernelDensity(metric=ma) myclf = hw7u.MyKNN(metric=metric[j], density=True) mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' #skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) #print 'scikit predict' #sk_pred = skclf.predict(X_test) #print sk_pred print y_test print y_pred #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc) print 'My Accuracy: {}'.format(myacc)
def runDigits(n, skclf, myclf): mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) print 'scikit predict' sk_pred = skclf.predict(X_test) print sk_pred print y_test print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
def multiclass_AUC(clf, X, Y): # Binarize the output X, Y = np.array(X), np.array(Y) Y = label_binarize(Y, classes=list(set(Y))) n_classes = Y.shape[1] # shuffle and split training and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5, random_state=0) # Learn to predict each class against the other classifier = OneVsRestClassifier(clf) Y_score = classifier.fit(X_train, Y_train).predict(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) print "AUC for multiclass {}: {}".format(clf.__class__.__name__, roc_auc["micro"])
def prepare_multiclass_clf(X, y): clf = GridSearchCV(LogisticRegression(penalty='l1'), {'C': np.logspace(-4, 2, 10)}, scoring='accuracy', cv=5) multi_clf = OneVsRestClassifier(clf) multi_clf.fit(X, y) return multi_clf
def fit_multiclass_svm(documents, idfs): model = gensim.models.Word2Vec.load("train_word2vec.model") dim = 50; X = np.zeros([4000, dim]); X_test = np.zeros([490, dim]); y = np.zeros(4000); y_test = np.zeros(490); i = 0 for doc in documents[:4000]: x = np.zeros(dim) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X[i, :] = x/count y[i] = doc["topic_id"] i = i + 1; svm_model = OneVsRestClassifier(LinearSVC(random_state=0, C = 1)).fit(X, y) i = 0 for doc in documents[4000:4490]: x = np.zeros(dim) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X_test[i, :] = x/count y_test[i] = doc["topic_id"] i = i + 1; print svm_model.score(X_test, y_test)
def model(model, OvsR=False): if OvsR: model = OneVsRestClassifier(model, n_jobs=-1) return model
# -------------- from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score # clf = OneVsRestClassifier(LogisticRegression()) # clf1 = OneVsRestClassifier(LogisticRegression()) # model_fit_all_features = clf1.fit(X_train, Y_train) # predictions_all_features = clf1.predict(X_test) # score_all_features = accuracy_score(Y_test, predictions_all_features) # # print(score_all_features) # model_fit_top_features= clf.fit(scaled_features_train_df,Y_train) # predictions_top_features= clf.predict(X_test) # score_top_features = accuracy_score(Y_test,predictions_top_features) clf = OneVsRestClassifier(LogisticRegression()) clf1 = OneVsRestClassifier(LogisticRegression()) model_fit_all_features = clf1.fit(X_train, Y_train) predictions_all_features = model_fit_all_features.predict(X_test) score_all_features = accuracy_score(Y_test, predictions_all_features) print(score_all_features) model_fit_top_features = clf.fit(scaled_features_train_df[top_k_predictors], Y_train) predictions_top_features = model_fit_top_features.predict( scaled_features_train_df[top_k_predictors])
y = data['grp'] X = data.iloc[:, 1:31] # Binarize the output y = label_binarize(y, classes=[0, 1, 2]) n_classes = y.shape[1] # split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) # Learn to predict each class against the other classifier = OneVsRestClassifier( svm.SVC(kernel='linear', probability=True, random_state=0)) y_score = classifier.fit(X_train, y_train).decision_function(X_test) ova = classifier.fit(X_train, y_train) y_pred = ova.predict(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area !!!!!!!!!!!!!!!!!!!! PROCI STA ZNACE OVE MERE fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
k = 10 scores = cross_val_score(modelo, treino_dados, treino_marcacoes, cv=k) taxa_de_acerto = np.mean(scores) msg = "Taxa de acerto do {0}: {1}".format(nome, taxa_de_acerto) print(msg) return taxa_de_acerto resultados = {} from sklearn.multiclass import OneVsRestClassifier #Algoritmo que irá rodar por trás do OneVsRestClassifier from sklearn.svm import LinearSVC #random_state roda de maneira fixa e não aleatória modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state=0)) resultadoOneVsRest = fit_and_predict("OneVsRestClassifier", modeloOneVsRest, treino_dados, treino_marcacoes) #Adiciona ao dicionario resultados[resultadoOneVsRest] = modeloOneVsRest from sklearn.multiclass import OneVsOneClassifier modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state=0)) resultadoOneVsOne = fit_and_predict("OneVsOne", modeloOneVsOne, treino_dados, treino_marcacoes) resultados[resultadoOneVsOne] = modeloOneVsOne from sklearn.naive_bayes import MultinomialNB modeloMultinomial = MultinomialNB()
elif m == 'MCB': pool_classifiers = RandomForestClassifier(n_estimators=10) pool_classifiers.fit(Feature_train, Label_train.ravel()) mcb = MCB(pool_classifiers) mcb.fit(Feature_train, Label_train.ravel()) Label_predict = mcb.predict(Feature_test) elif m == 'DES-MI': pool_classifiers = RandomForestClassifier(n_estimators=10) pool_classifiers.fit(Feature_train, Label_train.ravel()) dmi = DESMI(pool_classifiers) dmi.fit(Feature_train, Label_train.ravel()) Label_predict = dmi.predict(Feature_test) elif m == 'One_vs_Rest-SMOTE-XGBoost': sm = SMOTE() Feature_train_o, Label_train_o = sm.fit_sample(Feature_train, Label_train.ravel()) clf = OneVsRestClassifier(xgboost.XGBClassifier(**BayesOp_Parameters)) clf.fit(Feature_train_o, Label_train_o) Label_predict = clf.predict(Feature_test) elif m == 'One_vs_Rest-XGBoost': clf = OneVsRestClassifier(xgboost.XGBClassifier(**BayesOp_Parameters)) clf.fit(Feature_train, Label_train.ravel()) Label_predict = clf.predict(Feature_test) ml_record.measure(i, Label_test, Label_predict, 'weighted') i += 1 file_wirte = "Result_One_vs_All_BayOp_XGBoost_G_mean_GA_99_pop.txt" ml_record.output(file_wirte, m, Dir)
def run_data(self, flag, model_name): if flag == 'orig': print( '\n(¯`·._.·(¯`·._.· Evaluation on original data ·._.·´¯)·._.·´¯)\n' ) y = self.y y_pca = y elif flag == 'full_anno': print( '\n(¯`·._.·(¯`·._.· Evaluation on full annotations ·._.·´¯)·._.·´¯)\n' ) print(self.filt_txt) y = self.y_all_anno_max y_pca = np.ravel(self.oe.inverse_transform(y)) elif flag == 'filt': print( '\n(¯`·._.·(¯`·._.· Evaluation on filtered data ·._.·´¯)·._.·´¯)\n' ) print(self.filt_txt) y = self.y_filt_anno_max y_pca = np.ravel(self.oe.inverse_transform(y)) if self.pca_plot: fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(self.X[:, 0], self.X[:, 1], self.X[:, 2], c=y_pca) plt.show() if model_name == 'svm': # # support vector classifier if self.multi_label is False: if flag != 'orig': y = np.ravel(self.oe.inverse_transform(y)) kfold = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=self.seed) clf_cv = svm.SVC(C=self.C, gamma=self.gamma, random_state=self.seed) # if multilabel else: if flag != 'orig': kfold = IterativeStratification(n_splits=5, order=1, random_state=self.seed) else: # kfold = model_selection.KFold(n_splits=5, random_state=self.seed) kfold = model_selection.StratifiedKFold( n_splits=5, random_state=self.seed) clf_cv = OneVsRestClassifier( svm.SVC(C=self.C, gamma=self.gamma, random_state=self.seed)) # clf_cv = svm.SVC(C=self.C, gamma=self.gamma, random_state=1987) # best_params = self.svc_param_selection(clf_cv, self.X, self.y, kfold) # print('Best params:', best_params) pre = model_selection.cross_val_score(clf_cv, self.X, y, cv=kfold, scoring='precision_macro') rec = model_selection.cross_val_score(clf_cv, self.X, y, cv=kfold, scoring='recall_macro') fsc = model_selection.cross_val_score(clf_cv, self.X, y, cv=kfold, scoring='f1_macro') # score, perm_sc, pvalue = model_selection.permutation_test_score(clf_cv, self.X, self.y, cv=kfold, scoring='f1_macro', n_permutations=100, n_jobs=-1) print('5-Fold CV Precision: {} , STD: {}'.format( pre.mean(), pre.std())) print('5-Fold CV Recall: {} , STD: {}'.format( rec.mean(), rec.std())) print('5-Fold CV F1-Score: {} , STD: {}'.format( fsc.mean(), fsc.std())) # print('Classification score {} (pvalue : {})'.format(score, pvalue)) # X_train, X_test, y_train, y_test = model_selection.train_test_split(self.X, y, test_size=0.5, random_state=1987) # clf_cv.fit(X_train, y_train) # y_pred = clf_cv.predict(X_test) # report = classification_report(y_test, y_pred) # print(report) return pre, rec, fsc elif model_name == 'gmm': # gaussian mixture model clf_cv = mixture.GaussianMixture(n_components=len(np.unique(y)), covariance_type='full', n_init=10, random_state=1987) # clf_cv = mixture.BayesianGaussianMixture(n_components=len(np.unique(self.y)), covariance_type='full', n_init=10, random_state=1987) clf_cv.fit(self.X) y_pred = clf_cv.predict(self.X) # pdb.set_trace() return y, y_pred
class SKModel(object): ''' This class facilitates training, testing, storage, and deployment of scikit-learn models. ''' def __init__(self, estimator=None, encoder=None): self.estimator = estimator self.id = None self.deployed = False self.call_count = 0 self.last_call = None self.recommendation_threshold = 0.0 self.train_results = None self.test_results = None self.dependent = None self.independent = None self.model = None self.encoder = encoder self.tpr = None self.fpr = None self.roc_auc = None self.model_path = None self.encoder_path = None self.encoder_type = None self.train_timestamp = None self.train_time = None self.train_data_balance = None self.test_timestamp = None self.test_time = None def train(self, data): if self.estimator is None: logging.warning( 'Model estimator not yet specified. Please define or load an estimator.', UserWarning) self.model = OneVsRestClassifier(self.estimator).fit( data.X_train, data.y_train) self.dependent = data.dependent independent_vars = [] for i in data.independent: independent_vars.append({"name": i}) self.independent = independent_vars train_results, timestamp, train_time, train_data_balance = Models( )._train(self.model, data.X_train, data.y_train, balance=data.balance, encoder=self.encoder) self.train_results = train_results self.train_timestamp = timestamp self.train_time = train_time self.train_data_balance = train_data_balance def test(self, data): if self.model is None: logging.warning( 'Model not yet specified. Please train or load a model.', UserWarning) test_results, timestamp, test_time = Models()._test( self.model, data.X_test, data.y_test) self.test_results = test_results self.test_timestamp = timestamp self.test_time = test_time def predict(self, X): if self.model is None: logging.warning( 'Model not yet specified. Please train or load a model.', UserWarning) y_pred = self.model.predict(X) return y_pred def predict_proba(self, X): if self.model is None: logging.warning( 'Model not yet specified. Please train or load a model.', UserWarning) Y_pred_proba = self.model.predict_proba(X) return Y_pred_proba def store(self, model_path, server_config, encoder_path=None, encoder=None, override=False): Models(server_config=server_config)._store( model=self, model_path=model_path, encoder_path=encoder_path, encoder=encoder, override=override) logging.info('Model stored successfully.') def load_model(self, model_id, server_config): models_connection = Models(server_config=server_config) model_info = models_connection._get_info(model_id) self.id = model_info['models']['id'] self.deployed = model_info['models']['deployed'] self.call_count = model_info['models']['callCount'] self.last_call = model_info['models']['lastCall'] self.recommendation_threshold = model_info['models'][ 'recommendationThreshold'] self.train_results = { 'accuracy': model_info['models']['trainAccuracy'], 'recall': model_info['models']['trainPrecision'], 'precision': model_info['models']['trainPrecision'], 'f1': model_info['models']['trainF1'] } self.test_results = { 'accuracy': model_info['models']['testAccuracy'], 'recall': model_info['models']['testPrecision'], 'precision': model_info['models']['testPrecision'], 'f1': model_info['models']['testF1'] } self.dependent = model_info['models']['dependent'] self.independent = model_info['models']['independent'] self.model_path = model_info['models']['modelPath'] self.encoder_path = model_info['models']['encoderPath'] self.encoder_type = model_info['models']['encoderType'] self.train_timestamp = model_info['models']['lastTrainedDate'] self.train_time = model_info['models']['trainTime'] self.train_data_balance = model_info['models']['trainDataBalance'] self.test_timestamp = model_info['models']['lastTestedDate'] self.test_time = model_info['models']['testTime'] model = models_connection._load_from_bucket( model_info['models']['modelPath']) self.model = model self.estimator = self.model.estimator if self.encoder_path is not None: self.encoder = models_connection._load_from_bucket( self.encoder_path) return self @staticmethod def load_generators(model_id, server_config): models_connection = Models(server_config=server_config) model_info = models_connection._get_info(model_id) generators = [] for i in model_info['models']['independent']: if isinstance(i, dict): if 'generator_path' in i.keys(): func = models_connection._load_from_bucket( i['generator_path']) generators.append(func) return generators def delete_model(self, model_id, server_config): models_connection = Models(server_config=server_config) model_info = models_connection._get_info(model_id) # delete the modelPath and encoderPath objects in S3 models_connection._delete_from_bucket( model_info['models']['modelPath']) models_connection._delete_from_bucket( model_info['models']['encoderPath']) # delete generators self.delete_generators(model_id, server_config) # then delete from Elasticsearch models_connection._delete_from_index(model_id) @staticmethod def delete_generators(model_id, server_config): models_connection = Models(server_config=server_config) model_info = models_connection._get_info(model_id) all_models = models_connection.index.get()[0] generator_paths = [] to_delete = [] for i in model_info['models']['independent']: if isinstance(i, dict): if 'generator_path' in i.keys(): generator_paths.append(i['generator_path']) to_delete.append(i['generator_path']) for m in all_models['models']: if m['_id'] != model_info['models']['id']: for i in m['_source']['independent']: if isinstance(i, dict): if 'generator_path' in i.keys(): if i['generator_path'] in generator_paths and i[ 'generator_path'] in to_delete: to_delete.remove(i['generator_path']) logging.info( i['generator_path'] + ' shared with another model, skipping delete.' ) for i in to_delete: models_connection._delete_from_bucket(i) def deploy(self, server_config, deploy=False, recommendation_threshold=0.0): if not self.model_path or not self.encoder_path: logging.warning( 'Must store model and encoder prior to deployment.', UserWarning) else: self.deployed = deploy self.recommendation_threshold = recommendation_threshold Models(server_config=server_config)._deploy(model=self) logging.info( 'Model deployed successfully and will be available after the next server restart.' ) def tag_generator(self, func, output_var, input_vars, generator_path=None): if generator_path: generator_path = generator_path else: generator_path = func.__name__ + '.pickle' if not isinstance(self.independent, list): logging.warning('Independent variables not defined as a list.', UserWarning) sys.exit() for i in self.independent: if i['name'] == output_var: i['inputs'] = input_vars i['generator_path'] = generator_path # create tmp directory if not present if not os.path.exists('tmp/'): os.makedirs('tmp/') logging.info('Created directory tmp to tag generator.') with open('tmp/' + generator_path, 'wb') as g: dill.dump(func, g, protocol=dill.HIGHEST_PROTOCOL) logging.info('Generator tagged successfully.')
accuracy = accuracy_score(y_val, predicted) f1_score_macro = f1_score(y_val, predicted, average='macro') f1_score_micro = f1_score(y_val, predicted, average='micro') f1_score_weighted = f1_score(y_val, predicted, average='weighted') print("accuracy:", accuracy) print("f1_score_macro:", f1_score_macro) print("f1_score_micro:", f1_score_micro) print("f1_score_weighted:", f1_score_weighted) import warnings import sklearn.exceptions warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning) # TF-IDF+朴素贝叶斯模型--------------------------------------------------------------------------------------------------- NB_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')), ('clf', OneVsRestClassifier(MultinomialNB())), ]) NB_pipeline.fit(X_train, y_train) prob=NB_pipeline.predict_proba(X_val) predicted = NB_pipeline.predict(X_val) print_evaluation_scores(y_val, predicted) #TF-IDF+逻辑回归--------------------------------------------------------------------------------------------------------- LogReg_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')), ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=10000), n_jobs=1)), ]) LogReg_pipeline.fit(X_train, y_train)
validation_result = algorithm.predict(X_validation) matches = validation_result == Y_validation total_matches = sum(matches) total_elements_test = len(Y_validation) print "Total de elementos de validação", total_elements_test print "Taxa de acertos do melhor algoritmo com elementos de validação: {0} %".format( round(100.0 * total_matches / total_elements_test, 2)) multinomialModel = MultinomialNB() adaBoostModel = AdaBoostClassifier() oneVsRestModel = OneVsRestClassifier(LinearSVC(random_state=0)) oneVsOneModel = OneVsOneClassifier(LinearSVC(random_state=0)) total_matches_multinomial = fit_and_predict(multinomialModel, "MultinomialNB", X_trainning, Y_trainning) total_matches_adaboost = fit_and_predict(adaBoostModel, "AdaBoost", X_trainning, Y_trainning) total_matches_one_vs_rest = fit_and_predict(oneVsRestModel, "OneVsRest", X_trainning, Y_trainning) total_matches_one_vs_one = fit_and_predict(oneVsOneModel, "OneVsOne", X_trainning, Y_trainning) print "Total de elementos analisados no teste: {0}".format(len(Y_trainning))
# accuracy with tfidf vectorizer acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), Y_test) # display accuracies print(acc_count_nb) print(acc_tfidf_nb) # Code ends here # -------------- import warnings warnings.filterwarnings('ignore') # initialize logistic regression logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10)) logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10)) # fit on count vectorizer training data logreg_1.fit(X_train_count ,Y_train ) logreg_2.fit(X_train_tfidf ,Y_train ) # fit on tfidf vectorizer training data acc_count_logreg = accuracy_score(logreg_1.predict(X_test_count), Y_test) # accuracy with count vectorizer # accuracy with tfidf vectorizer acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), Y_test) # display accuracies print(acc_count_logreg)
def score(emb, startfrom0=False, topk=False): # 0. Files #embeddings_file = "blogcatalog.embeddings" list_of_files = glob.glob('../emb/kaggle/*.emb') matfile = mat_file embeddings_file = emb_file # 2. Load labels mat = loadmat(matfile) A = mat['network'] graph = sparse2graph(A) labels_matrix = mat['group'] if startfrom0: index_align = 0 else: index_align = 1 features_matrix_array = [] dw_features_matrix_array = {} cf_features_matrix_array = {} cfi_features_matrix_array = {} if all_file: for f in list_of_files: embed = numpy.loadtxt(f, skiprows=1) features_matrix = numpy.asarray([ embed[numpy.where(embed[:, 0] == node + index_align), 1:][0, 0] for node in range(len(graph)) ]) features_matrix = numpy.reshape( features_matrix, [features_matrix.shape[0], features_matrix.shape[-1]]) if os.path.basename( os.path.splitext(f)[0]).split('_')[-1] == 'cfi': cfi_features_matrix_array['cfi'] = features_matrix elif os.path.basename( os.path.splitext(f)[0]).split('_')[-1] == 'cf': cf_features_matrix_array['cf'] = features_matrix else: nw = int( os.path.basename(os.path.splitext(f)[0]).split('_')[-1]) dw_features_matrix_array[nw] = features_matrix features_matrix_array.append(dw_features_matrix_array) features_matrix_array.append(cf_features_matrix_array) features_matrix_array.append(cfi_features_matrix_array) else: if emb is None: # 1. Load Embeddings embed = numpy.loadtxt(embeddings_file, skiprows=1) features_matrix = numpy.asarray([ embed[numpy.where(embed[:, 0] == node + index_align), 1:][0, 0] for node in range(len(graph)) ]) features_matrix = numpy.reshape( features_matrix, [features_matrix.shape[0], features_matrix.shape[-1]]) else: features_matrix = emb features_matrix_array.append(features_matrix) res = [] training_percents = [0.3, 0.5, 0.9] # uncomment for all training percents #training_percents = numpy.asarray(range(1,10))*.1 for emb in features_matrix_array: score_array = {} for key in emb.keys(): emb_buf = emb[key] # 3. to score each train/test group all_results = defaultdict(list) # 2. Shuffle, to create train/test groups shuffles = [] number_shuffles = 2 for x in range(number_shuffles): shuffles.append(skshuffle(emb_buf, labels_matrix)) for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = X[:training_size, :] y_train_ = y[:training_size] y_train = [[] for x in xrange(y_train_.shape[0])] cy = y_train_.tocoo() for i, j in izip(cy.row, cy.col): y_train[i].append(j) #mlb = MultiLabelBinarizer() #y_train_onehot = mlb.fit_transform(y_train) y_train_onehot = label2onehot( y_train, labels_matrix.toarray().shape[1]) #assert sum(len(l) for l in y_train) == y_train_.nnz X_test = X[training_size:, :] y_test_ = y[training_size:] y_test = [[] for x in xrange(y_test_.shape[0])] cy = y_test_.tocoo() for i, j in izip(cy.row, cy.col): y_test[i].append(j) #y_test_onehot = mlb.fit_transform(y_test) y_test_onehot = label2onehot( y_test, labels_matrix.toarray().shape[1]) if topk: clf = TopKRanker(LogisticRegression(max_iter=500, )) else: clf = OneVsRestClassifier( LogisticRegression(max_iter=500)) clf.fit(X_train, y_train_onehot) if topk: # find out how many labels should be predicted top_k_list = [len(l) for l in y_test] preds = clf.predict(X_test, top_k_list) preds = label2onehot(preds, labels_matrix.toarray().shape[1]) else: preds = clf.predict(X_test) results = {} averages = ["micro", "macro", "samples", "weighted"] for average in averages: results[average] = f1_score(y_test_onehot, preds, average=average) all_results[train_percent].append(results) print 'Results, using embeddings of dimensionality', X.shape[1] print '-------------------' for train_percent in sorted(all_results.keys()): print 'Train percent:', train_percent for x in all_results[train_percent]: print x print '-------------------' score_array[key] = all_results res.append(score_array) dw_res, cf_res, cfi_res = res[0], res[1], res[2] averages = ["micro", "macro", "samples", "weighted"] percent = [0.3, 0.5, 0.9] for average in averages: for p in percent: plt.figure() y_value_dw = [ dw_res[k][p][0][average] for k in sorted(dw_res.keys()) ] y_value_cf = [ cf_res['cf'][p][0][average] for k in sorted(dw_res.keys()) ] y_value_cfi = [ cfi_res['cfi'][p][0][average] for k in sorted(dw_res.keys()) ] plt.plot(y_value_dw, 'bo-') plt.plot(y_value_cf, 'ro-') plt.plot(y_value_cfi, 'go-') plt.grid(True) plt.xlabel('number of walks at 10, 20, 50, 100') plt.ylabel('score') plt.title('percentage: %f, metric: %s' % (p, average)) plt.savefig("p%.1f_%s.png" % (p, average))
def train_classifiers(dataset, dataset_fp, subset_size, n_grams, seeds, test_size): overall_start_time = time.time() if dataset == 'bnc_rb': # Read raw data raw_data = pd.read_csv('data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv') # prepocess data data = preprocess_df(df=raw_data, data='bnc_rb') # change column names so everything works later data.rename(columns={"clean_text": "clean_data", "age_cat": "labels"}, inplace=True) elif dataset == 'bnc': raw_data = pd.read_csv('data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv') # prepocess data data = preprocess_df(df=raw_data, data='bnc') # change column names so everything works later data.rename(columns={"clean_text": "clean_data", "age_cat": "labels"}, inplace=True) elif dataset == 'blog': raw_data = pd.read_csv('data/blogs_kaggle/blogtext.csv') # prepocess data data = preprocess_df(df=raw_data, data='blog') # change column names so everything works later data.rename(columns={"clean_text": "clean_data", "age_cat": "labels"}, inplace=True) # preproc_file = Path("./data/blogs_kaggle/blogger_preprocessed_data_FAKE.csv") # # Pre-process raw data if pre-processed data doesn't exist # try: # preproc_abs_path = preproc_file.resolve(strict=True) # except FileNotFoundError: # # doesn't exist # # # Read and load dataset # print("Reading raw data...") # data = pd.read_csv("./data/blogs_kaggle/blogtext.csv") # print("Done reading raw data.") # # # # Subsetting data # # perc_df = 0.00020 # fraction of dataset to take # # sub_sample = math.ceil(perc_df * data.shape[0]) # # if subset_size != -1: # # Chosen to train and test model(s) on subset of size subset_size # # #shuffle data set before subsampling # data = data.sample(frac=1).reset_index(drop=True) # data = data[:subset_size] # # print(f"Dataset size before preprocessing: {data.shape[0]}") # # print("Preprocessing data...") # # Removing all unwanted text/characters from data['text'] column # # Remove all non-alphabetical characters # data['clean_data'] = data['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ', x)) # # # Make all letters lower case # data['clean_data'] = data['clean_data'].apply(lambda x: x.lower()) # # # Remove white space from beginning and end of string # data['clean_data'] = data['clean_data'].apply(lambda x: x.strip()) # # # Remove instances empty strings # before_rm_empty = len(data) # data.drop(data[data.clean_data == ''].index, inplace = True) # # print(f'{before_rm_empty - len(data)} empty string instances removed.') # # # Remove texts that are probably not English by filtering blogs that dont contain at least one of the top 50 most used English words # # create dict with most common English words # top_en_words = {} # with open('./data/wordlists/top1000english.txt') as f: # count = 1 # for line in f: # key = line.split()[0].lower() # top_en_words[key] = count # count += 1 # # # Stop at top 50 words. Idea taken from DialoGPT paper. # if count > 50: # break # # # data['top_50_en'] = data['clean_data'].apply(lambda x : True if not set(x.split()).isdisjoint(top_en_words) else False) # # def top_lang_detect(text): # # detected_langs = detect_langs(text) # # return detected_langs[0].lang # # # def top_prob_detect(text): # # detected_langs = detect_langs(text) # # return detected_langs[0].prob # # start_time = time.time() # data['top_lang'] = data['clean_data'].apply(top_lang_detect) # print(f"Top lang detection took {time.time() - start_time} seconds") # start_time = time.time() # data['top_prob'] = data['clean_data'].apply(top_prob_detect) # print(f"Top lang prob lang detection took {time.time() - start_time} seconds") # # # Remove rows without one of top50 most common english words # before_top50_removal = len(data) # data.drop(data[data['top_50_en'] == False].index, inplace = True) # print(f"{before_top50_removal - len(data)} instances dropped") # # before_top_lang = len(data) # data.drop(data[data['top_lang'] != 'en'].index, inplace = True) # print(f'{before_top_lang - len(data)} instances dropped.') # # before_top_prob = len(data) # data.drop(data[data['top_prob'] < 0.9].index, inplace = True) # print(f'{before_top_prob - len(data)} instances dropped.') # # # Remove stop words # stopwords = set(nltk.corpus.stopwords.words('english')) # use set (hash table) data structure for faster lookup # # # also add urllink and nbsp to set of words to remove # stopwords.update(['urllink', 'nbsp']) # # data['clean_data'] = data['clean_data'].apply(lambda x: ' '.join([words for words in x.split() if words not in stopwords])) # # print("Done preprocessing data.") # # print("Saving preprocessed dataframe to csv...") # # save pre-processed dataframe to csv # data.to_csv("./data/blogs_kaggle/blogger_preprocessed_data.csv") # # else: # # exists # # Read and load dataset # print("Reading preprocessed data...") # data = pd.read_csv("./data/blogs_kaggle/blogger_preprocessed_data.csv") # print("Done reading preprocessed data.") # # data = data[['clean_data', 'labels']] # # print(f"Dataset size after preprocessing: {data.shape[0]}") # # # Drop columns that are uninformative for writing style (i.e., ID and date) # data.drop(['id', 'date'], axis = 1, inplace = True) # # # Add labels for age categories # def age_to_cat(age): # '''Returns age category label for given age number.''' # # if 13 <= int(age) <= 17: # return '13-17' # elif 23 <= int(age) <= 27: # return '23-27' # elif 33 <= int(age): # return '33-47' # else: # print(int(age)) # raise ValueError("Given age not in one of pre-defined age groups.") # # # data['age_cat'] = data['age'].apply(age_to_cat) # # # Merge all possibly interesting labels into one column # data['labels'] = data.apply(lambda col: [col['gender'], str(col['age']), col['topic'], col['sign']], axis = 1) # # # Only keep age as label # # data['labels'] = data.apply(lambda col: [str(col['age'])], axis = 1) # TODO: Why keep age as string? # # data['labels'] = data.apply(lambda col: [col['age']], axis = 1) # data['labels'] = data.apply(lambda col: [col['age_cat']], axis = 1) # # # Reduce dataframe to only contain cleaned blogs and list of labels # data = data[['clean_data', 'labels']] # results dict accs_all = {} if dataset == 'blog': class_labels_list = ['13-17', '23-27', '33-47'] elif dataset == 'bnc' or dataset == 'bnc_rb': class_labels_list = ['19_29', '50_plus'] # Evaluate performance def print_evaluation_scores(labels, preds): print(f"Accuracy: {accuracy_score(labels, preds)}") print(f"F1 score: {f1_score(labels, preds, average = None)}") # outputs F1 per class print(f"Average precision: {average_precision_score(labels, preds, average = 'micro')}") print(f"Average recall: {recall_score(labels, preds, average = 'micro')}") print(classification_report(labels, preds, digits=5, zero_division=0)) # print(f"Confusion Matrix: {confusion_matrix(labels.argmax(axis=1), preds.argmax(axis=1))}") # def print_top_n(vectorizer, clf, class_labels, n_feat = 10): # """Prints features with the highest coefficient values, per class""" # feature_names = vectorizer.get_feature_names() # for i, class_label in enumerate(class_labels): # topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:] # print("%s: %s" % (class_label, # " ".join(feature_names[j] for j in topn))) # spacy english tokenizer # spacy_eng = spacy.load("en_core_web_sm") # def tokenizer_eng(text): # text = str(text) # return [tok.text.lower() for tok in spacy_eng.tokenizer(text)] # # token_counter = Counter() # for sentence in data.clean_data: # for word in tokenizer_eng(sentence): # token_counter.update([word]) # # min_thresh = 3000 # trunc_counter = {x: count for x, count in token_counter.items() if count >= min_thresh} # TODO: FIX REVERSE ORDERING BUG. SEE NOTEBOOK FOR RPA # def print_top_n_thresh(vectorizer, clf, class_labels, n_feat = 100, # counter = trunc_counter): # """Prints features with the highest coefficient values, per class""" # feature_names = vectorizer.get_feature_names() # for i, class_label in enumerate(class_labels): # topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:] # print("%s: %s" % (class_label, # " ".join(feature_names[j] for j in topn if feature_names[j] in counter))) # def print_top_n_thresh(vectorizer, clf, class_labels, n_feat = 100, # counter = trunc_counter): # """Prints features with the highest coefficient values, per class""" # feature_names = vectorizer.get_feature_names() # for i, class_label in enumerate(class_labels): # topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:] # topn = topn[::-1] # Reverse order of arg s.t. features with high coefficients appear first # print("%s: %s" % (class_label, # " ".join(feature_names[j] for j in topn if feature_names[j] in counter))) # # def most_informative_feature_for_class(vectorizer, classifier, class_labels, n=10): # #labelid = list(classifier.classes_).index(classlabel) # feature_names = vectorizer.get_feature_names() # for i, class_label in enumerate(class_labels): # topn = sorted(zip(classifier.estimators_[i].coef_[0], feature_names))[-n:] # # for coef, feat in topn: # print(class_label, feat, coef) test_accs = {} test_f1s = {} for n_gram in n_grams: test_accs[n_gram] = {} test_f1s[n_gram] = {} # for class_label in class_labels_list: # test_f1s[n_gram][class_label] = {} print("Starting training and testing loops...") for seed in tqdm(seeds, desc = "Seed loop."): # set seed for reproducibility np.random.seed(seed) # shuffle dataframe data = data.sample(frac=1).reset_index(drop=True) for n in tqdm(n_grams, desc = "n gram loop."): # Split data into features/ X and labels / Y X = data['clean_data'] Y = data['labels'] # n-gram model vectorizer = CountVectorizer(binary = True, ngram_range = (1, n)) # fit model X = vectorizer.fit_transform(X) # # check out a sample of the uni- and bigrams # print(vectorizer.get_feature_names()[:10]) # Get label counts label_counts = {} if dataset == 'blog': # for labels in data.labels.values: # for label in labels: # if label in label_counts: # label_counts[label] += 1 # else: # label_counts[label] = 1 for label in data.labels.values: if label in label_counts: label_counts[label] += 1 else: label_counts[label] = 1 elif dataset == 'bnc_rb' or dataset == 'bnc': for label in data.labels.values: if label in label_counts: label_counts[label] += 1 else: label_counts[label] = 1 # Binarize the labels for prediction if dataset == 'blog': # binarizer = MultiLabelBinarizer(classes = sorted(label_counts.keys())) binarizer = LabelBinarizer() elif dataset == 'bnc_rb' or dataset == 'bnc': binarizer = LabelBinarizer() Y = binarizer.fit_transform(data.labels) label_counts.keys() # Split data into train and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size) # if n == 1: # # save splits and vectorizer # save_file_splits_vzer = f"splits_vzer_{n}_gram_seed_{seed}" # pickle.dump((vectorizer, X_train, X_test, Y_train, Y_test), # open(save_file_splits_vzer, 'wb')) # Fit logistic regression model start_time = time.time() model = LogisticRegression(solver = 'lbfgs', multi_class='ovr', max_iter = 1000000) model = OneVsRestClassifier(model) # model = MultiOutputClassifier(model) model.fit(X_train, Y_train) print(f"Fitting model took {time.time() - start_time} seconds.") # save the classifier # save_file_name = f"logit_{n}_gram_seed_{seed}" # pickle.dump(model, open(save_file_name, 'wb')) # make predictions on test set Y_pred = model.predict(X_test) Y_pred_inversed = binarizer.inverse_transform(Y_pred) Y_test_inversed = binarizer.inverse_transform(Y_test) print("=" * 81) print(f"n = {n}") print(f"seed = {seed}") print_evaluation_scores(Y_test, Y_pred) test_accs[n][seed] = accuracy_score(Y_test, Y_pred) test_f1s[n][seed] = f1_score(Y_test, Y_pred, average=None) # for label_idx in range(len(class_labels_list)): # test_f1s[n][class_labels_list[label_idx]][seed] = f1_score(Y_test, Y_pred, average=None)[label_idx] if n in accs_all: accs_all[n].append(accuracy_score(Y_test, Y_pred)) else: accs_all[n] = [accuracy_score(Y_test, Y_pred)] # Print most informative features # if n == 1: # print("Most informative features per age-group.") # print_top_n_thresh(vectorizer = vectorizer, clf = model, # class_labels = class_labels_list, n_feat = 20) print("-" * 81) # print("Some failure cases.") # # predictions = model.predict(inputs) # for i, (x, pred, label) in enumerate(zip(X_test, Y_pred, Y_test)): # if (pred != label).any(): # print(f"pred: {pred}") # print(f"label: {label}") # pred_cat = binarizer.classes_[np.where(pred == 1)[0][0]] # label_cat = binarizer.classes_[np.where(label == 1)[0][0]] # print(data['clean_data'][i], 'has been classified as ', pred_cat, 'and should be ', label_cat) print("=" * 81) # UNCOMMENT FOLLOWING LINES FOR CM PLOTS # int_labels = [label for label in range(len(class_labels_list))] # cm = confusion_matrix(Y_test, Y_pred, labels=int_labels) # make_confusion_matrix(cf=cm, categories=class_labels_list, title=f'Confusion Matrix for {dataset} on Test set', # num_labels=int_labels, y_true=Y_test, y_pred=Y_pred, figsize=FIGSIZE) # cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S') # plt.savefig(f"{FIGDIR}{dataset}/cm_{n}_gram_{dataset}_dt_{cur_datetime}.png", # bbox_inches='tight') # most_informative_feature_for_class(vectorizer = vectorizer, classifier = model, class_labels = class_labels_list, n=10) # def plot_accuracies(accs, show = False): # # means = [np.mean(accs[n]) for n in range(1, len(accs) + 1)] # # print(np.mean(means)) # stds = [np.std(accs[n]) for n in range(1, len(accs) + 1)] # # x_pos = np.arange(len(accs)) # x_labels = list(accs.keys()) # # # Build the plot # fig, ax = plt.subplots() # ax.bar(x_pos, means, yerr=stds, align='center', alpha=0.5, ecolor='black', capsize=10) # ax.set_ylabel('Mean classification accuracy.') # ax.set_xlabel("$n$") # ax.set_xticks(x_pos) # ax.set_xticklabels(x_labels) # ax.set_title('Age group prediction accuracy for various n-gram models.') # ax.yaxis.grid(True) # # # Save the figure and show # plt.tight_layout() # plt.savefig('figures/bar_plot_with_error_bars_10000.png') # # if show: # plt.show() # plot_accuracies(accs_all) # print average metrics print(89*'-') print(89 * '-') print("PRINTING AVERAGE METRICS") for n_gram in n_grams: n_gram_accs = [] n_gram_f1s = [] for seed in seeds: n_gram_accs.append(test_accs[n_gram][seed]) n_gram_f1s.append(test_f1s[n_gram][seed]) print(f"| n = {n_gram} | Average accuracy = {np.mean(n_gram_accs)} | Acc std = {np.std(n_gram_accs)} " f"| Average f1s = {np.mean(n_gram_f1s, axis=0)} | F1s std = {np.std(n_gram_f1s, axis=0)} |") overall_end_time = time.time() print(f"Done with everything. Took {overall_end_time - overall_start_time} seconds.")
class ClassifierLinearSVM: def __init__(self, task, cv=3): self.cv = cv self.model = None self.calibrated_model = None # name of the property self.task = task self.config = helpers.load_yaml("src/config.yml") def train(self, X_train, y_train): if self.task.label_task == "single-label": self.model = LinearSVC(dual=True, max_iter=3000) elif self.task.label_task == "multi-label": self.model = OneVsRestClassifier( LinearSVC(dual=True, max_iter=3000)) self.model.fit(X_train, y_train) self.calibrated_model = CalibratedClassifierCV( base_estimator=self.model, cv="prefit") self.calibrated_model.fit(X_train, y_train) return self.calibrated_model @staticmethod def _linear_scale_confidence(confidences): """ return the ratio of prob according to the sum of top n probabilities for the predicted intents. if probs = [p1, p2, p3] then the return probabilities will be scaled as [p1/sum(p1,p2,p3), p2/sum(p1,p2,p3), p3/sum(p1,p2,p3)] Args: confidences: probabilities of intents Returns: numpy array: the scaled confidences """ s = np.sum(confidences) return confidences / s def predict_utt_top_n(self, featurized_utt, n=3): """ predict the topn predictions along with the confidence probability for each one. Note that model.classes_ contains the trained labels in alphabetical order. Here, we sort the confidences together with the labels, and return the top3 from this sorted order Args: featurized_utt (str): featurized and tokenized single utterance Returns: One list of strings and one list of floats """ raw_confidences = self.calibrated_model.predict_proba( featurized_utt)[0] # indices of sorted confidences from high to low confidence sorted_conf_idx = np.argsort(raw_confidences)[::-1][:n] labels = np.take(self.calibrated_model.classes_, sorted_conf_idx) confidences = np.take(raw_confidences, sorted_conf_idx) # scaled_confidences = self._linear_scale_confidence(confidences) scaled_confidences = confidences return labels, scaled_confidences def predict_batch_top_n(self, X_test, topn=5): """ predict the topn predictions for the whole batch. Returns a list of tuples, where each tuple is a list """ return [ self.predict_utt_top_n(test.reshape(1, -1), n=topn) for test in X_test ] def get_pred_and_accuracy(self, X_test, y_test, topn=5): """ Returns predictions and accuracy for the test set """ predictions = [ self.predict_utt_top_n(test.reshape(1, -1), n=topn) for test in X_test ] num_correct = 0 for test, pred in zip(y_test, predictions): topn_list = pred[0] if test in topn_list: num_correct += 1 return predictions, num_correct / len(y_test) def load(self): self.calibrated_model = helpers.load_model_from_dir( self.config["models_dir"], self.task.classifier_name) def export(self): helpers.save_model_to_dir(self.config["models_dir"], self.task.classifier_name, self.calibrated_model)
for filename in elsa_filenames: #read the images image = imread(filename) #flatten it image = resize(image, (200,200)) hog_features = hog(image, orientations=12, pixels_per_cell=(16, 16),cells_per_block=(1, 1)) #hog_features = hog(image, orientations=12, pixels_per_cell=(16,16), cells_per_bock=(1,1)) data.append(hog_features) labels.append(1) print('Finished adding Elsa samples to dataset') for filename in eric_filenames: #read the images image = imread(filename) #flatten it image = resize(image, (200,200)) hog_features = hog(image, orientations=12, pixels_per_cell=(16, 16),cells_per_block=(1, 1)) #hog_features = hog(image, orientations=12, pixels_per_cell=(16,16), cells_per_bock=(1,1)) data.append(hog_features) labels.append(2) print('Finished adding Eric samples to dataset') print('Training the SVM') #create the SVC clf = OneVsRestClassifier(SVC(kernel="linear", probability=True)) #traing the svm clf.fit(data, labels) #pickle it - save it to a file clf = pickle.dump(clf, open("signature.detector","wb"))
max_features=200000, smooth_idf=True, norm="l2", tokenizer=lambda x: x.split(), sublinear_tf=False, ngram_range=(1, 3)) x_train_multilabel = vectorizer.fit_transform(x_train['question']) x_test_multilabel = vectorizer.transform(x_test['question']) print("Dimensions of train data X: ", x_train_multilabel.shape, "Y: ", y_train.shape) print("Dimensions of test data X: ", x_test_multilabel.shape, "Y: ", y_test.shape) classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1) classifier.fit(x_train_multilabel, y_train) predictions = classifier.predict(x_test_multilabel) print("Accuracy:", metrics.accuracy_score(y_test, predictions)) print("Macro f1 score:", metrics.f1_score(y_test, predictions, average='macro')) print("Micro f1 score:", metrics.f1_score(y_test, predictions, average='micro')) print("Hamming loss:", metrics.hamming_loss(y_test, predictions)) print("Precision recall report: \n", metrics.classification_report(y_test, predictions)) # Dumping model joblib.dump(classifier, './model/equal_weight_model.pkl')
def oneVrest(x,y,test): predict=OneVsRestClassifier(SVC(kernel='linear')).fit(x, y).predict(test) return predict
def test_pairwise_n_features_in(): """Check the n_features_in_ attributes of the meta and base estimators When the training data is a regular design matrix, everything is intuitive. However, when the training data is a precomputed kernel matrix, the multiclass strategy can resample the kernel matrix of the underlying base estimator both row-wise and column-wise and this has a non-trivial impact on the expected value for the n_features_in_ of both the meta and the base estimators. """ X, y = iris.data, iris.target # Remove the last sample to make the classes not exactly balanced and make # the test more interesting. assert y[-1] == 0 X = X[:-1] y = y[:-1] # Fitting directly on the design matrix: assert X.shape == (149, 4) clf_notprecomputed = svm.SVC(kernel="linear").fit(X, y) assert clf_notprecomputed.n_features_in_ == 4 ovr_notprecomputed = OneVsRestClassifier(clf_notprecomputed).fit(X, y) assert ovr_notprecomputed.n_features_in_ == 4 for est in ovr_notprecomputed.estimators_: assert est.n_features_in_ == 4 ovo_notprecomputed = OneVsOneClassifier(clf_notprecomputed).fit(X, y) assert ovo_notprecomputed.n_features_in_ == 4 assert ovo_notprecomputed.n_classes_ == 3 assert len(ovo_notprecomputed.estimators_) == 3 for est in ovo_notprecomputed.estimators_: assert est.n_features_in_ == 4 # When working with precomputed kernels we have one "feature" per training # sample: K = X @ X.T assert K.shape == (149, 149) clf_precomputed = svm.SVC(kernel="precomputed").fit(K, y) assert clf_precomputed.n_features_in_ == 149 ovr_precomputed = OneVsRestClassifier(clf_precomputed).fit(K, y) assert ovr_precomputed.n_features_in_ == 149 assert ovr_precomputed.n_classes_ == 3 assert len(ovr_precomputed.estimators_) == 3 for est in ovr_precomputed.estimators_: assert est.n_features_in_ == 149 # This becomes really interesting with OvO and precomputed kernel together: # internally, OvO will drop the samples of the classes not part of the pair # of classes under consideration for a given binary classifier. Since we # use a precomputed kernel, it will also drop the matching columns of the # kernel matrix, and therefore we have fewer "features" as result. # # Since class 0 has 49 samples, and class 1 and 2 have 50 samples each, a # single OvO binary classifier works with a sub-kernel matrix of shape # either (99, 99) or (100, 100). ovo_precomputed = OneVsOneClassifier(clf_precomputed).fit(K, y) assert ovo_precomputed.n_features_in_ == 149 assert ovr_precomputed.n_classes_ == 3 assert len(ovr_precomputed.estimators_) == 3 assert ovo_precomputed.estimators_[ 0].n_features_in_ == 99 # class 0 vs class 1 assert ovo_precomputed.estimators_[ 1].n_features_in_ == 99 # class 0 vs class 2 assert ovo_precomputed.estimators_[ 2].n_features_in_ == 100 # class 1 vs class 2
def generate_tags(article_id): try: import pandas as pd import numpy as np import itertools from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer from sklearn.preprocessing import MultiLabelBinarizer from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC from website.models import Article a = Article.objects.get(id=article_id) # import article's comments into dataframe df = pd.DataFrame(list(a.comment_set.all().values( 'id', 'article', 'disqus_id', 'text', 'summary', 'tags', 'suggested_tags'))) # merge all text (comments+summaries) into a new column df['train_text'] = df[['text', 'summary']].apply(lambda x: ' '.join(x), axis=1) print(df['train_text']) # define classifier clf = OneVsRestClassifier(SVC(kernel='linear')) # train data: use only comments with tags tagged = df.loc[df['tags'].notnull()] # train data: preproccess and vectorize (TfIdf) text data count_vect = CountVectorizer( stop_words='english', min_df=3, max_df=0.30, #lowercase=True, ngram_range=(1, 2), ) X_train_counts = count_vect.fit_transform(list(tagged.train_text)) tfidf_transformer = TfidfTransformer().fit(X_train_counts) X_train_tfidf = tfidf_transformer.transform(X_train_counts) # train classifier clf = clf.fit(X_train_tfidf, tagged.tags) # suggest tags for ALL instances in df test_df = df.drop_duplicates(subset=['disqus_id']) X_test_counts = count_vect.transform(list(test_df.train_text)) X_test_tfidf = tfidf_transformer.transform(X_test_counts) suggested = clf.predict(X_test_tfidf) # save suggested tags to the dataframe test_df.suggested_tags = suggested # add suggested tags to the database sorted_df = test_df.sort_values('disqus_id') comments = a.comment_set.all().order_by('disqus_id') for comment in comments: comment.suggested_tags.clear() for row_item, comment in zip(sorted_df.iterrows(), comments): index, row = row_item if row['suggested_tags']: if not comment.tags.filter(id=row['suggested_tags']).exists(): comment.suggested_tags.add(row['suggested_tags']) except Exception, e: print e
def test_ovr_fit_predict_svc(): ovr = OneVsRestClassifier(svm.SVC()) ovr.fit(iris.data, iris.target) assert len(ovr.estimators_) == 3 assert ovr.score(iris.data, iris.target) > 0.9
def test_ovr_partial_fit(): # Test if partial_fit is working as intended X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert len(ovr.estimators_) == len(np.unique(y)) assert np.mean(y == pred) > 0.65 # Test when mini batches doesn't have all classes # with SGDClassifier X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr = OneVsRestClassifier( SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) ovr.partial_fit(X[:7], y[:7], np.unique(y)) ovr.partial_fit(X[7:], y[7:]) pred = ovr.predict(X) ovr1 = OneVsRestClassifier( SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) pred1 = ovr1.fit(X, y).predict(X) assert np.mean(pred == y) == np.mean(pred1 == y) # test partial_fit only exists if estimator has it: ovr = OneVsRestClassifier(SVC()) assert not hasattr(ovr, "partial_fit")
def test_ovr_fit_predict_sparse(): for sparse in [ sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix, ]: base_clf = MultinomialNB(alpha=1) X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0, ) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) Y_pred_sprs = clf_sprs.predict(X_test) assert clf.multilabel_ assert sp.issparse(Y_pred_sprs) assert_array_equal(Y_pred_sprs.toarray(), Y_pred) # Test predict_proba Y_proba = clf_sprs.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > 0.5 assert_array_equal(pred, Y_pred_sprs.toarray()) # Test decision_function clf = svm.SVC() clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train)) dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int) assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0, ) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # Decision function only estimator. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert not hasattr(decision_only, "predict_proba") # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) assert not hasattr(decision_only, "predict_proba") decision_only.fit(X_train, Y_train) assert not hasattr(decision_only, "predict_proba") assert hasattr(decision_only, "decision_function") # Estimator which can get predict_proba enabled after fitting gs = GridSearchCV(svm.SVC(probability=False), param_grid={"probability": [True]}) proba_after_fit = OneVsRestClassifier(gs) assert not hasattr(proba_after_fit, "predict_proba") proba_after_fit.fit(X_train, Y_train) assert hasattr(proba_after_fit, "predict_proba") Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > 0.5 assert_array_equal(pred, Y_pred)
'verb_count', 'adjective_count', 'tot_pos_words_count', 'tot_neg_words_count', 'tot_neu_words_count', 'user_avg_stars', 'user_yelping_since', 'user_review_count'] ) df = df[df.review_stars != 3] X = df.values[:, 1:] y = df.values[:, 0] # Binarize the output y = label_binarize(y, classes=[1, 2, 4, 5]) n_classes = y.shape[1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) classifier = OneVsRestClassifier(GradientBoostingClassifier(loss= 'deviance', max_features= 'auto', n_estimators= 110, random_state= 3)) y_score = classifier.fit(X_train, y_train).predict_proba(X_test) #.decision_function(X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) colors = ['blue', 'red', 'green', 'yellow'] for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--')
def test_ovr_always_present(): # Test that ovr works with classes that are always present or absent. # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) X[:5, :] = 0 # Build an indicator matrix where two features are always on. # As list of lists, it would be: [[int(i >= 5), 2, 3] for i in range(10)] y = np.zeros((10, 3)) y[5:, 0] = 1 y[:, 1] = 1 y[:, 2] = 1 ovr = OneVsRestClassifier(LogisticRegression()) msg = r"Label .+ is present in all training examples" with pytest.warns(UserWarning, match=msg): ovr.fit(X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) assert np.unique(y_pred[:, -2:]) == 1 y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.ones(X.shape[0])) # y has a constantly absent label y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) msg = r"Label not 1 is present in all training examples" with pytest.warns(UserWarning, match=msg): ovr.fit(X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
X[count, :] = vec_one count += 1 lb = preprocessing.LabelBinarizer() labels_get = list(labels.label) labels_get = [[x] for x in labels_get] y = MultiLabelBinarizer().fit_transform(labels_get) list_mico = [] list_maco = [] items = [p / 10.0 for p in range(1, 10)] for item in items: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=item, random_state=51) clf = LogisticRegression() #C= 1, penalty = "l2", tol=0.01) y_score = OneVsRestClassifier(clf).fit(X_train, y_train).predict(X_test) item_preict = [] for item in y_score: if item.any(): item_preict.append(item) all_zeros = not np.any(y_score) micro_f1 = f1_score(y_test, y_score, average='micro') macro_f2 = f1_score(y_test, y_score, average='macro') print micro_f1 print macro_f2 list_mico.append(micro_f1) list_maco.append(macro_f2)
print "time to train cosine k nearest neighbors: %.2f seconds\n" % (end - start) ''' ----------------------------------------- ''' print "\n-----------------------------------------\n\n" # Load the digits dataset digits = load_digits() X = digits.data y = digits.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) #naive bayes on digits start = time.clock() clf = OneVsRestClassifier(MultinomialNB()) clf.fit(X_train,y_train) accuracy = clf.score(X_test, y_test) * 100.0 end = time.clock() print "naive bayes accuracy on digits (small dataset): %.2f%%" % accuracy print "time to train naive bayes: %.2f seconds\n" % (end - start) #logistic regression on digits start = time.clock() clf = LogisticRegression('l1', C=0.1) clf.fit(X_train,y_train) accuracy = clf.score(X_test, y_test) * 100.0 end = time.clock() print "logistic regression accuracy on digits (small dataset): %.2f%%" % accuracy print "time to train logistic regression: %.2f seconds\n" % (end - start)
class NodeTransformerLogit(Transformer): """ we will get a list of blocks belonging to N classes. we train a logit classifier for those classes, as well as a multilabel classifier for the neighor of those classes the built feature vector is 2*N long """ dGridSearch_LR_conf = dGridSearch_CONF def __init__(self, nbClass=None, n_feat_node=1000, t_ngrams_node=(2, 4), b_node_lc=False, n_jobs=1): """ input: - number of classes - number of ngram - ngram min/max size - lowercase or not - njobs when fitting the logit using grid search if n_feat_node is negative, or 0, or None, we use all possible ngrams """ Transformer.__init__(self) self.nbClass = nbClass self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc self.n_jobs = n_jobs self.text_pipeline = None # feature extractor self.mdl_main = None # the main model predicting among the nbClass classes self.mdl_neighbor = None # the neighborhood model predicting zero to many of the classes def fit(self, X, y=None): """ This tranformer needs the graphs to be fitted properly - see fitByGraph """ return self def fitByGraph(self, lGraph, lAllNode=None): """ we need to train 2 Logit: one to predict the node class, another to predict the class of the neighborhhod """ self.text_pipeline = Pipeline([ ('selector', NodeTransformerTextEnclosed()), ( 'tf', TfidfVectorizer( lowercase=self.b_node_lc #, max_features=10000 , analyzer='char', ngram_range=self.t_ngrams_node) ) #(2,6)), #we can use it separately from the pipleline once fitted # , ('word_selector' , SelectKBest(chi2, k=self.n_feat_node)) ]) # the y if lAllNode == None: lAllNode = [nd for g in lGraph for nd in g.lNode] y = np.array([nd.cls for nd in lAllNode], dtype=np.int) if self.nbClass != len(np.unique(y)): traceln("Classes seen are: %s" % np.unique(y).tolist()) traceln(self.nbClass) raise ValueError( "ERROR: some class is not represented in the training set") #fitting the textual feature extractor self.text_pipeline.fit(lAllNode, y) #extracting textual features x = self.text_pipeline.transform(lAllNode) #creating and training the main logit model lr = LogisticRegression(class_weight='balanced') self.mdl_main = GridSearchCV(lr, self.dGridSearch_LR_conf, refit=True, n_jobs=self.n_jobs) self.mdl_main.fit(x, y) del y if DEBUG: print(self.mdl_main) #now fit a multiclass multilabel logit to predict if a node is neighbor with at least one node of a certain class, for each class #Shape = (nb_tot_nodes x nb_tot_labels) y = np.vstack([g.getNeighborClassMask() for g in lGraph]) #we get this from the graph object. assert y.shape[0] == len(lAllNode) lr = LogisticRegression(class_weight='balanced') gslr = GridSearchCV(lr, self.dGridSearch_LR_conf, refit=True, n_jobs=self.n_jobs) self.mdl_neighbor = OneVsRestClassifier(gslr, n_jobs=self.n_jobs) self.mdl_neighbor.fit(x, y) del x, y if DEBUG: print(self.mdl_neighbor) return self def transform(self, lNode): """ return the 2 logit scores """ a = np.zeros( (len(lNode), 3 * self.nbClass), dtype=np.float64 ) #for each class: is_of_class? is_neighbor_of_class on same page or accross page? x = self.text_pipeline.transform(lNode) a[..., 0:self.nbClass] = self.mdl_main.predict_proba(x) a[..., self.nbClass:3 * self.nbClass] = self.mdl_neighbor.predict_proba(x) # for i, nd in enumerate(lNode): # print i, nd, a[i] if DEBUG: print(a) return a
"kernel; c/gamma; tol; jakosc zbioru trenujacego (1vs1); jakosc zbiory testowego (1vs1); jakosc zbioru trenujacego (1vsR); jakosc zbiory testowego (1vsR)\n" ) X_train, y_train = getData(sciezka + train[i]) X_test, y_test = getData(sciezka + test[i]) for j in range(0, 4): gram = get_kernel(j, X_train, X_train, 10) gram_test = rbf_kernel(X_test, X_train, 10) gram_test = rbf_kernel(X_test, X_train, 10) print('Test gram calculated') #print('Jakosc klasyfikacji zbioru testowego: ', clf.score(gram_test,y_test)) #cpredicted = clf.predict(gram_test) ovo = OneVsOneClassifier( SVC(C=params[j], kernel='precomputed', tol=params[j])).fit(gram, y_train) ovr = OneVsRestClassifier( SVC(C=params[j], kernel='precomputed', tol=params[j])).fit(gram, y_train) print('Jakosc klasyfikacji zbioru trenujacego: ', ovo.score(X_train, y_train)) print('Jakosc klasyfikacji zbioru testowego: ', ovo.score(X_test, y_test)) f.write(kernels[j]) f.write(';') f.write(str(params[j])) f.write(';') f.write(str(tols[j])) gram_test = rbf_kernel(X_test, X_train, 10) f.write(';') f.write(str(ovo.score(gram, y_train))) f.write(';') f.write(str(ovo.score(gram_test, y_test)))
from sklearn.multiclass import OneVsRestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.externals import joblib import numpy as np if __name__ == '__main__': rechtspraak_train_text = np.load("train_data.npy") rechtspraak_train_labels = np.load("train_label.npy") rechtspraak_test_text = np.load("test_data.npy") rechtspraak_test_labels = np.load("test_label.npy") print("Training classifier") classif = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=11), n_jobs=-1) classif.fit(rechtspraak_train_text, rechtspraak_train_labels) score_acc = classif.score(rechtspraak_test_text, rechtspraak_test_labels) joblib.dump(classif, 'model_knn.pkl') print("Score: " + str(score_acc))
print("Accuracy of XGB =", accuracy_score(y_test,xgb_pred),"\n") print("Classification of XGB\n\n",classification_report(y_test,xgb_pred),"\n") print("Confusion matrix of XGB\n\n\n",confusion_matrix(y_test,xgb_pred)) # In[58]: from sklearn.multiclass import OneVsRestClassifier from sklearn.ensemble import AdaBoostClassifier # In[59]: ada_model = OneVsRestClassifier(AdaBoostClassifier()) # In[60]: ada_model.fit(x_train,y_train) # In[61]: ada_pred = ada_model.predict(x_test) # In[64]: