def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.fit_transform(X_train_counts) clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return sgd_pipeline
def runSVCPipeline(entries, langs): t0 = time() svc_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) clf = LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) #dec = clf.decision_function([[1]]) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return svc_pipeline
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.1, 1, 10), } df = pd.read_csv('data/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
def bagOfWords(dataset): sp = SetProcessing() datalist = sp.convertDataToList(dataset) japanese, korean, mandarin = sp.organizeEasternLanguages(datalist) datalist = datalist[870:970] pairs = sp.buildSpeakingLearningPairs(datalist) print(pairs) entries = [] langs = [] korean = korean[:10] japanese = japanese[:10] for s in korean: datalist.append(s) for fr in japanese: datalist.append(fr) for data in datalist: entries.append(data[sp.ENTRY]) langs.append(data[sp.SPEAKING]) print(langs) vect = CountVectorizer() X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) X_train_tfidf = X_train_tfidf.toarray() tree = SGDClassifier() tree.fit(X_train_tfidf, langs) result = tree.predict(X_train_tfidf) print(np.mean(result == langs)) print(metrics.classification_report(langs, result, target_names=langs))
def vectorize(training_data, test_data,approach): """Function that creates vectors from the training and test data for the SVM, trains the SVM and makes predictions on the test data. Uses the svm module from scikit. Parameters ---------- training_data : A list of lists of the format [named_pair, relationship, list of features]. test_data : A list of lists of the format [named_pair, list of features]. """ global corpus, classes test_corpus = [] for data in training_data: named_pair = data[0] rel_class = data[1] tokens = data[2] classes.append(rel_class) corpus.append(' '.join(tokens).decode("UTF-8",errors="ignore").encode("UTF-8")) u_clases=set(classes) class_list=list(u_clases) vectorizer = TfidfVectorizer(min_df=3, sublinear_tf=True, use_idf=True) X = vectorizer.fit_transform(corpus) svm = SVC(C=10, gamma=0.0, kernel='linear') svm.fit(X, classes) if approach==1: ind=1 else: ind=2 for data in test_data: named_pair = data[0] tokens = data[ind] test_corpus.append(' '.join(tokens).decode("UTF-8",errors="ignore").encode("UTF-8")) Xtest = vectorizer.transform(test_corpus) prediction = svm.predict(Xtest) corr1 = 0 corr2 = 0 total = 0 actual=[] predict=[] for i in range(len(prediction)): if test_data[i][0][0]+"|"+test_data[i][0][1] in relation_dict: v=relation_dict[test_data[i][0][0]+"|"+test_data[i][0][1]] else: v="NA" if v=="NA": continue if prediction[i] == v and prediction[i] in class_list: corr1 += 1 elif prediction[i] == v: corr2 += 1 total += 1 actual.append(v) predict.append(prediction[i]) print str(i)+":"+test_data[i][0][0]+":"+test_data[i][0][1]+":"+prediction[i]+":"+v res=open("Result_approach"+str(approach)+".txt","w") res.write("Technique Approach "+str(approach)+"\n") res.write(metrics.classification_report(actual, predict)) res.write("Accuracy:"+str(float(corr1+corr2)/total)+"\n\n")
def print_classification_report(y_true, y_pred, title=''): """ Print a classification report """ # TODO: print classification report print(classification_report(y_true, y_pred))
def main(): pipeline = Pipeline([('vect', TfidfVectorizer()), ('clf', LogisticRegression())]) parameters = { # 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), # 'vect__max_features': (5000, 10000, None), # 'vect__ngram_range': ((1, 1), (1, 2)), # 'vect__use_idf': (True, False), # 'vect__norm': ('l1', 'l2'), # 'clf__penalty': ('l1', 'l2'), # 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predictions) print cm plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() predictions = np.ones(len(predictions)) * 2 print 'Accuracy:', accuracy_score(y_test, predictions) print 'Degenerate Classification Report:', classification_report( y_test, predictions)
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./filtered_classes_musiconly.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) train_scaled = preprocessing.scale(train) X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_scaled, target, test_size = 0.5, random_state = 0) tuned_parameters = [{'kernel': ['rbf'],'gamma': [ 1e-3, 1e-4 ], 'C':[1, 10, 100, 1000]}, {'kernel': ['linear'], 'C':[1, 10,100, 1000]}] scores = [ ('precision', metrics.precision_score), ('recall', metrics.recall_score), ] for score_name, score_func in scores: print "Tuning hyper-parameters for %s" % score_name print clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X_train, y_train, cv = 5) print "Best Parameters set found on development set:" print print clf.best_estimator_ print print "Grid scores on development set:" print for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r" % ( mean_score, scores.std() / 2, params) print print "Detailed classification report:" print print "The model is trained on the full development set." print "The scores are computed on the full evaluation set." print y_true, y_pred = y_test, clf.predict(X_test) print metrics.classification_report(y_true, y_pred) print
def run_model(self, train_path, test_path): trainx, trainy = self.load_data(train_path) self.train_model(trainx, trainy) testx, testy = self.load_data(test_path) predy = self.predict_res(testx) accuracy = accuracy_score(testy, predy) label = [1, 0] classifier = ['interested', 'nointerested'] result = classification_report(testy, predy, labels=label, target_names = classifier) + '\naccuracy\t' + str(accuracy) print result
def perecptronClassification(): from sklearn.datasets import fetch_20newsgroups from sklearn.metrics.metrics import f1_score, classification_report from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import Perceptron categories = ['rec.sport.hockey','rec.sport.baseball','rec.autos'] newsgroups_train = fetch_20newsgroups(subset='train',categories=categories,remove=('headers','footers','quotes')) newsgroups_test = fetch_20newsgroups(subset='test',categories=categories,remove=('headers','footers','quotes')) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(newsgroups_train.data) X_test = vectorizer.transform(newsgroups_test.data) classifier = Perceptron(n_iter=100,eta0=0.1) classifier.fit(X_train,newsgroups_train.target) predictions = classifier.predict(X_test) print classification_report(newsgroups_test.target,predictions)
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer()), ('clf', LogisticRegression()) ]) parameters = { # 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), # 'vect__max_features': (5000, 10000, None), # 'vect__ngram_range': ((1, 1), (1, 2)), # 'vect__use_idf': (True, False), # 'vect__norm': ('l1', 'l2'), # 'clf__penalty': ('l1', 'l2'), # 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predictions) print cm plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() predictions = np.ones(len(predictions)) * 2 print 'Accuracy:', accuracy_score(y_test, predictions) print 'Degenerate Classification Report:', classification_report(y_test, predictions)
def Run(self, trainFileDir, testFileDir): XTrain, yTrain = self.loadData(trainFileDir) self.trainModel(XTrain, yTrain) XTest, yTest = self.loadData(testFileDir) yPred = self.predict(XTest) accuracy = accuracy_score(yTest, yPred) #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred) labels = [1, 0] classNames = ['interested', 'notInterested'] report = classification_report(yTest, yPred, labels=labels, target_names=classNames) + '\naccuracy\t' + str(accuracy) print report
def show_report(self, y_predicted): y_true = [] y_predicted_new = [] for i in range(len(self.__labels)): if self.__labels[i]=='P': y_true.append(1) if y_predicted[i]=='positivo': y_predicted_new.append(1) if self.__labels[i]=='N': y_true.append(-1) if y_predicted[i]=='negativo': y_predicted_new.append(-1) if self.__labels[i]=='NEU': y_true.append(0) if y_predicted[i]=='neutral': y_predicted_new.append(0) print classification_report(y_true, y_predicted_new) print confusion_matrix(y_true, y_predicted_new)
def Run(self, trainFileDir, testFileDir): XTrain, yTrain = self.loadData(trainFileDir) self.trainModel(XTrain, yTrain) XTest, yTest = self.loadData(testFileDir) yPred = self.predict(XTest) accuracy = accuracy_score(yTest, yPred) #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred) labels = [1, 0] classNames = ['interested', 'notInterested'] report = classification_report( yTest, yPred, labels=labels, target_names=classNames) + '\naccuracy\t' + str(accuracy) print report
def main(argv): try: opts, args = getopt.getopt(argv, "d:c:") except getopt.GetoptError: sys.exit(2) for opt, arg in opts: if opt == '-d': data_file = arg elif opt == '-c': label_col = int(arg) y_true = np.genfromtxt(data_file, usecols=label_col, delimiter="\t", skip_header=1) for lab in range(2, 9): print "lab", lab y_pred = np.genfromtxt(data_file, usecols=lab, delimiter="\t", skip_header=1) print "The classification report for Algorithm", lab, "is \n" #Make classification report print metrics.classification_report(y_true, y_pred) print "Accuracy: %.6f" % metrics.accuracy_score(y_true, y_pred) #Compute specificity from confusion amtrix cm = confusion_matrix(y_true, y_pred) print "Confusion matrix as \n", cm tn = int(cm[0, 0]) fp = int(cm[0, 1]) print "tn", tn print "fp", fp s = tn / (tn + fp) print "Speicificity is", s, "\n" print "Metthiew correlation co-efficient: %.6f" % matthews_corrcoef( y_true, y_pred)
def perecptronClassification(): from sklearn.datasets import fetch_20newsgroups from sklearn.metrics.metrics import f1_score, classification_report from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import Perceptron categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(newsgroups_train.data) X_test = vectorizer.transform(newsgroups_test.data) classifier = Perceptron(n_iter=100, eta0=0.1) classifier.fit(X_train, newsgroups_train.target) predictions = classifier.predict(X_test) print classification_report(newsgroups_test.target, predictions)
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./filtered_classes.csv") o_target = np.array( [x[0] for x in data] ) o_train = np.array( [x[1:] for x in data] ) #Split the data randomly into 80% training and 20% test X_train, X_test, y_train, y_test = cross_validation.train_test_split(o_train,o_target, test_size = 0.20) print str(len(o_target)) print str(len(y_test)) #Compute the most frequent class in the training set H = histogram(y_train) mc = max(H.iteritems(), key=operator.itemgetter(1))[0] print str(H) print str(mc) y_predict = np.empty(len(y_test)) y_predict[:] = mc #print str(y_predict) print metrics.classification_report(y_test, y_predict) print str(metrics.zero_one_score(y_test, y_predict))
def movieReviewsMultiClassClassification(): import pandas as pd df = pd.read_csv('./data/trainMovieSentiment.tsv', header=0, delimiter='\t') print df.count() print df['Phrase'].head(10) print df['Sentiment'].describe() print df['Sentiment'].value_counts() print df['Sentiment'].value_counts() / df['Sentiment'].count() #training with sckit classifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.pipeline import Pipeline from sklearn.metrics.metrics import classification_report, accuracy_score, confusion_matrix from sklearn.grid_search import GridSearchCV pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression())]) parameters = { 'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.01, 1, 10) } X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score %0.3f' % grid_search.best_score_ print 'Best params set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'accuracy:', accuracy_score(y_test, predictions) print 'confusion matrix:', confusion_matrix(y_test, predictions) print 'classifiaction report', classification_report(y_test, predictions)
def evaluate(df): X = df.ix[:,0:7] y = df["seed"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) print len(X_train) y_test = np.array(y_test) clf = LogisticRegression() clf.fit(X_train,y_train) print "------------",clf.predict_proba(X_test) print clf.get_params() pipeline= Pipeline([ ('clf',LogisticRegression()) ]) parameters={ } grid_search = GridSearchCV(pipeline,parameters,n_jobs=1,verbose=1) grid_search.fit(X_train,y_train) print "Best score:",grid_search.best_score_ print "Best parameters set:" best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print (param_name,best_parameters[param_name]) prediction = grid_search.predict(X_test) for i,pred in enumerate(prediction): print "original:",y_test[i],"predicted",pred print grid_search.score(X_test,y_test) print accuracy_score(y_test,prediction) print "classification_report",classification_report(y_test,prediction) clf_pred = clf.predict(X_test) for i,pred in enumerate(clf_pred): print "original:",y_test[i],"predicted",pred print accuracy_score(y_test,clf_pred) print clf.score(X_test,y_test)
def Classify(txtList, txtLabels, fileName, labelList): x_train = np.array(txtList[0:300]) y_train = np.array(txtLabels[0:300]) x_test = np.array(txtList[301:]) y_test = np.array(txtLabels[301:]) classifier = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(x_train, y_train) predicted = classifier.predict(x_test) f=open(fileName,'w') f.writelines(metrics.classification_report(y_test, predicted,target_names=labelList)) f.write('\nNumber of Labels:'+str(len(labelList))) f.write('\nhamming loss : '+str(metrics.hamming_loss(y_test,predicted))) f.write('\nf-beta(beta=0.5 - biased towards Precision) : '+str(metrics.fbeta_score(y_test,predicted,0.5))) f.write('\nzero-loss:'+str(zero_one_loss(y_test,predicted))) f.write('\nAccuracy score:'+str(metrics.accuracy_score(y_test,predicted))) f.close()
def evaluate(df): X = df.ix[:, 0:7] y = df["seed"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) print len(X_train) y_test = np.array(y_test) clf = LogisticRegression() clf.fit(X_train, y_train) print "------------", clf.predict_proba(X_test) print clf.get_params() pipeline = Pipeline([('clf', LogisticRegression())]) parameters = {} grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1) grid_search.fit(X_train, y_train) print "Best score:", grid_search.best_score_ print "Best parameters set:" best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print(param_name, best_parameters[param_name]) prediction = grid_search.predict(X_test) for i, pred in enumerate(prediction): print "original:", y_test[i], "predicted", pred print grid_search.score(X_test, y_test) print accuracy_score(y_test, prediction) print "classification_report", classification_report(y_test, prediction) clf_pred = clf.predict(X_test) for i, pred in enumerate(clf_pred): print "original:", y_test[i], "predicted", pred print accuracy_score(y_test, clf_pred) print clf.score(X_test, y_test)
def baseline(test_data,approach): # baseline method implementation correct=0 total=0 actual=[] pred=[] v = "" if approach==1: ind=1 else: ind=2 for pt in test_data: syn_list=[] if pt[0][0]+"|"+pt[0][1] in relation_dict: v=relation_dict[pt[0][0]+"|"+pt[0][1]] if v in relation_synonyms: syn_list=relation_synonyms[v] else: syn_list=[v] else: v="NA" made=False if v=="NA": continue for x in syn_list: if x in pt[ind]: pred.append(v) correct+=1 made=True if made==False: pred.append("NA") actual.append(v) total+=1 res=open("Result_approach"+str(approach)+".txt","a") res.write("Technique:Baseline\n") res.write(metrics.classification_report(actual, pred)) res.write("Accuracy: "+str(float(correct)/total)+"\n")
def runTreePipeline(entries, langs): t0 = time() tree_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', DecisionTreeClassifier(max_features=n_features))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) clf = DecisionTreeClassifier(max_features=n_features) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return tree_pipeline
#se pasmo con 1000000 #probar con mas parametros classifier = RandomForestClassifier(n_estimators=100) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) #print X_train.shape from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score print '\nAccuracy:', accuracy_score(y_test, prediction) print '\nscore:', classifier.score(X_train, y_train) print '\nrecall:', recall_score(y_test, prediction) print '\nprecision:', precision_score(y_test, prediction) print '\n clasification report:\n', classification_report(y_test, prediction) print '\n confussion matrix:\n', confusion_matrix(y_test, prediction) #plots: import matplotlib.pyplot as plt confusion_matrix_plot = confusion_matrix(y_test, prediction) plt.title('matriz de confusion') plt.colorbar() plt.xlabel() plt.xlabel('categoria de verdad') plt.ylabel('categoria predecida') plt.show() #como arreglo # import numpy as np
[ 5 409 9024 6693 440] [ 1 88 1112 2529 853]] Classification Report: precision recall f1-score support 0 0.59 0.13 0.21 3483 1 0.51 0.28 0.36 13711 2 0.64 0.90 0.75 39682 3 0.56 0.40 0.47 16571 4 0.62 0.19 0.29 4583 avg / total 0.59 0.61 0.57 78030 """ predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) ################# Sample 12 ################# # Applying Multi-label Classification """ """ ################# Sample 13 ################# # Multi-Label Classification Performance Metrics """ >>> import numpy as np >>> from sklearn.metrics import hamming_loss >>> print hamming_loss(np.array([[0.0, 1.0], [1.0, 1.0]]), np.array([[0.0, 1.0], [1.0, 1.0]]))
# Split the data set into two subsets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ] scores = ['precision', 'recall'] for score in scores: print '\nTuning hyper parameters for %s\n' % score # Define a classifier clf = GridSearchCV(svm.SVC(), param_grid, cv=5, n_jobs=-1, scoring=score) clf.fit(X_train, y_train) print 'Best parameters set found on development set:\n' print clf.best_estimator_ print 'Grid scores on development set:\n' for params, mean_score, scores in clf.grid_scores_: print ("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print '\nDetailed classification report\n' print 'The model is trained on the full development set.' print 'The scores are computed on the full evaluation set\n' y_true, y_pred = y_test, clf.predict(X_test) print classification_report(y_true, y_pred)
def print_classification_report(y_test_report, y_predicted_report,target_names): #target_names = ['class 0', 'class 1'] print ("overall accuracy score of the classifier is") print accuracy_score(y_test_report, y_predicted_report) print(classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names)); return None
plt.title('Kittens and Adult Cats') plt.show() #Perceptron categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(newsgroups_train.data) X_test = vectorizer.transform(newsgroups_test.data) classifier = Perceptron(n_iter=100, eta0=0.1) classifier.fit_transform(X_train, newsgroups_train.target) predictions = classifier.predict(X_test) print classification_report(newsgroups_test.target, predictions) """ Output seen precision recall f1-score support 0 0.89 0.87 0.88 396 1 0.87 0.78 0.82 397 2 0.79 0.88 0.83 399 avg / total 0.85 0.85 0.85 1192 """ #plot the output import matplotlib matplotlib.use('Qt4Agg')
def main(argv): # get options passed at command line try: opts, args = getopt.getopt(argv, "d:o:c:C:t:m:") except getopt.GetoptError: #print helpString sys.exit(2) #print opts for opt, arg in opts: if opt == '-d': data_file = arg elif opt == '-o': out_folder = arg elif opt == '-c': label_col = int(arg) elif opt == '-C': data_cols = arg elif opt == '-t': test_file = arg #Whole genome prediction file elif opt == '-m': model_file = arg model_filename = os.path.abspath(model_file) data_file = os.path.abspath(data_file) test_file = os.path.abspath(test_file) print model_file, "\n" data_cols = [int(x) for x in data_cols.split(",")] x_data = np.loadtxt(data_file, usecols=data_cols, delimiter = "\t", skiprows=1) y_data = np.genfromtxt(data_file, usecols = label_col, delimiter = "\t", skip_header=1) test_x_data = np.loadtxt(test_file, usecols=data_cols, delimiter = "\t", skiprows=1) test_y_data = np.genfromtxt(test_file, usecols = label_col, delimiter = "\t", skip_header=1) #Load the model file# estimator = joblib.load(model_filename) #perform same scaling on training and testing data x_data, test_x_data = scaling_training_testing_data(x_data, test_x_data) np.random.seed(0) indices = np.random.permutation(len(test_x_data)) test_x_data = test_x_data[indices] test_y_data = test_y_data[indices] cols = 0 with open (test_file,"r") as temp: a = '\n'.join(line.strip("\n") for line in temp) b = np.genfromtxt(StringIO(a), usecols = cols, delimiter="\t", dtype=None, skip_header=1) enhancer_names_test = b[indices] temp.close() y_FAN_pred = estimator.predict(test_x_data) y_score_test = estimator.predict_proba(test_x_data) print metrics.classification_report(test_y_data,y_FAN_pred) combined_test = zip(enhancer_names_test, test_y_data, y_FAN_pred, y_score_test[:,0], y_score_test[:,1]) #f = open(out_folder + "/subroutine_RF_FANTOM_FeatureSelected_pred.txt", 'w') f = open(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.txt", 'w') f.write("Enhancer_name\tY_true_labels\tY_predicted_labels\tProb_Class0\tProb_class1\n") for i in combined_test: line = '\t'.join(str(x) for x in i) f.write(line + '\n') f.close() print "Random Forests: On FANTOM, Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred) print metrics.classification_report(test_y_data,y_FAN_pred) print "Number of mislabeled points : %d" % (test_y_data != y_FAN_pred).sum() print metrics.classification_report(test_y_data,y_FAN_pred) print "Random Forests: Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred) #Before we move on, let's look at a key parameter that RF returns, namely feature_importances. This tells us which #features in our dataset seemed to matter the most (although won't matter in the present scenario with only 2 features) print estimator.feature_importances_ #Plot ROC# roc_plt = plot_roc(estimator, test_x_data, test_y_data, y_FAN_pred) #pl.savefig(out_folder + "/subroutine_RF_FeatureSelected_split_test_train_Kfold.svg", transparent=True, bbox_inches='tight', pad_inches=0.2) pl.savefig(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.svg", transparent=True, bbox_inches='tight', pad_inches=0.2) roc_plt.show()
y_.extend(y_test) prediction_.extend(prediction) verbose('----------\n') verbose("Evaluation") if opts.mode in ['age', 'gender']: from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # Calculando desempeño print('Accuracy :', accuracy_score(y_, prediction_)) print('Precision :', precision_score(y_, prediction_)) print('Recall :', recall_score(y_, prediction_)) print('F-score :', f1_score(y_, prediction_)) print('\nClasification report:\n', classification_report(y_, prediction_)) print('\nConfussion matrix :\n', confusion_matrix(y_, prediction_)) else: from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error, r2_score print('Mean Abs Error :', mean_absolute_error(y_, prediction_)) print('Mean Sqr Error :', mean_squared_error(y_, prediction_)) print('R2 Error :', r2_score(y_, prediction_)) #plots: #import matplotlib.pyplot as plt #confusion_matrix_plot = confusion_matrix(y_test, prediction) #plt.title('matriz de confusion') #plt.colorbar() #plt.xlabel() #plt.xlabel('categoria de verdad') #plt.ylabel('categoria predecida')
y_.extend(y_test) prediction_.extend(prediction) verbose('----------\n') verbose("Evaluation") if opts.mode in ['age','gender']: from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # Calculando desempeño print( 'Accuracy :', accuracy_score(y_, prediction_)) print( 'Precision :', precision_score(y_, prediction_)) print( 'Recall :', recall_score(y_, prediction_)) print( 'F-score :', f1_score(y_, prediction_)) print( '\nClasification report:\n', classification_report(y_, prediction_)) print( '\nConfussion matrix :\n',confusion_matrix(y_, prediction_)) else: from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error,r2_score print( 'Mean Abs Error :', mean_absolute_error(y_, prediction_)) print( 'Mean Sqr Error :', mean_squared_error(y_, prediction_)) print( 'R2 Error :', r2_score(y_, prediction_)) #plots: #import matplotlib.pyplot as plt #confusion_matrix_plot = confusion_matrix(y_test, prediction) #plt.title('matriz de confusion') #plt.colorbar() #plt.xlabel() #plt.xlabel('categoria de verdad')
def print_classification_report(y_true, y_pred, title=''): cr = classification_report(y_true, y_pred) print cr
for m in [1, 2]: print "STARTING CLASSIFICATION" clf = runClassificationTest(X_train, y_train, m, featureV, datatype) predicted= clf.predict(X_test) print "Accuracy: %0.3f " % (accuracy_score(y_test,predicted )) ''' print "precision ", (precision_score(y_test, clf.predict(X_test), average=None)) print "recall ", (recall_score(y_test, clf.predict(X_test), average=None)) print "F1 Score ", (f1_score(y_test, clf.predict(X_test), average=None)) ''' if datatype == 3: print classification_report(y_test, predicted, target_names=['0','1', '2'], digits=3) print draw_confusion_matrix(y_test, predicted, [0,1,2]) else: print classification_report(y_test, predicted, target_names=['0','1'], digits=3) print draw_confusion_matrix(y_test, predicted, [0,1])
processed_comment_list = [] for art in commentList.items(): for comm in art[1]: processed_comment_list.append(comm.body.decode('ascii', 'ignore')) features = vectorizer.transform(processed_comment_list) y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_test.npy') print features.shape print y_train.shape print y_test.shape valueVector = np.concatenate([y_train, y_test]) print print valueVector.shape # train_list = [' '.join(sent) for sent in train_list] # test_list = [' '.join(sent) for sent in test_list] predicted = [float(v) for v in clf.predict(features)] print "Accuracy: %0.3f " % (accuracy_score(valueVector, predicted)) print classification_report(valueVector, predicted, target_names=['0', '1']) print draw_confusion_matrix(valueVector, predicted, ['ham', 'spam'])
data_to_predict = [] for (i, X_to_predict) in enumerate(X_to_predict): features = image_to_feature_vector(X_to_predict) data_to_predict.append(features) data_to_predict = np.array(data_to_predict) / 255.0 pred = model_CNN.predict(data_to_predict, batch_size=BS, verbose=1) pred_cat = np.zeros((len(pred), 1)) for i in range(len(pred)): temp = pred[i, :] pred_cat[i] = np.where(temp == temp.max()) y_pred = pred_cat y_true = y_to_predict from sklearn.metrics import metrics print(metrics.classification_report(y_true, y_pred)) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_true, y_pred) plt.matshow(cm) # ======= Predict on Test Set based on above Model testX_reshape = np.reshape(testX, [len(testX), 64, 64, 3]) testX_data = [] for (i, testX_reshape) in enumerate(testX_reshape): features = image_to_feature_vector(testX_reshape) testX_data.append(features) testX_data = np.array(testX_data) / 255.0 pred = model.predict(testX_data, batch_size=BS, verbose=1) pred_cat = np.zeros((len(pred), 1))
lc.pop(0) lc = [float(i) for i in lc] x.append(lc) f.close() pipeline = Pipeline([ ('clf', LogisticRegression()) ]) parameters = { 'clf__C': (0.1, 1, 10), } X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:' print confusion_matrix(y_test, predictions) print 'Classification Report:' print classification_report(y_test, predictions)
random_state=0) #Great, the dataset has 4 classes that we'll try to predict. It's got fairly interesting seperation as we can see below. #Let's visualize the data with a scatter plot plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.BuGn) plt.show() # In[3]: #Great, let's now fit this dataset to the Decision Tree Classifier and see how well it does. dtree = DecisionTreeClassifier(max_depth=10).fit( X, y) #this parameter defines the maximum depth of the tree y_pred = dtree.predict(X) print metrics.classification_report(y, y_pred) #THe report tells us that the overall accuracy of the predicted labels is about 94%. Looking at the data, we can be #almost certain that this is definitely overfitting. To predict 94% of this dataset correctly, the tree would need to be #extremely well tuned to the dataset we trained on (for now, the entire X dataset). This will mean that when you expose #new data to the model, it will not be able to predict so well. #We can confirm our understanding by doing a train/cv split on the data. Let's define a couple of functions next #that will help us run this multiple times. We'll begin by doing a 80/20 split on the data below. X_train, X_test, y_train, y_test = train_test_split(X, y) # In[4]: #All right let's do this the right way. We'll use a cross-validation generator to select train and CV datasets to finetune #parameters such as C (Regularization parameter we saw earlier). These hyperparameters are extremely critical to the model. #Now, if we tune parameters against the Test dataset, we will end up biasing towards the test set and will once again
ids_ = np.load(opts.IDS) le = preprocessing.LabelEncoder() le.fit(ids_) verbose("Total classes", le.classes_.shape[0]) ids = le.transform(ids_) X_train, X_test, y_train, y_test=\ train_test_split(feats, ids, test_size=0.20, random_state=42) verbose("Training") classifier = RandomForestClassifier(n_estimators=opts.estimators, n_jobs=opts.nprocessors, max_depth=20, verbose=True) # Aprendiendo classifier.fit(X_train, y_train) # Prediciendo verbose("Prediction") prediction = classifier.predict(X_test) print('Accuracy :', accuracy_score(y_test, prediction)) print('Precision :', precision_score(y_test, prediction)) print('Recall :', recall_score(y_test, prediction)) print('F-score :', f1_score(y_test, prediction)) print('\nClasification report:\n', classification_report(y_test, prediction)) print('\nConfussion matrix :\n', confusion_matrix(y_test, prediction))
np.set_printoptions(threshold=np.nan) # 配置utf-8输出环境 reload(sys) sys.setdefaultencoding('utf-8') #从文件导入停用词表 stpwrdlst = process_tool.read_stopword("extra_dict/stop_words.txt") #训练集读取 train_set = joblib.load("wordbag/word_bag1124.data") print train_set.target_name # print "fenci" # process_tool.chinesefenci("test_corpus", "test_token") # print "train_bag" # process_tool.train_bags("test_token","test_set.data", "test_wordbag") # print "test tfidf" # test_data = process_tool.testset_tfidf("test_wordbag/test_set.data", "extra_dict/stop_words.txt", train_set.vocabulary) test_data = joblib.load("test_wordbag/test_word_bag.data") # print "MultinomialNB train" # clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm, train_set.label) # joblib.dump(clf,"model/MultinomialNB.model",compress=3) clf = joblib.load("model/MultinomialNB.model") print (test_data.tdm).shape print len(test_data.label) # print clf.predict(test_data.tdm) print test_data.target_name print classification_report(np.array(test_data.label), clf.predict(test_data.tdm),target_names=train_set.target_name) cm = confusion_matrix(np.array(test_data.label), clf.predict(test_data.tdm)) print cm
categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(newsgroups_train.data) X_test = vectorizer.transform(newsgroups_test.data) classifier = Perceptron(n_iter=100, eta0=0.1) classifier.fit_transform(X_train, newsgroups_train.target) predictions = classifier.predict(X_test) print classification_report(newsgroups_test.target, predictions) ################# Example ################# """ """ """ sudo apt-get remove libopenblas-base openblas (required for video contextualization) is incompatible with scipy. """ import numpy as np import matplotlib matplotlib.use('Qt4Agg') import matplotlib.pyplot as plt from sklearn.linear_model import Perceptron
temp = line.strip().split(',') train_feature.append(map(int,temp[0:-1])) train_target.extend(map(int,temp[-1])) train_data.close() ##test data test_feature=[] test_target=[] for line in test_data: temp = line.strip().split(',') test_feature.append(map(int,temp[0:-1])) test_target.extend(map(int,temp[-1])) test_data.close() train_feature = np.array(train_feature) test_feature = np.array(test_feature) ##OneHotEncoder used enc = OneHotEncoder(categorical_features=np.array([1,2,4,5,6,7,8,9,10,11,14,15,16,17,18,21]),n_values=[13,13,9,5,5,13,5,2,13,13,9,31,10,5,2,9]) enc.fit(train_feature) train_feature = enc.transform(train_feature).toarray() test_feature = enc.transform(test_feature).toarray() clf = RandomForestClassifier(n_estimators=10) clf = clf.fit(train_feature,train_target) ##result print (clf.predict(test_feature)) target_names = ['losing', 'active'] print (classification_report(test_target, clf.predict(test_feature),target_names=target_names))
def print_classification_report(y_test_report, y_predicted_report, target_names): # target_names = ['class 0', 'class 1'] print ("overall accuracy score of the classifier is") print accuracy_score(y_test_report, y_predicted_report) print (classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names)) return None
def main(): print("Getting features for deleted papers from the database") if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_keywords_feature() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted + kw_confirmed for i in range(len(features)): _, _, ckw = kw_features[i] features[i] += (ckw, ) featuresnp = np.array(features, dtype='float32') targetnp = np.array(target, dtype='int32') featuresnp -= np.mean(featuresnp, axis=0) featuresnp /= np.std(featuresnp, axis=0) # Set the parameters by cross-validation # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split(featuresnp, targetnp, test_size=0.3, random_state=0) tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=4, score_func=score, n_jobs=4, verbose=2) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_estimator_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.cv_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print()
d = { d[0]: d[1:] for d in [ l.strip()[9:].split(' ') for l in open('reuters/cats.txt', 'rb') if l.startswith('training') ] } for f in glob.glob( '/home/gavin/PycharmProjects/mastering-machine-learning/ch4-logistic_regression/reuters/training/*' ): training_id = f[f.rfind('/') + 1:] articles.append(' '.join([label.strip() for label in open(f, 'rb')])) labels.append(d[training_id]) vectorizer = TfidfVectorizer() train_len = int(len(articles) * .7) X_train = vectorizer.fit_transform(articles[:train_len]) X_test = vectorizer.transform(articles[train_len:]) for label in set([label for instance in labels for label in instance][:3]): y = [1 if label in instance else 0 for instance in labels] print y y_train = y[:train_len] y_test = y[train_len:] classifier = LogisticRegression() classifier.fit_transform(X_train, y_train) predictions = classifier.predict(X_test) print y_test print predictions print classification_report(y_test, predictions)
y_train[unlabeled_set] = -1 ############################################################################### # Learn with LabelSpreading lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5) lp_model.fit(X, y_train) predicted_labels = lp_model.transduction_[unlabeled_set] true_labels = y[unlabeled_set] cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) print "Label Spreading model: %d labeled & %d unlabeled points (%d total)" % \ (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples) print metrics.classification_report(true_labels, predicted_labels) print "Confusion matrix" print cm # calculate uncertainty values for each transduced distribution pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T) # pick the top 10 most uncertain labels uncertainty_index = np.argsort(pred_entropies)[-10:] ############################################################################### # plot f = pl.figure(figsize=(7, 5)) for index, image_index in enumerate(uncertainty_index): image = images[image_index]
class_sep=1.0, random_state=0) #Great, the dataset has 4 classes that we'll try to predict. It's got fairly interesting seperation as we can see below. #Let's visualize the data with a scatter plot plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.BuGn) plt.show() # In[3]: #Great, let's now fit this dataset to the Decision Tree Classifier and see how well it does. dtree = DecisionTreeClassifier(max_depth=10).fit(X,y) #this parameter defines the maximum depth of the tree y_pred=dtree.predict(X) print metrics.classification_report(y, y_pred) #THe report tells us that the overall accuracy of the predicted labels is about 94%. Looking at the data, we can be #almost certain that this is definitely overfitting. To predict 94% of this dataset correctly, the tree would need to be #extremely well tuned to the dataset we trained on (for now, the entire X dataset). This will mean that when you expose #new data to the model, it will not be able to predict so well. #We can confirm our understanding by doing a train/cv split on the data. Let's define a couple of functions next #that will help us run this multiple times. We'll begin by doing a 80/20 split on the data below. X_train, X_test, y_train, y_test = train_test_split(X,y) # In[4]: #All right let's do this the right way. We'll use a cross-validation generator to select train and CV datasets to finetune #parameters such as C (Regularization parameter we saw earlier). These hyperparameters are extremely critical to the model.
for tweet in reader[0:2*(numironicos/3)]: tweets_train.append(tweet["text"]) labels_train.append("noironia") for tweet in reader[2*(numironicos/3):]: tweets_test.append(tweet["text"]) labels_test.append("noironia") stop_words = [] f = open("spanish.txt") for line in f: stop_words.append(line.strip()) f.close() y_train = np.array(labels_train, dtype=object) y_test = np.array(labels_test, dtype=object) vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words) X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object)) X_test = vectorizer.transform(np.array(tweets_test, dtype=object)) classifier = RandomForestClassifier(n_estimators = 10) classifier.fit(X_train.toarray(), y_train) prediction = classifier.predict(X_test.toarray()) print '\nAccuracy :', accuracy_score(y_test, prediction) print '\nPrecision :', precision_score(y_test, prediction) print '\nRecall :', recall_score(y_test, prediction) print '\nF-score :', f1_score(y_test, prediction) print '\nClasification report:\n', classification_report(y_test,prediction) print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
#Split the data into training and test sets Features_train, Features_test, Labels_train, Labels_test = train_test_split(Features, Labels) #Create a pipeline and an instance of DecisionTreeClassifier for grid search. #Set 'criterion' to 'entropy' to build the tree using the information gain heuristic. # pipeline = Pipeline([('clf', DecisionTreeClassifier(criterion='entropy'))]) #Replace Decision Tree with Random Forest pipeline = Pipeline([('clf', RandomForestClassifier(criterion='entropy'))]) #Specify the hyperparameter space for grid search parameters = { 'clf__n_estimators' : (5, 10, 20, 50), 'clf__max_depth' : (50, 150, 250), 'clf__min_samples_split' : (1, 2, 3), 'clf__min_samples_leaf' : (1, 2, 3) } #Set GridSearchCV() to maximize the model's F1 score grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1') grid_search.fit(Features_train, Labels_train) print 'Best score: %0.3f' %grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s %r' %(param_name, best_parameters[param_name]) predictions = grid_search.predict(Features_test) print classification_report(Labels_test, predictions)
def load_data(dataset): f = gzip.open(dataset, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() train_set_x, train_set_y = train_set valid_set_x, valid_set_y = valid_set test_set_x, test_set_y = test_set rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] return rval if __name__ == "__main__": datasets = load_data('mnist.pkl.gz') train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print train_set_x.shape print train_set_y.shape logreg = linear_model.LogisticRegression() logreg.fit(train_set_x, train_set_y) predictions = logreg.predict(test_set_x) print confusion_matrix(test_set_y, predictions) print classification_report(test_set_y, predictions)
def whole_dataset_train_test(X, y): rfpred = RandomForestClassifier().fit(X,y) pred = rfpred.predict(X) print "When fitted on the whole dataset with selected features, then the classification report is found to be:\n"; print "Random Forests: Accuracy: %.6f" %metrics.accuracy_score(y,pred) print metrics.classification_report(y, pred)