def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer()), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), 'vect__max_features': (5000, 10000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'vect__norm': ('l1', 'l2'), 'clf__penalty': ('l1', 'l2'), 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Precision:', precision_score(y_test, predictions) print 'Recall:', recall_score(y_test, predictions) print 'F1 score:', f1_score(y_test, predictions)
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.1, 1, 10), } os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets') df = pd.read_csv('data/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') lb = LabelBinarizer() y_train = np.array([number[0] for number in lb.fit_transform(y_train)]) grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) lb = LabelBinarizer() y_train = np.array([number[0] for number in lb.fit_transform(y_train)]) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Precision:', precision_score(y_test, predictions) print 'Recall:', recall_score(y_test, predictions)
def evaluate(self, params, rnnDataTest): predictLabels = [] trueLabels = [] allSNum = rnnDataTest.allSNum allSTree = rnnDataTest.allSTree allSStr = rnnDataTest.allSStr verbIndices = rnnDataTest.verbIndices sentenceLabels = rnnDataTest.sentenceLabels ndoc = rnnDataTest.ndoc() print "Total number of trees/sentences to be evaluated: ", ndoc for s in range(ndoc): if(s % 100 == 0) : print "Processing sentences ", s , ' - ', s+100 thissentVerbIndices = verbIndices[s] sStr = allSStr[s]; sNum = allSNum[s]; sTree = allSTree[s] labels = sentenceLabels[s] if((len(sNum) == 1) or (len(thissentVerbIndices)==0) or (labels.shape[1] != len(sStr))): continue #only one word in a sent, no verbs for this sent, tokens and labels mismatch for nverb, vid in enumerate(thissentVerbIndices): scoresMat = np.zeros((len(sStr), self.Wcat.shape[0])) for wid in range(len(sStr)): indices = np.array([vid, wid]) setVerbnWordDistanceFeat(self.Wv, sNum, vid, wid, params) tree = forwardPropTree(self.W, self.WO, self.Wcat, self.Wv, self.Wo, sNum, sTree, sStr, sNN=None, indicies=None, params=params) calPredictions(tree, self.Wcat, self.Wv, indices, sStr, params) #updates score, nodepath etc for this verb, word pair scoresMat[wid,:] = tree.score pred_answer = viterbi(scoresMat, self.Tran) true_answer = labels[nverb,:] for i in range(len(pred_answer)): predictLabels.append(pred_answer[i]) trueLabels.append(true_answer[i]) #TODO : calculate predicted labels f1 = f1_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels) p = precision_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels) r = recall_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#), labels=all_labels) print "XXXXXXX F1 = ", f1 print "XXXXXXX P = ", p print "XXXXXXX R = ", r print return predictLabels
def evaluate(self, params, rnnDataTest): # predictLabels = np.zeros(len(rnnDataTest.allSNum), dtype='int32') # probabilities = np.zeros(len(rnnDataTest.allSNum)) predictLabels = [] trueLabels = [] allSNum = rnnDataTest.allSNum allSTree = rnnDataTest.allSTree allSStr = rnnDataTest.allSStr verbIndices = rnnDataTest.verbIndices # allSNN = rnnDataTest.allSNN # allIndicies = rnnDataTest.allIndicies sentenceLabels = rnnDataTest.sentenceLabels ndoc = rnnDataTest.ndoc() print "Total number of trees/sentences to be evaluated: ", ndoc for s in range(ndoc): if(s % 100 == 0) : print "Processing sentences ", s , ' - ', s+100 thissentVerbIndices = verbIndices[s] sStr = allSStr[s]; sNum = allSNum[s]; sTree = allSTree[s] labels = sentenceLabels[s] if((len(sNum) == 1) or (len(thissentVerbIndices)==0) or (labels.shape[1] != len(sStr))): continue #only one word in a sent, no verbs for this sent, tokens and labels mismatch for nverb, vid in enumerate(thissentVerbIndices): for wid in range(len(sStr)): indices = np.array([vid, wid]) truelabel = labels[nverb, wid] setVerbnWordDistanceFeat(self.Wv, sNum, vid, wid, params) tree = forwardPropTree(self.W, self.WO, self.Wcat, self.Wv, self.Wo, sNum, sTree, sStr, sNN=None, indicies=None, params=params) trueLabels.append(truelabel) calPredictions(tree, self.Wcat, self.Wv, indices, sStr, params) #updates score, nodepath etc for this verb, word pair predictedLabel = np.argmax(tree.y) predictLabels.append(predictedLabel) f1 = f1_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels) p = precision_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels) r = recall_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#), labels=all_labels) print "XXXXXXX F1 = ", f1 print "XXXXXXX P = ", p print "XXXXXXX R = ", r print return predictLabels
# have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) # it's all yours from here forward! from time import time from sklearn.cross_validation import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import metrics features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=.3, random_state=42) clf = DecisionTreeClassifier() t0 = time() pred = clf.fit(features_train, labels_train) print "training time is ", round((time() - t0), 3), "s" t1 = time() pred = clf.predict(features_test) print "prediction time is ", round(time() - t1, 3), "s" from sklearn.metrics import accuracy_score accuracy = accuracy_score(labels_test, pred) print accuracy precision = metrics.precision_score(labels_test, pred) print precision recall = metrics.recall_score(labels_test, pred) print recall
# Prediciendo verbose(" Predicting fold (%i)" % (i + 1)) prediction = regressor.predict(X_test) y_.extend(y_test) prediction_.extend(prediction) verbose('----------\n') verbose("Evaluation") if opts.mode in ['age', 'gender']: from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # Calculando desempeƱo print('Accuracy :', accuracy_score(y_, prediction_)) print('Precision :', precision_score(y_, prediction_)) print('Recall :', recall_score(y_, prediction_)) print('F-score :', f1_score(y_, prediction_)) print('\nClasification report:\n', classification_report(y_, prediction_)) print('\nConfussion matrix :\n', confusion_matrix(y_, prediction_)) else: from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error, r2_score print('Mean Abs Error :', mean_absolute_error(y_, prediction_)) print('Mean Sqr Error :', mean_squared_error(y_, prediction_)) print('R2 Error :', r2_score(y_, prediction_)) #plots: #import matplotlib.pyplot as plt #confusion_matrix_plot = confusion_matrix(y_test, prediction) #plt.title('matriz de confusion') #plt.colorbar()
if __name__ == '__main__': if len(sys.argv) != 2: print ("Illegal use of Arguments: Best_configuration.py <Training_samples_location> <Testing_Samples_Location>") exit(1) test = sys.argv[1] header_list = [] labels = [] i=0 header_test = [] test_labels = [] i = 0 for root, dirs, files in os.walk(test): for name in files: fo = open(root +"/"+name, "r") content = fo.read().replace('\n', ' ') body = re.sub(r'^(.*) Lines: (\d)+ ', "", content) header_test.append(unicode(body,errors='ignore')) test_labels.append(i) i=i+1 text_clf01 = joblib.load('Training_model.pkl') predicted01 = text_clf01.predict(header_test) print("Removed Stop Words + L2 penalization") print ("F1:",metrics.f1_score(test_labels, predicted01, average='macro')) print ("accuracy:", metrics.accuracy_score(test_labels, predicted01)) print ("precision:",metrics.precision_score(test_labels, predicted01, average='macro')) print ("recall:",metrics.recall_score(test_labels, predicted01, average='macro'))
def fit_predict(config, X_train, y_train, X_test=None, y_test=None, ref_thd=None): """ Uses the configuration dictionary settings to train a model using the specified training algorithm. If set, also evaluates the trained model in a test set. Additionally, performs feature selection and model parameters optimization. @param config: the configuration dictionary obtained parsing the configuration file. @param X_train: the np.array object for the matrix containing the feature values for each instance in the training set. @param y_train: the np.array object for the response values of each instance in the training set. @param X_test: the np.array object for the matrix containing the feature values for each instance in the test set. Default is None. @param y_test: the np.array object for the response values of each instance in the test set. Default is None. """ # sets the selection method transformer = set_selection_method(config) # if the system is configured to run feature selection # runs it and modifies the datasets to the new dimensions if transformer is not None: log.info("Running feature selection %s" % str(transformer)) log.debug("X_train dimensions before fit_transform(): %s,%s" % X_train.shape) log.debug("y_train dimensions before fit_transform(): %s" % y_train.shape) X_train = transformer.fit_transform(X_train, y_train) log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape) if X_test is not None: X_test = transformer.transform(X_test) # sets learning algorithm and runs it over the training data estimator, scorers = set_learning_method(config, X_train, y_train) log.info("Running learning algorithm %s" % str(estimator)) estimator.fit(X_train, y_train) if (X_test is not None) and (y_test is not None): log.info("Predicting unseen data using the trained model...") y_hat = estimator.predict(X_test) log.info("Evaluating prediction on the test set...") for scorer_name, scorer_func in scorers: v = scorer_func(y_test, y_hat) log.info("%s = %s" % (scorer_name, v)) log.info("Customized scores: ") try: log.info("pearson_corrcoef = %s" % pearson_corrcoef(y_test, y_hat)) except: pass try: log.info("Precision score: = %s" % precision_score(y_test, y_hat)) except: pass try: log.info("Recall score: = %s" % recall_score(y_test, y_hat)) except: pass try: log.info("F1 score: = %s" % f1_score(y_test, y_hat)) except: pass try: log.info("MAE: = %s" % mean_absolute_error(y_test, y_hat)) except: pass try: log.info("RMSE: = %s" % root_mean_squared_error(y_test, y_hat)) except: pass try: res = classify_report_bin(y_test, y_hat) if "N/A" <> res: log.info("Classify report bin: = %s" % res) else: res = classify_report_bin_regression(y_test, y_hat) if "N/A" <> res: log.info("Classify report bin regression: = %s" % res) else: if ref_thd is None: log.error("No ref thd defined") else: refthd = float(ref_thd) res = classify_report_regression(y_test, y_hat, refthd) log.info("Classify report regression: = %s" % res) except Exception, e: print e with open("predicted.csv", "w") as _fout: for _x, _y in zip(y_test, y_hat): print >> _fout, "%f\t%f" % (_x, _y)
def fit_predict(config, X_train, y_train, X_test=None, y_test=None, ref_thd=None): ''' Uses the configuration dictionary settings to train a model using the specified training algorithm. If set, also evaluates the trained model in a test set. Additionally, performs feature selection and model parameters optimization. @param config: the configuration dictionary obtained parsing the configuration file. @param X_train: the np.array object for the matrix containing the feature values for each instance in the training set. @param y_train: the np.array object for the response values of each instance in the training set. @param X_test: the np.array object for the matrix containing the feature values for each instance in the test set. Default is None. @param y_test: the np.array object for the response values of each instance in the test set. Default is None. ''' # sets the selection method transformer = set_selection_method(config) # if the system is configured to run feature selection # runs it and modifies the datasets to the new dimensions if transformer is not None: log.info("Running feature selection %s" % str(transformer)) log.debug("X_train dimensions before fit_transform(): %s,%s" % X_train.shape) log.debug("y_train dimensions before fit_transform(): %s" % y_train.shape) X_train = transformer.fit_transform(X_train, y_train) log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape) if X_test is not None: X_test = transformer.transform(X_test) # sets learning algorithm and runs it over the training data estimator, scorers = set_learning_method(config, X_train, y_train) log.info("Running learning algorithm %s" % str(estimator)) estimator.fit(X_train, y_train) if (X_test is not None) and (y_test is not None): log.info("Predicting unseen data using the trained model...") y_hat = estimator.predict(X_test) log.info("Evaluating prediction on the test set...") for scorer_name, scorer_func in scorers: v = scorer_func(y_test, y_hat) log.info("%s = %s" % (scorer_name, v)) log.info("Customized scores: ") try: log.info("pearson_corrcoef = %s" % pearson_corrcoef(y_test, y_hat)) except: pass try: log.info("Precision score: = %s" % precision_score(y_test, y_hat)) except: pass try: log.info("Recall score: = %s" % recall_score(y_test, y_hat)) except: pass try: log.info("F1 score: = %s" % f1_score(y_test, y_hat)) except: pass try: log.info("MAE: = %s" % mean_absolute_error(y_test, y_hat)) except: pass try: log.info("RMSE: = %s" % root_mean_squared_error(y_test, y_hat)) except: pass try: res = classify_report_bin(y_test, y_hat) if "N/A" <> res: log.info("Classify report bin: = %s" % res) else: res = classify_report_bin_regression(y_test, y_hat) if "N/A" <> res: log.info("Classify report bin regression: = %s" % res) else: if ref_thd is None: log.error("No ref thd defined") else: refthd = float(ref_thd) res = classify_report_regression(y_test, y_hat, refthd) log.info("Classify report regression: = %s" % res) except Exception, e: print e with open("predicted.csv", 'w') as _fout: for _x, _y in zip(y_test, y_hat): print >> _fout, "%f\t%f" % (_x, _y)
vect__norm: 'l2' vect__use_idf: True """ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix __author__ = 'gavin' import pandas as pd df = pd.read_csv('sms/sms.csv') X_train_r, X_test_r, y_train, y_test = train_test_split( df['message'], df['label']) vectorizer = TfidfVectorizer(max_df=0.5, max_features=None, ngram_range=(1, 1), norm='l2', use_idf=True) X_train = vectorizer.fit_transform(X_train_r) X_test = vectorizer.transform(X_test_r) classifier = LogisticRegression(penalty='l2', C=7) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) print 'score', classifier.score(X_test, y_test) print 'precision', precision_score(y_test, predictions) print 'recall', recall_score(y_test, predictions) print confusion_matrix(y_test, predictions)
X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.metrics import classification_report y_true_all = [] predictions_all = [] for label in good_categories[:3]: print 'label', label y_train = [1 if label in instance else 0 for instance in y_train_all] y_test = [1 if label in instance else 0 for instance in y_test_all] y_true_all.append(y_test) classifier = LogisticRegression() classifier.fit_transform(X_train, y_train) predictions = classifier.predict(X_test) predictions_all.append(predictions) print classification_report(y_test, predictions) print confusion_matrix(y_test, predictions) print 'precision', precision_score(y_test, predictions) print 'recall', recall_score(y_test, predictions) print 'accuracy', accuracy_score(y_test, predictions) print '\n' y_true_all = np.array(y_true_all) predictions_all = np.array(predictions_all) print hamming_loss(y_true_all, predictions_all)
from sklearn.pipeline import Pipeline start_time = time.time() text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ]) text_clf = text_clf.fit(header_list, labels) predicted = text_clf.predict(header_test) print("Naive bayes") print("F1:", metrics.f1_score(test_labels, predicted, average='macro')) print("accuracy:", metrics.accuracy_score(test_labels, predicted)) print("precision:", metrics.precision_score(test_labels, predicted, average='macro')) print("recall:", metrics.recall_score(test_labels, predicted, average='macro')) print("Tine in seconds %s" % (time.time() - start_time)) #SVM### from sklearn.linear_model import SGDClassifier start_time = time.time() text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier( loss='hinge', penalty='l2', )), ]) text_clf = text_clf.fit(header_list, labels) predicted = text_clf.predict(header_test) print("SVM")
from sklearn.ensemble import RandomForestClassifier #se pasmo con 1000000 #probar con mas parametros classifier = RandomForestClassifier(n_estimators=100) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) #print X_train.shape from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score print '\nAccuracy:', accuracy_score(y_test, prediction) print '\nscore:', classifier.score(X_train, y_train) print '\nrecall:', recall_score(y_test, prediction) print '\nprecision:', precision_score(y_test, prediction) print '\n clasification report:\n', classification_report(y_test, prediction) print '\n confussion matrix:\n', confusion_matrix(y_test, prediction) #plots: import matplotlib.pyplot as plt confusion_matrix_plot = confusion_matrix(y_test, prediction) plt.title('matriz de confusion') plt.colorbar() plt.xlabel() plt.xlabel('categoria de verdad') plt.ylabel('categoria predecida') plt.show()
le = preprocessing.LabelEncoder() le.fit(ids_) verbose("Total classes",le.classes_.shape[0]) ids=le.transform(ids_) X_train, X_test, y_train, y_test=\ train_test_split(feats, ids, test_size=0.20, random_state=42) verbose("Training") classifier=RandomForestClassifier( n_estimators=opts.estimators, n_jobs=opts.nprocessors, max_depth=20, verbose=True) # Aprendiendo classifier.fit(X_train, y_train) # Prediciendo verbose("Prediction") prediction = classifier.predict(X_test) print( 'Accuracy :', accuracy_score(y_test, prediction)) print( 'Precision :', precision_score(y_test, prediction)) print( 'Recall :', recall_score(y_test, prediction)) print( 'F-score :', f1_score(y_test, prediction)) print( '\nClasification report:\n', classification_report(y_test, prediction)) print( '\nConfussion matrix :\n',confusion_matrix(y_test, prediction))
# ch2 = SelectFromModel(clf2, prefit=True) # # X_train = ch2.transform(X_train) # X_test = ch2.transform(X_test) clf.fit(X_train, y_train) print len(y_train) print len(y_test) pred = clf.predict(X_test) #pred = [0]* len(y_test) score = metrics.accuracy_score(y_test, pred) prec = metrics.precision_score(y_test, pred) recall = metrics.recall_score(y_test, pred) f1 = metrics.f1_score(y_test, pred) print("accuracy: %0.3f prec: %0.3f recall: %0.3f f1: %0.3f" % (score, prec, recall, f1)) total.append(score) total2.append(f1) file2 = open('results/%s-1' % source, 'w') file3 = open('results/%s-0' % source, 'w') for s, (y, x) in zip(pred_sents, zip(pred, X_test)): if y == 1: file2.write(s + '\n') file2.write(str(x) + '\n') else: file3.write(s + '\n') file3.write(str(x) + '\n')
def evaluate(self, params, rnnDataTest): # predictLabels = np.zeros(len(rnnDataTest.allSNum), dtype='int32') # probabilities = np.zeros(len(rnnDataTest.allSNum)) predictLabels = [] trueLabels = [] allSNum = rnnDataTest.allSNum allSTree = rnnDataTest.allSTree allSStr = rnnDataTest.allSStr verbIndices = rnnDataTest.verbIndices # allSNN = rnnDataTest.allSNN # allIndicies = rnnDataTest.allIndicies sentenceLabels = rnnDataTest.sentenceLabels ndoc = rnnDataTest.ndoc() print "Total number of trees/sentences to be evaluated: ", ndoc for s in range(ndoc): if (s % 100 == 0): print "Processing sentences ", s, ' - ', s + 100 thissentVerbIndices = verbIndices[s] sStr = allSStr[s] sNum = allSNum[s] sTree = allSTree[s] labels = sentenceLabels[s] if ((len(sNum) == 1) or (len(thissentVerbIndices) == 0) or (labels.shape[1] != len(sStr))): continue #only one word in a sent, no verbs for this sent, tokens and labels mismatch for nverb, vid in enumerate(thissentVerbIndices): for wid in range(len(sStr)): indices = np.array([vid, wid]) truelabel = labels[nverb, wid] setVerbnWordDistanceFeat(self.Wv, sNum, vid, wid, params) tree = forwardPropTree(self.W, self.WO, self.Wcat, self.Wv, self.Wo, sNum, sTree, sStr, sNN=None, indicies=None, params=params) trueLabels.append(truelabel) calPredictions( tree, self.Wcat, self.Wv, indices, sStr, params ) #updates score, nodepath etc for this verb, word pair predictedLabel = np.argmax(tree.y) predictLabels.append(predictedLabel) f1 = f1_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None) #, labels=all_labels) p = precision_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None) #, labels=all_labels) r = recall_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None) #), labels=all_labels) print "XXXXXXX F1 = ", f1 print "XXXXXXX P = ", p print "XXXXXXX R = ", r print return predictLabels
ids_ = np.load(opts.IDS) le = preprocessing.LabelEncoder() le.fit(ids_) verbose("Total classes", le.classes_.shape[0]) ids = le.transform(ids_) X_train, X_test, y_train, y_test=\ train_test_split(feats, ids, test_size=0.20, random_state=42) verbose("Training") classifier = RandomForestClassifier(n_estimators=opts.estimators, n_jobs=opts.nprocessors, max_depth=20, verbose=True) # Aprendiendo classifier.fit(X_train, y_train) # Prediciendo verbose("Prediction") prediction = classifier.predict(X_test) print('Accuracy :', accuracy_score(y_test, prediction)) print('Precision :', precision_score(y_test, prediction)) print('Recall :', recall_score(y_test, prediction)) print('F-score :', f1_score(y_test, prediction)) print('\nClasification report:\n', classification_report(y_test, prediction)) print('\nConfussion matrix :\n', confusion_matrix(y_test, prediction))
for tweet in reader[0:2*(numironicos/3)]: tweets_train.append(tweet["text"]) labels_train.append("noironia") for tweet in reader[2*(numironicos/3):]: tweets_test.append(tweet["text"]) labels_test.append("noironia") stop_words = [] f = open("spanish.txt") for line in f: stop_words.append(line.strip()) f.close() y_train = np.array(labels_train, dtype=object) y_test = np.array(labels_test, dtype=object) vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words) X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object)) X_test = vectorizer.transform(np.array(tweets_test, dtype=object)) classifier = RandomForestClassifier(n_estimators = 10) classifier.fit(X_train.toarray(), y_train) prediction = classifier.predict(X_test.toarray()) print '\nAccuracy :', accuracy_score(y_test, prediction) print '\nPrecision :', precision_score(y_test, prediction) print '\nRecall :', recall_score(y_test, prediction) print '\nF-score :', f1_score(y_test, prediction) print '\nClasification report:\n', classification_report(y_test,prediction) print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
prediction = regressor.predict(X_test) y_.extend(y_test) prediction_.extend(prediction) verbose('----------\n') verbose("Evaluation") if opts.mode in ['age','gender']: from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # Calculando desempeƱo print( 'Accuracy :', accuracy_score(y_, prediction_)) print( 'Precision :', precision_score(y_, prediction_)) print( 'Recall :', recall_score(y_, prediction_)) print( 'F-score :', f1_score(y_, prediction_)) print( '\nClasification report:\n', classification_report(y_, prediction_)) print( '\nConfussion matrix :\n',confusion_matrix(y_, prediction_)) else: from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error,r2_score print( 'Mean Abs Error :', mean_absolute_error(y_, prediction_)) print( 'Mean Sqr Error :', mean_squared_error(y_, prediction_)) print( 'R2 Error :', r2_score(y_, prediction_)) #plots: #import matplotlib.pyplot as plt #confusion_matrix_plot = confusion_matrix(y_test, prediction) #plt.title('matriz de confusion')