def run_classifier(X_train, y_train, X_test, y_test, clf_name, num_trees): accu = 0.0 roc = 0.0 predicted = [] clf = '' if clf_name == "BernoulliNB": clf = BernoulliNB().fit(X_train, y_train) if clf_name == "GaussianNB": clf = GaussianNB().fit(X_train, y_train) if clf_name == "RF": clf = RandomForestClassifier(n_estimators=num_trees) clf = clf.fit(X_train, y_train) if clf_name == "SVM": clf = SVC(cache_size=2000, probability=False) clf.fit(X_train, y_train) if clf_name == "KNN": n_neighbors = 5 # default is 5 clf = neighbors.KNeighborsClassifier(n_jobs=-1) clf.fit(X_train, y_train) predicted = clf.predict(X_test) if clf_name == "SVM": predicted_prob = clf.decision_function(X_test) accu = accuracy_score(y_test, predicted) roc = roc_auc_score(y_test, predicted_prob) else: predicted_prob = clf.predict_proba(X_test) accu = accuracy_score(y_test, predicted) roc = roc_auc_score(y_test, predicted_prob[:, 1]) pos_presicion = precision_score(y_test, predicted) pos_recall = recall_score(y_test, predicted) pos_f1 = f1_score(y_test, predicted) print("Correctly Classified: {}".format(accu)) print(classification_report(y_test, predicted, digits=4)) return accu, roc, pos_presicion, pos_recall, pos_f1
def main(): ##### DO NOT MODIFY THESE OPTIONS ########################## parser = argparse.ArgumentParser() parser.add_argument('-training', required=True, help='Path to training data') parser.add_argument('-business_file', required=True, help='Path to business data') parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm') parser.add_argument('-top', type=int, help='Number of top features to show') parser.add_argument('-test', help='Path to test data') opts = parser.parse_args() ############################################################ ##### BUILD TRAINING SET ################################### # Initialize CountVectorizer # You will need to implement functions in tokenizer.py tokenizer = Tokenizer() vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace', tokenizer=tokenizer) csv_file = open(opts.training) file_reader = csv.reader(csv_file) tweets = [] lable = [] for line in file_reader: tweets.append(line[2]) lable.append(int(line[1])) vocabulary = vectorizer.fit_transform(tweets) #print tweets lable = np.array(lable) #print lable # Load training text and training labels # (make sure that your labels are converted to integers (0 or 1, not '0' or '1') # so that we can enforce the condition that label data is binary) # Get training features using vectorizer # Transform training labels to numpy array (numpy.array) ############################################################ ##### TRAIN THE MODEL ###################################### # Initialize the corresponding type of the classifier and train it (using 'fit') if opts.classifier == 'nb': classifier = BernoulliNB(binarize=None) classifier.fit(vocabulary, lable) elif opts.classifier == 'log': classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) classifier.fit(vocabulary, lable) elif opts.classifier == 'svm': classifier = LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None) classifier.fit(vocabulary, lable) else: raise Exception('Unrecognized classifier!') ############################################################ ###### VALIDATE THE MODEL ################################## # Print training mean accuracy using 'score' print ("Training accuracy: %f" % classifier.score(vocabulary, lable)) # Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy' # and print the mean score and std deviation scores = cross_validation.cross_val_score(classifier, vocabulary, lable, scoring = 'accuracy', cv=10) print("Cross-Validation Accuracy: %f (+/- %f)" % (scores.mean(), scores.std())) ############################################################ ##### EXAMINE THE MODEL #################################### if opts.top is not None: # print top n most informative features for positive and negative classes print 'Most informative features' util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top) ############################################################ ##### TEST THE MODEL ####################################### if opts.test is None: # Test the classifier on one sample test tweet # Tim Kraska 10:43 AM - 5 Feb 13 test_tweet = 'Water dripping from 3rd to 1st floor while the firealarm makes it hard to hear anything. BTW this is the 2nd leakage. Love our new house' terms = vectorizer.transform([test_tweet]) # Print the predicted label of the test tweet print classifier.predict(terms) # Print the predicted probability of each label. if opts.classifier != 'svm': # Use predict_proba print classifier.predict_proba(terms) else: # Use decision_funcion print classifier.decision_function(terms) else: # Test the classifier on the given test set # Extract features from the test set and transform it using vectorizer csv_file = open(opts.test) file_reader = csv.reader(csv_file) test_tweets = [] true_lable = [] business = [] for line in file_reader: business.append(line[0]) test_tweets.append(line[2]) true_lable.append(int(line[1])) terms = vectorizer.transform(test_tweets) true_lable = np.array(true_lable) predict_lable = classifier.predict(terms) # Print test mean accuracy accuracy = (len(true_lable) - sum(true_lable^predict_lable))/len(true_lable) print ("Test accuracy: %f" % accuracy) # Predict labels for the test set # Print the classification report target_names = ['Negative', 'Positive'] if opts.classifier != 'svm': test_predicted_proba = classifier.predict_proba(terms) util.plot_roc_curve(true_lable, test_predicted_proba) positive_prob = [] negative_prob = [] for i, item in enumerate(true_lable): if true_lable[i] == 1: positive_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]]) else: negative_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]]) sorted_positive = sorted(positive_prob, key=itemgetter(1), reverse= True) positive_bias = sorted_positive[0:100] sorted_negative = sorted(negative_prob, key=itemgetter(1)) negative_bias = sorted_negative[0:100] bfile = open(opts.business_file, 'r') bdic = {} for line in bfile: line = json.loads(line) bdic[line['business_id']] = [line['name'], line['full_address']] positive = open('positive_bias.csv', 'w') writer_positive = csv.writer(positive) negative = open('negative_bias.csv', 'w') writer_negative = csv.writer(negative) for item in positive_bias: writer_positive.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1])) for item in negative_bias: writer_negative.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1])) '''
clf = GradientBoostingClassifier(n_estimators=5, random_state=0) fs_train = fs_train.toarray() fs_test = fs_test.toarray() if config.SELF_TRAINING: fl = fs_train.shape[0] ll = labels_train.shape[0] fsarr = fs_train.toarray() cur_fs = fsarr[:fl / 10] cur_labels = labels_train[:ll / 10] clf.fit(cur_fs, cur_labels) print clf.classes_ for i in range(1, 10): new_fs = fsarr[(i * fl) / 10:((i + 1) * fl) / 10] confidence_scores = clf.decision_function(new_fs) most_confident_samples = confidence_scores.max( axis=1).argsort()[-1 * (confidence_scores.shape[0] / 10):] most_confident_labels = confidence_scores[ most_confident_samples].argmax(axis=1) cur_fs = np.append(cur_fs, new_fs[most_confident_samples], axis=0) cur_labels = np.append(cur_labels, clf.classes_[most_confident_labels]) clf.fit(cur_fs, cur_labels) pred = clf.predict(fs_test) else: clf.fit(fs_train, labels_train) pred = clf.predict(fs_test) if grid_search:
fs_train = fs_train.toarray() fs_test = fs_test.toarray() if config.SELF_TRAINING: fl = fs_train.shape[0] ll = labels_train.shape[0] fsarr = fs_train.toarray() cur_fs = fsarr[:fl / 10] cur_labels = labels_train[:ll / 10] clf.fit(cur_fs, cur_labels) print clf.classes_ for i in range(1, 10): new_fs = fsarr[(i * fl) / 10:((i + 1) * fl) / 10] confidence_scores = clf.decision_function(new_fs) most_confident_samples = confidence_scores.max(axis=1).argsort()[ -1 * (confidence_scores.shape[0] / 10):] most_confident_labels = confidence_scores[most_confident_samples].argmax(axis=1) cur_fs = np.append(cur_fs, new_fs[most_confident_samples], axis=0) cur_labels = np.append(cur_labels, clf.classes_[most_confident_labels]) clf.fit(cur_fs, cur_labels) pred = clf.predict(fs_test) else: clf.fit(fs_train, labels_train) pred = clf.predict(fs_test) if grid_search: print clf.best_estimator_
def main(): parser = argparse.ArgumentParser() parser.add_argument('-training', required=True, help='Path to training data') parser.add_argument('-test', help='Path to test data') parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm') parser.add_argument('-top', type=int, help='Number of top features to show') parser.add_argument('-trees',type=int,help="Number of trees (if random forest for classifier)") opts = parser.parse_args() ##### BUILD TRAINING SET ################################### # Initialize CountVectorizer vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace') # Load training text and training labels # (make sure to convert labels to integers (0 or 1, not '0' or '1') # so that we can enforce the condition that label data is binary) count = 0 with open(opts.training, 'rU') as f: reader = csv.reader(f) train_data = list(reader) train_labels = numpy.arange(len(train_data)) train_text = [] i = 0 for blog in train_data: label = blog[0] text = blog[1] train_text.append(text) train_labels[i] = int(label) i+=1 print("ready to vectorize training data") # Get training features using vectorizer train_features = vectorizer.fit_transform(train_text) # Transform training labels to numpy array (numpy.array) print("done vectorizing") ############################################################ ##### TRAIN THE MODEL ###################################### # Initialize the corresponding type of the classifier and train it (using 'fit') if opts.classifier == 'nb': classifier = BernoulliNB(binarize=None) print("Naive Bayes") elif opts.classifier == 'log': classifier = LogisticRegression(C=.088) print("Log") elif opts.classifier == 'svm': classifier = LinearSVC() print("Support Vector Machine") elif opts.classifier == 'rf': if not opts.trees: trees = 10 else: trees = opts.trees classifier = RandomForestClassifier(n_estimators=trees) train_features = train_features.toarray() elif opts.classifier == 'knn': classifier = KNeighborsClassifier(n_neighbors=10) else: raise Exception('Unrecognized classifier!') classifier.fit(train_features,train_labels) ############################################################ ###### VALIDATE THE MODEL ################################## # Print training mean accuracy using 'score' print(classifier.score(train_features,train_labels)) scores = cross_validation.cross_val_score(classifier,train_features,train_labels,cv=10,scoring='accuracy') print("Cross Validation Scores Calculated") print(scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())) ############################################################ ##### EXAMINE THE MODEL #################################### if opts.top is not None: print("Got "+str(opts.top)+" tops") # print top n most informative features for positive and negative classes util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top) ############################################################ ##### TEST THE MODEL ####################################### if opts.test is None: test_blog = "uses yahoo boss support search experience general web search perform query application set term candidates using key terms term its within result set its global measure similar 1ST_PERSON former colleagues 1ST_PERSON enterprise try yourself URL rough edges produces considering example 1ST_PERSON application explore learn 1ST_PERSON started 1ST_PERSON term 1ST_PERSON suggestions looked name caught 1ST_PERSON following 1ST_PERSON 1ST_PERSON again results 1ST_PERSON immediately had document further made clear someone 1ST_PERSON get home can_t you_ll experience 1ST_PERSON did 1ST_PERSON encourage" # Print the predicted label of the test blog features = vectorizer.transform([test_blog]) if opts.classifier == 'rf': features = features.toarray() print("Prediction (1 == correct): ") print(classifier.predict(features)) # Print the predicted probability of each label. if opts.classifier != 'svm': # Use predict_proba print("User predict prob ") print(classifier.predict_proba(features)) else: # Use decision_function print("use decision ") print(classifier.decision_function(features)) else: with open(opts.test, 'rb') as f: reader = csv.reader(f) test_data = list(reader) test_labels = numpy.arange(len(test_data)) test_text = [] i = 0 for blog in test_data: label = blog[0] text = blog[-1] test_text.append(text) test_labels[i] = int(label) i+=1 print("ready to vectorize testing data") # Get training features using vectorizer test_features = vectorizer.transform(test_text) print("Score") print(classifier.score(test_features,test_labels)) # Test the classifier on the given test set # Extract features from the test set and transform it using vectorizer # Print test mean accuracy # Predict labels for the test set predictions = classifier.predict(test_features) # Print the classification report print("Classification report") print(classification_report(test_labels,predictions)) # Print the confusion matrix print("Classifier uses: Confusion!") print(confusion_matrix(test_labels,predictions)) print("It's super effective!") # Get predicted label of the test set if opts.classifier != 'svm': print("Predicted Probability") test_predicted_proba = classifier.predict_proba(test_features) blogs = zip(test_labels,predictions,test_predicted_proba,test_text) num = len(blogs) counter = 0 """for tup in reversed(sorted(blogs,key=lambda x:x[2][1])): if tup[0] == tup[1]: if counter < 5: print(tup) counter+=1 counter = 0 for tup in reversed(sorted(blogs,key=lambda x:x[2][0])): if tup[0] == tup[1]: if counter < 5: print(tup) counter+=1""" util.plot_roc_curve(test_labels, test_predicted_proba) else: print("Decision Function") decisions = classifier.decision_function(test_features) #import matplotlib.pyplot as plt x = numpy.arange(0,len(decisions),1) plt.plot(x,decisions) plt.show()