def evaluate_classifier (numTrainR, numTrainN, numTestR, numTestN, model, verbose): ''' I used code from http://www.nltk.org/book/ch06.html for this ''' #load raw tweets: rawRacistTweets = loadRacistTweets(numTweets = numTrainR + numTestR, excludeJokes=True) rawNormalTweets = loadNonRacistTweets(numTweets = numTrainN + numTestN) #rawTweets = rawRacistTweets + rawNormalTweets print("Number of racist tweets: {}.".format(len(rawRacistTweets))); print("Number of normal tweets: {}.".format(len(rawNormalTweets))); #split into train/test sets trainR = rawRacistTweets[0:numTrainR]; print(len(trainR)) testR = rawRacistTweets[numTrainR:numTrainR + numTestR]; print(len(testR)) trainN = rawNormalTweets[0:numTrainN]; print(len(trainN)) testN = rawNormalTweets[numTrainN:numTrainN + numTestN]; print(len(testN)) #combine racist/non-racist tweets into single train/test datasets trainTweets = trainR + trainN; testTweets = testR + testN; #pre-process tweets (i.e. remove certain words): preprocessedTrainTweets = [(preprocess(d), c) for (d, c) in trainTweets]; preprocessedTestTweets = [(preprocess(d), c) for (d, c) in testTweets]; featureExtractor = FeatureExtractor([FeatureExtractor.UNIGRAM, FeatureExtractor.BIGRAM]) #featureExtractor.train_TF_IDF(trainTweets) #compute training & testing features trainFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTrainTweets]; testFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTestTweets]; if model == 'SVM': classifier = nltk.classify.SklearnClassifier(LinearSVC()); classifier.train(trainFeats); #evaluate SVM classifier print("----------------------"); print("SVM Classifier"); elif model == 'RF': rf = RF(n_estimators=75, max_features = 'sqrt', class_weight='auto', criterion="entropy", min_samples_split=9, random_state=0) classifier = nltk.classify.SklearnClassifier(rf); classifier.train(trainFeats); #evaluate RF classifier print("----------------------"); print("RF Classifier"); #note that TF-IDF cannot be set when model=NB elif model == 'NB': # Bayes classifier = nltk.NaiveBayesClassifier.train(trainFeats); print("----------------------"); print("NB Classifier"); print("accuracy: %.3f" %nltk.classify.accuracy(classifier, testFeats)); Y_test = [testFeat[1] for testFeat in testFeats] Y_pred = classifier.classify_many([testFeat[0] for testFeat in testFeats]) conf=metrics.confusion_matrix(Y_test, Y_pred, [0,1]) precision, recall, fscore = precision_recall_fscore(conf, 1) print("precision: %.3f" %precision) print("recall: %.3f" %recall) print("f1 score: %.3f" %fscore) print("%.1f\%% & %.1f\%% & %.1f\%%" %(100*precision,100*recall,100*fscore)) print("confusion matrix:") print(conf) if verbose: FP_indeces = np.where(np.subtract(Y_pred, Y_test)==1)[0] FN_indeces = np.where(np.subtract(Y_pred, Y_test)==-1)[0] for FP_index in FP_indeces: print("False positive: {}".format(' '.join(testTweets[FP_index][0]))) for FN_index in FN_indeces: print("False negative: {}".format(' '.join(testTweets[FN_index][0])))
for item in tupleList : for word in item[0] : res.append(word); return res; ''' I used code from http://www.nltk.org/book/ch06.html for this ''' if __name__ == "__main__" : print("NB start"); racistTweets = [(preprocess(d), c) for (d, c) in loadRacistTweets(excludeJokes=True)]; normalTweets = [(preprocess(d), c) for (d, c) in loadNonRacistTweets(numTweets=len(racistTweets))]; print("Number of racist tweets: {}.".format(len(racistTweets))); print("Number of normal tweets: {}.".format(len(normalTweets))); numTrain = 1500; numTest = 500; trainR = racistTweets[0:numTrain]; testR = racistTweets[numTrain:numTrain + numTest]; trainN = normalTweets[0:numTrain]; testN = normalTweets[numTrain:numTrain + numTest];