Ejemplo n.º 1
0
Archivo: nb.py Proyecto: fieteb/nlp15FP
        for word in item[0] :
            res.append(word);
    return res;



'''
    I used code from
    http://www.nltk.org/book/ch06.html
    for this

'''
if __name__ == "__main__" :
    print("NB start");
    racistTweets = [(preprocess(d), c) for (d, c) in loadRacistTweets(excludeJokes=True)];
    normalTweets = [(preprocess(d), c) for (d, c) in loadNonRacistTweets(numTweets=len(racistTweets))];

    print("Number of racist tweets: {}.".format(len(racistTweets)));
    print("Number of normal tweets: {}.".format(len(normalTweets)));

    numTrain = 1500;
    numTest = 500;

    trainR = racistTweets[0:numTrain];
    testR = racistTweets[numTrain:numTrain + numTest];

    trainN = normalTweets[0:numTrain];
    testN = normalTweets[numTrain:numTrain + numTest];


    trainTweets = trainR + trainN;
Ejemplo n.º 2
0
def evaluate_classifier (numTrainR, numTrainN, numTestR, numTestN, model, verbose):
    '''
        I used code from
        http://www.nltk.org/book/ch06.html
        for this

    '''

    #load raw tweets:
    rawRacistTweets = loadRacistTweets(numTweets = numTrainR + numTestR, excludeJokes=True)
    rawNormalTweets = loadNonRacistTweets(numTweets = numTrainN + numTestN)
    #rawTweets = rawRacistTweets + rawNormalTweets

    print("Number of racist tweets: {}.".format(len(rawRacistTweets)));
    print("Number of normal tweets: {}.".format(len(rawNormalTweets)));

    #split into train/test sets
    trainR = rawRacistTweets[0:numTrainR];
    print(len(trainR))
    testR = rawRacistTweets[numTrainR:numTrainR + numTestR];
    print(len(testR))

    trainN = rawNormalTweets[0:numTrainN];
    print(len(trainN))
    testN = rawNormalTweets[numTrainN:numTrainN + numTestN];
    print(len(testN))

    #combine racist/non-racist tweets into single train/test datasets
    trainTweets = trainR + trainN;
    testTweets = testR + testN;

    #pre-process tweets (i.e. remove certain words):
    preprocessedTrainTweets = [(preprocess(d), c) for (d, c) in trainTweets];
    preprocessedTestTweets = [(preprocess(d), c) for (d, c) in testTweets];

    featureExtractor = FeatureExtractor([FeatureExtractor.UNIGRAM, FeatureExtractor.BIGRAM])
    #featureExtractor.train_TF_IDF(trainTweets)

    #compute training & testing features
    trainFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTrainTweets];
    testFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTestTweets];

    if model == 'SVM':
        classifier = nltk.classify.SklearnClassifier(LinearSVC());
        classifier.train(trainFeats);

        #evaluate SVM classifier
        print("----------------------");
        print("SVM Classifier");
    elif model == 'RF':
        rf = RF(n_estimators=75, max_features = 'sqrt', 											      class_weight='auto', criterion="entropy",
			 min_samples_split=9, random_state=0)
        classifier = nltk.classify.SklearnClassifier(rf);
        classifier.train(trainFeats);
        #evaluate RF classifier

        print("----------------------");
        print("RF Classifier");
    #note that TF-IDF cannot be set when model=NB
    elif model == 'NB':
        # Bayes
        classifier = nltk.NaiveBayesClassifier.train(trainFeats);
        print("----------------------");
        print("NB Classifier");

    print("accuracy: %.3f" %nltk.classify.accuracy(classifier, testFeats));
    Y_test = [testFeat[1] for testFeat in testFeats]
    Y_pred = classifier.classify_many([testFeat[0] for testFeat in testFeats])
    conf=metrics.confusion_matrix(Y_test, Y_pred, [0,1])
    precision, recall, fscore = precision_recall_fscore(conf, 1)

    print("precision: %.3f" %precision)
    print("recall: %.3f" %recall)
    print("f1 score: %.3f" %fscore)
    print("%.1f\%% & %.1f\%% & %.1f\%%" %(100*precision,100*recall,100*fscore))

    print("confusion matrix:")
    print(conf)

    if verbose:
        FP_indeces = np.where(np.subtract(Y_pred, Y_test)==1)[0]
        FN_indeces = np.where(np.subtract(Y_pred, Y_test)==-1)[0]
        for FP_index in FP_indeces:
            print("False positive: {}".format(' '.join(testTweets[FP_index][0])))
        for FN_index in FN_indeces:
            print("False negative: {}".format(' '.join(testTweets[FN_index][0])))
Ejemplo n.º 3
0
        if FeatureExtractor.TF_IDF in self.features:
            features.update(self.get_TF_IDF_feature_vector(tokens))
        if FeatureExtractor.UNIGRAM in self.features:
            features.update(self.get_unigram_features(tokens))
        if FeatureExtractor.BIGRAM in self.features:
            features.update(self.get_bigram_features(tokens))
        return features


if __name__ == "__main__":
    '''
    This is an example to see how to use this class
    '''

    #construct a tiny set of tweets
    tweets = loadNonRacistTweets(numTweets = 10)

    print("First tweet: {}".format(' '.join(tweets[0][0])))

    #preprocess the tweets to filter punctuation and common words
    preprocessedTweets = [(preprocess(tweet),label) for (tweet,label) in tweets]

    #extract bigram features only
    bigramFeatureExtractor = FeatureExtractor([FeatureExtractor.BIGRAM])
    #get_feature_vector() requires the tweet without the label
    bigramFeatures = bigramFeatureExtractor.get_feature_vector(preprocessedTweets[0][0])
    print("\nBigram Feature Vector:\n {}".format(bigramFeatures))

    #extract unigram and TF_IDF features - the TF_IDF features demonstrates the usefulness of having a class
    unigramTDIDFFeatureExtractor = FeatureExtractor([FeatureExtractor.UNIGRAM, FeatureExtractor.TF_IDF])
    unigramTDIDFFeatureExtractor.train_TF_IDF(preprocessedTweets)