def extract_features(tweet):
    word_features = get_word_features(tweet)
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in tweet_words)
    return features


''' Train on file "trainGold.tsv"
    Test on file "devGold.tsv"
    (attempted to view confusion matrix. bugs...)
'''

# TRAIN
read = readerAndWriter.readFile("data/cleaned/trainGold.tsv")
dataForTweets = readTweets(read)
training_set = nltk.classify.apply_features(extract_features, dataForTweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

# TEST
testFile = readerAndWriter.readFile("data/cleaned/devGold.tsv")
dataForTest = readTweets(testFile)
test_set = nltk.classify.apply_features(extract_features, dataForTest)
print "CLASSIFYING: ", nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(10)
'''
cm = nltk.ConfusionMatrix(training_set, training_set)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))
'''
Exemple #2
0
        #result = []
        #for tweet in tagResults: #w = word, t = tag, c = confidence level
        #    cleanTweet = ""
        #    for triple in tweet:
        #        print triple
        #        (w, t, c) = triple
        #        #removing urls, user mentions, numbers, and hashtags from the tweet
        #        if t != 'U' and t != '@' and t!= '$' and t != '#':
        #            cleanTweet+=str(w) + " "
        #            #print "cleantweet is:",cleanTweet
        #            #print w
        #    print cleanTweet
        #    result.append(cleanTweet)
        #print result
        #readResults = readerAndWriter.readFile("cleaned/PFTweets.txt")
        readTrain = readerAndWriter.readFile("cleaned/PFTweetsTrain.txt")
        readTest = readerAndWriter.readFile("cleaned/PFTweetsTest.txt")
        #print "read results\n",readResults
        print "\n DONE READING \n"
        #print "\nclean tweet\n",cleanTweet(tweets)
        #replacedResults = replaceTaggedTweet(readResults)
        taggedTrain = replaceTaggedTweet(readTrain)
        taggedTest = replaceTaggedTweet(readTest)
        print "\n DONE REPLACING \n"
        #print "\nreplace tagged tweet\t",replacedResults
        #np.savetxt("cleaned/testCleaned.txt",replacedResults)
        #toFile("cleaned/PFTweetsCleaned.txt",replacedResults)
        toFile("cleaned/PFTrainTag.txt", taggedTrain)
        toFile("cleaned/PFTestTag.txt", taggedTest)
        print "\n DONE WRITING \n"
'''
def collapseScales(originalData):
    # newFile is an array of arrays where each element in newFile 
    # is a line (as an array)
    newFile = [] 
    for e in originalData:
        line = e.split("\t")
        if (int(line[2]) == -2):
            line[2] = str(-1)
        elif (int(line[2]) == 2):
            line[2] = str(1)
        newFile.append(line)
    return newFile 

''' ******************* COLLAPSE SCALES ***************************'''
originalFile = readerAndWriter.readFile("cleaned/allTopics.tsv")

# find the distribution of the labels before collapsing the scales
dist_count = [0]*5
for element in originalFile:
    line = element.split("\t")
    dist_count[int(line[2])+2] += 1

print "Distribution count of numbers [-2, -1, 0, 1, 2]:",dist_count #[139, 1086, 2667, 4608, 549]
print "Total number of tweets",np.sum(dist_count) #9049
distribution = [0]*5
for num in range(len(dist_count)):
    print "num is", num
    print dist_count[num]
    distribution[num] = dist_count[num]/np.sum(dist_count)
    
 #result = []
 #for tweet in tagResults: #w = word, t = tag, c = confidence level
 #    cleanTweet = ""
 #    for triple in tweet:
 #        print triple
 #        (w, t, c) = triple
 #        #removing urls, user mentions, numbers, and hashtags from the tweet
 #        if t != 'U' and t != '@' and t!= '$' and t != '#':
 #            cleanTweet+=str(w) + " "
 #            #print "cleantweet is:",cleanTweet
 #            #print w
 #    print cleanTweet
 #    result.append(cleanTweet)
 #print result
 #readResults = readerAndWriter.readFile("cleaned/PFTweets.txt")
 readTrain = readerAndWriter.readFile("cleaned/PFTweetsTrain.txt")
 readTest = readerAndWriter.readFile("cleaned/PFTweetsTest.txt")
 #print "read results\n",readResults
 print "\n DONE READING \n"
 #print "\nclean tweet\n",cleanTweet(tweets)
 #replacedResults = replaceTaggedTweet(readResults)
 taggedTrain = replaceTaggedTweet(readTrain)
 taggedTest = replaceTaggedTweet(readTest)
 print "\n DONE REPLACING \n"
 #print "\nreplace tagged tweet\t",replacedResults
 #np.savetxt("cleaned/testCleaned.txt",replacedResults)
 #toFile("cleaned/PFTweetsCleaned.txt",replacedResults)
 toFile("cleaned/PFTrainTag.txt", taggedTrain)
 toFile("cleaned/PFTestTag.txt", taggedTest)
 print "\n DONE WRITING \n"
     
def collapseScales(originalData):
    # newFile is an array of arrays where each element in newFile
    # is a line (as an array)
    newFile = []
    for e in originalData:
        line = e.split("\t")
        if (int(line[2]) == -2):
            line[2] = str(-1)
        elif (int(line[2]) == 2):
            line[2] = str(1)
        newFile.append(line)
    return newFile


''' ******************* COLLAPSE SCALES ***************************'''
originalFile = readerAndWriter.readFile("cleaned/allTopics.tsv")

# find the distribution of the labels before collapsing the scales
dist_count = [0] * 5
for element in originalFile:
    line = element.split("\t")
    dist_count[int(line[2]) + 2] += 1

print "Distribution count of numbers [-2, -1, 0, 1, 2]:", dist_count  #[139, 1086, 2667, 4608, 549]
print "Total number of tweets", np.sum(dist_count)  #9049
distribution = [0] * 5
for num in range(len(dist_count)):
    print "num is", num
    print dist_count[num]
    distribution[num] = dist_count[num] / np.sum(dist_count)