def train(self):
        print 'Classifier Training in progress....'
        poscutoff = len(self.positiveFeatures)
        negcutoff = len(self.negativeFeatures)
        print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positiveFeatures[:poscutoff] + self.negativeFeatures[:negcutoff]
        
        testfeats = self.test()        
        print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
        self.classifier = NaiveBayesClassifier.train(trainfeats)        
        print 'accuracy:', accuracy(self.classifier, testfeats)
        
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set) 
        
        for i, (feats, label) in enumerate(testfeats):    
            refsets[label].add(i)    
            observed = self.classifier.classify(feats)  
            #print label, observed  
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
Example #2
0
 def build_classifier(self):
     #print "Informal"
     self.labeled_features = self.build_informal_set()
     #print "Formal"
     self.labeled_features.extend(self.build_formal_set())
     classifier = learner.train(self.labeled_features)
     #classifier.show_most_informative_features()
     return classifier
Example #3
0
 def __init__(self, classList, featureMatrix):
     super(NaiveBayes, self).__init__()
     print "\n-------------------------\nNaive Bayes:\n-------------------------\n"
     
     self.classes = classList
     self.featureMatrix = featureMatrix
     self.nb = NB.train(zip(featureMatrix, classList))
     self.showMostInformativeFeatures()
def evaluate_features(feature_select, best_words):
    posFeatures = []
    negFeatures = []
   
   
      
    sentences = read_in_tweets(twitter_data)
    random.shuffle(sentences)
    sentences = sentences[:100000]
    
    posSentences = []
    negSentences = []
    for tup in sentences:
        if tup[0]=='0':
            negSentences.append(tup[1])
        elif tup[0]=='4':
            posSentences.append(tup[1])
    
   
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        posWords = [feature_select(posWords,best_words), 'pos']
        posFeatures.append(posWords)

    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        negWords = [feature_select(negWords,best_words), 'neg']
        negFeatures.append(negWords)


    
    # selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    # trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)    

    # initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)    

    # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)    

    # prints metrics to show how well the feature selection did
    print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    classifier.show_most_informative_features(10)
def create_model(pos_tweets, neg_tweets, neu_tweets, classifier_param='LinearSVC'):

    # filter away words that are less than 3 letters to form the training training_data
    tweets = []
    for (words, sentiment) in pos_tweets + neg_tweets + neu_tweets:
        words = util.clean_text(words, True)
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        #words_filtered = [' '.join(w) for w in [ x for x in nltk.bigrams(words.split())]]
        tweets.append((words_filtered, sentiment))

    # make sure tweets are shuffled randomly
    shuffle(tweets)

    # get the training set and train the Classifier
    training_set = nltk.classify.util.apply_features(extract_features, tweets)

    max_specificity = -1
    best_classifier = None
    average_accuracy = 0.0

    # perform 10-fold cross validation
    cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None)
    for traincv, testcv in cv:

        if classifier_param == "LinearSVC":
            classifier = SklearnClassifier(LinearSVC()).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "Tfid":
            # does TF-IDF weighting,
            # chooses the 1000 best features based on a chi2 statistic,
            # and then passes that into a multinomial naive Bayes classifier.
            pipeline = Pipeline([('tfidf', TfidfTransformer()), \
                                   ('chi2', SelectKBest(chi2, k=1000)), \
                                   ('nb', MultinomialNB())])
            classifier = SklearnClassifier(pipeline).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "Bernoulli":
            classifier = SklearnClassifier(BernoulliNB()).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "NaiveBayes":
            classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv)-1]])
        else:
            print "Classifier option not available: ", classifier_param
            sys.exit(1)

        accuracy_of_classifier, specificity = \
            util.accuracy(classifier, tweets[testcv[0]:testcv[len(testcv)-1]])

        average_accuracy += accuracy_of_classifier
        if specificity > max_specificity:
            max_specificity = specificity
            best_classifier = classifier

    print "\naverage accuracy: ", average_accuracy/cv.n_folds

    # save the classifier
    joblib.dump(best_classifier, "model/%s_classifier.pkl" % classifier_param)

    print "saved classifier"
Example #6
0
def main():
    rdr = CategorizedPlaintextCorpusReader('/home/mel/workspace/datascience/assignment5_kaggle/data/', r'.*\.txt', cat_pattern=r'(.*)\.txt')
    clf = NaiveBayesClassifier.train(list(make_training_data(rdr)))
    clf.show_most_informative_features(10)
    
    review_file = open("/home/mel/workspace/datascience/assignment5_kaggle/data/yelp_test_set/yelp_test_set_review.json")
    lines = review_file.readlines()
    output_file = open('/home/mel/workspace/datascience/assignment5_kaggle/output.csv', 'w+')
    
    for word in ('good', 'service'):
        print('probability {w!r} is useful: {p:.2%}'.format(
                                                              w = word, p = clf.prob_classify({word : True}).prob('useful')))
    def evaluateclassifier(self, featureselection):
        positivecount=0
        negativecount=0
        negativetweets = []
        positivetweets = []
        #print 'Evaluating Classifier'
        print featureselection
        with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f:
            #print 'Opening corpus file'
            reader = csv.reader(f)
            for row in reader:
                #Positive sentiment tweets
                if(row[0] == '4' and positivecount < self.corpuslength):
                    positivetweets.append(row[5])        
                    positivecount+=1        
                #Negative sentiment tweets
                if(row[0] == '0' and negativecount < self.corpuslength):
                    negativetweets.append(row[5])
                    negativecount+=1
        
        #print 'Generating Features' 
        self.positivefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positivetweets]
        self.negativefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativetweets]
        
        poscutoff = len(self.positivefeatures)
        negcutoff = len(self.negativefeatures)
        print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]
        
        testfeats = self.test(featureselection) 
        #testfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]       
        print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
        classifier = NaiveBayesClassifier.train(trainfeats)        
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
        
        #classifier.show_most_informative_features(20)
        
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set) 
        
        for i, (feats, label) in enumerate(testfeats):    
            refsets[label].add(i)    
            observed = classifier.classify(feats)  
            #print label, observed  
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
Example #8
0
    def test_simple(self):
        training_features = [
            ({'nice': True, 'good': True}, 'positive'),
            ({'bad': True, 'mean': True}, 'negative'),
        ]

        classifier = NaiveBayesClassifier.train(training_features)

        result = classifier.prob_classify({'nice': True})
        self.assertTrue(result.prob('positive') > result.prob('negative'))
        self.assertEqual(result.max(), 'positive')

        result = classifier.prob_classify({'bad': True})
        self.assertTrue(result.prob('positive') < result.prob('negative'))
        self.assertEqual(result.max(), 'negative')
    def build_classifier(self):

        #print "Creating a list of labels. If this is done, the previous init doesn't have to be"
        labels = ['arts','business','computers','home','recreation','science','shopping','knowledge']

        self.labeled_features = []
        for label in labels:
            print label.upper()
            self.labeled_features.extend(self.build_data_set(label))
            print self.labeled_features

        print self.labeled_features

        print "Labeled Features: ",self.labeled_features
        classifier = learner.train(self.labeled_features)
        classifier.show_most_informative_features()
        return classifier
Example #10
0
def main():
    mainDir="/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/corpus2"
    input="/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/tweets/cache"
    logger.info("Start app")
    documents = [(list(w.lower() for w in my_corpus.words(fileid)), categoryMapper(category))
                 for category in my_corpus.categories()
                 for fileid in my_corpus.fileids(category)]
    random.shuffle(documents)

    featuresets = [(document_features(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[250:], featuresets[:50]
    clf = NaiveBayesClassifier.train(train_set)
    logger.info("Accuracy: " + str(nltk.classify.accuracy(clf, test_set)))
    ref = [cat for features, cat in test_set]
    test = [clf.classify(features) for features, cat in test_set]
    logger.info(clf.show_most_informative_features(20))
    logger.info("\n" + nltk.ConfusionMatrix(ref, test).pp())
    logger.info("Exit app")
    def finalclassification(self):
        negative_words=[]
        positive_words=[]
        with open('positive.txt', 'r') as posSentences:
            for i in posSentences:
                posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                posWords = [negativevalues.makeadict(posWords), 'pos']
                positive_words.append(posWords)
        with open('negative.txt', 'r') as negSentences:
            for i in negSentences:
                negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                negWords = [negativevalues.makeadict(negWords), 'neg']
                negative_words.append(negWords)

        trainFeatures = positive_words[:] + negative_words[:]


        classifier = NaiveBayesClassifier.train(trainFeatures)    
        return classifier
def train(records):
    global CUR_CL
    train_data = []
    for record in records:
        text = record[1]
        class_label = record[0]
        feats = features_from_text(text, class_label, stopwords=sw)
        train_data.append(feats)
    if CUR_CL is None:
        if CLASSIFIER == "NaiveBayesClassifier":
            classifier = NaiveBayesClassifier.train(train_data)
        elif CLASSIFIER == "sklearnLinSVC":
            pipeline = Pipeline(
                [
                    ("tfidf", TfidfTransformer()),
                    ("chi2", SelectKBest(chi2, k=1000)),
                    ("nb", LinearSVC(multi_class="ovr")),
                ]
            )
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == "BernoulliNB":
            pipeline = Pipeline(
                [("tfidf", TfidfTransformer()), ("chi2", SelectKBest(chi2, k=1000)), ("nb", BernoulliNB())]
            )
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == "MultinomialNB":
            pipeline = Pipeline(
                [("tfidf", TfidfTransformer()), ("chi2", SelectKBest(chi2, k=1000)), ("nb", MultinomialNB())]
            )
            classifier = SklearnClassifier(pipeline).train(train_data)
        print CLASSIFIER
        CUR_CL = classifier
    else:
        print "Partial fitting.. \n\n"
        CUR_CL.train(train_data)
    f = open("%s/%s.pickle" % (pickles_dir, "news_based_" + CLASSIFIER), "wb")
    pickle.dump(CUR_CL, f)
    f.close()
    print "%s/%s.pickle saved" % (pickles_dir, "news_based_" + CLASSIFIER)

    gc.collect()
	def train(self,training_set=None):
		"""
		Trains the BOW NaiveBayes classifier.
		"""
		if (training_set == None):			
			training_set = [(sent, sent.certainty) for sent in self._corpus.sents()]
		#training_set = training_set[0:10] #para comparar con los resultados anteriores
		#build features		
		self._build_bow_features(training_set)
		
		#build featuresets for each sentence
		labeled_featuresets = []
		for sent in training_set:
			featureset = self.sentenceFeatures(sent)
			labeled_featuresets.append((featureset,sent.certainty))

		debug('Size of training set: '+str(len(labeled_featuresets)))
		#pp = pprint.PrettyPrinter(indent=4)
		#pp.pprint(labeled_featuresets)
		#train the NaiveBayes
		self._classifier = NaiveBayesClassifier.train(labeled_featuresets)
def create_classifier(feature_select, filename):    
    posFeatures = []
    negFeatures = []
    # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    
      
    sentences = read_in_tweets(twitter_data)
    random.shuffle(sentences)
    sentences = sentences[:100000]
    
    posSentences = []
    negSentences = []
    for tup in sentences:
        if tup[0]=='0':
            negSentences.append(tup[1])
        elif tup[0]=='4':
            posSentences.append(tup[1])
    
   
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)

    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)

    
    # selects 3/4 of the features to be used for training and 1/4 to be used for testing
    trainFeatures = negFeatures[:] + posFeatures[:]

    # trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)
    f = open(filename, 'wb')
    pickle.dump(classifier, f)
    f.close()
Example #15
0
# fileids_ = corpus_dir + '/rt-polarity*'

corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata'

cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']}

corpus_treatment(corpus_dir)

encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data')
fileids_ = '^rt-polarity.*'

categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader(
    root=encoded_corpus_dir,
    cat_map=t_map_,
    fileids=fileids_,
)

pos_words = categorized_plaintext_corpusreader.words(categories=['pos'])
pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos'])
pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos'])

neg_words = categorized_plaintext_corpusreader.words(categories=['pos'])
neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg'])
neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg'])

# NOTE: para views are not working to be looked into later

# classification
train = pos_words
classifier = NaiveBayesClassifier.train(train)
negcutoff, poscutoff = len(clean_tweets) * 4 / 5, len(insult_tweets) * 4 / 5
clean_train, clean_test = insult_tweets[:poscutoff], insult_tweets[poscutoff:]
insult_train, insult_test = clean_tweets[:negcutoff], clean_tweets[negcutoff:]

insult_feats_train = get_train_features_from_tweets(insult_train, 'insult')
clean_feats_train = get_train_features_from_tweets(clean_train, 'clean')

train_feats = insult_feats_train + clean_feats_train

#########################
# Classifier
# I tried the SVM and Naive Bayes classifiers that come with NLTK
# Since Naive Bayes retains the original feature set, I found it worked well here
# Naive Bayes also performed better in the evaluation
#########################
classifier = NaiveBayesClassifier.train(train_feats)
#classifier = SvmClassifier.train(train_feats)

#########################
# Evaluation
# Use the classifier on the test data and see how it did
#########################
correct, wrong = 0, 0

for tweet in insult_test:
    features = get_features_from_tweet(tweet)
    result = classifier.classify(features)
    if result == "insult":
        correct += 1
    else:
        wrong += 1
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features


def classify_tweet(tweet):
    return classifier.classify(extract_features(nltk.word_tokenize(tweet)))


pos_tweets = read_tweets('Training_Data/Social_Ranter.txt', 'positive')
neg_tweets = read_tweets('Training_Data/Negative.txt', 'negative')

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))

word_features = get_word_features(get_words_in_tweets(tweets))

training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)

test_tweets = read_tweets('Test_Tweets/Tweets_Positive.txt', 'positive')
test_tweets.extend(read_tweets('Test_Tweets/Tweets_Negative.txt', 'negative'))
total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1
result = accuracy / total * 100
 def train(self):
     training_set = classify.apply_features(self.extract_features,
                                            self.training)
     classifier = NaiveBayesClassifier.train(training_set)
     joblib.dump(classifier, 'naive_classifier.pkl', 3)
Example #19
0

print(gender_features('Gary'))

featuresets = [(gender_features(n), g) for (n, g) in names]

print(len(featuresets))

print(featuresets[0:10])

train_set, test_set = featuresets[500:], featuresets[:500]

print(len(train_set))
print(len(test_set))

nb_classifier = NaiveBayesClassifier.train(train_set)
print(nb_classifier.classify(gender_features('Gary')))
print(nb_classifier.classify(gender_features('Grace')))

print(classify.accuracy(nb_classifier, test_set))
print(nb_classifier.show_most_informative_features(5))

me_classifier = MaxentClassifier.train(train_set)

print(me_classifier.classify(gender_features('Gary')))
print(me_classifier.classify(gender_features('Grace')))

classify.accuracy(me_classifier, test_set)

me_classifier.show_most_informative_features(5)
Example #20
0
 def train(self, pairs):
     features = [(self.features(x,y), judgment) for x,y,judgment in pairs]
     self.model = NaiveBayesClassifier.train(features)
     #self.model = MaxentClassifier.train(features, max_iter=10)
     print self.model.most_informative_features()
Example #21
0
def start():
    global classifications_collection, tweets_collection, global_count
    sw = stopwords.words('english')
    thr = 5
    refactored_tweets = {}
    records = tweets_collection.find()
    for record in records:
        tweet = record['text']
        tmp_classifiers = record['classifications']
        for clasfId, classId in tmp_classifiers.iteritems():
            if clasfId not in refactored_tweets.keys():
                refactored_tweets[clasfId] = []
            refactored_tweets[clasfId].append({'text': tweet, 'classId':classId})
    
    records = None

    gc.collect()    

    for classification in classifications_collection.find():
        tweets = []
        classification_name = classification['classification']
        classification_id = str(classification["_id"])
        
        classes = classification['classes']
        
        #records = tweets_collection.find({"clasfId":classification_id})

        records = []
        try:
            records = refactored_tweets[classification_id]
        except KeyError:
            print "No tweets for classification ", classification_name
            continue
        records_count = len(records)
        print classification_name, records_count

        if classification_id in global_count.keys():
            if int(records_count/thr)>global_count[classification_id]:
                print "Exceeded threshold. Training started"
                for record in records:
                    tweet = record['text']
                    class_id = record['classId']
                    class_label = get_class_label(class_id, classes)
                    feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw)
                    
                    tweets.append(feats)
                classifier = NaiveBayesClassifier.train(tweets)
                f = open("%s.pickle"%classification_name, 'wb')
                pickle.dump(classifier, f)
                f.close()
                global_count[classification_id] = int(records_count/thr)
            else:
                pass
        else:
            global_count[classification_id] = int(records_count/thr)
            if global_count[classification_id] >=1:
                print "New classification or just started monitor"
                for record in records:
                    tweet = record['text']
                    class_id = record['classId']
                    class_label = get_class_label(class_id, classes)
                    feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw)
                    
                    tweets.append(feats)
                classifier = NaiveBayesClassifier.train(tweets)
                f = open("%s.pickle"%classification_name, 'wb')
                pickle.dump(classifier, f)
                f.close()
Example #22
0
import base64
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

corpusdir = "./text"
newcorpus = PlaintextCorpusReader(corpusdir, ".*")
labeled_names = (
    [(name, "comp") for name in newcorpus.words("comp.txt")]
    + [(name, "animal") for name in newcorpus.words("animal.txt")]
    + [(word, "ignore") for word in newcorpus.words("ignorethese.txt")]
)
features = [({n: n}, thing) for (n, thing) in labeled_names]
training = features[:]
testing = "What color is the mouse?".lower().split(" ")
classifier = NaiveBayesClassifier.train(training)
pickleclf = pickle.dumps(classifier)
compressed = base64.b64encode(zlib.compress(pickleclf, 9))
with open("PickledClassifier.txt", "wb") as outobj:
    outobj.write(compressed)
compScore = 0
animalScore = 0
for word in testing:
    if (
        word[len(word) - 1] == "."
        or word[len(word) - 1] == ","
        or word[len(word) - 1] == "?"
        or word[len(word) - 1] == "!"
    ):
        word = word[: len(word) - 1]
    result = classifier.classify({word: word})
iteracao = iteracao + 1
arquivoMedicoes = open('medicoes_analise_threads_' + str(iteracao) + '.txt', 'w')
precisao = accuracy(classificador, featuresClassificados) * 100
arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(precisao))
arquivoMedicoes.close()
features = resultadoPositivos.get() + resultadoNegativos.get() + resultadosNeutros.get()
pool1.terminate()
pool1.close()
pool2.terminate()
pool2.close()
pool3.terminate()
pool3.close()
if precisao > 50:
	features.extend(featuresClassificados)
	shuffle(features)
	classificador = NaiveBayesClassifier.train(features)
	arquivoClassificador = open('classificador.pickle', 'wb')
	dump(classificador, arquivoClassificador, protocol=HIGHEST_PROTOCOL)
	arquivoClassificador.close()
	arquivoPositivos = open('positivos.json', 'w')
	ujson.dump(positivos, arquivoPositivos)
	arquivoPositivos.close()
	arquivoNegativos = open('negativos.json', 'w')
	ujson.dump(negativos, arquivoNegativos)
	arquivoNegativos.close()
	arquivoNeutros = open('neutros.json', 'w')
	ujson.dump(neutros, arquivoNeutros)
	arquivoNeutros.close()
arquivoResultados = open('resultados_sem_stopwords' + str(iteracao) + '.csv', 'w', newline='')
w = writer(arquivoResultados, delimiter=',')
linhas = [['Resposta', 'Pontos', 'Sentimento - Naive Bayes', 'Sentimento - AlchemyAPI']]
Example #24
0
 def build_classifier(self):
     self.labeled_features = self.build_informal_set()
     self.labeled_features.extend(self.build_formal_set())
     classifier = learner.train(self.labeled_features)
     classifier.show_most_informative_features()
     return classifier
 def _train(self):
     train_set = [(self._extract_features(tweet), tweet['label'])
                  for tweet_id, tweet in self._tweets.items()]
     self._classifier = NaiveBayesClassifier.train(train_set)
Example #26
0
    tweets.append((words_filtered, sentiment))


# extract the word features out from the training data
word_features = get_word_features(get_words_in_tweets(tweets))

# get the training set and train the Naive Bayes Classifier
print("Aplicando o treino com o Naive Bayes Classifier (by NLTK)...\n")
training_set = nltk.classify.util.apply_features(extract_features, tweets)
cv = cross_validation.KFold(len(training_set), n_folds=number_cross, indices=True, shuffle=False, random_state=None, k=None)

totalaccuracy = 0
test = { 'positive': 0, 'negative': 0, 'totpos': 0, 'totneg': 0 }
for tweet, testcv in cv:
    classifier  = NaiveBayesClassifier.train(training_set[tweet[0]:tweet[len(tweet)-1]])
    accuracy    = nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]])

    totalaccuracy += accuracy

    classified     = classify_tweet(in_tweets[testcv[0]][0])
    # print 'accuracy:', accuracy
    # print ("Tweet: ... : Pre-class: %s || Classificado como: %s" % (in_tweets[testcv[0]][1], classified))
    if classified == 'positive':
        test['positive'] += 1
    else:
        test['negative'] += 1

    if in_tweets[testcv[0]][1] == 'positive':
        test['totpos'] += 1
    else:
Example #27
0
 def __init__(self, *args, **kwargs):
     self.load_training_data()
     # train classifier
     self.word_features = nltk.FreqDist(self.all_words).keys()
     training_set = nltk.classify.util.apply_features(self.extract_features, self.training_tweets)
     self.classifier = NaiveBayesClassifier.train(training_set)
Example #28
0

# Collect all the words in the training examples
vocabulary = set()
for fileid in train_fileids:
    for word in movie_reviews.words(fileid):
        vocabulary.add(word)

# Try a feature set of 500 random words
vocabulary = list(vocabulary)
random.shuffle(vocabulary)
random_featureset = vocabulary[:500]

train_set = format_dataset(train_fileids, random_featureset)
test_set = format_dataset(test_fileids, random_featureset)
bayes = NaiveBayesClassifier.train(train_set)

print("Random words: ", random_featureset)
print("Naive Bayes accuracy:", accuracy(bayes, test_set))

# Try a feature set of the 500 words that appear most often in the training examples
common_words = dict()
for fileid in train_fileids:
    for word in movie_reviews.words(fileid):
        if word not in common_words:
            common_words[word] = 1
        else:
            word = word + 1

sorted_common = sorted(common_words.items(),
                       key=operator.itemgetter(1))[fileid_count -
Example #29
0
        for key, value in {'noun': nouns, 'verb': verbs, 'adj': adj, 'adv': adv}.items():
            value.sort()
            for idx, word in enumerate(value[:3]):
                features[key + '-' + str(idx)] = word[1].lower()

    return features

train_set = []
for sent in train_data:
    tagged_sent = [(word[2], word[0]) for word in sent]
    for idx, word in enumerate(sent):
        features = wsd_features(tagged_sent, idx)
        sense = word[1]
        train_set.append((features, sense))

classifier = NaiveBayesClassifier.train(train_set)

code.interact(local=locals())


class Concept(object):
    def __init__(self, *args):
        if args:
            synsets = [wordnet.synsets(x) for x in args]
            self.synsets = self._common_synsets(synsets)
            if len(args) > 1:
                isas = [self._isa_synsets(synsets, x) for x in synsets]
                self.synsets = set.union(self.synsets, *isas)
        else:
            self.synsets = set()
Example #30
0
word_features = get_word_features(get_words_in_tweets(tweets))

# get the training set and train the Naive Bayes Classifier
print("Aplicando o treino com o Naive Bayes Classifier (by NLTK)...\n")
training_set = nltk.classify.util.apply_features(extract_features, tweets)
cv = cross_validation.KFold(len(training_set),
                            n_folds=number_cross,
                            indices=True,
                            shuffle=False,
                            random_state=None,
                            k=None)

totalaccuracy = 0
test = {'positive': 0, 'negative': 0, 'totpos': 0, 'totneg': 0}
for tweet, testcv in cv:
    classifier = NaiveBayesClassifier.train(
        training_set[tweet[0]:tweet[len(tweet) - 1]])
    accuracy = nltk.classify.util.accuracy(
        classifier, training_set[testcv[0]:testcv[len(testcv) - 1]])

    totalaccuracy += accuracy

    classified = classify_tweet(in_tweets[testcv[0]][0])
    # print 'accuracy:', accuracy
    # print ("Tweet: ... : Pre-class: %s || Classificado como: %s" % (in_tweets[testcv[0]][1], classified))
    if classified == 'positive':
        test['positive'] += 1
    else:
        test['negative'] += 1

    if in_tweets[testcv[0]][1] == 'positive':
        test['totpos'] += 1