Example #1
0
    def train_nlp(self, features_train, labels_train):
        """
        Trains two Naive Bayes classifiers, one for author names
        and one for titles.

        :param cur: A database pointer for the Goodreads data (see pull_data.py).
        :param con: The database connection.
        :return: nlp_title: An NLP classifier for titles.
                 nlp_author: An NLP classifier for authors.
        """
        # Organize title and author data
        train_data = list(zip(features_train, labels_train))

        title_data = [(title_prep(record[0][0]), record[1]) for record in
                      train_data]

        author_data = [(author_prep(record[0][1]), record[1]) for
                       record in train_data]

        # Train the classifiers using the training data

        # Title Classifier
        self.nlp_title = NaiveBayesClassifier.train(title_data)
        # nltk.classify.util.accuracy(clf, title_data[divide:])

        # Author Classifier
        self.nlp_author = NaiveBayesClassifier.train(author_data)
Example #2
0
    def train(self, clf_type):
        print('Training classifier...')

        words, labels = self.load_data(self.train_path)

        self.pos = [t[1] for t in nltk.pos_tag(words)]

        self.previous_labels = ["O"] + labels
        # next_labels = labels[1:] + ['O']

        features = [self.features(words, i) for i in range(len(words))]
        train_samples = [(f, l) for (f, l) in zip(features, labels)]
        if clf_type == 'SVM':
            # classifier = SklearnClassifier( make_pipeline(StandardScaler(with_mean=False), SVC(kernel='rbf',
            # probability=True, max_iter=1000))).train(train_samples)
            classifier = SklearnClassifier(LinearSVC()).train(train_samples)
        elif clf_type == 'MLP':
            classifier = SklearnClassifier(
                MLPClassifier()).train(train_samples)
        elif clf_type == 'Naive Bayes':
            classifier = NaiveBayesClassifier.train(train_samples)
        else:
            classifier = MaxentClassifier.train(train_samples,
                                                max_iter=self.max_iter)
        self.dict_classifiers[clf_type] = classifier
        self.pos = self.previous_labels = None
def analyze_data(pos_train, neg_train, pos_test, neg_test):
    global tweets

    pos_tweets = read_tweets(pos_train, 'positive')
    neg_tweets = read_tweets(neg_train, 'negative')

    # filter away words that are less than 3 letters to form the training data
    for (words, sentiment) in pos_tweets + neg_tweets:
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        tweets.append((words_filtered, sentiment))

    # get the training set and train the Naive Bayes Classifier
    training_set = nltk.classify.util.apply_features(extract_features, tweets)
    classifier = NaiveBayesClassifier.train(training_set)

    # read in the test tweets and check accuracy
    # to add your own test tweets, add them in the respective files
    test_tweets = read_tweets(pos_test, 'positive')
    test_tweets.extend(read_tweets(neg_test, 'negative'))
    total = accuracy = float(len(test_tweets))

    for tweet in test_tweets:
        if classify_tweet(tweet[0], classifier) != tweet[1]:
            accuracy -= 1
    tot_accuracy = accuracy / total * 100

    print("\n\nResults:")
    print("######################################")
    print(" Total accuracy: ", end="")
    print('%.3f' % tot_accuracy, end="")
    print("%", end="")
    print(' (%d/%d)!  ' % (accuracy, total))
    print("######################################")
    def train(self):
        print 'Classifier Training in progress....'
        poscutoff = len(self.positiveFeatures)
        negcutoff = len(self.negativeFeatures)
        print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positiveFeatures[:poscutoff] + self.negativeFeatures[:negcutoff]
        
        testfeats = self.test()        
        print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
        self.classifier = NaiveBayesClassifier.train(trainfeats)        
        print 'accuracy:', accuracy(self.classifier, testfeats)
        
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set) 
        
        for i, (feats, label) in enumerate(testfeats):    
            refsets[label].add(i)    
            observed = self.classifier.classify(feats)  
            #print label, observed  
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
Example #5
0
    def __init__(self, classList, featureMatrix):
        super(NaiveBayes, self).__init__()
        print "\n-------------------------\nNaive Bayes:\n-------------------------\n"

        self.classes = classList
        self.featureMatrix = featureMatrix
        self.nb = NB.train(zip(featureMatrix, classList))
        self.showMostInformativeFeatures()
Example #6
0
 def build_classifier(self):
     #print "Informal"
     self.labeled_features = self.build_informal_set()
     #print "Formal"
     self.labeled_features.extend(self.build_formal_set())
     classifier = learner.train(self.labeled_features)
     #classifier.show_most_informative_features()
     return classifier
Example #7
0
 def __init__(self, classList, featureMatrix):
     super(NaiveBayes, self).__init__()
     print "\n-------------------------\nNaive Bayes:\n-------------------------\n"
     
     self.classes = classList
     self.featureMatrix = featureMatrix
     self.nb = NB.train(zip(featureMatrix, classList))
     self.showMostInformativeFeatures()
Example #8
0
 def build_classifier(self):
     #print "Informal"
     self.labeled_features = self.build_informal_set()
     #print "Formal"
     self.labeled_features.extend(self.build_formal_set())
     classifier = learner.train(self.labeled_features)
     #classifier.show_most_informative_features()
     return classifier
Example #9
0
 def train(self):
     self._test_set = [
         ({word: (word in pt.applyTokenizer(x[0]))
           for word in _LEXICON}, x[1]) for x in __TRAIN_SET__
     ]
     #print("> Test Set: ", self._test_set)
     #self._training_set = apply_features(self.extractFeature, self._test_set)
     self._classifier = NaiveBayesClassifier.train(self._test_set)
def evaluate_features(feature_select, best_words):
    posFeatures = []
    negFeatures = []
   
   
      
    sentences = read_in_tweets(twitter_data)
    random.shuffle(sentences)
    sentences = sentences[:100000]
    
    posSentences = []
    negSentences = []
    for tup in sentences:
        if tup[0]=='0':
            negSentences.append(tup[1])
        elif tup[0]=='4':
            posSentences.append(tup[1])
    
   
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        posWords = [feature_select(posWords,best_words), 'pos']
        posFeatures.append(posWords)

    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        negWords = [feature_select(negWords,best_words), 'neg']
        negFeatures.append(negWords)


    
    # selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

    # trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)    

    # initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)    

    # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)    

    # prints metrics to show how well the feature selection did
    print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    classifier.show_most_informative_features(10)
Example #11
0
def modelTrainingLexicon(traginingData, testData):
    print("--Lexicon Model--")
    tab = []
    dataLexiconFeature = []
    dataLexiconFeatureT = []
    for data in traginingData:
        booleanNeg = False
        pos_score = neg_score = obj_score = 0
        tagData = pos_tag(data[0])
        negationData = mark_negation(data[0])
        pos_score, neg_score, obj_score =tagCount(data,tagData,negationData,pos_score,neg_score,obj_score,booleanNeg)
        total = int(pos_score) - int(neg_score)
        if (total < 0):
            overall = 'neg'
        elif (total > 0):
            overall = 'pos'
        elif (total == 0):
            overall = 'neutre'
        tab.append(pos_score)
        tab.append(neg_score)
        tab.append(obj_score)
        feats = ({'positive': pos_score, 'negative': neg_score}, data[1])
        dataLexiconFeature.append(feats)

    for dataT in testData:
        booleanNegT = False
        pos_scoreT = neg_scoreT = obj_scoreT = 0
        tagData = pos_tag(dataT[0])
        negationDataT = mark_negation(dataT[0])
        pos_scoreT, neg_scoreT, obj_score = tagCount(dataT, tagData, negationDataT, pos_scoreT, neg_scoreT, obj_scoreT,
                                                   booleanNegT)
        total = int(pos_scoreT) - int(neg_scoreT)

        tab.append(pos_scoreT)
        tab.append(neg_scoreT)
        tab.append(obj_scoreT)
        featsT = ({'positive': pos_scoreT, 'negative': neg_scoreT}, dataT[1])
        dataLexiconFeatureT.append(featsT)


    classifier = NaiveBayesClassifier.train(dataLexiconFeature)
    realSet = collections.defaultdict(set)
    testSet = collections.defaultdict(set)

    tabPr = []
    tabOut = []

    for i, (feat, ovAll) in enumerate(dataLexiconFeatureT):
        realSet[ovAll].add(i)
        predicted = classifier.classify(feat)
        tabOut.append(predicted)
        tabPr.append(predicted)
        testSet[predicted].add(i)


    print("Accuracy Naive Bayes for Lexicon Model : ", nltk.classify.util.accuracy(classifier, dataLexiconFeatureT))

    return realSet, testSet, tabPr, tabOut
Example #12
0
 def trainCustom(self, trainSet):
     #print("\n> Train set custom", trainSet)
     self._test_set = [
         ({word: (word in pt.applyTokenizer(x[0]))
           for word in trainSet}, x[1]) for x in __TRAIN_SET__
     ]
     #print("> Test Set: ", self._test_set)
     #self._training_set = apply_features(self.extractFeature, self._test_set)
     self._classifier = NaiveBayesClassifier.train(self._test_set)
def create_model(pos_tweets, neg_tweets, neu_tweets, classifier_param='LinearSVC'):

    # filter away words that are less than 3 letters to form the training training_data
    tweets = []
    for (words, sentiment) in pos_tweets + neg_tweets + neu_tweets:
        words = util.clean_text(words, True)
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        #words_filtered = [' '.join(w) for w in [ x for x in nltk.bigrams(words.split())]]
        tweets.append((words_filtered, sentiment))

    # make sure tweets are shuffled randomly
    shuffle(tweets)

    # get the training set and train the Classifier
    training_set = nltk.classify.util.apply_features(extract_features, tweets)

    max_specificity = -1
    best_classifier = None
    average_accuracy = 0.0

    # perform 10-fold cross validation
    cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None)
    for traincv, testcv in cv:

        if classifier_param == "LinearSVC":
            classifier = SklearnClassifier(LinearSVC()).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "Tfid":
            # does TF-IDF weighting,
            # chooses the 1000 best features based on a chi2 statistic,
            # and then passes that into a multinomial naive Bayes classifier.
            pipeline = Pipeline([('tfidf', TfidfTransformer()), \
                                   ('chi2', SelectKBest(chi2, k=1000)), \
                                   ('nb', MultinomialNB())])
            classifier = SklearnClassifier(pipeline).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "Bernoulli":
            classifier = SklearnClassifier(BernoulliNB()).train(training_set[traincv[0]:traincv[len(traincv)-1]])
        elif classifier_param == "NaiveBayes":
            classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv)-1]])
        else:
            print "Classifier option not available: ", classifier_param
            sys.exit(1)

        accuracy_of_classifier, specificity = \
            util.accuracy(classifier, tweets[testcv[0]:testcv[len(testcv)-1]])

        average_accuracy += accuracy_of_classifier
        if specificity > max_specificity:
            max_specificity = specificity
            best_classifier = classifier

    print "\naverage accuracy: ", average_accuracy/cv.n_folds

    # save the classifier
    joblib.dump(best_classifier, "model/%s_classifier.pkl" % classifier_param)

    print "saved classifier"
	def _loadClassifier(self):
		# Choose estimator
		estimator = ELEProbDist
		# Create the P(label) distribution 
		label_probdist = estimator(self._label_freqdist)	
		# Create the P(fval|label, fname) distribution 
		feature_probdist = {} 
		for ((label, fname), freqdist) in self._feature_freqdist.items(): 
			probdist = estimator(freqdist, bins=len(self._feature_values[fname])) 
			feature_probdist[label,fname] = probdist 		
		self._classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
Example #15
0
 def run_train(self, mode='agreeable'):
     # mode in ['gender', 'age_group', 'extroverted', 'stable',
     # 'agreeable',·'conscientious', 'openness']
     train_input = []
     print(f"making train_input: {mode}")
     for infos in tqdm(self.train.values()):
         for info in infos['text']:  # process same label for 100 texts
             train_input.append((self.get_feature_dict(info), infos[mode]))
     print(f"running trainer... {mode}")
     self.classifier[mode] = NB.train(train_input)
     print("running trainer done")
Example #16
0
def main():
    rdr = CategorizedPlaintextCorpusReader('/home/mel/workspace/datascience/assignment5_kaggle/data/', r'.*\.txt', cat_pattern=r'(.*)\.txt')
    clf = NaiveBayesClassifier.train(list(make_training_data(rdr)))
    clf.show_most_informative_features(10)
    
    review_file = open("/home/mel/workspace/datascience/assignment5_kaggle/data/yelp_test_set/yelp_test_set_review.json")
    lines = review_file.readlines()
    output_file = open('/home/mel/workspace/datascience/assignment5_kaggle/output.csv', 'w+')
    
    for word in ('good', 'service'):
        print('probability {w!r} is useful: {p:.2%}'.format(
                                                              w = word, p = clf.prob_classify({word : True}).prob('useful')))
Example #17
0
    def evaluateclassifier(self, featureselection):
        positivecount=0
        negativecount=0
        negativetweets = []
        positivetweets = []
        #print 'Evaluating Classifier'
        print featureselection
        with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f:
            #print 'Opening corpus file'
            reader = csv.reader(f)
            for row in reader:
                #Positive sentiment tweets
                if(row[0] == '4' and positivecount < self.corpuslength):
                    positivetweets.append(row[5])        
                    positivecount+=1        
                #Negative sentiment tweets
                if(row[0] == '0' and negativecount < self.corpuslength):
                    negativetweets.append(row[5])
                    negativecount+=1
        
        #print 'Generating Features' 
        self.positivefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positivetweets]
        self.negativefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativetweets]
        
        poscutoff = len(self.positivefeatures)
        negcutoff = len(self.negativefeatures)
        print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]
        
        testfeats = self.test(featureselection) 
        #testfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]       
        print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
        classifier = NaiveBayesClassifier.train(trainfeats)        
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
        
        #classifier.show_most_informative_features(20)
        
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set) 
        
        for i, (feats, label) in enumerate(testfeats):    
            refsets[label].add(i)    
            observed = classifier.classify(feats)  
            #print label, observed  
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
Example #18
0
    def _train(self):
        pickle_filename = "{0}.pickle".format(self.__class__.__name__)
        if os.path.isfile(pickle_filename):
            with open(pickle_filename, "rb") as classifier_f:
                self._classifier = pickle.load(classifier_f)
            classifier_f.close()
        else:
            train_set = [(self._extract_features(cascade), cascade['label'])
                         for cascade in self._dataset]
            self._classifier = NaiveBayesClassifier.train(train_set)

            with open(pickle_filename, "wb") as save_classifier:
                pickle.dump(self._classifier, save_classifier)
            save_classifier.close()
Example #19
0
def train(filename):

  print 'Reading data from the file ' + filename
  labeled_featuresets = []
  with open(filename) as f:
    for line in f:
        sentence, category = line.split(' ,,, ',  1)
        labeled_featuresets.append((extract_features(sentence), category.strip()))

  print 'Training started'
  classifier = NaiveBayesClassifier.train(labeled_featuresets)

  print 'Training completed\n'
  return classifier
Example #20
0
    def __init__(self):
        """
        Gather data
        """
        positive = twitter_samples.strings('positive_tweets.json')
        negative = twitter_samples.strings('negative_tweets.json')
        self.stop_words = list(set(stopwords.words('english')))

        positive_tokens = twitter_samples.tokenized('positive_tweets.json')
        negative_tokens = twitter_samples.tokenized('negative_tweets.json')
        """
        Clean the data
        """
        positive_clean = []
        negative_clean = []

        for token in positive_tokens:
            positive_clean.append(self.clean(token))

        for token in negative_tokens:
            negative_clean.append(self.clean(token))

        positive_model_tokens = self.final_token_generator(positive_clean)
        negative_model_tokens = self.final_token_generator(negative_clean)
        """
        Use generator to make datasets
        """
        positive_dataset = [(token, "Positive")
                            for token in positive_model_tokens]

        negative_dataset = [(token, "Negative")
                            for token in negative_model_tokens]

        dataset = positive_dataset + negative_dataset
        """
        Shake it all about
        """
        random.shuffle(dataset)
        random.shuffle(dataset)
        random.shuffle(dataset)
        """
        Split them up
        """
        training = dataset[:7000]
        testing = dataset[7000:]
        """
        Train the classifier
        """
        self.classifier = NaiveBayesClassifier.train(training)
        """
Example #21
0
 def train_classifier(self, data):
     try:
         positive = self.cleaner.clean_tweets(data["positive"])
         negative = self.cleaner.clean_tweets(data["negative"])
         dataset = []
         for tokens in positive:
             dataset.append((dict([t, True] for t in tokens), +1))
         for tokens in negative:
             dataset.append((dict([t, True] for t in tokens), -1))
         random.shuffle(dataset)
         model = NaiveBayesClassifier.train(dataset)
         with open(self.get_model_path(), "wb") as analyser:
             analyser.write(pickle.dumps(model))
     except Exception as ex:
         print(ex)
Example #22
0
    def test_simple(self):
        training_features = [
            ({'nice': True, 'good': True}, 'positive'),
            ({'bad': True, 'mean': True}, 'negative'),
        ]

        classifier = NaiveBayesClassifier.train(training_features)

        result = classifier.prob_classify({'nice': True})
        self.assertTrue(result.prob('positive') > result.prob('negative'))
        self.assertEqual(result.max(), 'positive')

        result = classifier.prob_classify({'bad': True})
        self.assertTrue(result.prob('positive') < result.prob('negative'))
        self.assertEqual(result.max(), 'negative')
Example #23
0
 def run_naive_bayes(self, language):
     self.__check_language(language)
     util.time_log("starting nb...")
     ret_list = []
     self.load_data_reviews(language)
     for k_iter in range(0, self.k):
         util.time_log("learning...")
         classifier = NaiveBayesClassifier.train(
             self.training_data_text_vectorized_nb(language, k_iter))
         util.time_log("classifying")
         ret_list.append([
             classifier.classify(x)
             for x in self.test_data_text_vectorized_nb(language, k_iter)
         ])
     return ret_list
    def build_classifier(self):

        #print "Creating a list of labels. If this is done, the previous init doesn't have to be"
        labels = ['arts','business','computers','home','recreation','science','shopping','knowledge']

        self.labeled_features = []
        for label in labels:
            print label.upper()
            self.labeled_features.extend(self.build_data_set(label))
            print self.labeled_features

        print self.labeled_features

        print "Labeled Features: ",self.labeled_features
        classifier = learner.train(self.labeled_features)
        classifier.show_most_informative_features()
        return classifier
Example #25
0
def main():
    mainDir="/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/corpus2"
    input="/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/tweets/cache"
    logger.info("Start app")
    documents = [(list(w.lower() for w in my_corpus.words(fileid)), categoryMapper(category))
                 for category in my_corpus.categories()
                 for fileid in my_corpus.fileids(category)]
    random.shuffle(documents)

    featuresets = [(document_features(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[250:], featuresets[:50]
    clf = NaiveBayesClassifier.train(train_set)
    logger.info("Accuracy: " + str(nltk.classify.accuracy(clf, test_set)))
    ref = [cat for features, cat in test_set]
    test = [clf.classify(features) for features, cat in test_set]
    logger.info(clf.show_most_informative_features(20))
    logger.info("\n" + nltk.ConfusionMatrix(ref, test).pp())
    logger.info("Exit app")
Example #26
0
    def test_simple(self):
        training_features = [({
            'nice': True,
            'good': True
        }, 'positive'), ({
            'bad': True,
            'mean': True
        }, 'negative')]

        classifier = NaiveBayesClassifier.train(training_features)

        result = classifier.prob_classify({'nice': True})
        self.assertTrue(result.prob('positive') > result.prob('negative'))
        self.assertEqual(result.max(), 'positive')

        result = classifier.prob_classify({'bad': True})
        self.assertTrue(result.prob('positive') < result.prob('negative'))
        self.assertEqual(result.max(), 'negative')
Example #27
0
    def finalclassification(self):
        negative_words=[]
        positive_words=[]
        with open('positive.txt', 'r') as posSentences:
            for i in posSentences:
                posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                posWords = [negativevalues.makeadict(posWords), 'pos']
                positive_words.append(posWords)
        with open('negative.txt', 'r') as negSentences:
            for i in negSentences:
                negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                negWords = [negativevalues.makeadict(negWords), 'neg']
                negative_words.append(negWords)

        trainFeatures = positive_words[:] + negative_words[:]


        classifier = NaiveBayesClassifier.train(trainFeatures)    
        return classifier
def train(records):
    global CUR_CL
    train_data = []
    for record in records:
        text = record[1]
        class_label = record[0]
        feats = features_from_text(text, class_label, stopwords=sw)
        train_data.append(feats)
    if CUR_CL is None:
        if CLASSIFIER == "NaiveBayesClassifier":
            classifier = NaiveBayesClassifier.train(train_data)
        elif CLASSIFIER == "sklearnLinSVC":
            pipeline = Pipeline(
                [
                    ("tfidf", TfidfTransformer()),
                    ("chi2", SelectKBest(chi2, k=1000)),
                    ("nb", LinearSVC(multi_class="ovr")),
                ]
            )
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == "BernoulliNB":
            pipeline = Pipeline(
                [("tfidf", TfidfTransformer()), ("chi2", SelectKBest(chi2, k=1000)), ("nb", BernoulliNB())]
            )
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == "MultinomialNB":
            pipeline = Pipeline(
                [("tfidf", TfidfTransformer()), ("chi2", SelectKBest(chi2, k=1000)), ("nb", MultinomialNB())]
            )
            classifier = SklearnClassifier(pipeline).train(train_data)
        print CLASSIFIER
        CUR_CL = classifier
    else:
        print "Partial fitting.. \n\n"
        CUR_CL.train(train_data)
    f = open("%s/%s.pickle" % (pickles_dir, "news_based_" + CLASSIFIER), "wb")
    pickle.dump(CUR_CL, f)
    f.close()
    print "%s/%s.pickle saved" % (pickles_dir, "news_based_" + CLASSIFIER)

    gc.collect()
Example #29
0
    def test_simple(self):
        training_features = [
            ({
                "nice": True,
                "good": True
            }, "positive"),
            ({
                "bad": True,
                "mean": True
            }, "negative"),
        ]

        classifier = NaiveBayesClassifier.train(training_features)

        result = classifier.prob_classify({"nice": True})
        self.assertTrue(result.prob("positive") > result.prob("negative"))
        self.assertEqual(result.max(), "positive")

        result = classifier.prob_classify({"bad": True})
        self.assertTrue(result.prob("positive") < result.prob("negative"))
        self.assertEqual(result.max(), "negative")
Example #30
0
def modelUnigram(trainData, testData):

    print("--MODEL UNIGRAM--")
    tab = []
    classifier = NaiveBayesClassifier.train(trainData)
    realSet = collections.defaultdict(set)
    testSet = collections.defaultdict(set)
    tabOut = []
    tabOver = []

    for i, (wordFeat, overall) in enumerate(testData):
        realSet[overall].add(i)
        predicted = classifier.classify(wordFeat)
        tabOut.append(predicted)
        tabOver.append(overall)
        tab.append(predicted)
        testSet[predicted].add(i)

    print("Accuracy Naive Bayes for Unigram Model : ", nltk.classify.util.accuracy(classifier, testData))

    return realSet, testSet, tab, tabOut, tabOver
	def train(self,training_set=None):
		"""
		Trains the BOW NaiveBayes classifier.
		"""
		if (training_set == None):			
			training_set = [(sent, sent.certainty) for sent in self._corpus.sents()]
		#training_set = training_set[0:10] #para comparar con los resultados anteriores
		#build features		
		self._build_bow_features(training_set)
		
		#build featuresets for each sentence
		labeled_featuresets = []
		for sent in training_set:
			featureset = self.sentenceFeatures(sent)
			labeled_featuresets.append((featureset,sent.certainty))

		debug('Size of training set: '+str(len(labeled_featuresets)))
		#pp = pprint.PrettyPrinter(indent=4)
		#pp.pprint(labeled_featuresets)
		#train the NaiveBayes
		self._classifier = NaiveBayesClassifier.train(labeled_featuresets)
def start():

    reviews = get_reviews()
    top_words = [i[0] for i in get_top_words(reviews, 2000)]

    # Generate Features Sets
    print ("Generate Feature_set for all documents: Started")
    feature_set = []
    for review, category in reviews:
        feature_set.append((get_features(review, top_words), category))

    print("Generate Feature_set for all documents: Completed")

    test_set, train_set = feature_set[:20000], feature_set[20000:]

    print("Training Started")
    classifier = NaiveBayesClassifier.train(train_set)
    print("Training Started")

    print("Testing Now....")
    print(nltk.classify.accuracy(classifier, test_set))
def create_classifier(feature_select, filename):    
    posFeatures = []
    negFeatures = []
    # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
    # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
    
      
    sentences = read_in_tweets(twitter_data)
    random.shuffle(sentences)
    sentences = sentences[:100000]
    
    posSentences = []
    negSentences = []
    for tup in sentences:
        if tup[0]=='0':
            negSentences.append(tup[1])
        elif tup[0]=='4':
            posSentences.append(tup[1])
    
   
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)

    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)

    
    # selects 3/4 of the features to be used for training and 1/4 to be used for testing
    trainFeatures = negFeatures[:] + posFeatures[:]

    # trains a Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(trainFeatures)
    f = open(filename, 'wb')
    pickle.dump(classifier, f)
    f.close()
    def train(self, training_set=None):
        """
		Trains the BOW NaiveBayes classifier.
		"""
        if (training_set == None):
            training_set = [(sent, sent.certainty)
                            for sent in self._corpus.sents()]
        #training_set = training_set[0:10] #para comparar con los resultados anteriores
        #build features
        self._build_bow_features(training_set)

        #build featuresets for each sentence
        labeled_featuresets = []
        for sent in training_set:
            featureset = self.sentenceFeatures(sent)
            labeled_featuresets.append((featureset, sent.certainty))

        debug('Size of training set: ' + str(len(labeled_featuresets)))
        #pp = pprint.PrettyPrinter(indent=4)
        #pp.pprint(labeled_featuresets)
        #train the NaiveBayes
        self._classifier = NaiveBayesClassifier.train(labeled_featuresets)
def get_baseline_method(x_train,
                        y_train,
                        x_test,
                        y_test,
                        method=None,
                        keywords=None):
    def transform_features(sentence):
        words = sentence.lower().split()
        return dict(('contains(%s)' % w, True) for w in words)

    if 'nb' in method:
        x_train = list(map(transform_features, x_train))
        x_test = list(map(transform_features, x_test))
        train_set = list(zip(x_train, y_train))
        clf = NaiveBayesClassifier.train(train_set)
        score_test = np.array([clf.prob_classify(t).prob(1.0) for t in x_test])
        score_train = np.array(
            [clf.prob_classify(t).prob(1.0) for t in x_train])
    else:
        x_train = [extract_glove_feature(text) for text in x_train]
        x_test = [extract_glove_feature(text) for text in x_test]
        if 'randomforest' in method:
            clf = RandomForestClassifier(max_depth=5,
                                         n_estimators=10,
                                         max_features=1).fit(x_train, y_train)
            score_train = clf.predict_proba(x_train)[:, 1]
            score_test = clf.predict_proba(x_test)[:, 1]
        elif 'knn' in method:
            clf = KNeighborsClassifier(10).fit(x_train, y_train)
            score_train = clf.predict_proba(x_train)[:, 1]
            score_test = clf.predict_proba(x_test)[:, 1]
        elif 'gloverank' in method:
            from sklearn.metrics.pairwise import cosine_similarity
            keyword_doc = extract_glove_feature(keywords).reshape(1, 50)
            score_train = cosine_similarity(keyword_doc, x_train)[0]
            score_test = cosine_similarity(keyword_doc, x_test)[0]

    return score_test, y_test, score_train, y_train
Example #36
0
    def train(self):
        print 'Classifier Training in progress....'
        poscutoff = len(self.positiveFeatures)
        negcutoff = len(self.negativeFeatures)
        print "Train Pos Cutoff: " + str(
            poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positiveFeatures[:
                                           poscutoff] + self.negativeFeatures[:
                                                                              negcutoff]

        testfeats = self.test()
        print 'Train on %d instances, test on %d instances' % (len(trainfeats),
                                                               len(testfeats))
        self.classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', accuracy(self.classifier, testfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = self.classifier.classify(feats)
            #print label, observed
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'],
                                                       testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'],
                                                 testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'],
                                                       testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'],
                                                       testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'],
                                                 testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'],
                                                       testsets['neg'])
def train(records):
    global CUR_CL
    train_data = []
    for record in records:
        text = record[1]
        class_label = record[0]
        feats = features_from_text(text, class_label, stopwords=sw)
        train_data.append(feats)
    if CUR_CL is None:
        if CLASSIFIER == 'NaiveBayesClassifier':
            classifier = NaiveBayesClassifier.train(train_data)
        elif CLASSIFIER == 'sklearnLinSVC':
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('nb', LinearSVC(multi_class='ovr'))])
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == 'BernoulliNB':
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('nb', BernoulliNB())])
            classifier = SklearnClassifier(pipeline).train(train_data)
        elif CLASSIFIER == 'MultinomialNB':
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('nb', MultinomialNB())])
            classifier = SklearnClassifier(pipeline).train(train_data)
        print CLASSIFIER
        CUR_CL = classifier
    else:
        print 'Partial fitting.. \n\n'
        CUR_CL.train(train_data)
    f = open("%s/%s.pickle" % (pickles_dir, 'news_based_' + CLASSIFIER), 'wb')
    pickle.dump(CUR_CL, f)
    f.close()
    print"%s/%s.pickle saved" % (pickles_dir, 'news_based_' + CLASSIFIER)

    gc.collect()
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features


def classify_tweet(tweet):
    return classifier.classify(extract_features(nltk.word_tokenize(tweet)))


pos_tweets = read_tweets('Training_Data/Social_Inspirer.txt', 'positive')
neg_tweets = read_tweets('Training_Data/Negative.txt', 'negative')

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))

word_features = get_word_features(get_words_in_tweets(tweets))

training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)

test_tweets = read_tweets('Test_Tweets/Tweets_Positive.txt', 'positive')
test_tweets.extend(read_tweets('Test_Tweets/Tweets_Negative.txt', 'negative'))
total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1
result = accuracy / total * 100
Example #39
0
        for key, value in {'noun': nouns, 'verb': verbs, 'adj': adj, 'adv': adv}.items():
            value.sort()
            for idx, word in enumerate(value[:3]):
                features[key + '-' + str(idx)] = word[1].lower()

    return features

train_set = []
for sent in train_data:
    tagged_sent = [(word[2], word[0]) for word in sent]
    for idx, word in enumerate(sent):
        features = wsd_features(tagged_sent, idx)
        sense = word[1]
        train_set.append((features, sense))

classifier = NaiveBayesClassifier.train(train_set)

code.interact(local=locals())


class Concept(object):
    def __init__(self, *args):
        if args:
            synsets = [wordnet.synsets(x) for x in args]
            self.synsets = self._common_synsets(synsets)
            if len(args) > 1:
                isas = [self._isa_synsets(synsets, x) for x in synsets]
                self.synsets = set.union(self.synsets, *isas)
        else:
            self.synsets = set()
Example #40
0
    tweets.append((words_filtered, sentiment))


# extract the word features out from the training data
word_features = get_word_features(get_words_in_tweets(tweets))

# get the training set and train the Naive Bayes Classifier
print("Aplicando o treino com o Naive Bayes Classifier (by NLTK)...\n")
training_set = nltk.classify.util.apply_features(extract_features, tweets)
cv = cross_validation.KFold(len(training_set), n_folds=number_cross, indices=True, shuffle=False, random_state=None, k=None)

totalaccuracy = 0
test = { 'positive': 0, 'negative': 0, 'totpos': 0, 'totneg': 0 }
for tweet, testcv in cv:
    classifier  = NaiveBayesClassifier.train(training_set[tweet[0]:tweet[len(tweet)-1]])
    accuracy    = nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]])

    totalaccuracy += accuracy

    classified     = classify_tweet(in_tweets[testcv[0]][0])
    # print 'accuracy:', accuracy
    # print ("Tweet: ... : Pre-class: %s || Classificado como: %s" % (in_tweets[testcv[0]][1], classified))
    if classified == 'positive':
        test['positive'] += 1
    else:
        test['negative'] += 1

    if in_tweets[testcv[0]][1] == 'positive':
        test['totpos'] += 1
    else:
Example #41
0
 def __init__(self, *args, **kwargs):
     self.load_training_data()
     # train classifier
     self.word_features = nltk.FreqDist(self.all_words).keys()
     training_set = nltk.classify.util.apply_features(self.extract_features, self.training_tweets)
     self.classifier = NaiveBayesClassifier.train(training_set)
 def train(self, pairs):
     features = [(self.features(x,y), judgment) for x,y,judgment in pairs]
     self.model = NaiveBayesClassifier.train(features)
     #self.model = MaxentClassifier.train(features, max_iter=10)
     print self.model.most_informative_features()
Example #43
0
for (n , g) in names:
    print n 
    print g
    break

featuresets = [(gender_features(n), g) for (n, g) in names]
featuresets

len(featuresets)
train_set, test_set = featuresets[500:], featuresets[:500] 
train_set


from nltk import NaiveBayesClassifier

nb_classifier = NaiveBayesClassifier.train(train_set)

nb_classifier.classify(gender_features('Gary'))

nb_classifier.classify(gender_features('Grace'))

from nltk import classify
 
classify.accuracy(nb_classifier, test_set)

nb_classifier.show_most_informative_features(5)




iteracao = iteracao + 1
arquivoMedicoes = open('medicoes_analise_threads_' + str(iteracao) + '.txt', 'w')
precisao = accuracy(classificador, featuresClassificados) * 100
arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(precisao))
arquivoMedicoes.close()
features = resultadoPositivos.get() + resultadoNegativos.get() + resultadosNeutros.get()
pool1.terminate()
pool1.close()
pool2.terminate()
pool2.close()
pool3.terminate()
pool3.close()
if precisao > 50:
	features.extend(featuresClassificados)
	shuffle(features)
	classificador = NaiveBayesClassifier.train(features)
	arquivoClassificador = open('classificador.pickle', 'wb')
	dump(classificador, arquivoClassificador, protocol=HIGHEST_PROTOCOL)
	arquivoClassificador.close()
	arquivoPositivos = open('positivos.json', 'w')
	ujson.dump(positivos, arquivoPositivos)
	arquivoPositivos.close()
	arquivoNegativos = open('negativos.json', 'w')
	ujson.dump(negativos, arquivoNegativos)
	arquivoNegativos.close()
	arquivoNeutros = open('neutros.json', 'w')
	ujson.dump(neutros, arquivoNeutros)
	arquivoNeutros.close()
arquivoResultados = open('resultados_sem_stopwords' + str(iteracao) + '.csv', 'w', newline='')
w = writer(arquivoResultados, delimiter=',')
linhas = [['Resposta', 'Pontos', 'Sentimento - Naive Bayes', 'Sentimento - AlchemyAPI']]
Example #45
0
import base64
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

corpusdir = "./text"
newcorpus = PlaintextCorpusReader(corpusdir, ".*")
labeled_names = (
    [(name, "comp") for name in newcorpus.words("comp.txt")]
    + [(name, "animal") for name in newcorpus.words("animal.txt")]
    + [(word, "ignore") for word in newcorpus.words("ignorethese.txt")]
)
features = [({n: n}, thing) for (n, thing) in labeled_names]
training = features[:]
testing = "What color is the mouse?".lower().split(" ")
classifier = NaiveBayesClassifier.train(training)
pickleclf = pickle.dumps(classifier)
compressed = base64.b64encode(zlib.compress(pickleclf, 9))
with open("PickledClassifier.txt", "wb") as outobj:
    outobj.write(compressed)
compScore = 0
animalScore = 0
for word in testing:
    if (
        word[len(word) - 1] == "."
        or word[len(word) - 1] == ","
        or word[len(word) - 1] == "?"
        or word[len(word) - 1] == "!"
    ):
        word = word[: len(word) - 1]
    result = classifier.classify({word: word})
Example #46
0
def start():
    global classifications_collection, tweets_collection, global_count
    sw = stopwords.words('english')
    thr = 5
    refactored_tweets = {}
    records = tweets_collection.find()
    for record in records:
        tweet = record['text']
        tmp_classifiers = record['classifications']
        for clasfId, classId in tmp_classifiers.iteritems():
            if clasfId not in refactored_tweets.keys():
                refactored_tweets[clasfId] = []
            refactored_tweets[clasfId].append({'text': tweet, 'classId':classId})
    
    records = None

    gc.collect()    

    for classification in classifications_collection.find():
        tweets = []
        classification_name = classification['classification']
        classification_id = str(classification["_id"])
        
        classes = classification['classes']
        
        #records = tweets_collection.find({"clasfId":classification_id})

        records = []
        try:
            records = refactored_tweets[classification_id]
        except KeyError:
            print "No tweets for classification ", classification_name
            continue
        records_count = len(records)
        print classification_name, records_count

        if classification_id in global_count.keys():
            if int(records_count/thr)>global_count[classification_id]:
                print "Exceeded threshold. Training started"
                for record in records:
                    tweet = record['text']
                    class_id = record['classId']
                    class_label = get_class_label(class_id, classes)
                    feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw)
                    
                    tweets.append(feats)
                classifier = NaiveBayesClassifier.train(tweets)
                f = open("%s.pickle"%classification_name, 'wb')
                pickle.dump(classifier, f)
                f.close()
                global_count[classification_id] = int(records_count/thr)
            else:
                pass
        else:
            global_count[classification_id] = int(records_count/thr)
            if global_count[classification_id] >=1:
                print "New classification or just started monitor"
                for record in records:
                    tweet = record['text']
                    class_id = record['classId']
                    class_label = get_class_label(class_id, classes)
                    feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw)
                    
                    tweets.append(feats)
                classifier = NaiveBayesClassifier.train(tweets)
                f = open("%s.pickle"%classification_name, 'wb')
                pickle.dump(classifier, f)
                f.close()