def train(self, training_corpus):
     assert isinstance(training_corpus, (list, tuple))
     assert isinstance(training_corpus[0], dict)
     featureset = [(twit_features(i["text"]), i["polarity"])
                     for i in training_corpus
                     if i["denied"] == 0]
     self.classifier = NaiveBayesClassifier.train(featureset)
def get_sentiment_data(query, training_set):
	train = []
	with open('training/' + training_set + '/training.txt') as f:
		for line in f:
			temp = line.split('\t')
			#print temp
			train.append((get_features(temp[1]), temp[0]))
	clf = NaiveBayesClassifier.train(train)

	tweets = grab_tweets(query)
	print "HERE"
	classified = {}
	for tweet in tweets:
		if tweet.created_at in classified.keys():
			classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
		else:
			classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
	print classified

	returndata = {}
	for key in classified:
		#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
		#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
		# percent:
		returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
		#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
	print returndata
	return returndata
Example #3
0
def nltk_model():
    """Fits the (non-parametric) naive Bayes classifier from nltk on the names
    dataset."""
    # each elt of all_names will be a (name, gender) tuple
    all_names = list()

    with open(MALE_FILE, "r") as f:
        for line in f:
            all_names.append((line.rstrip(), "male"))  # rstrip removes trailing whitespace

    with open(FEMALE_FILE, "r") as g:
        for line in g:
            all_names.append((line.rstrip(), "female"))

    # assert stmts can be useful for debugging etc
    assert len(all_names) == 7944

    # shuffle all_names in place
    random.shuffle(all_names)

    # features are ({'feature_type': feature_value}, gender) tuples
    features = [(nltk_featurize(name), gender) for name, gender in all_names]
    split_pt = int(TRAIN_PCT * len(features))

    train_set, test_set = features[:split_pt], features[split_pt:]
    nb = NaiveBayesClassifier.train(train_set)

    print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
    nb.show_most_informative_features(10)
def test_raw_mail(org_email):

	features_test = {}
	wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
	word_tokenize(org_email)]
	for key in wordtokens_test:
		if key not in stpwords:
			features_test[key] = True
	return features_test

	#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
	feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]

	#Splitting the test and training data sets from the whole email set features
	size_feature = int(len(feature_sets) * 0.10)
	train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
	classifier = NaiveBayesClassifier.train(train_set)
	#print (test_set[1:5])

	#Printing the accuracy of the machine
	print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) 
	
	#Printing the top 50 features
	classifier.show_most_informative_features(50) 

	#Printing the spam and ham labels
	print ('labels:',classifier.labels())

	#Classification of user entered email
	while(True):
		featset = raw_mail(input("Enter text to classify: "))
		print (classifier.classify(featset))
Example #5
0
    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = kwargs.get('positive', [
            'what time is it',
            'hey what time is it',
            'do you have the time',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ])

        self.negative = kwargs.get('negative', [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'thyme is my favorite herb',
            'do you have time to look at my essay',
            'how do you have the time to do all this'
            'what is it'
        ])

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        train_set = [
            (self.time_question_features(text), n) for (text, n) in labeled_data
        ]

        self.classifier = NaiveBayesClassifier.train(train_set)
    def __init_naive_bayes( self ):
        """
    	    Create and trains the NaiveBayes Classifier
        """
	try:
#		corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
#		while corpus_no == 0 or corpus_no > 3:
#		    corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
		corpus = 'corpus2'#+str(corpus_no)
		
		path = os.path.join('corpora/',corpus)
		spam_path = os.path.join(path,'spam')
		ham_path = os.path.join(path,'ham')
		
		
		spam_dir = os.listdir(spam_path)
		ham_dir = os.listdir(ham_path)
		
		train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
		train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]

		spam_size = len(train_spam_filelist)
		ham_size = len(train_ham_filelist)
		
		train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
		train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
		train_set = train_spam_set + train_ham_set
		
		self.classifier = NaiveBayesClassifier.train( train_set )

	except:
		    raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
			os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
			sys.exc_info()[2].tb_lineno, \
			sys.exc_info()[1].message )
def check_classifier(feature_extractor, **kwargs):
    '''
    Train the classifier on the training spam and ham, then check its accuracy
    on the test data, and show the classifier's most informative features.
    '''
    
    # Make training and testing sets of (features, label) data
    train_set, test_spam, test_ham = \
        make_train_test_sets(feature_extractor, **kwargs)
    
    #===============================================
    # ADD YOUR CODE HERE
    # Train the classifier on the training set (train_set)
    # classifier = /your code/
    # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
    # spam_accuracy = /your code/
    # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
    # ham_accuracy = /your code/
    #===============================================
    classifier = NaiveBayesClassifier.train(train_set)
    spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
    ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
    
    # How accurate is the classifier on the test sets?
    print ('Test Spam accuracy: {0:.2f}%'
       .format(100 * spam_accuracy))
    print ('Test Ham accuracy: {0:.2f}%'
       .format(100 * ham_accuracy))

    # Show the top 20 informative features
    print classifier.show_most_informative_features(20)
Example #8
0
def train_nltk(data, labels):
    '''
    Returns a trained nltk.NaiveBayesClassifier
    
    Inputs
    ---------
    data -- np.array of tuples
    '''
    # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
    kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)

    best_model = None
    max_acc = float('-inf')
    for k, (train_index, test_index) in enumerate(kf):
        X_train, Y_train = data[train_index], labels[train_index]
        X_test, Y_test = data[test_index], labels[test_index]

        features_train = bulk_extract_features(X_train)
        features_test = bulk_extract_features(X_test)

        train_set = zip(features_train, Y_train)
        test_set = zip(features_test, Y_test)
        
        model = nbc.train(train_set)

        acc = nltk.classify.accuracy(model, test_set)
        print str(acc)
        if acc > max_acc:
            max_acc = acc
            best_model = model
    best_model.show_most_informative_features(30)
    return best_model
Example #9
0
    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = [
            'what time is it',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ]

        self.negative = [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'what is'
        ]

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        # train_set = apply_features(self.time_question_features, training_data)
        train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]

        self.classifier = NaiveBayesClassifier.train(train_set)
Example #10
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
    def train(self):
        """
        """
        catalog = getToolByName(self, "portal_catalog")
        presentNouns = dict()
        trainingData = []
        allNouns = catalog.uniqueValuesFor("noun_terms")
        for item in allNouns:
            presentNouns.setdefault(item, 0)

        subjectIndex = catalog._catalog.getIndex("Subject")
        nounTermsIndex = catalog._catalog.getIndex("noun_terms")

        # The internal catalog ids of the objects
        # that have noun terms in the catalog
        nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())

        # The internal catalog ids of the objects
        # that have subjects in the catalog
        subjectIndexIds = IISet(subjectIndex._unindex.keys())
        commonIds = intersection(subjectIndexIds, nounTermIndexIds)

        for cid in commonIds:
            nounPresence = presentNouns.copy()
            nouns = nounTermsIndex._unindex[cid]
            tags = subjectIndex._unindex[cid]
            for noun in nouns:
                nounPresence[noun] = 1
            for tag in tags:
                trainingData.append((nounPresence, tag))
        if trainingData:
            self.classifier = NaiveBayesClassifier.train(trainingData)
Example #12
0
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
	classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
	allclassifiers = []
	for name in classnames:
		for i in range(n):
			random.shuffle(featureslist)
			train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)

			if name == 'Naive Bayes':
				spamclassifier = NaiveBayesClassifier.train(train_set)
			if name == 'Logistic Regression':
				spamclassifier = SklearnClassifier(LogisticRegression())
				spamclassifier.train(train_set)
			if name == 'Linear SCV':
				spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
				spamclassifier.train(train_set)
			perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
			if i == 0:
				perfmeasures_n = perfmeasures_i
			else:
				perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
	
		# Store last classifier built per model
		allclassifiers.append(spamclassifier)
		
		# Print performance measures per classifier
		printperformance(name, perfmeasures_n, n)	
		
	return allclassifiers
Example #13
0
def get_matrix(spam_set, ham_set, num_folds):
	'''
	Generate different matrix by taking the average of K Fold data
	'''
	total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0

	for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
		classifier = NaiveBayesClassifier.train(train_set)
		spam_len = len(test_spam_set)
		ham_len = len(test_ham_set)
		true_positive = false_positive = true_negative = false_negative = 0
		for test in test_spam_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 0:
				true_positive += 1
			else:
				false_negative += 1
		for test in test_ham_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 1:
				true_negative += 1
			else:
				false_positive += 1
												
		precision = true_positive / float(true_positive + false_positive)
		recall = true_positive / float(true_positive + false_negative)
		F1 += (2 * precision * recall) / (precision + recall)
		spam_accuracy += true_positive / float(true_positive + false_negative)
		ham_accuracy += true_negative / float(true_negative + false_positive)
		total_precision += precision
		total_recall += recall

	return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
Example #14
0
 def train_classifiers(self):
     for word in self.senses:
         train_set = []
         for senseId in self.senses[word]:
             for lsa_vector in self.senses[word][senseId]:
                 train_set.append([dict(lsa_vector), senseId])
         self.classifiers[word] = NaiveBayesClassifier.train(train_set)
Example #15
0
def training(features, method, proportion_training):
	training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing
	testing_set = features[int(proportion_training*len(features)):]
				
	if method == 'NaiveBayes':
		classifier = NaiveBayesClassifier.train(training_set)
				
	return training_set, testing_set, classifier
Example #16
0
  def train(self, foldPercent=.8):
    features = self.buildFeatures()

    foldIndex = int(foldPercent * len(features))
    self.setTrain = features[:foldIndex]
    self.setTest = features[foldIndex:]

    self.classifier = nbc.train(self.setTrain)
Example #17
0
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    train_set_tuple = tuple(train_set)
    classifier = NaiveBayesClassifier.train(train_set_tuple)
    return train_set, test_set, classifier
Example #18
0
def textClass():
    #dbFile = open("samp.txt")
    dbFile = open("all.txt")

    reviews = list() #each list element is a list of words in the review
    ratings = list() #ratings given
    usefulness = list() #review classification

    tot_recs = 0
    len_tot = 0
    mlen = 0

    #parse the file and create the list to be passed to the NBClassifiers
    while tot_recs < 150000:#True:
        if tot_recs % 1000 == 0:
            print "num records:", tot_recs
        tot_recs += 1
        raw_rec = readRec(dbFile)
        if len(raw_rec) == 0:
            break
        review_text = [word.strip(punctuation) for word in raw_rec["text"]]
        rate_val = str( raw_rec["score"][0] )
        
        prs_rec = parse4ftrs(raw_rec)
        len_tot += prs_rec["length"]
        if prs_rec["length"] > mlen:
            mlen = prs_rec["length"]
        use_val = str( prs_rec["class"] )

        #print use_val, rate_val
        #word feature dictionary
        wfd = word_feats(review_text)

        ratings.append( ( wfd  , rate_val)  )
        usefulness.append( ( wfd, use_val)  )

    dbFile.close()
    print "avg length:", len_tot/tot_recs
    print "max len:", mlen
    #select a cutoff for test v training
    #nrecs = len(ratings)
    nrecs = tot_recs
    rate_cl = NaiveBayesClassifier.train(ratings)
    use_cl = NaiveBayesClassifier.train(usefulness)
    return rate_cl, use_cl
def evaluate_classifier(train_set, test_spam, test_ham):
    """ Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham),
	then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative 
	features are showed.
	"""
    classifier = NaiveBayesClassifier.train(train_set)
    print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam)))
    print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham)))
    print classifier.show_most_informative_features(20)
Example #20
0
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
Example #21
0
def buildClassifier(hamDir, spamDir):
	spamEmails = []
	hamEmails = []
	allEmails = []
	features = []

	# Using glob instead of os.listdir to ignore hidden files

	for email in glob.glob(spamDir + "/*"):
		f = open(email)
		spamEmails.append(f.read())
		f.close()

	for email in glob.glob(hamDir + "/*"):
		f = open(email)
		hamEmails.append(f.read())
		f.close()

	for email in spamEmails:
		allEmails.append((email, 'spam'))

	for email in hamEmails:
		allEmails.append((email, 'ham'))

	# Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle.
	random.shuffle(allEmails)

	# Make a list of feature per email
	for (email, label) in allEmails:
		features.append((emailFeatures(email), label))

	# 70:30 ratio for training:testing
	print "Using a 70:30 ratio for training:testing, the accuracy is as follows: "
	totalSize = int(len(features) * 0.7)
	trainingEmails, testingEmails = features[:totalSize], features[totalSize:]

	print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails))
	classifier = NaiveBayesClassifier.train(trainingEmails)
	print classify.accuracy(classifier, testingEmails)

	print "Now creating and saving a full size classifier made up of %d emails..." %len(features)
	classifier = NaiveBayesClassifier.train(features)

	saveClassifier(classifier, "full-classifier.pickle")
Example #22
0
    def __init__(self,classifierType):

        titles = []
        bodies = []
        invalids = []
        drivers = []
        fromFields = []
        toFields = []
        ctitles = []
        cbodies = []
        cdrivers = []

        
        dirname = os.path.dirname(__file__)
        with open(os.path.join(dirname,'sfIsGood.csv'), 'rb') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            i = -1
            for row in spamreader:
                i += 1
                if (i > 0):
                    titles.append(row[0])
                    bodies.append(row[3])
                    fromFields.append(row[6])
                    toFields.append(row[7])
                    invalids.append(row[6] == 'invalid')
                    drivers.append(row[10])
                    if not row[6] == 'invalid':
                        ctitles.append(row[0])
                        cbodies.append(row[3])
                        cdrivers.append(row[10])

        words = []
        if classifierType == 'driver':
            for i in range(len(ctitles)):
                words += nltk.word_tokenize(ctitles[i])
                words += nltk.word_tokenize(cbodies[i])

            documents = [((nltk.word_tokenize(ctitles[i]) +
                           nltk.word_tokenize(cbodies[i]))
                          , cdrivers[i]) for i in range(len(ctitles))]
            random.shuffle(documents)

        elif classifierType == 'invalid':
            for i in range(len(titles)):
                words += nltk.word_tokenize(titles[i])
                words += nltk.word_tokenize(bodies[i])

            documents = [((nltk.word_tokenize(titles[i]) +
                           nltk.word_tokenize(bodies[i]))
                          , str(invalids[i])) for i in range(len(ctitles))]
            random.shuffle(documents)
            
        all_words = nltk.FreqDist(w.lower() for w in words)
        self.word_features = all_words.keys()[:500]
        self.training_set = [(self.document_features(d), c) for (d,c) in documents]
        self.classifier = NaiveBayesClassifier.train(self.training_set)
Example #23
0
    def naives_classifier(self, training_set, dev_set, log=0):

        classifier = NaiveBayesClassifier.train(training_set)
        accuracy = classify.accuracy(classifier, dev_set)

        print('Naive Bayes accuracy dev percent: ', (accuracy * 100))
        if log == 1:
            classifier.show_most_informative_features(20)

        return classifier
def user_name_classify(user_name, classifier):
    """Infer a gender for a User given any name, using a Naive Bayes classifier
    """

    names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
    features = [(name, gender) for (name, gender) in names]
    training_set = features[500:]
    test_set = features[:500]
    classifier = NaiveBayesClassifier.train(training_set)
    return classifier.classify(user_name)
Example #25
0
def train(positiveFile='positive.csv', negativeFile='negative.csv', nOccurrences=25, trainProportion=0.9):
  files = [positiveFile, negativeFile]
  tweetfeats = []
  masterfeats = {}
  for fn in files:
    f = open(fn, 'r')
    theclass = "pos"
    if fn == negativeFile:
      theclass = "neg"
    sep = '\t'
    fin = csv.reader(f, delimiter = sep)
    for line in fin:
      text = line[1]
      if (len(line) != 9):
        print(text)
      # break up into tokens removing all non-word chars
      feat = featurify(text)
      for f in feat:
        if f in masterfeats:
          masterfeats[f] += 1
        else:
          masterfeats[f] = 0
      if len(feat) > 0:
        tweetfeats.append((feat, theclass))

  mfn = masterfeats.copy()
  for f in masterfeats:
    if masterfeats[f] < nOccurrences:
      del mfn[f]
  masterfeats = mfn
  f = open("features.lst", "w")
  f.write('\n'.join(list(masterfeats.keys())))
  f.close()
  print "Number of Features = %i" % len(masterfeats)

  train_cut = int(len(tweetfeats) * trainProportion)
  random.shuffle(tweetfeats)
  trainfeats = tweetfeats[:train_cut]
  testfeats = tweetfeats[train_cut:]

  print "Training sentiment classifier..."
  sys.stdout.flush()
  classifier = NaiveBayesClassifier.train(trainfeats)
  print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
  classifier.show_most_informative_features()
  sys.stdout.flush()

  # SAVE the classifier & features
  f = open("classifier.pickle", 'w')
  pickle.dump(classifier, f)
  f.close()
  f = open("features.pickle", 'w')
  pickle.dump(masterfeats, f)
  f.close()
def classify(text, sender=None, subject=None):
    training_set = load_training_set()
    classifier = NaiveBayesClassifier.train(training_set)
    test_data = bag_of_words(extract_bigrams(text))
    if sender is not None:
        test_data[sender] = True
    if subject is not None:
        test_data[subject] = True
    classified = classifier.prob_classify(test_data)
    pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()})
    return categories[classified.max()]
Example #27
0
 def train(self, data):
     self.result_string = self._represent(data)
     self.labels = defaultdict(int)
     result_string_len = len(self.result_string)
     self.labels = FreqDist(self.result_string)
     train = []
     for start in range(0, len(self.result_string) - self.n_w, self.n_w - 1):
         window = self.result_string[start:start + self.n_w]
         x_key = self.result_string[start + self.n_w]
         train.append(self._gen_feature(window, x_key))
     self.classifier = NaiveBayesClassifier.train(train)
Example #28
0
   def buildRevClassifier(self, features, normalize, validity):
      revs = self.values()
      random.shuffle(revs)

      featureSets = [(features(rev), rev.reviewer) for rev in self.values()]

      #limit = {'5':0, '4':0, '3':0, '2':0, '1':0}
      #for feature, rank in featureSets:
      #   if limit[rank] > normalize:
      #      featureSets.remove((feature, rank))
      #   limit[rank] += 1
      return NaiveBayesClassifier.train(featureSets)
def cross_validate():
    training_set = load_training_set()
    random.shuffle(training_set)
    average = 0
    cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None)
    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]])
        acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]])
        print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1]
        print 'Accuracy: %4.2f' % acc
        average += acc
    print 'Average accuracy: %4.2f' % (average / 10)
Example #30
0
ts = ts[:2]
#print ts
#feat_set=dict(feat_set)

training_data = zip(tl, ts)
#training_data=dict(training_data)

#training_data, test_set = feat_set[:700],feat_set[700:]

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [
    ({i: (i in word_tokenize(sentence.lower()))
      for i in vocabulary}, tag) for sentence, tag in training_data
]

classifier = nbc.train(feature_set)

#for classifying a new sentence

test_sentence = tl[1]
featurized_test_sentence = {
    i: (i in word_tokenize(test_sentence.lower()))
    for i in vocabulary
}

print "test_sent:", test_sentence
print "tag:", classifier.classify(featurized_test_sentence)

#print nltk.classify.accuracy(classifier,test_set)
Example #31
0
 def train_topic_classifier(self, train_set):
     classifier = NaiveBayesClassifier.train(train_set)
     return classifier
Example #32
0
 def train(self, trainingData):
     self.classifier = NaiveBayesClassifier.train(trainingData)
Example #33
0
    #而不是真正意义上的随机序列。 Seed就是这个算法开始计算的第一个值。所以就会出现只要seed是一样的,那么后续所有“随机”结果和顺序也都是完全一致的。
    random.seed(5)#指定种子,按照特定算法生成固定的随机数
    random.shuffle(data)#打乱序列顺序

    #创建测试数据
    input_names=['Alexander','Danielle','David','Cheryl']

    #定义将被训练和测试数据的百分比
    num_train = int(0.8*len(data))

    #循环输入不同的长度,比较精确度
    for i in range(1,6):
        print('\nNumber of end letters:',i)
        features = [(extract_features(n,i),gender) for (n,gender) in data]
        #将数据分成训练和测试
        train_data,test_data = features[:num_train],features[num_train:]
        #使用训练数据构建朴素贝叶斯分类器
        calssifier =NaiveBayesClassifier.train(train_data)
        #计算分类器的准确度
        accuracy = round(100*nltk_accuracy(calssifier,test_data),2)
        print('Accuracy = '+str(accuracy)+'%')

        #使用训练的模型预测输入数据的输出
        for name in input_names:
            print(name,'==>',calssifier.classify(extract_features(name,i)))
    




Example #34
0
def main():
    should_download = input("Do you need to download nltk libraries? [y/n] ")
    if should_download == "y":
        download_nltk_libraries()

    analysis = SentimentAnalysis()

    # If the cleaned and tokenized data is already cached, pull from that
    if os.path.isfile('cache/cleaned_training_data_negative_cache.csv'):
        cleaned_positive_content = read_cache(
            'cache/cleaned_training_data_positive_cache.csv')
        cleaned_negative_content = read_cache(
            'cache/cleaned_training_data_negative_cache.csv')
        print("Read from cache")
    else:
        # Otherwise, clean and tokenize the data and then cache it.
        split_training_file()

        positive_tokens = analysis.tokenize_training_model(positive_tweets)
        negative_tokens = analysis.tokenize_training_model(negative_tweets)
        cleaned_positive_content = analysis.clean_content(positive_tokens)
        cleaned_negative_content = analysis.clean_content(negative_tokens)

        write_header('cache/cleaned_training_data_positive_cache.csv')
        write_header('cache/cleaned_training_data_negative_cache.csv')
        write_cache('cache/cleaned_training_data_positive_cache.csv',
                    cleaned_positive_content)
        write_cache('cache/cleaned_training_data_negative_cache.csv',
                    cleaned_negative_content)

    positive_content_for_model = analysis.prepare_content_for_model(
        cleaned_positive_content)
    negative_content_for_model = analysis.prepare_content_for_model(
        cleaned_negative_content)

    # The dataset needs to be converted to a dict applicable for training.
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_content_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_content_for_model]

    # The positive and negative sentiment halves to train off of should be combined again, and the order randomized.
    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)

    # train the first 70%, test the last 30%. We have 1.6 million tweets in our training data.
    train_data = dataset[:1120000]
    test_data = dataset[1120000:]

    print("Training using dataset")
    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))

    # After training, we can repeat the process using real data.
    tokenized_tweets, og_tweets, num_original_tweets = fetch_featured_tweets()
    assert len(tokenized_tweets) == len(og_tweets)
    cleaned_drug_tokens = analysis.clean_content(tokenized_tweets)

    print("Running network on real tweets")
    num_positives = 0
    for idx, tokens in enumerate(cleaned_drug_tokens):
        original_tweet = og_tweets[idx]
        token_dict = dict([token, True] for token in tokens)
        try:
            # We instruct our network to classify each tweet, and only output Positive sentiment tweets.
            classified = classifier.classify(token_dict)
            if classified == 'Positive':
                num_positives += 1
                print(original_tweet, "=>", classified)
        except Exception:
            print("exception")

    print("\nTotal original tweets:", num_original_tweets)
    print("Total drug related tweets:", len(cleaned_drug_tokens))
    print("Percent of original tweets that are drug related:",
          len(cleaned_drug_tokens) / num_original_tweets)
    print("Total number of positive sentiment tweets:", num_positives)
    print("Percent of drug related tweets with positive sentiment:",
          num_positives / len(cleaned_drug_tokens))

    return 0
Example #35
0
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    #Create list that contains lists that contains our dictionary sentences and the string "possitive"
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]
    #Create list that contains lists that contains our dictionary sentences and the string "negative"
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]
    #Merging the list of data
    dataset = positive_dataset + negative_dataset
    #Randomize their position
    random.shuffle(dataset)
    #split dataset in 80% training and 20% as testing
    value = 0.8 * len(dataset) + 1
    train_dataset = dataset[:int(value)]
    test_dataset = dataset[int(value):]

    #Call and train Naives Bayes classifier
    classifier = NaiveBayesClassifier.train(train_dataset)
    #Check and print the accuracy with the testing data
    print("Accuracy is:", classify.accuracy(classifier, test_dataset))
    #Show the 10 more important words
    print(classifier.show_most_informative_features(10))
    #Create and run a testing tweet
    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    print(custom_tweet,
          classifier.classify(dict([token, True] for token in custom_tokens)))
Example #36
0
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


# Creating features for each review
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Shuffling the documents
random.shuffle(featuresets)

training_set = featuresets[:20000]
testing_set = featuresets[20000:]

classifier = NaiveBayesClassifier.train(training_set)
accuracy = classify.accuracy(classifier, testing_set)

MNB_clf = SklearnClassifier(MultinomialNB())
MNB_clf.train(training_set)
#print("MNB_classifier accuracy percent:", (classify.accuracy(MNB_clf, test_set))*100)

BNB_clf = SklearnClassifier(BernoulliNB())
BNB_clf.train(training_set)
#print("BernoulliNB_classifier accuracy percent:", (classify.accuracy(BNB_clf, test_set))*100)

LogReg_clf = SklearnClassifier(LogisticRegression())
LogReg_clf.train(training_set)
#print("LogisticRegression_classifier accuracy percent:", (classify.accuracy(LogReg_clf, test_set))*100)

SGD_clf = SklearnClassifier(SGDClassifier())
Example #37
0
features_data = np.array(sentences)
features_data_test = np.array(testSentences)

k_fold = KFold(n_splits=10, random_state=1992, shuffle=True)
word_features = None
accuracy_scores = []
accuracy_data_scores = []
for train_set, test_set in k_fold.split(features_data):
    word_features = get_word_features(
        get_words_in_sentences(features_data[train_set].tolist()))
    train_features = apply_features(extract_features,
                                    features_data[train_set].tolist())
    test_features = apply_features(extract_features,
                                   features_data[test_set].tolist())
    classifier = NaiveBayesClassifier.train(train_features)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    testdata_features = apply_features(extract_features,
                                       features_data_test.tolist())
    refdatasets = collections.defaultdict(set)
    testdatasets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_features):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    for i, (feats, label) in enumerate(testdata_features):
Example #38
0
 def train_model(self, data):
     self.model = NaiveBayesClassifier.train(data)
 def trainModel(self, train_data, test_data):
     return NaiveBayesClassifier.train(train_data)
Example #40
0
def train(all_features, ratio):
    train_size = int(len(all_features) * ratio)
    train_set, test_set = all_features[:train_size], all_features[train_size:]
    clf = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, clf
def main():

    model_csv = INPUT_PATH + '/newsSentiment.csv'
    stop_words = stopwords.words('english')
    all_model_data = []
    model_data = {'positive': [], 'neutral': [], 'negative': []}
    tokenized_data_rows = []
    with open(model_csv, newline='', encoding="ISO-8859-1") as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            # model_data.append((row[1], row[0]))
            all_model_data.append((row[0], row[1]))
            tokens = remove_noise(word_tokenize(row[1]), stop_words)
            model_data[row[0]].append(tokens)
            tokenized_data_rows.append((tokens, row[0]))

    # use UTC time
    to_datetime = datetime.utcnow()
    from_datetime = to_datetime - timedelta(days=7)
    news_data = load_news_data(from_datetime, to_datetime)
    news_data = news_data[news_data['category'] == 'business']
    # remove tiny snippets
    # news_data = news_data[(news_data.description.map(len) > 140)]

    # positive_tweets = twitter_samples.strings('positive_tweets.json')
    # negative_tweets = twitter_samples.strings('negative_tweets.json')

    # text = twitter_samples.strings('tweets.20150430-223406.json')
    # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    # positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    # negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    # positive_cleaned_tokens_list = []
    # negative_cleaned_tokens_list = []

    # for tokens in positive_tweet_tokens:
    #     positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # for tokens in negative_tweet_tokens:
    #     negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # all_pos_words = get_all_words(model_data['positive'])

    # freq_dist_pos = FreqDist(all_pos_words)
    # print(freq_dist_pos.most_common(10))

    # positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    # negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    # positive_dataset = [(tweet_dict, "Positive")
    #                      for tweet_dict in positive_tokens_for_model]

    # negative_dataset = [(tweet_dict, "Negative")
    #                      for tweet_dict in negative_tokens_for_model]

    # [(tweet_dict, "Negative")
    #                      for tweet_dict in negative_tokens_for_model]

    # dataset = positive_dataset + negative_dataset

    dataset = []
    for key, token_list in model_data.items():
        tokens_for_model = get_tweets_for_model(token_list)
        # for token_row in token_list:
        dataset.extend([(tweet_dict, key) for tweet_dict in tokens_for_model])

    random.shuffle(dataset)

    partition_number = len(dataset) * 3 // 4
    train_data = dataset[:partition_number]
    test_data = dataset[partition_number:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))

    # custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
    # custom_tokens = remove_noise(word_tokenize(custom_tweet))
    # print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    news_data['sentiment'] = news_data['title'].apply(classify_string,
                                                      args=(classifier, ))
    save_path = TEST_OUTPUT_PATH + '/data3.csv'
    results = news_data[['title', 'sentiment']]
    results.to_csv(save_path, index=False, encoding='utf-8')
Example #42
0
def main():
    global positive_tokens
    global cleaned_positive_tokens
    global negative_tokens
    global cleaned_negative_tokens
    global predict_tokens
    global cleaned_predict_tokens
    global output_list

    global temp_matrix

    # get cleaned up tokens
    print("......Cleaning up Dataset......")
    print("...tokenizing...")
    print("...normalizing...")
    print("...Lemmatizing...")
    print("...removing stop words...\n")
    clean_up_tweets(positive_input_file_dir, train_text_column_index, positive_tokens, cleaned_positive_tokens)
    print("Done: clean up positive tweets")
    clean_up_tweets(negative_input_file_dir, train_text_column_index, negative_tokens, cleaned_negative_tokens)
    print("Done: clean up negative tweets\n")

    #print(positive_tokens[4])
    #print(cleaned_positive_tokens[4])
    #print(negative_tokens[4])
    #print(cleaned_negative_tokens[4])

    # Converting Tokens to a Dictionary:
    positive_tokens_for_model = get_tweets_for_model(cleaned_positive_tokens)
    negative_tokens_for_model = get_tweets_for_model(cleaned_negative_tokens)
    print("Done: Convert tokens to dictionaries.\n")

    # create a dataset by joining the positive and negative tweets.
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]
    dataset = positive_dataset + negative_dataset
    print("Done: Combine dataset by joining the positive and negative tweets.")
    # random shuffle
    random.shuffle(dataset)

    print(f"positive dataset: {len(positive_dataset)} tweets.")
    print(f"negative dataset: {len(negative_dataset)} tweets.")
    print(f"combine positive & negative dataset: {len(dataset)} tweets.\n")
    print("......Training Data......")

    # splits the shuffled data into a ratio of 7:3 for training and testing
    train_data = dataset[:round(len(dataset)*0.7)]
    test_data = dataset[round(len(dataset)*0.7):]
    print(f"train data: {len(train_data)} tweets")
    print(f"test data: {len(test_data)} tweets\n")

    print("Build & Test Naive_Bayes_Classifier Model: ")
    classifier = NaiveBayesClassifier.train(train_data)
    print("=============Accuracy====================")
    print(f"Accuracy is:{classify.accuracy(classifier, test_data)}\n")

    print(classifier.show_most_informative_features(10))

    # build confusion matrix
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    labels = []
    tests = []

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
        labels.append(label)
        tests.append(observed)
    print("=============Precision and Recall====================")
    print(f"Positive precision: {nltk.precision(refsets['Positive'], testsets['Positive'])}")
    print(f"Positive recall: {nltk.recall(refsets['Positive'], testsets['Positive'])}")
    print(f"Positive F-measure: {nltk.f_measure(refsets['Positive'], testsets['Positive'])}")
    print(f"Negative precision: {nltk.precision(refsets['Negative'], testsets['Negative'])}")
    print(f"Negative recall: {nltk.recall(refsets['Negative'], testsets['Negative'])}")
    print(f"Negative F-measure: {nltk.f_measure(refsets['Negative'], testsets['Negative'])}")

    print("=============Confusion Matrix====================")
    confusion_matrix_result = nltk.ConfusionMatrix(labels, tests)
    print(confusion_matrix_result)

    # now visualize the confusion matrix using matplotlib.pyplot
    #=============Visualize Confusion Matrix====================
    # matirx needs to be saved as np.array()
    # also, needs to extract ._confusion first
    confusion_matrix_result = np.array(confusion_matrix_result._confusion)
    temp_matrix = confusion_matrix_result

    classes = ["Negatives", "Positives"]
    plt.figure()
    plt.imshow(confusion_matrix_result, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    text_format = 'd'
    thresh = confusion_matrix_result.max()/2
    for row, column in itertools.product(range(confusion_matrix_result.shape[0]),
                                         range(confusion_matrix_result.shape[1])):
        plt.text(column, row, format(confusion_matrix_result[row, column], text_format),
                 horizontalalignment='center',
                 color='white' if confusion_matrix_result[row, column] > thresh else "black")

    plt.ylabel("True Values")
    plt.xlabel("Predicted Values")
    plt.tight_layout()
    # needs a high resolution image
    plt.savefig("/Users/Han/Downloads/web project data/confusion_matrix.png", dpi=1200)
    plt.show()

    # =======================================now predict new tweets=======================================
    print("......Now Cleaning up new Dataset......")
    print("...tokenizing...")
    print("...normalizing...")
    print("...Lemmatizing...")
    print("...removing stop words...\n")
    clean_up_tweets(predict_input_file_dir, predict_text_column_index, predict_tokens, cleaned_predict_tokens)
    print("Done: clean up predict tweets\n")

    print("...Now Deploy Bayes Classifier on new dataset...")
    for current_tweet_tokens in cleaned_predict_tokens:
        output_list.append([classifier.classify(dict([token, True] for token in current_tweet_tokens))])

    write_csv(output_list, output_file_dir)
    print("Done! ")
Example #43
0
 def train(self, train_set):
     self.classifier = NaiveBayesClassifier.train(train_set)
     return self.classifier
Example #44
0
print("Dictionary with Positive class : ", positiveReviewDataset[7])
print("Dictionary with Negative class : ", negativeReviewDataset[7])
#print("tagged neg :",negative_dataset[0])

dataset = positiveReviewDataset + negativeReviewDataset

print("Dataset[0] :", dataset[0])
print("Dataset length", len(dataset))

random.shuffle(dataset)

trainData = dataset[:7000]
testData = dataset[7000:]

trainedModel = NaiveBayesClassifier.train(trainData)

print("Accuracy of the model : ", classify.accuracy(trainedModel, testData))

review = "This is a bad product."
reviewTokens = noiseRemoval(word_tokenize(review))

# Test print
print(review, " : ",
      trainedModel.classify(dict([token, True] for token in reviewTokens)))

#Text = "j@nittha"
#Text = re.sub("@", "a", Text)
#print(Text)

Example #45
0
    for palavra_unica_base_tratada in palavras_unicas_base_tratada:
        resultado_linha_palavra['%s' % palavra_unica_base_tratada] = (
            palavra_unica_base_tratada in palavras_unicas_da_frase)

    # print(f'{frase}: {palavras_unicas_da_frase} : {resultado_linha_palavra}\n')

    return resultado_linha_palavra


# Base classificada
base_classificada = classify.apply_features(extrator_linha_nltk,
                                            base_sem_stop_words_stemmed)

# Constrói classificador de probabilidade do Naive Bayes
classificador = NaiveBayesClassifier.train(base_classificada)

# Estatísticas do Classificador
print(
    f'As classes existentes na base classificada são {classificador.labels()}\n'
)
print(f'As 5 principais características são:')
classificador.show_most_informative_features(5)

print_space_between_logs()

# Utilizando o classificador
print(f'Utilizando classificador Naive Bayes para obter a classe\n')


def imprimir_classificacao_frase(frase):
def get_classifier():
    # positive_tweets = twitter_samples.strings("positive_tweets.json")
    # negative_tweets = twitter_samples.strings("negative_tweets.json")
    # text = twitter_samples.strings("tweets.20150430-223406.json")
    # tokens = twitter_samples.tokenized("positive_tweets.json")[0]

    stop_words = stopwords.words("english")

    positive_reviewids = [
        x.reviewid for x in session.query(Review).filter(
            Review.score >= 5).order_by(Review.score.desc()).all()
    ]
    positive_reviews = []
    for id in random.sample(positive_reviewids, 100):
        positive_reviews.append(
            session.query(Content).filter(
                Content.reviewid == id).first().content)

    negative_reviewids = [
        x.reviewid for x in session.query(Review).filter(
            Review.score < 5).order_by(Review.score).all()
    ]
    negative_reviews = []
    for id in random.sample(negative_reviewids, 100):
        negative_reviews.append(
            session.query(Content).filter(
                Content.reviewid == id).first().content)

    # positive_tokens = twitter_samples.tokenized("positive_tweets.json")
    # negative_tokens = twitter_samples.tokenized("negative_tweets.json")
    positive_tokens = [nltk.word_tokenize(x) for x in positive_reviews]
    negative_tokens = [nltk.word_tokenize(x) for x in negative_reviews]

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # all_pos_words = get_all_words(positive_cleaned_tokens_list)

    # freq_dist_pos = FreqDist(all_pos_words)
    # print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(word_dict, "Positive")
                        for word_dict in positive_tokens_for_model]

    negative_dataset = [(word_dict, "Negative")
                        for word_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    # train_data = dataset[:7000]
    # test_data = dataset[7000:]
    train_data = dataset

    return NaiveBayesClassifier.train(train_data)
Example #47
0
def mine_tweets(infile: str, tweetout: str, gramout: str) -> None:
    """Classify, prune, and atomize Tweets."""
    logger = logging.getLogger("miner")

    logger.info("Gathering and tokenizing positive tweets")
    positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json")

    logger.info("Gathering and tokenizing negative tweets")
    negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json")

    logger.info("Cleaning model tokens")
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    # Clean tokens
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(normalize(tokens))

    # Clean tokens
    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(normalize(tokens))

    logger.info("Building Tweet corpus")
    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)  # type: ignore
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)  # type: ignore

    # Mark positive Tweets as such
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    # Mark negative Tweets as such
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    # Create unified dataset and shuffle it
    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)

    # Train the data using the first 70% as
    # training data, and the last 30% as
    # testing data.
    logger.info("70% training, 30% testing")
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    logger.info("Training...")
    classifier = NaiveBayesClassifier.train(train_data)

    logger.info("Accuracy is: %s", classify.accuracy(classifier, test_data))

    logger.info("Classifying Tweets")
    tweets = []

    with open(infile, "r") as csv_file:
        logger.info("Opened %s", infile)

        csv_reader = csv.reader(csv_file, delimiter=",")
        logger.info("Attached CSV reader to %s successfully", infile)

        # Counts processed Tweets and rejected ones
        counter: int = 0
        subject_reject: int = 0

        # Iterate
        for tweet in csv_reader:

            # Printing
            if not counter % DIVISION:
                logger.info("Read in %s Tweets so far...", counter)

            # For debugging
            if counter == MAX_TWEETS:
                break

            # Classify Tweet
            new_tweet = Tweet(tweet)
            dist = classifier.prob_classify(
                dict([token, True]
                     for token in new_tweet.cleaned_tokens)  # type: ignore
            )
            new_tweet.positivity = dist.prob("Positive")
            new_tweet.negativity = dist.prob("Negative")
            new_tweet.difference = abs(new_tweet.positivity -
                                       new_tweet.negativity)

            # Assess the subjectivity of the Tweet
            if new_tweet.difference > SUBJECTIVITY_THRESHOLD:
                tweets.append(new_tweet)
            else:
                subject_reject += 1

            # Count
            counter += 1

    logger.info("Processed %s Tweets", len(tweets))
    logger.info("%s Tweets were rejected for not being subjective enough",
                subject_reject)

    # Pickle Tweets
    pickle.dump(tweets, open(tweetout, "wb"))
    logger.info("Pickled %s Tweets", len(tweets))

    # Storing our n-gram occurrences
    gram_scores: List[Dict[str, int]] = [{}, {}, {}, {}, {}]

    # Counting n-grams
    for i in range(1, 5):
        logger.info("Creating %s-grams", i)

        # Iterate
        for tweet in tweets:  # type: ignore

            # Create n-grams
            grams = ngrams(tweet.cleaned_tokens, i)  # type: ignore

            # Count every gram
            for gram in grams:

                # Create record for new n-gram
                if gram not in gram_scores[i]:
                    gram_scores[i][gram] = 1

                # Update existing record
                else:
                    gram_scores[i][gram] += 1

    # Serialize n-grams to file
    with open(gramout, "wb") as gramout_fp:
        pickle.dump(gram_scores, gramout_fp)
pos_features = []
for words in pos_reviews:
    pos_features.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_features = []
for words in neg_reviews:
    neg_features.append((bag_of_words(words), 'neg'))

shuffle(pos_features)
shuffle(neg_features)

test_feature_set = pos_features[:200] + neg_features[:200]
train_feature_set = pos_features[200:] + neg_features[200:]

classifier = NBC.train(train_feature_set)

accuracy = classify.accuracy(classifier, test_feature_set)
print(accuracy)
#f = open('unigram_classifier.pickle', 'wb')
#pickle.dump(classifier, f)
#f.close()

while (1):
    custom_review = input(
        "Enter a custom movie review (Press ENTER key to exit):\n")
    if (len(custom_review) < 1):
        break
    custom_review_tokens = word_tokenize(custom_review)
    custom_feature_set = bag_of_words(custom_review_tokens)
    print(classifier.classify(custom_feature_set))
Example #49
0
common_words = [
    word for word, freq in words_freqs.most_common(10000)
    if (word not in stopwords.words("english")) and (word not in ponctuation)
]
print(common_words[:100])


# -------Funtions---------------------------------------------------------
def find_features(document, com_words=common_words):
    words = set(document)
    features = {}
    for w in com_words:
        features[w] = (w in words)
    return features


# ---------------------------------------------------------------------------

feature_sets = [(find_features(text), category)
                for (text, category) in documents]
data = {}
data["train"] = feature_sets[:1900]
data["test"] = feature_sets[1900:]
clf = NaiveBayesClassifier.train(data["train"])  # acc: 85.095
# acc = classify.accuracy(clf, data["test"])*100
clf.show_most_informative_features(10)

#------TEST------------------------------------
rev_name = movie_reviews.fileids("neg")[11]
text = movie_reviews.words(rev_name)
clf.classify(find_features(text))
Example #50
0
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    sem_classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(sem_classifier, test_data))

    print(sem_classifier.show_most_informative_features(10))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(
        custom_tweet,
        sem_classifier.classify(dict([token, True]
                                     for token in custom_tokens)))

custom_tweet = "My daughter has been at MEM airport for almost 7 hours trying to fly #unitedAIRLINES to houston. #unitedair what are you going to do???"
Example #51
0
### Get our texts into the format NLTK expects for its classifier

negative_featurized = [{word: True
                        for word in review} for review in negative_min_df]
positive_featurized = [{word: True
                        for word in review} for review in positive_min_df]

negative_tagged = [(review, 'negative') for review in negative_featurized]
positive_tagged = [(review, 'positive') for review in positive_featurized]

all_tagged = negative_tagged + positive_tagged

### Train the classifier

classifier = NaiveBayesClassifier.train(all_tagged)

### Import, process, featurize new set of movie reviews

ebert_path = 'movie_reviews/ebert/'
ebert_files = os.listdir(ebert_path)
ebert_reviews = [open(ebert_path + name).read() for name in ebert_files]
ebert_tokenized = [word_tokenize(review.lower()) for review in ebert_reviews]
ebert_no_stops = [[word for word in review if word not in stopword_set]
                  for review in ebert_tokenized]
ebert_lemmatized = [[wnl.lemmatize(word) for word in review]
                    for review in ebert_tokenized]
ebert_set = [set(review) for review in ebert_lemmatized]
ebert_min_df = [[word for word in review if word in more_than_once_set]
                for review in ebert_set]
ebert_featurized = ({word: True for word in review} for review in ebert_min_df)
    features['topic'] = document[0]
    for word in document_words:
        # features['contains(%s)' % word] = (word in document_words)
        features[word] = (word in document_words)

    return features


tweets = file_handler.load_data(
    settings.BASE_DIR + '/sentiment_app/analyzer/dataset/full-corpus-lite.csv')
data_set = nltk.classify.apply_features(extract_features, tweets)
# training_set = data_set[:len(data_set)/2]
# testing_set = data_set[len(data_set)/2:]

# make classifier
classifier = NaiveBayesClassifier.train(data_set)


def anaylze(tweet):
    print tweet
    # tweet = ("topic", "tweet string post")

    # accuracy & informative features
    # print nltk.classify.accuracy(classifier, testing_set)
    # print classifier.show_most_informative_features(30)
    # print classifier._labels

    # Test Classify
    data = preprocess(tweet[1])
    feature = extract_features((tweet[0], data))
Example #53
0
    def train(self, language):
        df = read_csv("./dataset/SentiWordNet_3.0.0.tsv",
                      sep="\t",
                      header=0,
                      index_col='ID')

        labeled = []
        for row in df.iterrows():
            score = 0
            if float(row[1]['NegScore']) > 0:
                score = float(
                    numpy.tanh(row[1]['PosScore']) /
                    -float(-row[1]['NegScore']))
            else:
                score = float(numpy.tanh(row[1]['PosScore']))
            try:
                tokenized = word_tokenize(row[1]['Gloss'])
            except:
                continue

            item = (tokenized, score)
            labeled.append(item)

        stop_words = stopwords.words(language)

        # positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
        # negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

        # positive_cleaned_tokens_list = []
        # negative_cleaned_tokens_list = []

        # for tokens in positive_tweet_tokens:
        #     positive_cleaned_tokens_list.append(self.remove_noise(tokens, stop_words))

        # for tokens in negative_tweet_tokens:
        #     negative_cleaned_tokens_list.append(self.remove_noise(tokens, stop_words))

        # labeled_cleaned_tokens_list = self.remove_noise(labeled, stop_words)
        # all_pos_words = self.get_all_words(labeled_cleaned_tokens_list)

        # freq_dist_pos = FreqDist(all_pos_words)

        # positive_tokens_for_model = self.get_tweets_for_model(positive_cleaned_tokens_list)
        # negative_tokens_for_model = self.get_tweets_for_model(negative_cleaned_tokens_list)

        # positive_dataset = [(tweet_dict, "Positive")
        #                         for tweet_dict in positive_tokens_for_model]

        # negative_dataset = [(tweet_dict, "Negative")
        # for tweet_dict in negative_tokens_for_model]

        dataset = labeled

        random.shuffle(dataset)

        train_data = dataset[:7000]
        test_data = dataset[7000:]
        self.classifier = NaiveBayesClassifier.train(train_data)
        self.total_accuracy = classify.accuracy(self.classifier, test_data)

        self.refsets = collections.defaultdict(set)
        self.testsets = collections.defaultdict(set)
        print('Total accuracy: ', self.total_accuracy)
def main():
    print('Building model...')
    print('Gathering training data...')

    # set nltk twitter samples as list of strings
    pos_sample_tweets = twitter_samples.strings('positive_tweets.json')
    neg_sample_tweets = twitter_samples.strings('negative_tweets.json')

    #### UPDATE HERE: Option to add your own tweet samples
    #### Remove the empty list, uncomment and update filepaths below
    pos_custom_tweets = []  ## helpers.import_csv('positive_tweets.csv')
    neg_custom_tweets = []  ## helpers.import_csv('negative_tweets.csv')

    # combine nltk twitter samples and custom tweets
    positive_tweets = pos_sample_tweets + pos_custom_tweets
    negative_tweets = neg_sample_tweets + neg_custom_tweets

    # tokenize tweets
    positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets]
    negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets]

    # set cleaned tokens lists
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    stop_words = stopwords.words('english')

    # get cleaned positive tokens
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(
            helpers.remove_noise(tokens, stop_words))

    # get cleaned negative tokens
    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(
            helpers.remove_noise(tokens, stop_words))

    # convert tokens into iterable word lists
    all_pos_words = helpers.get_all_words(positive_cleaned_tokens_list)
    all_neg_words = helpers.get_all_words(negative_cleaned_tokens_list)

    # get frequency distribution of word lists
    freq_dist_pos = FreqDist(all_pos_words)
    freq_dist_neg = FreqDist(all_neg_words)

    # print top 10 positive and negative words
    print('Top 10 positive and negative words:')
    print(freq_dist_pos.most_common(10))
    print(freq_dist_neg.most_common(10))

    # convert tokens to a dictionary for modelling
    positive_tokens_for_model = helpers.get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = helpers.get_tweets_for_model(
        negative_cleaned_tokens_list)

    # assign a label to positive tokens
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    # assign a label to negative tokens
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    # set dataset and randomize to train model
    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)

    # split the data into a 70:30 ratio among 10K tweets
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # train a Naive Bayes model
    classifier = NaiveBayesClassifier.train(train_data)

    # print model accuracy
    print("Model accuracy is:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))
    print('Model complete!\n')

    return classifier
Example #55
0
    return palavras


def encontrarpalavrasunicas(frequencia):
    freq = frequencia.keys()
    return freq


palavrasunicas = encontrarpalavrasunicas(buscafrequencia(palavras))


def extratorpalavras(documento):
    doc = set(documento)
    caracteristicas = {}
    for palavra in palavrasunicas:
        caracteristicas['%s' % palavra] = (palavra in doc)
    return caracteristicas


classificador = NaiveBayesClassifier.train(
    apply_features(extratorpalavras, removestopwords(frases_padrao)))

testestemming = []
stemmer = RSLPStemmer()
for (palavrastreinamento) in sujeito.split():
    comstem = [p for p in palavrastreinamento.split()]
    testestemming.append(str(stemmer.stem(comstem[0])))

print('individuo: %s -  <reação da vitima = %s> ' %
      (sujeito, classificador.classify(extratorpalavras(testestemming))))
Example #56
0
def trainModel(dataType, save=True):
    if dataType == "Twitter":
        pTweets = twitter_samples.strings('positive_tweets.json')
        nTweets = twitter_samples.strings('negative_tweets.json')
        cleanPTweets = preprocess(pTweets, dataType)
        cleanNTweets = preprocess(nTweets, dataType)
        pDict = []
        nDict = []

        for tweet in cleanPTweets:
            tempDict = {}
            for token in tweet:
                tempDict[token] = True
            pDict.append(tempDict)
        for tweet in cleanNTweets:
            tempDict = {}
            for token in tweet:
                tempDict[token] = True
            nDict.append(tempDict)

        pData = [(tweet, "Positive") for tweet in pDict]
        nData = [(tweet, "Negative") for tweet in nDict]
        dataSet = pData + nData

        random.shuffle(dataSet)
        classifier = NaiveBayesClassifier.train(dataSet)

        if save:
            modelName = "./python/models/" + dataType + "BayesModel.txt"
            with open(modelName, 'wb') as f:
                pickle.dump(classifier, f)

        return classifier

    if dataType == "Movie":
        cleanPReviews = []
        cleanNReviews = []
        for file in movie_reviews.fileids('pos'):
            cleanPReviews.append(movie_reviews.words(file))
        for file in movie_reviews.fileids('neg'):
            cleanNReviews.append(movie_reviews.words(file))
        pDict = []
        nDict = []

        for review in cleanPReviews:
            tempDict = {}
            for token in review:
                tempDict[token] = True
            pDict.append(tempDict)
        for review in cleanNReviews:
            tempDict = {}
            for token in review:
                tempDict[token] = True
            nDict.append(tempDict)

        pData = [(review, "Positive") for review in pDict]
        nData = [(review, "Negative") for review in nDict]
        dataSet = pData + nData

        random.shuffle(dataSet)
        classifier = NaiveBayesClassifier.train(dataSet)

        if save:
            modelName = "./python/models/" + dataType + "BayesModel.txt"
            with open(modelName, 'wb') as f:
                pickle.dump(classifier, f)

        return classifier
Example #57
0
def nbtrain(train_set):
    classifier = NaiveBayesClassifier.train(train_set)
    return classifier
Example #58
0
 def __init__(self, feat_sets):
     self.train_set = feat_sets[:9500]
     self.test_set = feat_sets[9500:]
     self.Multinomial_classifier = SklearnClassifier(MultinomialNB())
     self.bernoulli_classifier = SklearnClassifier(BernoulliNB())
     self.naivebayes_classifier = NaiveBayesClassifier.train(self.train_set)
Example #59
0
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk import classify

names = [('Aidar', 'boy'), ('Marat', 'boy'), ('Aslan', 'boy'),
         ('Nurbek', 'boy'), ('Nurlan', 'boy'), ('Rakhman', 'boy'),
         ('Rustam', 'boy'), ('Islam', 'boy'), ('Daulet', 'boy'),
         ('Yerkebulan', 'boy'), ('Gaziz', 'boy'), ('Aigerim', 'girl'),
         ('Aidana', 'girl'), ('Zhansaya', 'girl'), ('Karina', 'girl'),
         ('Zarina', 'girl'), ('Aiman', 'girl'), ('Sholpan', 'girl'),
         ('Kamshat', 'girl'), ('Aisulu', 'girl'), ('Alina', 'girl'),
         ('Rauan', 'boy'), ('Raikhan', 'girl')]


def gender_features(word):
    return {'last_letter': word[-1]}


featuresets = [(gender_features(n), g) for (n, g) in names]
train_set, test_set = featuresets[:17], featuresets[17:]

nb_classifier = NaiveBayesClassifier.train(train_set)
print(nb_classifier.classify(gender_features('Leyla')))
print(classify.accuracy(nb_classifier, test_set))
print(nb_classifier.show_most_informative_features(5))
Example #60
-1
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)