def train(self, training_corpus):
     assert isinstance(training_corpus, (list, tuple))
     assert isinstance(training_corpus[0], dict)
     featureset = [(twit_features(i["text"]), i["polarity"])
                     for i in training_corpus
                     if i["denied"] == 0]
     self.classifier = NaiveBayesClassifier.train(featureset)
def get_sentiment_data(query, training_set):
	train = []
	with open('training/' + training_set + '/training.txt') as f:
		for line in f:
			temp = line.split('\t')
			#print temp
			train.append((get_features(temp[1]), temp[0]))
	clf = NaiveBayesClassifier.train(train)

	tweets = grab_tweets(query)
	print "HERE"
	classified = {}
	for tweet in tweets:
		if tweet.created_at in classified.keys():
			classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
			classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
	print classified

	returndata = {}
	for key in classified:
		#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
		#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
		# percent:
		returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
		#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
	print returndata
	return returndata
def nltk_model():
    """Fits the (non-parametric) naive Bayes classifier from nltk on the names
    # each elt of all_names will be a (name, gender) tuple
    all_names = list()

    with open(MALE_FILE, "r") as f:
        for line in f:
            all_names.append((line.rstrip(), "male"))  # rstrip removes trailing whitespace

    with open(FEMALE_FILE, "r") as g:
        for line in g:
            all_names.append((line.rstrip(), "female"))

    # assert stmts can be useful for debugging etc
    assert len(all_names) == 7944

    # shuffle all_names in place

    # features are ({'feature_type': feature_value}, gender) tuples
    features = [(nltk_featurize(name), gender) for name, gender in all_names]
    split_pt = int(TRAIN_PCT * len(features))

    train_set, test_set = features[:split_pt], features[split_pt:]
    nb = NaiveBayesClassifier.train(train_set)

    print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
def test_raw_mail(org_email):

	features_test = {}
	wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
	for key in wordtokens_test:
		if key not in stpwords:
			features_test[key] = True
	return features_test

	#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
	feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]

	#Splitting the test and training data sets from the whole email set features
	size_feature = int(len(feature_sets) * 0.10)
	train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
	classifier = NaiveBayesClassifier.train(train_set)
	#print (test_set[1:5])

	#Printing the accuracy of the machine
	print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) 
	#Printing the top 50 features

	#Printing the spam and ham labels
	print ('labels:',classifier.labels())

	#Classification of user entered email
		featset = raw_mail(input("Enter text to classify: "))
		print (classifier.classify(featset))
    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = kwargs.get('positive', [
            'what time is it',
            'hey what time is it',
            'do you have the time',
            'do you know the time',
            'do you know what time it is',
            'what is the time'

        self.negative = kwargs.get('negative', [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'thyme is my favorite herb',
            'do you have time to look at my essay',
            'how do you have the time to do all this'
            'what is it'

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]

        train_set = [
            (self.time_question_features(text), n) for (text, n) in labeled_data

        self.classifier = NaiveBayesClassifier.train(train_set)
    def __init_naive_bayes( self ):
    	    Create and trains the NaiveBayes Classifier
#		corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
#		while corpus_no == 0 or corpus_no > 3:
#		    corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
		corpus = 'corpus2'#+str(corpus_no)
		path = os.path.join('corpora/',corpus)
		spam_path = os.path.join(path,'spam')
		ham_path = os.path.join(path,'ham')
		spam_dir = os.listdir(spam_path)
		ham_dir = os.listdir(ham_path)
		train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
		train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]

		spam_size = len(train_spam_filelist)
		ham_size = len(train_ham_filelist)
		train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
		train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
		train_set = train_spam_set + train_ham_set
		self.classifier = NaiveBayesClassifier.train( train_set )

		    raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
			os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
			sys.exc_info()[2].tb_lineno, \
			sys.exc_info()[1].message )
def check_classifier(feature_extractor, **kwargs):
    Train the classifier on the training spam and ham, then check its accuracy
    on the test data, and show the classifier's most informative features.
    # Make training and testing sets of (features, label) data
    train_set, test_spam, test_ham = \
        make_train_test_sets(feature_extractor, **kwargs)
    # Train the classifier on the training set (train_set)
    # classifier = /your code/
    # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
    # spam_accuracy = /your code/
    # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
    # ham_accuracy = /your code/
    classifier = NaiveBayesClassifier.train(train_set)
    spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
    ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
    # How accurate is the classifier on the test sets?
    print ('Test Spam accuracy: {0:.2f}%'
       .format(100 * spam_accuracy))
    print ('Test Ham accuracy: {0:.2f}%'
       .format(100 * ham_accuracy))

    # Show the top 20 informative features
    print classifier.show_most_informative_features(20)
def train_nltk(data, labels):
    Returns a trained nltk.NaiveBayesClassifier
    data -- np.array of tuples
    # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
    kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)

    best_model = None
    max_acc = float('-inf')
    for k, (train_index, test_index) in enumerate(kf):
        X_train, Y_train = data[train_index], labels[train_index]
        X_test, Y_test = data[test_index], labels[test_index]

        features_train = bulk_extract_features(X_train)
        features_test = bulk_extract_features(X_test)

        train_set = zip(features_train, Y_train)
        test_set = zip(features_test, Y_test)
        model = nbc.train(train_set)

        acc = nltk.classify.accuracy(model, test_set)
        print str(acc)
        if acc > max_acc:
            max_acc = acc
            best_model = model
    return best_model
    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = [
            'what time is it',
            'do you know the time',
            'do you know what time it is',
            'what is the time'

        self.negative = [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'what is'

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]

        # train_set = apply_features(self.time_question_features, training_data)
        train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]

        self.classifier = NaiveBayesClassifier.train(train_set)
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
    def train(self):
        catalog = getToolByName(self, "portal_catalog")
        presentNouns = dict()
        trainingData = []
        allNouns = catalog.uniqueValuesFor("noun_terms")
        for item in allNouns:
            presentNouns.setdefault(item, 0)

        subjectIndex = catalog._catalog.getIndex("Subject")
        nounTermsIndex = catalog._catalog.getIndex("noun_terms")

        # The internal catalog ids of the objects
        # that have noun terms in the catalog
        nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())

        # The internal catalog ids of the objects
        # that have subjects in the catalog
        subjectIndexIds = IISet(subjectIndex._unindex.keys())
        commonIds = intersection(subjectIndexIds, nounTermIndexIds)

        for cid in commonIds:
            nounPresence = presentNouns.copy()
            nouns = nounTermsIndex._unindex[cid]
            tags = subjectIndex._unindex[cid]
            for noun in nouns:
                nounPresence[noun] = 1
            for tag in tags:
                trainingData.append((nounPresence, tag))
        if trainingData:
            self.classifier = NaiveBayesClassifier.train(trainingData)
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
	classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
	allclassifiers = []
	for name in classnames:
		for i in range(n):
			train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)

			if name == 'Naive Bayes':
				spamclassifier = NaiveBayesClassifier.train(train_set)
			if name == 'Logistic Regression':
				spamclassifier = SklearnClassifier(LogisticRegression())
			if name == 'Linear SCV':
				spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
			perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
			if i == 0:
				perfmeasures_n = perfmeasures_i
				perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
		# Store last classifier built per model
		# Print performance measures per classifier
		printperformance(name, perfmeasures_n, n)	
	return allclassifiers
def get_matrix(spam_set, ham_set, num_folds):
	Generate different matrix by taking the average of K Fold data
	total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0

	for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
		classifier = NaiveBayesClassifier.train(train_set)
		spam_len = len(test_spam_set)
		ham_len = len(test_ham_set)
		true_positive = false_positive = true_negative = false_negative = 0
		for test in test_spam_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 0:
				true_positive += 1
				false_negative += 1
		for test in test_ham_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 1:
				true_negative += 1
				false_positive += 1
		precision = true_positive / float(true_positive + false_positive)
		recall = true_positive / float(true_positive + false_negative)
		F1 += (2 * precision * recall) / (precision + recall)
		spam_accuracy += true_positive / float(true_positive + false_negative)
		ham_accuracy += true_negative / float(true_negative + false_positive)
		total_precision += precision
		total_recall += recall

	return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
 def train_classifiers(self):
     for word in self.senses:
         train_set = []
         for senseId in self.senses[word]:
             for lsa_vector in self.senses[word][senseId]:
                 train_set.append([dict(lsa_vector), senseId])
         self.classifiers[word] = NaiveBayesClassifier.train(train_set)
def training(features, method, proportion_training):
	training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing
	testing_set = features[int(proportion_training*len(features)):]
	if method == 'NaiveBayes':
		classifier = NaiveBayesClassifier.train(training_set)
	return training_set, testing_set, classifier
  def train(self, foldPercent=.8):
    features = self.buildFeatures()

    foldIndex = int(foldPercent * len(features))
    self.setTrain = features[:foldIndex]
    self.setTest = features[foldIndex:]

    self.classifier = nbc.train(self.setTrain)
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    train_set_tuple = tuple(train_set)
    classifier = NaiveBayesClassifier.train(train_set_tuple)
    return train_set, test_set, classifier
def textClass():
    #dbFile = open("samp.txt")
    dbFile = open("all.txt")

    reviews = list() #each list element is a list of words in the review
    ratings = list() #ratings given
    usefulness = list() #review classification

    tot_recs = 0
    len_tot = 0
    mlen = 0

    #parse the file and create the list to be passed to the NBClassifiers
    while tot_recs < 150000:#True:
        if tot_recs % 1000 == 0:
            print "num records:", tot_recs
        tot_recs += 1
        raw_rec = readRec(dbFile)
        if len(raw_rec) == 0:
        review_text = [word.strip(punctuation) for word in raw_rec["text"]]
        rate_val = str( raw_rec["score"][0] )
        prs_rec = parse4ftrs(raw_rec)
        len_tot += prs_rec["length"]
        if prs_rec["length"] > mlen:
            mlen = prs_rec["length"]
        use_val = str( prs_rec["class"] )

        #print use_val, rate_val
        #word feature dictionary
        wfd = word_feats(review_text)

        ratings.append( ( wfd  , rate_val)  )
        usefulness.append( ( wfd, use_val)  )

    print "avg length:", len_tot/tot_recs
    print "max len:", mlen
    #select a cutoff for test v training
    #nrecs = len(ratings)
    nrecs = tot_recs
    rate_cl = NaiveBayesClassifier.train(ratings)
    use_cl = NaiveBayesClassifier.train(usefulness)
    return rate_cl, use_cl
def evaluate_classifier(train_set, test_spam, test_ham):
    """ Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham),
	then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative 
	features are showed.
    classifier = NaiveBayesClassifier.train(train_set)
    print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam)))
    print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham)))
    print classifier.show_most_informative_features(20)
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
def buildClassifier(hamDir, spamDir):
	spamEmails = []
	hamEmails = []
	allEmails = []
	features = []

	# Using glob instead of os.listdir to ignore hidden files

	for email in glob.glob(spamDir + "/*"):
		f = open(email)

	for email in glob.glob(hamDir + "/*"):
		f = open(email)

	for email in spamEmails:
		allEmails.append((email, 'spam'))

	for email in hamEmails:
		allEmails.append((email, 'ham'))

	# Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle.

	# Make a list of feature per email
	for (email, label) in allEmails:
		features.append((emailFeatures(email), label))

	# 70:30 ratio for training:testing
	print "Using a 70:30 ratio for training:testing, the accuracy is as follows: "
	totalSize = int(len(features) * 0.7)
	trainingEmails, testingEmails = features[:totalSize], features[totalSize:]

	print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails))
	classifier = NaiveBayesClassifier.train(trainingEmails)
	print classify.accuracy(classifier, testingEmails)

	print "Now creating and saving a full size classifier made up of %d emails..." %len(features)
	classifier = NaiveBayesClassifier.train(features)

	saveClassifier(classifier, "full-classifier.pickle")
    def __init__(self,classifierType):

        titles = []
        bodies = []
        invalids = []
        drivers = []
        fromFields = []
        toFields = []
        ctitles = []
        cbodies = []
        cdrivers = []

        dirname = os.path.dirname(__file__)
        with open(os.path.join(dirname,'sfIsGood.csv'), 'rb') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            i = -1
            for row in spamreader:
                i += 1
                if (i > 0):
                    invalids.append(row[6] == 'invalid')
                    if not row[6] == 'invalid':

        words = []
        if classifierType == 'driver':
            for i in range(len(ctitles)):
                words += nltk.word_tokenize(ctitles[i])
                words += nltk.word_tokenize(cbodies[i])

            documents = [((nltk.word_tokenize(ctitles[i]) +
                          , cdrivers[i]) for i in range(len(ctitles))]

        elif classifierType == 'invalid':
            for i in range(len(titles)):
                words += nltk.word_tokenize(titles[i])
                words += nltk.word_tokenize(bodies[i])

            documents = [((nltk.word_tokenize(titles[i]) +
                          , str(invalids[i])) for i in range(len(ctitles))]
        all_words = nltk.FreqDist(w.lower() for w in words)
        self.word_features = all_words.keys()[:500]
        self.training_set = [(self.document_features(d), c) for (d,c) in documents]
        self.classifier = NaiveBayesClassifier.train(self.training_set)
    def naives_classifier(self, training_set, dev_set, log=0):

        classifier = NaiveBayesClassifier.train(training_set)
        accuracy = classify.accuracy(classifier, dev_set)

        print('Naive Bayes accuracy dev percent: ', (accuracy * 100))
        if log == 1:

        return classifier
def user_name_classify(user_name, classifier):
    """Infer a gender for a User given any name, using a Naive Bayes classifier

    names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
    features = [(name, gender) for (name, gender) in names]
    training_set = features[500:]
    test_set = features[:500]
    classifier = NaiveBayesClassifier.train(training_set)
    return classifier.classify(user_name)
def train(positiveFile='positive.csv', negativeFile='negative.csv', nOccurrences=25, trainProportion=0.9):
  files = [positiveFile, negativeFile]
  tweetfeats = []
  masterfeats = {}
  for fn in files:
    f = open(fn, 'r')
    theclass = "pos"
    if fn == negativeFile:
      theclass = "neg"
    sep = '\t'
    fin = csv.reader(f, delimiter = sep)
    for line in fin:
      text = line[1]
      if (len(line) != 9):
      # break up into tokens removing all non-word chars
      feat = featurify(text)
      for f in feat:
        if f in masterfeats:
          masterfeats[f] += 1
          masterfeats[f] = 0
      if len(feat) > 0:
        tweetfeats.append((feat, theclass))

  mfn = masterfeats.copy()
  for f in masterfeats:
    if masterfeats[f] < nOccurrences:
      del mfn[f]
  masterfeats = mfn
  f = open("features.lst", "w")
  print "Number of Features = %i" % len(masterfeats)

  train_cut = int(len(tweetfeats) * trainProportion)
  trainfeats = tweetfeats[:train_cut]
  testfeats = tweetfeats[train_cut:]

  print "Training sentiment classifier..."
  classifier = NaiveBayesClassifier.train(trainfeats)
  print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

  # SAVE the classifier & features
  f = open("classifier.pickle", 'w')
  pickle.dump(classifier, f)
  f = open("features.pickle", 'w')
  pickle.dump(masterfeats, f)
def classify(text, sender=None, subject=None):
    training_set = load_training_set()
    classifier = NaiveBayesClassifier.train(training_set)
    test_data = bag_of_words(extract_bigrams(text))
    if sender is not None:
        test_data[sender] = True
    if subject is not None:
        test_data[subject] = True
    classified = classifier.prob_classify(test_data)
    pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()})
    return categories[classified.max()]
 def train(self, data):
     self.result_string = self._represent(data)
     self.labels = defaultdict(int)
     result_string_len = len(self.result_string)
     self.labels = FreqDist(self.result_string)
     train = []
     for start in range(0, len(self.result_string) - self.n_w, self.n_w - 1):
         window = self.result_string[start:start + self.n_w]
         x_key = self.result_string[start + self.n_w]
         train.append(self._gen_feature(window, x_key))
     self.classifier = NaiveBayesClassifier.train(train)
   def buildRevClassifier(self, features, normalize, validity):
      revs = self.values()

      featureSets = [(features(rev), rev.reviewer) for rev in self.values()]

      #limit = {'5':0, '4':0, '3':0, '2':0, '1':0}
      #for feature, rank in featureSets:
      #   if limit[rank] > normalize:
      #      featureSets.remove((feature, rank))
      #   limit[rank] += 1
      return NaiveBayesClassifier.train(featureSets)
def cross_validate():
    training_set = load_training_set()
    average = 0
    cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None)
    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]])
        acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]])
        print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1]
        print 'Accuracy: %4.2f' % acc
        average += acc
    print 'Average accuracy: %4.2f' % (average / 10)
ts = ts[:2]
#print ts

training_data = zip(tl, ts)

#training_data, test_set = feat_set[:700],feat_set[700:]

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [
    ({i: (i in word_tokenize(sentence.lower()))
      for i in vocabulary}, tag) for sentence, tag in training_data

classifier = nbc.train(feature_set)

#for classifying a new sentence

test_sentence = tl[1]
featurized_test_sentence = {
    i: (i in word_tokenize(test_sentence.lower()))
    for i in vocabulary

print "test_sent:", test_sentence
print "tag:", classifier.classify(featurized_test_sentence)

#print nltk.classify.accuracy(classifier,test_set)
 def train_topic_classifier(self, train_set):
     classifier = NaiveBayesClassifier.train(train_set)
     return classifier
 def train(self, trainingData):
     self.classifier = NaiveBayesClassifier.train(trainingData)
    #而不是真正意义上的随机序列。 Seed就是这个算法开始计算的第一个值。所以就会出现只要seed是一样的,那么后续所有“随机”结果和顺序也都是完全一致的。


    num_train = int(0.8*len(data))

    for i in range(1,6):
        print('\nNumber of end letters:',i)
        features = [(extract_features(n,i),gender) for (n,gender) in data]
        train_data,test_data = features[:num_train],features[num_train:]
        calssifier =NaiveBayesClassifier.train(train_data)
        accuracy = round(100*nltk_accuracy(calssifier,test_data),2)
        print('Accuracy = '+str(accuracy)+'%')

        for name in input_names:

def main():
    should_download = input("Do you need to download nltk libraries? [y/n] ")
    if should_download == "y":

    analysis = SentimentAnalysis()

    # If the cleaned and tokenized data is already cached, pull from that
    if os.path.isfile('cache/cleaned_training_data_negative_cache.csv'):
        cleaned_positive_content = read_cache(
        cleaned_negative_content = read_cache(
        print("Read from cache")
        # Otherwise, clean and tokenize the data and then cache it.

        positive_tokens = analysis.tokenize_training_model(positive_tweets)
        negative_tokens = analysis.tokenize_training_model(negative_tweets)
        cleaned_positive_content = analysis.clean_content(positive_tokens)
        cleaned_negative_content = analysis.clean_content(negative_tokens)


    positive_content_for_model = analysis.prepare_content_for_model(
    negative_content_for_model = analysis.prepare_content_for_model(

    # The dataset needs to be converted to a dict applicable for training.
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_content_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_content_for_model]

    # The positive and negative sentiment halves to train off of should be combined again, and the order randomized.
    dataset = positive_dataset + negative_dataset

    # train the first 70%, test the last 30%. We have 1.6 million tweets in our training data.
    train_data = dataset[:1120000]
    test_data = dataset[1120000:]

    print("Training using dataset")
    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    # After training, we can repeat the process using real data.
    tokenized_tweets, og_tweets, num_original_tweets = fetch_featured_tweets()
    assert len(tokenized_tweets) == len(og_tweets)
    cleaned_drug_tokens = analysis.clean_content(tokenized_tweets)

    print("Running network on real tweets")
    num_positives = 0
    for idx, tokens in enumerate(cleaned_drug_tokens):
        original_tweet = og_tweets[idx]
        token_dict = dict([token, True] for token in tokens)
            # We instruct our network to classify each tweet, and only output Positive sentiment tweets.
            classified = classifier.classify(token_dict)
            if classified == 'Positive':
                num_positives += 1
                print(original_tweet, "=>", classified)
        except Exception:

    print("\nTotal original tweets:", num_original_tweets)
    print("Total drug related tweets:", len(cleaned_drug_tokens))
    print("Percent of original tweets that are drug related:",
          len(cleaned_drug_tokens) / num_original_tweets)
    print("Total number of positive sentiment tweets:", num_positives)
    print("Percent of drug related tweets with positive sentiment:",
          num_positives / len(cleaned_drug_tokens))

    return 0
    negative_tokens_for_model = get_tweets_for_model(

    #Create list that contains lists that contains our dictionary sentences and the string "possitive"
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]
    #Create list that contains lists that contains our dictionary sentences and the string "negative"
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]
    #Merging the list of data
    dataset = positive_dataset + negative_dataset
    #Randomize their position
    #split dataset in 80% training and 20% as testing
    value = 0.8 * len(dataset) + 1
    train_dataset = dataset[:int(value)]
    test_dataset = dataset[int(value):]

    #Call and train Naives Bayes classifier
    classifier = NaiveBayesClassifier.train(train_dataset)
    #Check and print the accuracy with the testing data
    print("Accuracy is:", classify.accuracy(classifier, test_dataset))
    #Show the 10 more important words
    #Create and run a testing tweet
    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
          classifier.classify(dict([token, True] for token in custom_tokens)))
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# Creating features for each review
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Shuffling the documents

training_set = featuresets[:20000]
testing_set = featuresets[20000:]

classifier = NaiveBayesClassifier.train(training_set)
accuracy = classify.accuracy(classifier, testing_set)

MNB_clf = SklearnClassifier(MultinomialNB())
#print("MNB_classifier accuracy percent:", (classify.accuracy(MNB_clf, test_set))*100)

BNB_clf = SklearnClassifier(BernoulliNB())
#print("BernoulliNB_classifier accuracy percent:", (classify.accuracy(BNB_clf, test_set))*100)

LogReg_clf = SklearnClassifier(LogisticRegression())
#print("LogisticRegression_classifier accuracy percent:", (classify.accuracy(LogReg_clf, test_set))*100)

SGD_clf = SklearnClassifier(SGDClassifier())
features_data = np.array(sentences)
features_data_test = np.array(testSentences)

k_fold = KFold(n_splits=10, random_state=1992, shuffle=True)
word_features = None
accuracy_scores = []
accuracy_data_scores = []
for train_set, test_set in k_fold.split(features_data):
    word_features = get_word_features(
    train_features = apply_features(extract_features,
    test_features = apply_features(extract_features,
    classifier = NaiveBayesClassifier.train(train_features)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    testdata_features = apply_features(extract_features,
    refdatasets = collections.defaultdict(set)
    testdatasets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_features):
        observed = classifier.classify(feats)

    for i, (feats, label) in enumerate(testdata_features):
 def train_model(self, data):
     self.model = NaiveBayesClassifier.train(data)
 def trainModel(self, train_data, test_data):
     return NaiveBayesClassifier.train(train_data)
def train(all_features, ratio):
    train_size = int(len(all_features) * ratio)
    train_set, test_set = all_features[:train_size], all_features[train_size:]
    clf = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, clf
def main():

    model_csv = INPUT_PATH + '/newsSentiment.csv'
    stop_words = stopwords.words('english')
    all_model_data = []
    model_data = {'positive': [], 'neutral': [], 'negative': []}
    tokenized_data_rows = []
    with open(model_csv, newline='', encoding="ISO-8859-1") as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            # model_data.append((row[1], row[0]))
            all_model_data.append((row[0], row[1]))
            tokens = remove_noise(word_tokenize(row[1]), stop_words)
            tokenized_data_rows.append((tokens, row[0]))

    # use UTC time
    to_datetime = datetime.utcnow()
    from_datetime = to_datetime - timedelta(days=7)
    news_data = load_news_data(from_datetime, to_datetime)
    news_data = news_data[news_data['category'] == 'business']
    # remove tiny snippets
    # news_data = news_data[( > 140)]

    # positive_tweets = twitter_samples.strings('positive_tweets.json')
    # negative_tweets = twitter_samples.strings('negative_tweets.json')

    # text = twitter_samples.strings('tweets.20150430-223406.json')
    # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    # positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    # negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    # positive_cleaned_tokens_list = []
    # negative_cleaned_tokens_list = []

    # for tokens in positive_tweet_tokens:
    #     positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # for tokens in negative_tweet_tokens:
    #     negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # all_pos_words = get_all_words(model_data['positive'])

    # freq_dist_pos = FreqDist(all_pos_words)
    # print(freq_dist_pos.most_common(10))

    # positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    # negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    # positive_dataset = [(tweet_dict, "Positive")
    #                      for tweet_dict in positive_tokens_for_model]

    # negative_dataset = [(tweet_dict, "Negative")
    #                      for tweet_dict in negative_tokens_for_model]

    # [(tweet_dict, "Negative")
    #                      for tweet_dict in negative_tokens_for_model]

    # dataset = positive_dataset + negative_dataset

    dataset = []
    for key, token_list in model_data.items():
        tokens_for_model = get_tweets_for_model(token_list)
        # for token_row in token_list:
        dataset.extend([(tweet_dict, key) for tweet_dict in tokens_for_model])


    partition_number = len(dataset) * 3 // 4
    train_data = dataset[:partition_number]
    test_data = dataset[partition_number:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    # custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
    # custom_tokens = remove_noise(word_tokenize(custom_tweet))
    # print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    news_data['sentiment'] = news_data['title'].apply(classify_string,
                                                      args=(classifier, ))
    save_path = TEST_OUTPUT_PATH + '/data3.csv'
    results = news_data[['title', 'sentiment']]
    results.to_csv(save_path, index=False, encoding='utf-8')
def main():
    global positive_tokens
    global cleaned_positive_tokens
    global negative_tokens
    global cleaned_negative_tokens
    global predict_tokens
    global cleaned_predict_tokens
    global output_list

    global temp_matrix

    # get cleaned up tokens
    print("......Cleaning up Dataset......")
    print("...removing stop words...\n")
    clean_up_tweets(positive_input_file_dir, train_text_column_index, positive_tokens, cleaned_positive_tokens)
    print("Done: clean up positive tweets")
    clean_up_tweets(negative_input_file_dir, train_text_column_index, negative_tokens, cleaned_negative_tokens)
    print("Done: clean up negative tweets\n")


    # Converting Tokens to a Dictionary:
    positive_tokens_for_model = get_tweets_for_model(cleaned_positive_tokens)
    negative_tokens_for_model = get_tweets_for_model(cleaned_negative_tokens)
    print("Done: Convert tokens to dictionaries.\n")

    # create a dataset by joining the positive and negative tweets.
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]
    dataset = positive_dataset + negative_dataset
    print("Done: Combine dataset by joining the positive and negative tweets.")
    # random shuffle

    print(f"positive dataset: {len(positive_dataset)} tweets.")
    print(f"negative dataset: {len(negative_dataset)} tweets.")
    print(f"combine positive & negative dataset: {len(dataset)} tweets.\n")
    print("......Training Data......")

    # splits the shuffled data into a ratio of 7:3 for training and testing
    train_data = dataset[:round(len(dataset)*0.7)]
    test_data = dataset[round(len(dataset)*0.7):]
    print(f"train data: {len(train_data)} tweets")
    print(f"test data: {len(test_data)} tweets\n")

    print("Build & Test Naive_Bayes_Classifier Model: ")
    classifier = NaiveBayesClassifier.train(train_data)
    print(f"Accuracy is:{classify.accuracy(classifier, test_data)}\n")


    # build confusion matrix
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    labels = []
    tests = []

    for i, (feats, label) in enumerate(test_data):
        observed = classifier.classify(feats)
    print("=============Precision and Recall====================")
    print(f"Positive precision: {nltk.precision(refsets['Positive'], testsets['Positive'])}")
    print(f"Positive recall: {nltk.recall(refsets['Positive'], testsets['Positive'])}")
    print(f"Positive F-measure: {nltk.f_measure(refsets['Positive'], testsets['Positive'])}")
    print(f"Negative precision: {nltk.precision(refsets['Negative'], testsets['Negative'])}")
    print(f"Negative recall: {nltk.recall(refsets['Negative'], testsets['Negative'])}")
    print(f"Negative F-measure: {nltk.f_measure(refsets['Negative'], testsets['Negative'])}")

    print("=============Confusion Matrix====================")
    confusion_matrix_result = nltk.ConfusionMatrix(labels, tests)

    # now visualize the confusion matrix using matplotlib.pyplot
    #=============Visualize Confusion Matrix====================
    # matirx needs to be saved as np.array()
    # also, needs to extract ._confusion first
    confusion_matrix_result = np.array(confusion_matrix_result._confusion)
    temp_matrix = confusion_matrix_result

    classes = ["Negatives", "Positives"]
    plt.imshow(confusion_matrix_result, interpolation='nearest',
    plt.title("Confusion Matrix")
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    text_format = 'd'
    thresh = confusion_matrix_result.max()/2
    for row, column in itertools.product(range(confusion_matrix_result.shape[0]),
        plt.text(column, row, format(confusion_matrix_result[row, column], text_format),
                 color='white' if confusion_matrix_result[row, column] > thresh else "black")

    plt.ylabel("True Values")
    plt.xlabel("Predicted Values")
    # needs a high resolution image
    plt.savefig("/Users/Han/Downloads/web project data/confusion_matrix.png", dpi=1200)

    # =======================================now predict new tweets=======================================
    print("......Now Cleaning up new Dataset......")
    print("...removing stop words...\n")
    clean_up_tweets(predict_input_file_dir, predict_text_column_index, predict_tokens, cleaned_predict_tokens)
    print("Done: clean up predict tweets\n")

    print("...Now Deploy Bayes Classifier on new dataset...")
    for current_tweet_tokens in cleaned_predict_tokens:
        output_list.append([classifier.classify(dict([token, True] for token in current_tweet_tokens))])

    write_csv(output_list, output_file_dir)
    print("Done! ")
 def train(self, train_set):
     self.classifier = NaiveBayesClassifier.train(train_set)
     return self.classifier
print("Dictionary with Positive class : ", positiveReviewDataset[7])
print("Dictionary with Negative class : ", negativeReviewDataset[7])
#print("tagged neg :",negative_dataset[0])

dataset = positiveReviewDataset + negativeReviewDataset

print("Dataset[0] :", dataset[0])
print("Dataset length", len(dataset))


trainData = dataset[:7000]
testData = dataset[7000:]

trainedModel = NaiveBayesClassifier.train(trainData)

print("Accuracy of the model : ", classify.accuracy(trainedModel, testData))

review = "This is a bad product."
reviewTokens = noiseRemoval(word_tokenize(review))

# Test print
print(review, " : ",
      trainedModel.classify(dict([token, True] for token in reviewTokens)))

#Text = "j@nittha"
#Text = re.sub("@", "a", Text)

    for palavra_unica_base_tratada in palavras_unicas_base_tratada:
        resultado_linha_palavra['%s' % palavra_unica_base_tratada] = (
            palavra_unica_base_tratada in palavras_unicas_da_frase)

    # print(f'{frase}: {palavras_unicas_da_frase} : {resultado_linha_palavra}\n')

    return resultado_linha_palavra

# Base classificada
base_classificada = classify.apply_features(extrator_linha_nltk,

# Constrói classificador de probabilidade do Naive Bayes
classificador = NaiveBayesClassifier.train(base_classificada)

# Estatísticas do Classificador
    f'As classes existentes na base classificada são {classificador.labels()}\n'
print(f'As 5 principais características são:')


# Utilizando o classificador
print(f'Utilizando classificador Naive Bayes para obter a classe\n')

def imprimir_classificacao_frase(frase):
def get_classifier():
    # positive_tweets = twitter_samples.strings("positive_tweets.json")
    # negative_tweets = twitter_samples.strings("negative_tweets.json")
    # text = twitter_samples.strings("tweets.20150430-223406.json")
    # tokens = twitter_samples.tokenized("positive_tweets.json")[0]

    stop_words = stopwords.words("english")

    positive_reviewids = [
        x.reviewid for x in session.query(Review).filter(
            Review.score >= 5).order_by(Review.score.desc()).all()
    positive_reviews = []
    for id in random.sample(positive_reviewids, 100):
                Content.reviewid == id).first().content)

    negative_reviewids = [
        x.reviewid for x in session.query(Review).filter(
            Review.score < 5).order_by(Review.score).all()
    negative_reviews = []
    for id in random.sample(negative_reviewids, 100):
                Content.reviewid == id).first().content)

    # positive_tokens = twitter_samples.tokenized("positive_tweets.json")
    # negative_tokens = twitter_samples.tokenized("negative_tweets.json")
    positive_tokens = [nltk.word_tokenize(x) for x in positive_reviews]
    negative_tokens = [nltk.word_tokenize(x) for x in negative_reviews]

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # all_pos_words = get_all_words(positive_cleaned_tokens_list)

    # freq_dist_pos = FreqDist(all_pos_words)
    # print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(word_dict, "Positive")
                        for word_dict in positive_tokens_for_model]

    negative_dataset = [(word_dict, "Negative")
                        for word_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset


    # train_data = dataset[:7000]
    # test_data = dataset[7000:]
    train_data = dataset

    return NaiveBayesClassifier.train(train_data)
def mine_tweets(infile: str, tweetout: str, gramout: str) -> None:
    """Classify, prune, and atomize Tweets."""
    logger = logging.getLogger("miner")"Gathering and tokenizing positive tweets")
    positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json")"Gathering and tokenizing negative tweets")
    negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json")"Cleaning model tokens")
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    # Clean tokens
    for tokens in positive_tweet_tokens:

    # Clean tokens
    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(normalize(tokens))"Building Tweet corpus")
    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)  # type: ignore
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)  # type: ignore

    # Mark positive Tweets as such
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    # Mark negative Tweets as such
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    # Create unified dataset and shuffle it
    dataset = positive_dataset + negative_dataset

    # Train the data using the first 70% as
    # training data, and the last 30% as
    # testing data."70% training, 30% testing")
    train_data = dataset[:7000]
    test_data = dataset[7000:]"Training...")
    classifier = NaiveBayesClassifier.train(train_data)"Accuracy is: %s", classify.accuracy(classifier, test_data))"Classifying Tweets")
    tweets = []

    with open(infile, "r") as csv_file:"Opened %s", infile)

        csv_reader = csv.reader(csv_file, delimiter=",")"Attached CSV reader to %s successfully", infile)

        # Counts processed Tweets and rejected ones
        counter: int = 0
        subject_reject: int = 0

        # Iterate
        for tweet in csv_reader:

            # Printing
            if not counter % DIVISION:
      "Read in %s Tweets so far...", counter)

            # For debugging
            if counter == MAX_TWEETS:

            # Classify Tweet
            new_tweet = Tweet(tweet)
            dist = classifier.prob_classify(
                dict([token, True]
                     for token in new_tweet.cleaned_tokens)  # type: ignore
            new_tweet.positivity = dist.prob("Positive")
            new_tweet.negativity = dist.prob("Negative")
            new_tweet.difference = abs(new_tweet.positivity -

            # Assess the subjectivity of the Tweet
            if new_tweet.difference > SUBJECTIVITY_THRESHOLD:
                subject_reject += 1

            # Count
            counter += 1"Processed %s Tweets", len(tweets))"%s Tweets were rejected for not being subjective enough",

    # Pickle Tweets
    pickle.dump(tweets, open(tweetout, "wb"))"Pickled %s Tweets", len(tweets))

    # Storing our n-gram occurrences
    gram_scores: List[Dict[str, int]] = [{}, {}, {}, {}, {}]

    # Counting n-grams
    for i in range(1, 5):"Creating %s-grams", i)

        # Iterate
        for tweet in tweets:  # type: ignore

            # Create n-grams
            grams = ngrams(tweet.cleaned_tokens, i)  # type: ignore

            # Count every gram
            for gram in grams:

                # Create record for new n-gram
                if gram not in gram_scores[i]:
                    gram_scores[i][gram] = 1

                # Update existing record
                    gram_scores[i][gram] += 1

    # Serialize n-grams to file
    with open(gramout, "wb") as gramout_fp:
        pickle.dump(gram_scores, gramout_fp)
pos_features = []
for words in pos_reviews:
    pos_features.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_features = []
for words in neg_reviews:
    neg_features.append((bag_of_words(words), 'neg'))


test_feature_set = pos_features[:200] + neg_features[:200]
train_feature_set = pos_features[200:] + neg_features[200:]

classifier = NBC.train(train_feature_set)

accuracy = classify.accuracy(classifier, test_feature_set)
#f = open('unigram_classifier.pickle', 'wb')
#pickle.dump(classifier, f)

while (1):
    custom_review = input(
        "Enter a custom movie review (Press ENTER key to exit):\n")
    if (len(custom_review) < 1):
    custom_review_tokens = word_tokenize(custom_review)
    custom_feature_set = bag_of_words(custom_review_tokens)
common_words = [
    word for word, freq in words_freqs.most_common(10000)
    if (word not in stopwords.words("english")) and (word not in ponctuation)

# -------Funtions---------------------------------------------------------
def find_features(document, com_words=common_words):
    words = set(document)
    features = {}
    for w in com_words:
        features[w] = (w in words)
    return features

# ---------------------------------------------------------------------------

feature_sets = [(find_features(text), category)
                for (text, category) in documents]
data = {}
data["train"] = feature_sets[:1900]
data["test"] = feature_sets[1900:]
clf = NaiveBayesClassifier.train(data["train"])  # acc: 85.095
# acc = classify.accuracy(clf, data["test"])*100

rev_name = movie_reviews.fileids("neg")[11]
text = movie_reviews.words(rev_name)
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset


    train_data = dataset[:7000]
    test_data = dataset[7000:]

    sem_classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(sem_classifier, test_data))


    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

        sem_classifier.classify(dict([token, True]
                                     for token in custom_tokens)))

custom_tweet = "My daughter has been at MEM airport for almost 7 hours trying to fly #unitedAIRLINES to houston. #unitedair what are you going to do???"
### Get our texts into the format NLTK expects for its classifier

negative_featurized = [{word: True
                        for word in review} for review in negative_min_df]
positive_featurized = [{word: True
                        for word in review} for review in positive_min_df]

negative_tagged = [(review, 'negative') for review in negative_featurized]
positive_tagged = [(review, 'positive') for review in positive_featurized]

all_tagged = negative_tagged + positive_tagged

### Train the classifier

classifier = NaiveBayesClassifier.train(all_tagged)

### Import, process, featurize new set of movie reviews

ebert_path = 'movie_reviews/ebert/'
ebert_files = os.listdir(ebert_path)
ebert_reviews = [open(ebert_path + name).read() for name in ebert_files]
ebert_tokenized = [word_tokenize(review.lower()) for review in ebert_reviews]
ebert_no_stops = [[word for word in review if word not in stopword_set]
                  for review in ebert_tokenized]
ebert_lemmatized = [[wnl.lemmatize(word) for word in review]
                    for review in ebert_tokenized]
ebert_set = [set(review) for review in ebert_lemmatized]
ebert_min_df = [[word for word in review if word in more_than_once_set]
                for review in ebert_set]
ebert_featurized = ({word: True for word in review} for review in ebert_min_df)
    features['topic'] = document[0]
    for word in document_words:
        # features['contains(%s)' % word] = (word in document_words)
        features[word] = (word in document_words)

    return features

tweets = file_handler.load_data(
    settings.BASE_DIR + '/sentiment_app/analyzer/dataset/full-corpus-lite.csv')
data_set = nltk.classify.apply_features(extract_features, tweets)
# training_set = data_set[:len(data_set)/2]
# testing_set = data_set[len(data_set)/2:]

# make classifier
classifier = NaiveBayesClassifier.train(data_set)

def anaylze(tweet):
    print tweet
    # tweet = ("topic", "tweet string post")

    # accuracy & informative features
    # print nltk.classify.accuracy(classifier, testing_set)
    # print classifier.show_most_informative_features(30)
    # print classifier._labels

    # Test Classify
    data = preprocess(tweet[1])
    feature = extract_features((tweet[0], data))
    def train(self, language):
        df = read_csv("./dataset/SentiWordNet_3.0.0.tsv",

        labeled = []
        for row in df.iterrows():
            score = 0
            if float(row[1]['NegScore']) > 0:
                score = float(
                    numpy.tanh(row[1]['PosScore']) /
                score = float(numpy.tanh(row[1]['PosScore']))
                tokenized = word_tokenize(row[1]['Gloss'])

            item = (tokenized, score)

        stop_words = stopwords.words(language)

        # positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
        # negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

        # positive_cleaned_tokens_list = []
        # negative_cleaned_tokens_list = []

        # for tokens in positive_tweet_tokens:
        #     positive_cleaned_tokens_list.append(self.remove_noise(tokens, stop_words))

        # for tokens in negative_tweet_tokens:
        #     negative_cleaned_tokens_list.append(self.remove_noise(tokens, stop_words))

        # labeled_cleaned_tokens_list = self.remove_noise(labeled, stop_words)
        # all_pos_words = self.get_all_words(labeled_cleaned_tokens_list)

        # freq_dist_pos = FreqDist(all_pos_words)

        # positive_tokens_for_model = self.get_tweets_for_model(positive_cleaned_tokens_list)
        # negative_tokens_for_model = self.get_tweets_for_model(negative_cleaned_tokens_list)

        # positive_dataset = [(tweet_dict, "Positive")
        #                         for tweet_dict in positive_tokens_for_model]

        # negative_dataset = [(tweet_dict, "Negative")
        # for tweet_dict in negative_tokens_for_model]

        dataset = labeled


        train_data = dataset[:7000]
        test_data = dataset[7000:]
        self.classifier = NaiveBayesClassifier.train(train_data)
        self.total_accuracy = classify.accuracy(self.classifier, test_data)

        self.refsets = collections.defaultdict(set)
        self.testsets = collections.defaultdict(set)
        print('Total accuracy: ', self.total_accuracy)
def main():
    print('Building model...')
    print('Gathering training data...')

    # set nltk twitter samples as list of strings
    pos_sample_tweets = twitter_samples.strings('positive_tweets.json')
    neg_sample_tweets = twitter_samples.strings('negative_tweets.json')

    #### UPDATE HERE: Option to add your own tweet samples
    #### Remove the empty list, uncomment and update filepaths below
    pos_custom_tweets = []  ## helpers.import_csv('positive_tweets.csv')
    neg_custom_tweets = []  ## helpers.import_csv('negative_tweets.csv')

    # combine nltk twitter samples and custom tweets
    positive_tweets = pos_sample_tweets + pos_custom_tweets
    negative_tweets = neg_sample_tweets + neg_custom_tweets

    # tokenize tweets
    positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets]
    negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets]

    # set cleaned tokens lists
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    stop_words = stopwords.words('english')

    # get cleaned positive tokens
    for tokens in positive_tweet_tokens:
            helpers.remove_noise(tokens, stop_words))

    # get cleaned negative tokens
    for tokens in negative_tweet_tokens:
            helpers.remove_noise(tokens, stop_words))

    # convert tokens into iterable word lists
    all_pos_words = helpers.get_all_words(positive_cleaned_tokens_list)
    all_neg_words = helpers.get_all_words(negative_cleaned_tokens_list)

    # get frequency distribution of word lists
    freq_dist_pos = FreqDist(all_pos_words)
    freq_dist_neg = FreqDist(all_neg_words)

    # print top 10 positive and negative words
    print('Top 10 positive and negative words:')

    # convert tokens to a dictionary for modelling
    positive_tokens_for_model = helpers.get_tweets_for_model(
    negative_tokens_for_model = helpers.get_tweets_for_model(

    # assign a label to positive tokens
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    # assign a label to negative tokens
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    # set dataset and randomize to train model
    dataset = positive_dataset + negative_dataset

    # split the data into a 70:30 ratio among 10K tweets
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # train a Naive Bayes model
    classifier = NaiveBayesClassifier.train(train_data)

    # print model accuracy
    print("Model accuracy is:", classify.accuracy(classifier, test_data))
    print('Model complete!\n')

    return classifier
    return palavras

def encontrarpalavrasunicas(frequencia):
    freq = frequencia.keys()
    return freq

palavrasunicas = encontrarpalavrasunicas(buscafrequencia(palavras))

def extratorpalavras(documento):
    doc = set(documento)
    caracteristicas = {}
    for palavra in palavrasunicas:
        caracteristicas['%s' % palavra] = (palavra in doc)
    return caracteristicas

classificador = NaiveBayesClassifier.train(
    apply_features(extratorpalavras, removestopwords(frases_padrao)))

testestemming = []
stemmer = RSLPStemmer()
for (palavrastreinamento) in sujeito.split():
    comstem = [p for p in palavrastreinamento.split()]

print('individuo: %s -  <reação da vitima = %s> ' %
      (sujeito, classificador.classify(extratorpalavras(testestemming))))
def trainModel(dataType, save=True):
    if dataType == "Twitter":
        pTweets = twitter_samples.strings('positive_tweets.json')
        nTweets = twitter_samples.strings('negative_tweets.json')
        cleanPTweets = preprocess(pTweets, dataType)
        cleanNTweets = preprocess(nTweets, dataType)
        pDict = []
        nDict = []

        for tweet in cleanPTweets:
            tempDict = {}
            for token in tweet:
                tempDict[token] = True
        for tweet in cleanNTweets:
            tempDict = {}
            for token in tweet:
                tempDict[token] = True

        pData = [(tweet, "Positive") for tweet in pDict]
        nData = [(tweet, "Negative") for tweet in nDict]
        dataSet = pData + nData

        classifier = NaiveBayesClassifier.train(dataSet)

        if save:
            modelName = "./python/models/" + dataType + "BayesModel.txt"
            with open(modelName, 'wb') as f:
                pickle.dump(classifier, f)

        return classifier

    if dataType == "Movie":
        cleanPReviews = []
        cleanNReviews = []
        for file in movie_reviews.fileids('pos'):
        for file in movie_reviews.fileids('neg'):
        pDict = []
        nDict = []

        for review in cleanPReviews:
            tempDict = {}
            for token in review:
                tempDict[token] = True
        for review in cleanNReviews:
            tempDict = {}
            for token in review:
                tempDict[token] = True

        pData = [(review, "Positive") for review in pDict]
        nData = [(review, "Negative") for review in nDict]
        dataSet = pData + nData

        classifier = NaiveBayesClassifier.train(dataSet)

        if save:
            modelName = "./python/models/" + dataType + "BayesModel.txt"
            with open(modelName, 'wb') as f:
                pickle.dump(classifier, f)

        return classifier
def nbtrain(train_set):
    classifier = NaiveBayesClassifier.train(train_set)
    return classifier
 def __init__(self, feat_sets):
     self.train_set = feat_sets[:9500]
     self.test_set = feat_sets[9500:]
     self.Multinomial_classifier = SklearnClassifier(MultinomialNB())
     self.bernoulli_classifier = SklearnClassifier(BernoulliNB())
     self.naivebayes_classifier = NaiveBayesClassifier.train(self.train_set)
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk import classify

names = [('Aidar', 'boy'), ('Marat', 'boy'), ('Aslan', 'boy'),
         ('Nurbek', 'boy'), ('Nurlan', 'boy'), ('Rakhman', 'boy'),
         ('Rustam', 'boy'), ('Islam', 'boy'), ('Daulet', 'boy'),
         ('Yerkebulan', 'boy'), ('Gaziz', 'boy'), ('Aigerim', 'girl'),
         ('Aidana', 'girl'), ('Zhansaya', 'girl'), ('Karina', 'girl'),
         ('Zarina', 'girl'), ('Aiman', 'girl'), ('Sholpan', 'girl'),
         ('Kamshat', 'girl'), ('Aisulu', 'girl'), ('Alina', 'girl'),
         ('Rauan', 'boy'), ('Raikhan', 'girl')]

def gender_features(word):
    return {'last_letter': word[-1]}

featuresets = [(gender_features(n), g) for (n, g) in names]
train_set, test_set = featuresets[:17], featuresets[17:]

nb_classifier = NaiveBayesClassifier.train(train_set)
print(classify.accuracy(nb_classifier, test_set))
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()[-1:])[-2:])[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)