コード例 #1
 def train(self, training_corpus):
     assert isinstance(training_corpus, (list, tuple))
     assert isinstance(training_corpus[0], dict)
     featureset = [(twit_features(i["text"]), i["polarity"])
                     for i in training_corpus
                     if i["denied"] == 0]
     self.classifier = NaiveBayesClassifier.train(featureset)
コード例 #2
def get_sentiment_data(query, training_set):
	train = []
	with open('training/' + training_set + '/training.txt') as f:
		for line in f:
			temp = line.split('\t')
			#print temp
			train.append((get_features(temp[1]), temp[0]))
	clf = NaiveBayesClassifier.train(train)

	tweets = grab_tweets(query)
	print "HERE"
	classified = {}
	for tweet in tweets:
		if tweet.created_at in classified.keys():
			classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
			classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
	print classified

	returndata = {}
	for key in classified:
		#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
		#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
		# percent:
		returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
		#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
	print returndata
	return returndata
コード例 #3
ファイル: nbayes.py プロジェクト: CBaader/science
def nltk_model():
    """Fits the (non-parametric) naive Bayes classifier from nltk on the names
    # each elt of all_names will be a (name, gender) tuple
    all_names = list()

    with open(MALE_FILE, "r") as f:
        for line in f:
            all_names.append((line.rstrip(), "male"))  # rstrip removes trailing whitespace

    with open(FEMALE_FILE, "r") as g:
        for line in g:
            all_names.append((line.rstrip(), "female"))

    # assert stmts can be useful for debugging etc
    assert len(all_names) == 7944

    # shuffle all_names in place

    # features are ({'feature_type': feature_value}, gender) tuples
    features = [(nltk_featurize(name), gender) for name, gender in all_names]
    split_pt = int(TRAIN_PCT * len(features))

    train_set, test_set = features[:split_pt], features[split_pt:]
    nb = NaiveBayesClassifier.train(train_set)

    print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
def test_raw_mail(org_email):

	features_test = {}
	wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
	for key in wordtokens_test:
		if key not in stpwords:
			features_test[key] = True
	return features_test

	#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
	feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]

	#Splitting the test and training data sets from the whole email set features
	size_feature = int(len(feature_sets) * 0.10)
	train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
	classifier = NaiveBayesClassifier.train(train_set)
	#print (test_set[1:5])

	#Printing the accuracy of the machine
	print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) 
	#Printing the top 50 features

	#Printing the spam and ham labels
	print ('labels:',classifier.labels())

	#Classification of user entered email
		featset = raw_mail(input("Enter text to classify: "))
		print (classifier.classify(featset))
コード例 #5
ファイル: time_adapter.py プロジェクト: hundredrab/ChatterBot
    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = kwargs.get('positive', [
            'what time is it',
            'hey what time is it',
            'do you have the time',
            'do you know the time',
            'do you know what time it is',
            'what is the time'

        self.negative = kwargs.get('negative', [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'thyme is my favorite herb',
            'do you have time to look at my essay',
            'how do you have the time to do all this'
            'what is it'

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]

        train_set = [
            (self.time_question_features(text), n) for (text, n) in labeled_data

        self.classifier = NaiveBayesClassifier.train(train_set)
コード例 #6
def check_classifier(feature_extractor, **kwargs):
    Train the classifier on the training spam and ham, then check its accuracy
    on the test data, and show the classifier's most informative features.
    # Make training and testing sets of (features, label) data
    train_set, test_spam, test_ham = \
        make_train_test_sets(feature_extractor, **kwargs)
    # Train the classifier on the training set (train_set)
    # classifier = /your code/
    # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
    # spam_accuracy = /your code/
    # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
    # ham_accuracy = /your code/
    classifier = NaiveBayesClassifier.train(train_set)
    spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
    ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
    # How accurate is the classifier on the test sets?
    print ('Test Spam accuracy: {0:.2f}%'
       .format(100 * spam_accuracy))
    print ('Test Ham accuracy: {0:.2f}%'
       .format(100 * ham_accuracy))

    # Show the top 20 informative features
    print classifier.show_most_informative_features(20)
コード例 #7
    def train(self):
        catalog = getToolByName(self, "portal_catalog")
        presentNouns = dict()
        trainingData = []
        allNouns = catalog.uniqueValuesFor("noun_terms")
        for item in allNouns:
            presentNouns.setdefault(item, 0)

        subjectIndex = catalog._catalog.getIndex("Subject")
        nounTermsIndex = catalog._catalog.getIndex("noun_terms")

        # The internal catalog ids of the objects
        # that have noun terms in the catalog
        nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())

        # The internal catalog ids of the objects
        # that have subjects in the catalog
        subjectIndexIds = IISet(subjectIndex._unindex.keys())
        commonIds = intersection(subjectIndexIds, nounTermIndexIds)

        for cid in commonIds:
            nounPresence = presentNouns.copy()
            nouns = nounTermsIndex._unindex[cid]
            tags = subjectIndex._unindex[cid]
            for noun in nouns:
                nounPresence[noun] = 1
            for tag in tags:
                trainingData.append((nounPresence, tag))
        if trainingData:
            self.classifier = NaiveBayesClassifier.train(trainingData)
コード例 #8
ファイル: insulter.py プロジェクト: arizonat/data-science
def train_nltk(data, labels):
    Returns a trained nltk.NaiveBayesClassifier
    data -- np.array of tuples
    # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
    kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)

    best_model = None
    max_acc = float('-inf')
    for k, (train_index, test_index) in enumerate(kf):
        X_train, Y_train = data[train_index], labels[train_index]
        X_test, Y_test = data[test_index], labels[test_index]

        features_train = bulk_extract_features(X_train)
        features_test = bulk_extract_features(X_test)

        train_set = zip(features_train, Y_train)
        test_set = zip(features_test, Y_test)
        model = nbc.train(train_set)

        acc = nltk.classify.accuracy(model, test_set)
        print str(acc)
        if acc > max_acc:
            max_acc = acc
            best_model = model
    return best_model
コード例 #9
    def __init_naive_bayes( self ):
    	    Create and trains the NaiveBayes Classifier
#		corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
#		while corpus_no == 0 or corpus_no > 3:
#		    corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
		corpus = 'corpus2'#+str(corpus_no)
		path = os.path.join('corpora/',corpus)
		spam_path = os.path.join(path,'spam')
		ham_path = os.path.join(path,'ham')
		spam_dir = os.listdir(spam_path)
		ham_dir = os.listdir(ham_path)
		train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
		train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]

		spam_size = len(train_spam_filelist)
		ham_size = len(train_ham_filelist)
		train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
		train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
		train_set = train_spam_set + train_ham_set
		self.classifier = NaiveBayesClassifier.train( train_set )

		    raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
			os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
			sys.exc_info()[2].tb_lineno, \
			sys.exc_info()[1].message )
コード例 #10
ファイル: spam_filter.py プロジェクト: shwetgarg/spam_filter
def get_matrix(spam_set, ham_set, num_folds):
	Generate different matrix by taking the average of K Fold data
	total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0

	for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
		classifier = NaiveBayesClassifier.train(train_set)
		spam_len = len(test_spam_set)
		ham_len = len(test_ham_set)
		true_positive = false_positive = true_negative = false_negative = 0
		for test in test_spam_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 0:
				true_positive += 1
				false_negative += 1
		for test in test_ham_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 1:
				true_negative += 1
				false_positive += 1
		precision = true_positive / float(true_positive + false_positive)
		recall = true_positive / float(true_positive + false_negative)
		F1 += (2 * precision * recall) / (precision + recall)
		spam_accuracy += true_positive / float(true_positive + false_negative)
		ham_accuracy += true_negative / float(true_negative + false_positive)
		total_precision += precision
		total_recall += recall

	return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
コード例 #11
    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = [
            'what time is it',
            'do you know the time',
            'do you know what time it is',
            'what is the time'

        self.negative = [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'what is'

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]

        # train_set = apply_features(self.time_question_features, training_data)
        train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]

        self.classifier = NaiveBayesClassifier.train(train_set)
コード例 #12
ファイル: classifyspam.py プロジェクト: Vermeij/Spamfilter
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
	classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
	allclassifiers = []
	for name in classnames:
		for i in range(n):
			train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)

			if name == 'Naive Bayes':
				spamclassifier = NaiveBayesClassifier.train(train_set)
			if name == 'Logistic Regression':
				spamclassifier = SklearnClassifier(LogisticRegression())
			if name == 'Linear SCV':
				spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
			perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
			if i == 0:
				perfmeasures_n = perfmeasures_i
				perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
		# Store last classifier built per model
		# Print performance measures per classifier
		printperformance(name, perfmeasures_n, n)	
	return allclassifiers
コード例 #13
 def train_classifiers(self):
     for word in self.senses:
         train_set = []
         for senseId in self.senses[word]:
             for lsa_vector in self.senses[word][senseId]:
                 train_set.append([dict(lsa_vector), senseId])
         self.classifiers[word] = NaiveBayesClassifier.train(train_set)
コード例 #14
ファイル: category_nltk.py プロジェクト: brenden17/infinity
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
コード例 #15
def training(features, method, proportion_training):
	training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing
	testing_set = features[int(proportion_training*len(features)):]
	if method == 'NaiveBayes':
		classifier = NaiveBayesClassifier.train(training_set)
	return training_set, testing_set, classifier
コード例 #16
  def train(self, foldPercent=.8):
    features = self.buildFeatures()

    foldIndex = int(foldPercent * len(features))
    self.setTrain = features[:foldIndex]
    self.setTest = features[foldIndex:]

    self.classifier = nbc.train(self.setTrain)
コード例 #17
ファイル: filter.py プロジェクト: amitrai1095/Spam-Filter
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    train_set_tuple = tuple(train_set)
    classifier = NaiveBayesClassifier.train(train_set_tuple)
    return train_set, test_set, classifier
コード例 #18
ファイル: txtCls.py プロジェクト: adams-n-d/Miners
def textClass():
    #dbFile = open("samp.txt")
    dbFile = open("all.txt")

    reviews = list() #each list element is a list of words in the review
    ratings = list() #ratings given
    usefulness = list() #review classification

    tot_recs = 0
    len_tot = 0
    mlen = 0

    #parse the file and create the list to be passed to the NBClassifiers
    while tot_recs < 150000:#True:
        if tot_recs % 1000 == 0:
            print "num records:", tot_recs
        tot_recs += 1
        raw_rec = readRec(dbFile)
        if len(raw_rec) == 0:
        review_text = [word.strip(punctuation) for word in raw_rec["text"]]
        rate_val = str( raw_rec["score"][0] )
        prs_rec = parse4ftrs(raw_rec)
        len_tot += prs_rec["length"]
        if prs_rec["length"] > mlen:
            mlen = prs_rec["length"]
        use_val = str( prs_rec["class"] )

        #print use_val, rate_val
        #word feature dictionary
        wfd = word_feats(review_text)

        ratings.append( ( wfd  , rate_val)  )
        usefulness.append( ( wfd, use_val)  )

    print "avg length:", len_tot/tot_recs
    print "max len:", mlen
    #select a cutoff for test v training
    #nrecs = len(ratings)
    nrecs = tot_recs
    rate_cl = NaiveBayesClassifier.train(ratings)
    use_cl = NaiveBayesClassifier.train(usefulness)
    return rate_cl, use_cl
def evaluate_classifier(train_set, test_spam, test_ham):
    """ Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham),
	then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative 
	features are showed.
    classifier = NaiveBayesClassifier.train(train_set)
    print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam)))
    print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham)))
    print classifier.show_most_informative_features(20)
コード例 #20
ファイル: try.py プロジェクト: bharatkashyap/clickbait-repel
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
コード例 #21
ファイル: classifer.py プロジェクト: enmalik/Spam-Filter
def buildClassifier(hamDir, spamDir):
	spamEmails = []
	hamEmails = []
	allEmails = []
	features = []

	# Using glob instead of os.listdir to ignore hidden files

	for email in glob.glob(spamDir + "/*"):
		f = open(email)

	for email in glob.glob(hamDir + "/*"):
		f = open(email)

	for email in spamEmails:
		allEmails.append((email, 'spam'))

	for email in hamEmails:
		allEmails.append((email, 'ham'))

	# Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle.

	# Make a list of feature per email
	for (email, label) in allEmails:
		features.append((emailFeatures(email), label))

	# 70:30 ratio for training:testing
	print "Using a 70:30 ratio for training:testing, the accuracy is as follows: "
	totalSize = int(len(features) * 0.7)
	trainingEmails, testingEmails = features[:totalSize], features[totalSize:]

	print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails))
	classifier = NaiveBayesClassifier.train(trainingEmails)
	print classify.accuracy(classifier, testingEmails)

	print "Now creating and saving a full size classifier made up of %d emails..." %len(features)
	classifier = NaiveBayesClassifier.train(features)

	saveClassifier(classifier, "full-classifier.pickle")
コード例 #22
    def __init__(self,classifierType):

        titles = []
        bodies = []
        invalids = []
        drivers = []
        fromFields = []
        toFields = []
        ctitles = []
        cbodies = []
        cdrivers = []

        dirname = os.path.dirname(__file__)
        with open(os.path.join(dirname,'sfIsGood.csv'), 'rb') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            i = -1
            for row in spamreader:
                i += 1
                if (i > 0):
                    invalids.append(row[6] == 'invalid')
                    if not row[6] == 'invalid':

        words = []
        if classifierType == 'driver':
            for i in range(len(ctitles)):
                words += nltk.word_tokenize(ctitles[i])
                words += nltk.word_tokenize(cbodies[i])

            documents = [((nltk.word_tokenize(ctitles[i]) +
                          , cdrivers[i]) for i in range(len(ctitles))]

        elif classifierType == 'invalid':
            for i in range(len(titles)):
                words += nltk.word_tokenize(titles[i])
                words += nltk.word_tokenize(bodies[i])

            documents = [((nltk.word_tokenize(titles[i]) +
                          , str(invalids[i])) for i in range(len(ctitles))]
        all_words = nltk.FreqDist(w.lower() for w in words)
        self.word_features = all_words.keys()[:500]
        self.training_set = [(self.document_features(d), c) for (d,c) in documents]
        self.classifier = NaiveBayesClassifier.train(self.training_set)
コード例 #23
ファイル: trainer.py プロジェクト: gcvalderrama/Palantir
    def naives_classifier(self, training_set, dev_set, log=0):

        classifier = NaiveBayesClassifier.train(training_set)
        accuracy = classify.accuracy(classifier, dev_set)

        print('Naive Bayes accuracy dev percent: ', (accuracy * 100))
        if log == 1:

        return classifier
コード例 #24
def user_name_classify(user_name, classifier):
    """Infer a gender for a User given any name, using a Naive Bayes classifier

    names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
    features = [(name, gender) for (name, gender) in names]
    training_set = features[500:]
    test_set = features[:500]
    classifier = NaiveBayesClassifier.train(training_set)
    return classifier.classify(user_name)
コード例 #25
def classify(text, sender=None, subject=None):
    training_set = load_training_set()
    classifier = NaiveBayesClassifier.train(training_set)
    test_data = bag_of_words(extract_bigrams(text))
    if sender is not None:
        test_data[sender] = True
    if subject is not None:
        test_data[subject] = True
    classified = classifier.prob_classify(test_data)
    pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()})
    return categories[classified.max()]
コード例 #26
ファイル: realization.py プロジェクト: vstu-cad-stuff/ProEQ
 def train(self, data):
     self.result_string = self._represent(data)
     self.labels = defaultdict(int)
     result_string_len = len(self.result_string)
     self.labels = FreqDist(self.result_string)
     train = []
     for start in range(0, len(self.result_string) - self.n_w, self.n_w - 1):
         window = self.result_string[start:start + self.n_w]
         x_key = self.result_string[start + self.n_w]
         train.append(self._gen_feature(window, x_key))
     self.classifier = NaiveBayesClassifier.train(train)
コード例 #27
def train(positiveFile='positive.csv', negativeFile='negative.csv', nOccurrences=25, trainProportion=0.9):
  files = [positiveFile, negativeFile]
  tweetfeats = []
  masterfeats = {}
  for fn in files:
    f = open(fn, 'r')
    theclass = "pos"
    if fn == negativeFile:
      theclass = "neg"
    sep = '\t'
    fin = csv.reader(f, delimiter = sep)
    for line in fin:
      text = line[1]
      if (len(line) != 9):
      # break up into tokens removing all non-word chars
      feat = featurify(text)
      for f in feat:
        if f in masterfeats:
          masterfeats[f] += 1
          masterfeats[f] = 0
      if len(feat) > 0:
        tweetfeats.append((feat, theclass))

  mfn = masterfeats.copy()
  for f in masterfeats:
    if masterfeats[f] < nOccurrences:
      del mfn[f]
  masterfeats = mfn
  f = open("features.lst", "w")
  print "Number of Features = %i" % len(masterfeats)

  train_cut = int(len(tweetfeats) * trainProportion)
  trainfeats = tweetfeats[:train_cut]
  testfeats = tweetfeats[train_cut:]

  print "Training sentiment classifier..."
  classifier = NaiveBayesClassifier.train(trainfeats)
  print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

  # SAVE the classifier & features
  f = open("classifier.pickle", 'w')
  pickle.dump(classifier, f)
  f = open("features.pickle", 'w')
  pickle.dump(masterfeats, f)
コード例 #28
def cross_validate():
    training_set = load_training_set()
    average = 0
    cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None)
    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]])
        acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]])
        print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1]
        print 'Accuracy: %4.2f' % acc
        average += acc
    print 'Average accuracy: %4.2f' % (average / 10)
コード例 #29
   def buildRevClassifier(self, features, normalize, validity):
      revs = self.values()

      featureSets = [(features(rev), rev.reviewer) for rev in self.values()]

      #limit = {'5':0, '4':0, '3':0, '2':0, '1':0}
      #for feature, rank in featureSets:
      #   if limit[rank] > normalize:
      #      featureSets.remove((feature, rank))
      #   limit[rank] += 1
      return NaiveBayesClassifier.train(featureSets)
コード例 #30
pos_features = []
for words in pos_reviews:
    pos_features.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_features = []
for words in neg_reviews:
    neg_features.append((bag_of_words(words), 'neg'))


test_feature_set = pos_features[:200] + neg_features[:200]
train_feature_set = pos_features[200:] + neg_features[200:]

classifier = NBC.train(train_feature_set)

accuracy = classify.accuracy(classifier, test_feature_set)
#f = open('unigram_classifier.pickle', 'wb')
#pickle.dump(classifier, f)

while (1):
    custom_review = input(
        "Enter a custom movie review (Press ENTER key to exit):\n")
    if (len(custom_review) < 1):
    custom_review_tokens = word_tokenize(custom_review)
    custom_feature_set = bag_of_words(custom_review_tokens)
コード例 #31
def nbtrain(train_set):
    classifier = NaiveBayesClassifier.train(train_set)
    return classifier
コード例 #32
ts = ts[:2]
#print ts

training_data = zip(tl, ts)

#training_data, test_set = feat_set[:700],feat_set[700:]

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [
    ({i: (i in word_tokenize(sentence.lower()))
      for i in vocabulary}, tag) for sentence, tag in training_data

classifier = nbc.train(feature_set)

#for classifying a new sentence

test_sentence = tl[1]
featurized_test_sentence = {
    i: (i in word_tokenize(test_sentence.lower()))
    for i in vocabulary

print "test_sent:", test_sentence
print "tag:", classifier.classify(featurized_test_sentence)

#print nltk.classify.accuracy(classifier,test_set)
コード例 #33
        feature[u_word] = (u_word in doc)
    return feature

extract = extract_words(['admir', 'med', 'pesso'])

# Retorna todas as palavras do documento, verifica se as palavras passada por parametro tem no documento e informe ao final sua classe(alegria ou medo)
dataset_train = apply_features(extract_words, words_stemmer_train)
dataset_test = apply_features(extract_words, words_stemmer_test)


# constroi uma tabela de probabilidade
classifier = NaiveBayesClassifier.train(dataset_train)
#print(accuracy(classifier, dataset_test))

errors = []
for feature, target in dataset_test:
    result = classifier.classify(feature)
    if result != target:
        errors.append((target, result, feature))

for (target, result, feature) in errors:
    print(target, result, feature)

# usando a matrix de confução para saber como está os dados em relação de erros e acertos
y_test = []
コード例 #34
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [
    (tweet_dict, "Positive")  # creating the dictionary
    for tweet_dict in positive_tokens_for_model

negative_dataset = [(tweet_dict, "Negative")
                    for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset  # total dataset, includes positives and negatives
random.shuffle(dataset)  # shuffling it

train_data = dataset[:7000]  # train data consists of %70 of dataset
test_data = dataset[7000:]  # test data consists of %30 of dataset
classifier = NaiveBayesClassifier.train(
    train_data)  # classifying with Naive Bayes

print("Accuracy is:", classify.accuracy(classifier,
                                        test_data))  # accuracy of testing
    20))  # most informative 20 words of dataset

custom_tokens = remove_noise(word_tokenize(data))  # using our data
print(classifier.classify(dict([token, True] for token in custom_tokens)))

unique_words = set(custom_tokens)
freq_list = []

for words in unique_words:
    freq_list.append([custom_tokens.count(words), words])
コード例 #35
 def train(self, train_set):
     self.classifier = NaiveBayesClassifier.train(train_set)
     return self.classifier
コード例 #36
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import names
import random

names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])

feature_sets = [(gender_features(n), g) for (n, g) in names]
train_set, test_set = feature_sets[500:], feature_sets[:500]

classifier = NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

from nltk.classify import apply_features
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])

訓練データに偏った素性になってしまう -> 過学習

from collections import OrderedDict
コード例 #37
ファイル: emotions.py プロジェクト: nvbn/mrw.wtf
def get_classifier():
    train_set = get_trains_set()
    return NaiveBayesClassifier.train(train_set)
コード例 #38
features_data = np.array(sentences)
features_data_test = np.array(testSentences)

k_fold = KFold(n_splits=10, random_state=1992, shuffle=True)
word_features = None
accuracy_scores = []
accuracy_data_scores = []
for train_set, test_set in k_fold.split(features_data):
    word_features = get_word_features(
    train_features = apply_features(extract_features,
    test_features = apply_features(extract_features,
    classifier = NaiveBayesClassifier.train(train_features)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    testdata_features = apply_features(extract_features,
    refdatasets = collections.defaultdict(set)
    testdatasets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_features):
        observed = classifier.classify(feats)

    for i, (feats, label) in enumerate(testdata_features):
コード例 #39
ファイル: practice.py プロジェクト: VigneshMohan1/analysis
def train(labeled_featuresets, estimator=ELEProbDist):
    label_probdist = estimator(label_freqdist)
    feature_probdist = {}
    return NaiveBayesClassifier(label_probdist, feature_probdist)
コード例 #40
def evaluate_model(dataset, train_percentage=0.9):
    feature_set = [(get_features(i), label) for (i, label) in dataset]
    count = int(len(feature_set) * train_percentage)
    train_set, test_set = feature_set[:count], feature_set[count:]
    classifier = NaiveBayesClassifier.train(train_set)
    return nltk.classify.accuracy(classifier, test_set)
コード例 #41
    def train_test_model(self):
        This functions is an entirely self contained, trained Naive Bayes Model for text sentiment analysis with a 75.467% accuracy

        Importing more positive and negative classified tweets could be used to improve the model.

        The results are stored in the self.trained_model variable for the DataTransform class

        print('Preprocessing classified tweets for model.')
        from nltk.corpus import twitter_samples
        import random

        positive_tweets = twitter_samples.strings('positive_tweets.json')
        negative_tweets = twitter_samples.strings('negative_tweets.json')

        positive_df = pd.DataFrame(positive_tweets).rename(columns={0: 'text'})
        negative_df = pd.DataFrame(negative_tweets).rename(columns={0: 'text'})

        dict_samp = {}
        positive_dict = []
        positive = []
        negative_dict = []

        datatransform_positive = DataTransform()

        for i in range(len(datatransform_positive.output_df.index)):
            for j in range(len(datatransform_positive.output_df['token_text'][i])):
                dict_samp.update({datatransform_positive.output_df['token_text'][i][j]: True})
            dict_samp = {}
        for w in positive_dict:
            positive.append((w, 'Positive'))

        datatransform_negative = DataTransform()

        for i in range(len(datatransform_negative.output_df.index)):
            for j in range(len(datatransform_negative.output_df['token_text'][i])):
                dict_samp.update({datatransform_negative.output_df['token_text'][i][j]: True})
            dict_samp = {}
        for w in negative_dict:
            negative.append((w, 'Negative'))

        dataset = positive+negative


        train_data = dataset[:7000]
        test_data = dataset[7000:]

        self.trained_model = NaiveBayesClassifier.train(train_data)

        print("Accuracy is:", classify.accuracy(self.trained_model, test_data))

コード例 #42
 def trainModel(self, train_data, test_data):
     return NaiveBayesClassifier.train(train_data)
コード例 #43
    all_words += tweet[0]
freq = fd(all_words)
common = freq.most_common(200)
features = [i[0] for i in common]

def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

training_data = [(get_feature_dict(tweet), sentiment)
                 for tweet, sentiment in clean_words_train]
testing_data = [(get_feature_dict(tweet)) for tweet in clean_words_test]
classifier = nb.train(training_data)
output = []
# for tweet_words in testing_data:
#     print("--------------------------------")
#     print(tweet_words)
output = [classifier.classify(tweet_words) for tweet_words in testing_data]
           delimiter=" ")
コード例 #44
def train_test_evaluation():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')

    print('Total number of positive_tweets are : ', len(positive_tweets))
    print('Total number of negative_tweets are : ', len(negative_tweets))
    print('one smaple of positive_tweets : ', positive_tweets[0])
    print('one smaple of negative_tweets : ', negative_tweets[0])

    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
    print('Total number of positive_tweet_tokens are : ',
    print('Total number of negative_tweet_tokens are : ',
    print('one smaple of positive_tweet_tokens : ', positive_tweet_tokens[0])
    print('one smaple of negative_tweet_tokens : ', negative_tweet_tokens[0])

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # all_pos_words = get_all_words(positive_cleaned_tokens_list)
    # freq_dist_pos = FreqDist(all_pos_words)
    # print('Most Frequent Items in Positive Tweets',freq_dist_pos.most_common(10))
    # all_neg_words = get_all_words(negative_cleaned_tokens_list)
    # freq_dist_neg = FreqDist(all_neg_words)
    # print('Most Frequent Items in negative Tweets',freq_dist_neg.most_common(10))
    # print('-------------------------')

    positive_tokens_for_model = get_tweets_for_model(
    negative_tokens_for_model = get_tweets_for_model(

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset


    train_data = dataset[:9000]
    test_data = dataset[9000:]

    print('Length of Train Data is : ', len(train_data))
    print(' A sample of Traing Data : ', train_data[0])
    print('Length of Test Data is : ', len(train_data))
    print(' A sample of Test Data : ', test_data[0])

    classifier = NaiveBayesClassifier.train(train_data)

    print("\n\n Accuracy is:", classify.accuracy(classifier, test_data))


    f = open('tweeter_trained_cls.pickle', 'wb')
    pickle.dump(classifier, f)

    return classifier
コード例 #45
def predict():

    import nltk

    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import twitter_samples, stopwords
    from nltk.tag import pos_tag
    from nltk.tokenize import word_tokenize
    from nltk import FreqDist, classify, NaiveBayesClassifier
    import re, string, random
    import pickle

    def remove_noise(tweet_tokens, stop_words=()):

        cleaned_tokens = []

        for token, tag in pos_tag(tweet_tokens):
            token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                           '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
            token = re.sub("(@[A-Za-z0-9_]+)", "", token)

            if tag.startswith("NN"):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
                pos = 'a'

            lemmatizer = WordNetLemmatizer()
            token = lemmatizer.lemmatize(token, pos)

            if len(token
                   ) > 0 and token not in string.punctuation and token.lower(
                   ) not in stop_words:
        return cleaned_tokens

    def get_all_words(cleaned_tokens_list):
        for tokens in cleaned_tokens_list:
            for token in tokens:
                yield token

    def get_tweets_for_model(cleaned_tokens_list):
        for tweet_tokens in cleaned_tokens_list:
            yield dict([token, True] for token in tweet_tokens)

    if __name__ == "__main__":

        positive_tweets = twitter_samples.strings('positive_tweets.json')
        negative_tweets = twitter_samples.strings('negative_tweets.json')
        text = twitter_samples.strings('tweets.20150430-223406.json')
        tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

        stop_words = stopwords.words('english')

        positive_tweet_tokens = twitter_samples.tokenized(
        negative_tweet_tokens = twitter_samples.tokenized(

        positive_cleaned_tokens_list = []
        negative_cleaned_tokens_list = []

        for tokens in positive_tweet_tokens:
                remove_noise(tokens, stop_words))

        for tokens in negative_tweet_tokens:
                remove_noise(tokens, stop_words))

        all_pos_words = get_all_words(positive_cleaned_tokens_list)

        freq_dist_pos = FreqDist(all_pos_words)

        positive_tokens_for_model = get_tweets_for_model(
        negative_tokens_for_model = get_tweets_for_model(

        positive_dataset = [(tweet_dict, "Positive")
                            for tweet_dict in positive_tokens_for_model]

        negative_dataset = [(tweet_dict, "Negative")
                            for tweet_dict in negative_tokens_for_model]

        dataset = positive_dataset + negative_dataset


        train_data = dataset[:7000]
        test_data = dataset[7000:]

        classifier = NaiveBayesClassifier.train(train_data)

        print("Accuracy is:", classify.accuracy(classifier, test_data))


        custom_tweet = ""

        if request.method == 'POST':
            custom_tweet = request.form['text']

        custom_tokens = remove_noise(word_tokenize(custom_tweet))

        NB_Cls = classifier.classify(
            dict([token, True] for token in custom_tokens))

        print(custom_tweet, NB_Cls)

        pickle.dump(NB_Cls, open('sentimental_101.pkl', 'wb'))

        return render_template('results.html', result=NB_Cls)
コード例 #46
    word_features = list(set(all_words))[:2000]

    def find_features(wordList):
        words = set(wordList)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return features

    training_set = []

    for wordList, category in documents:
        training_set.append((find_features(wordList), category))

    classifier = NaiveBayesClassifier.train(training_set)

while True:
    choose = 0

    print("Opinion List")

    if (len(opinionList) > 0):
        for index, opinion in enumerate(opinionList):
            print(str(index + 1) + ". " + opinion)
        print("No opinion inserted")

    print("Opinion Analysis")
    print("1. Insert Opinion")
コード例 #47
class NBClassifier(TransformerMixin):
    """Naive Bayes classifier for part-of-text classification.

    The classifier creates a wrapper around NLTK NaiveBayesClassifier
    and implements `transform` and `fit_transform` methods suitable for
    pipeline integration.

        :param label_probdist:
            P(label), the probability distribution over labels.

            It is expressed as a ``ProbDistI`` whose samples are labels.
            I.e., P(label) = ``label_probdist.prob(label)``.

        :param feature_probdist:
            P(fname=fval|label), the probability distribution for feature values, given labels.

            It is expressed as a dictionary whose keys are ``(label, fname)``
            pairs and whose values are ``ProbDistI`` objects over feature values.
            I.e., P(fname=fval|label) = ``feature_probdist[label,fname].prob(fval)``.
            If a given ``(label,fname)`` is not a key in ``feature_probdist``,
            then it is assumed that the corresponding P(fname=fval|label)
            is 0 for all values of ``fval``.
    def __init__(self,

        self._estimator = estimator

        # in case arguments are specified (ie. when restoring the classifier)
        if all([label_probdist, feature_probdist]):
            self._classifier = NaiveBayesClassifier(
            self._classifier = None

    def features(self):
        if self._classifier is None:
            return None

        return self._classifier.most_informative_features()

    # noinspection PyPep8Naming, PyUnusedLocal
    def fit(self, X: typing.Iterable, y=None, **fit_params):  # pylint: disable=invalid-name,unused-argument
        """Fits the classifier to the given data set.

        :param X: Iterable, output of FeatureExtractor

            The X is expected to be an iterable of tuples (tagged_word, feature_set, label),
            where feature set is a dictionary of evaluated features.
            The format of X matches the output of `FeatureExtractor`.

        :param y: redundant (included to preserve base class method definition)

        # NLTK classifier expects stacked featuresets for the training,
        # so we need to reduce the dimenstionality
        labeled_featuresets = list()
        for entry in X:
                (featureset, feature_label)
                for _, featureset, feature_label in entry

        # initialize the NLTK classifier
        self._classifier = NaiveBayesClassifier.train(
            labeled_featuresets, estimator=self._estimator)

        return self

    # noinspection PyPep8Naming, PyUnusedLocal
    def transform(self, X):  # pylint: disable=invalid-name,unused-argument
        """Auxiliary function to be used in pipeline."""

        return self

    # noinspection PyPep8Naming
    def evaluate(
            X: typing.Iterable,  # pylint: disable=invalid-name
            y: typing.Iterable,
        """Perform evaluation of the classifier instance.

        :param X: Iterable, test data

            Same shape as for `fit` and `fit_predict` methods

        :param y: Iterable, of labels
        :param sample:

        one of labels to get the prediction for (for example,
                                                 if labels are ['class_A', 'class_B', 'class_C'], the sample
        could be 'class_A'.

        :param n: int, number of candidates to output
        # noinspection PyTypeChecker,PyTypeChecker
        if len(X) != len(y):
            raise ValueError("`X` and `y` must be of the same length.")

        candidate_arr = self.fit_predict(X, n=n, sample=sample)

        correctly_predicted = 0
        for candidates, label in zip(candidate_arr, y):
            pred = self._valid_candidates(candidates, label)
            correctly_predicted += int(pred)

        # return the accuracy score
        # noinspection PyTypeChecker
        return precision(total=len(y), correct=correctly_predicted)

    # noinspection PyPep8Naming
    def fit_predict(self, X: typing.Iterable, y=None, **fit_params):  # pylint: disable=invalid-name,unused-argument
        """Makes prediction about the given data.

        :param X: Iterable, prediction data

            The prediction data is expected to be of type List[(name_tuple, feature_set [,feature,label)]
            where feature_set corresponds to the output of FeatureExtractor and feature labels (if provided)
            should be None (will be ignored anyway).

        :param y: redundant (included to preserve bace class method definition)
        :param fit_params: kwargs, fit parameters

            n: number of candidates to output
            sample: one of labels to get the prediction for (for example,
            if labels are ['class_A', 'class_B', 'class_C'], the sample
            could be 'class_A'.
        # get fit parameters
        n = fit_params.get('n', 3)
        sample = fit_params.get('sample', None)

        # do not allow sample to be `None` (wouldn't be possible to sort
        # the candidates in a logical way)
        if sample is None:
            raise ValueError("`fit_parameter` `sample` was not specified."
                             " This is not allowed in `fit_predict` method")

        if not all([hasattr(var, '__len__') for var in [X, y or []]]):
            raise TypeError("`X` and `y` must implement `__len__` method")

        # noinspection PyTypeChecker
        predictions = [None] * len(X)
        for i, x in enumerate(X):
            candidate_pred = [None] * len(x)
            for j, candidate in enumerate(x):
                if len(candidate) == 3:
                    # feature label was provided as part of X set (usual case), ignore it
                    name_tuple, features, _ = candidate
                    name_tuple, features = candidate
                candidate_pred[j] = (name_tuple,
                                     self.predict(features, sample=sample))

            sorted_pred = sorted(candidate_pred,
                                 key=lambda t: t[1],
            predictions[i] = sorted_pred[:n]

        return np.array(predictions)

    def predict(self, features: dict, sample=None) -> typing.Any:
        """Make predictions based on given features.

        :param features: dict, features to be used for prediction

            Dictionary of (feature_key, feature_value)

        :param sample:

            one of labels to get the prediction for (for example,
            if labels are ['class_A', 'class_B', 'class_C'], the sample
            could be 'class_A'.

        :returns: Union[float, dict]

            If `sample` is specified, returns P(sample|features),
            ie the probability of `sample` given features,
            where `sample` is one of labels.
            Otherwise returns dict of (label: max_prob) for all
            known labels.
        if self._classifier is None:
            raise ValueError("Unable to make predictions. "
                             "Classifier has not been trained yet!")

        prob_dist = self._classifier.prob_classify(features)
        # sort by the probability

        if sample is not None:
            probs = prob_dist.prob(sample)
            probs = {s: prob_dist.prob(s) for s in self._classifier.labels()}

        return probs

    def show_most_informative_features(self):
        if self._classifier is None:


    def export(self, export_dir=None, export_name=None) -> str:
        """Exports timestamped pickled classifier to the given directory.

        :returns: path to the timestamped .checkpoint file
        export_dir = export_dir or 'export/'
        export_name = export_name or 'classifier'

        if export_name.endswith('.checkpoint'):
            export_name = ".".join(export_name.split('.')[:-1])

        time_stamp = str(datetime.datetime.now().timestamp())

        # create export directory
        os.makedirs(export_dir, exist_ok=True)

        time_stamped_fname = ".".join([export_name, time_stamp, 'checkpoint'])
        time_stamped_fpath = os.path.join(export_dir, time_stamped_fname)

        # pickle and export the classifier
        with open(time_stamped_fpath, 'wb') as exp_file:
            pickle.dump(self, exp_file)

        return time_stamped_fname

    def restore(checkpoint) -> "NBClassifier":
        """Restores the classifier from a checkpoint file.

        :param checkpoint: path to directory or specific checkpoint

            If path to directory provided, the newest checkpoint
            is restored.
        def _restore_checkpoint(fp):
            with open(fp, 'rb') as checkpoint_file:
                # load the exported classifier
                return pickle.load(checkpoint_file)

        if os.path.isdir(checkpoint):
            checkpoint_dir = checkpoint
            checkpoints = [
                os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint)
                if f.endswith('.checkpoint')
            # find the latest
            if not checkpoints:
                raise ValueError(
                    "No checkpoints were found in `{}`.".format(checkpoint))
            latest_checkpoint = sorted(checkpoints)[-1]
            clf = _restore_checkpoint(latest_checkpoint)

            clf = _restore_checkpoint(checkpoint)

        return clf

    def _valid_candidates(candidates: typing.Iterable, label):
        """Check whether the correct label is among candidates."""
        for candidate, _ in candidates:
            # FIXME: a bug here, NLTK lets weird things like '**' go through -> causes crash
            candidate_name, _ = candidate
                if re.search(candidate_name, label, flags=re.IGNORECASE):
                    return True
                return False

        return False
コード例 #48
 def __init__(self, feat_sets):
     self.train_set = feat_sets[:9500]
     self.test_set = feat_sets[9500:]
     self.Multinomial_classifier = SklearnClassifier(MultinomialNB())
     self.bernoulli_classifier = SklearnClassifier(BernoulliNB())
     self.naivebayes_classifier = NaiveBayesClassifier.train(self.train_set)
コード例 #49
def sentim(self, data):
    stop_words = ['the', 'an', 'the', 'i', 'a', 'and', 'to'] #, 'none'] #, 'heartworm', ' distemper/parvo'] #stopwords.words('english')

    path_csv = '../data/csv/tf_idf_adoptable_csv.csv'
    df = read_df_csv(path_csv)
    X_negative = df["description"] #data
    corpus_dirty = []
    for doc in range(len(X_negative)):
        str_corpus = str(X_negative[doc])

    negative_documents = []
    for doc in range(len(X_negative)):
        record = X_negative[doc]
        record = (record.lower())
        replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') 
        remove_digits = str.maketrans('', '', digits) 
        replaced = replaced.translate(remove_digits) 
        clean = replaced.replace(", '...'", "").replace("...", '')
    # print(documents)
# #     # 2. Create a set of tokenized documents.
    negative_descriptions = [word_tokenize(content) for content in negative_documents]

    negative_cleaned_tokens_list = []
    for tokens in negative_descriptions:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_neg_words = get_all_words(negative_cleaned_tokens_list)
    freq_dist_neg = FreqDist(all_neg_words)
    print("most common ADOPTABLE words: ", freq_dist_neg.most_common(10))


    path_csv = '../data/csv/tf_idf_adopted_csv.csv'
    df = read_df_csv(path_csv)
    X_positive = df["description"] #data
    corpus_dirty = []
    for doc in range(len(X_positive)):
        str_corpus = str(X_positive[doc])

    positive_documents = []
    for doc in range(len(X_positive)):
        record = X_positive[doc]
        record = (record.lower())
        replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') 
        remove_digits = str.maketrans('', '', digits) 
        replaced = replaced.translate(remove_digits) 
        clean = replaced.replace(", '...'", "").replace("...", '')
    # print(documents)
# #     # 2. Create a set of tokenized documents.
    positive_descriptions = [word_tokenize(content) for content in positive_documents]
    # print("\n\nPositive Descriptions Tokenized: ", positive_descriptions)
    # ['dora', 'female', 'shep', 'mix', 'brindle', 'dhpp', 'kc', '//', 'no', 'puppy', 'hi', 'cathleen', ',', 'she', 'is', 'doing', 'great', 'and', 'really', 'starting'], ['meet', 'nova', '!', 'now', 'that', 'she', 'is', 'done', 'raising', 'her', 'pups', 'she', 'is', 'looking', 'for', 'a', 'home', 'of', 'her', 'own', 'where']]
    positive_cleaned_tokens_list = []
    for tokens in positive_descriptions:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)
    # save_documents = open("pickled_algos/all_pos_words.pickle","wb")
    # pickle.dump(positive_cleaned_tokens_list, save_documents)
    # save_documents.close()

    freq_dist_pos = FreqDist(all_pos_words)
    print("most common ADOPTED words: ", freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    # positive_tokens_for_model = all_pos_words.pickle
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(description_dict, "Positive")
                    for description_dict in positive_tokens_for_model]

    negative_dataset = [(description_dict, "Negative")
                        for description_dict in negative_tokens_for_model]
    # print("positive_dataset: ", positive_dataset)
    # print("negative_dataset: ", negative_dataset)

    dataset = positive_dataset + negative_dataset
    seventy_percent_of_data = int(len(dataset) * .7)
    thirty_percent_of_data = int(len(dataset) * .3)
    # print(thirty_percent_of_data) #361

    random.shuffle(dataset) #to avoid bias

    train_data = dataset[:seventy_percent_of_data]
    test_data = dataset[thirty_percent_of_data:]

    classifier = NaiveBayesClassifier.train(train_data)
    # classifier = MultinomialNB.fit(train_data)
    save_classifier = open("naivebayes_pet.pickle","wb")
    pickle.dump(classifier, save_classifier)

    print("%%%%%%%%%%%%%%%%%%%Accuracy is:", classify.accuracy(classifier, test_data))

    # from nltk.corpus import twitter_samples
    # print("&&&&&&&&&&&&&&&&&&&&&&&&&")
    # print(twitter_samples)
    data = str(data)
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for ele in data:  
        if ele in punc:  
            data = data.replace(ele, "")
    data = data.split()
    # print("tokenized data: ", data)
    #breakdown parts of speech
    parts_of_speech = [] 
    print("parts of speech tagging: ", parts_of_speech) 
    #lemmatized data:
    stop_words = [] #left here in case I want to add words in the future
    cleaned_tokens = []

    for token, tag in nltk.pos_tag(data):
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos) 

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
    custom_tokens = remove_noise(word_tokenize(str(data)))

    print(str(data), classifier.classify(dict([token, True] for token in custom_tokens)))

    sentiment_result = [classifier.classify(dict([token, True] for token in custom_tokens))]

    print("sentiment_result: ", type(sentiment_result), sentiment_result)

    data = sentiment_result
    return data
コード例 #50

sad_token = get_tweets_for_model(negative)
joy_token = get_tweets_for_model(positive)

negative_dataset = [(tweet_dict, "negative") for tweet_dict in sad_token]

positive_dataset = [(tweet_dict, "positive") for tweet_dict in joy_token]

dataset = positive_dataset + negative_dataset

train_data = dataset[:900]
test_data = dataset[900:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))


# Connect to MariaDB Platform
    conn = mariadb.connect(
        user="******",  #- enter your username
        #password="******" - enter your password
        database="tcsproject"  # - enter your database name
except mariadb.Error as e:
    print(f"Error connecting to MariaDB Platform: {e}")
コード例 #51
ファイル: train_nbc.py プロジェクト: ravenscroftj/partridge
            for word in features:

                if word not in labelled_features:
                    labelled_features[word.lower()] = label_count

                labelled_features[word.lower()][label] += features[word]

            print "Currently at %d distinct tokens and %d papers" % (
                len(labelled_features), samplecount)

    label_probdist = get_label_probdist(labelled_features)

    feature_probdist = get_feature_probdist(labelled_features)

    classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

    for samplefile in test_samples:
        features = {}

        p = PaperParser()

        for sentence in p.extractRawSentences():
            tokens = nltk.word_tokenize(sentence)

            for word in tokens:
                features[word] = True

        dirname = os.path.basename(os.path.dirname(samplefile))
        label = labels[dirname]
コード例 #52
def train(all_features, ratio):
    train_size = int(len(all_features) * ratio)
    train_set, test_set = all_features[:train_size], all_features[train_size:]
    clf = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, clf
コード例 #53
print("Dictionary with Positive class : ", positiveReviewDataset[7])
print("Dictionary with Negative class : ", negativeReviewDataset[7])
#print("tagged neg :",negative_dataset[0])

dataset = positiveReviewDataset + negativeReviewDataset

print("Dataset[0] :", dataset[0])
print("Dataset length", len(dataset))


trainData = dataset[:7000]
testData = dataset[7000:]

trainedModel = NaiveBayesClassifier.train(trainData)

print("Accuracy of the model : ", classify.accuracy(trainedModel, testData))

review = "This is a bad product."
reviewTokens = noiseRemoval(word_tokenize(review))

# Test print
print(review, " : ",
      trainedModel.classify(dict([token, True] for token in reviewTokens)))

#Text = "j@nittha"
#Text = re.sub("@", "a", Text)

コード例 #54
 def train_topic_classifier(self, train_set):
     classifier = NaiveBayesClassifier.train(train_set)
     return classifier
コード例 #55
    print("Also see: Hindu Marriage Act")
elif resultc != -1 or y == "Christian":
    f1 = open("Christian.txt")
    f2 = open("christian01.txt")
    l1 = f1.read()
    arr = sent_tokenize(l1)
    l2 = f2.read()
    arr2 = word_tokenize(l2)
    for i in range(0, len(arr)):
        li1.append(tuple((arr[i], arr2[i])))
    print("Also see: Indian Divorce Act")
mycase = sys.argv[3]
#mycase=input("enter your case ")
c1 = 0
c2 = 0
model = NaiveBayesClassifier(li1)
case = sent_tokenize(mycase)
for i in range(0, len(case)):
    temp = model.classify(case[i])

    if temp == "0":
        c1 = c1 + 1
        c2 = c2 + 1
print("Probability of winning case", (c1 / (c1 + c2)) * 100)
コード例 #56
def sentim_twitter(self, data):
    '''heavily borrowed from https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
    to show functioning model'''
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset


    train_data = dataset[:700]
    test_data = dataset[700:]

    classifier = NaiveBayesClassifier.train(train_data)
    print("twitter data **********************************")

    print("%%%%%%%%%%%%%%%%%%% Twitter Accuracy is:", classify.accuracy(classifier, test_data))
    print("twitter data **********************************")


    # data = (data)

    # custom_tweet = str(data) 
    print("twitter data **********************************")
    print("twitter data **********************************")
    print("is this reading data correctly???: ", type(str(data)))
    custom_tweet = str(data)
    # this gives negative
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    print("twitter data **********************************")
    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    twitter =  classifier.classify(dict([token, True] for token in custom_tokens))
    return twitter
コード例 #57
 def train_model(self, data):
     self.model = NaiveBayesClassifier.train(data)
コード例 #58
    def train(self, corpus, selected_feats):
        train_set = self.parse_corpus(corpus)
        print('Train set:', len(train_set))

        # # unigram
        self.unigrams = Counter([
            word for chat, win, duration, extra in train_set for word in chat
        self.common_unigrams = [
            unigram for unigram, value in self.unigrams.items() if value > 1
        # print(len(self.unigrams), len(self.common_unigrams))

        # # bigram
        self.bigrams = Counter([
            ' '.join((word, chat[i + 1]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-1])
        self.common_bigrams = [
            bigram for bigram, value in self.bigrams.items() if value > 1
        # print(len(self.bigrams), len(self.common_bigrams))
        # # trigram
        self.trigrams = Counter([
            ' '.join((word, chat[i + 1], chat[i + 2]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-2])
        self.common_trigrams = [
            trigram for trigram, value in self.trigrams.items() if value > 1
        # print(len(self.trigrams), len(self.common_trigrams))
        # # fourgram
        self.fourgrams = Counter([
            ' '.join((word, chat[i + 1], chat[i + 2], chat[i + 3]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-3])
        self.common_fourgrams = [
            fourgram for fourgram, value in self.fourgrams.items() if value > 1
        # print(len(self.fourgrams), len(self.common_fourgrams))
        # # fivegram
        self.fivegrams = Counter([
            ' '.join(
                (word, chat[i + 1], chat[i + 2], chat[i + 3], chat[i + 4]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-4])
        self.common_fivegrams = [
            fivegram for fivegram, value in self.fivegrams.items() if value > 1
        # print(len(self.fivegrams), len(self.common_fivegrams))

        ###### WP30 PLOT #######
        # wp30s = [len(chat) // (duration / 1800) for chat,win,duration,extra in train_set]
        # n, bins, patches = plt.hist(wp30s, 100,alpha=0.75)
        # plt.show()
        # self.doclen = Counter([len(chat) for chat,win,duration in train_set])

        ###### CHATTER PLOT ######
        # data = []
        # for chat, win, duration,extra in w8m8.iterate(train_set, out='Training'):
        #     nchars = [0,0,0,0,0]
        #     for player, message in extra:
        #         nchars[player] += len(message)
        #     avg = sum(nchars) / 5
        #     data.append(max(nchars) / avg)
        # n, bins, patches = plt.hist(data, 1000,alpha=0.75)
        # plt.show()

        t = []
        for chat, win, duration, extra in w8m8.iterate(train_set,
            features = self.get_features(chat, duration, extra, selected_feats)
            t.append((features, win))
        self.classifier = NaiveBayesClassifier.train(t)
コード例 #59
def main():
    print('Building model...')
    print('Gathering training data...')

    # set nltk twitter samples as list of strings
    pos_sample_tweets = twitter_samples.strings('positive_tweets.json')
    neg_sample_tweets = twitter_samples.strings('negative_tweets.json')

    #### UPDATE HERE: Option to add your own tweet samples
    #### Remove the empty list, uncomment and update filepaths below
    pos_custom_tweets = []  ## helpers.import_csv('positive_tweets.csv')
    neg_custom_tweets = []  ## helpers.import_csv('negative_tweets.csv')

    # combine nltk twitter samples and custom tweets
    positive_tweets = pos_sample_tweets + pos_custom_tweets
    negative_tweets = neg_sample_tweets + neg_custom_tweets

    # tokenize tweets
    positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets]
    negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets]

    # set cleaned tokens lists
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    stop_words = stopwords.words('english')

    # get cleaned positive tokens
    for tokens in positive_tweet_tokens:
            helpers.remove_noise(tokens, stop_words))

    # get cleaned negative tokens
    for tokens in negative_tweet_tokens:
            helpers.remove_noise(tokens, stop_words))

    # convert tokens into iterable word lists
    all_pos_words = helpers.get_all_words(positive_cleaned_tokens_list)
    all_neg_words = helpers.get_all_words(negative_cleaned_tokens_list)

    # get frequency distribution of word lists
    freq_dist_pos = FreqDist(all_pos_words)
    freq_dist_neg = FreqDist(all_neg_words)

    # print top 10 positive and negative words
    print('Top 10 positive and negative words:')

    # convert tokens to a dictionary for modelling
    positive_tokens_for_model = helpers.get_tweets_for_model(
    negative_tokens_for_model = helpers.get_tweets_for_model(

    # assign a label to positive tokens
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    # assign a label to negative tokens
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    # set dataset and randomize to train model
    dataset = positive_dataset + negative_dataset

    # split the data into a 70:30 ratio among 10K tweets
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # train a Naive Bayes model
    classifier = NaiveBayesClassifier.train(train_data)

    # print model accuracy
    print("Model accuracy is:", classify.accuracy(classifier, test_data))
    print('Model complete!\n')

    return classifier
コード例 #60
ファイル: category_nltk.py プロジェクト: brenden17/infinity
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)