def train(self, training_corpus): assert isinstance(training_corpus, (list, tuple)) assert isinstance(training_corpus[0], dict) featureset = [(twit_features(i["text"]), i["polarity"]) for i in training_corpus if i["denied"] == 0] self.classifier = NaiveBayesClassifier.train(featureset)
def get_sentiment_data(query, training_set): train = [] with open('training/' + training_set + '/training.txt') as f: for line in f: temp = line.split('\t') #print temp train.append((get_features(temp[1]), temp[0])) clf = NaiveBayesClassifier.train(train) tweets = grab_tweets(query) print "HERE" classified = {} for tweet in tweets: if tweet.created_at in classified.keys(): classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))] else: classified[tweet.created_at] = [clf.classify(get_features(tweet.text))] print classified returndata = {} for key in classified: #numpos = sum([1 if v=='pos' else 0 for v in classified[key]]) #returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative # percent: returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key]) #returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0 print returndata return returndata
def nltk_model(): """Fits the (non-parametric) naive Bayes classifier from nltk on the names dataset.""" # each elt of all_names will be a (name, gender) tuple all_names = list() with open(MALE_FILE, "r") as f: for line in f: all_names.append((line.rstrip(), "male")) # rstrip removes trailing whitespace with open(FEMALE_FILE, "r") as g: for line in g: all_names.append((line.rstrip(), "female")) # assert stmts can be useful for debugging etc assert len(all_names) == 7944 # shuffle all_names in place random.shuffle(all_names) # features are ({'feature_type': feature_value}, gender) tuples features = [(nltk_featurize(name), gender) for name, gender in all_names] split_pt = int(TRAIN_PCT * len(features)) train_set, test_set = features[:split_pt], features[split_pt:] nb = NaiveBayesClassifier.train(train_set) print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set))) nb.show_most_informative_features(10)
def test_raw_mail(org_email): features_test = {} wordtokens_test = [word_limit.lemmatize(key.lower()) for key in word_tokenize(org_email)] for key in wordtokens_test: if key not in stpwords: features_test[key] = True return features_test #Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle] #Splitting the test and training data sets from the whole email set features size_feature = int(len(feature_sets) * 0.10) train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature] classifier = NaiveBayesClassifier.train(train_set) #print (test_set[1:5]) #Printing the accuracy of the machine print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) #Printing the top 50 features classifier.show_most_informative_features(50) #Printing the spam and ham labels print ('labels:',classifier.labels()) #Classification of user entered email while(True): featset = raw_mail(input("Enter text to classify: ")) print (classifier.classify(featset))
def __init__(self, chatbot, **kwargs): super().__init__(chatbot, **kwargs) from nltk import NaiveBayesClassifier self.positive = kwargs.get('positive', [ 'what time is it', 'hey what time is it', 'do you have the time', 'do you know the time', 'do you know what time it is', 'what is the time' ]) self.negative = kwargs.get('negative', [ 'it is time to go to sleep', 'what is your favorite color', 'i had a great time', 'thyme is my favorite herb', 'do you have time to look at my essay', 'how do you have the time to do all this' 'what is it' ]) labeled_data = ( [(name, 0) for name in self.negative] + [(name, 1) for name in self.positive] ) train_set = [ (self.time_question_features(text), n) for (text, n) in labeled_data ] self.classifier = NaiveBayesClassifier.train(train_set)
def check_classifier(feature_extractor, **kwargs): ''' Train the classifier on the training spam and ham, then check its accuracy on the test data, and show the classifier's most informative features. ''' # Make training and testing sets of (features, label) data train_set, test_spam, test_ham = \ make_train_test_sets(feature_extractor, **kwargs) #=============================================== # ADD YOUR CODE HERE # Train the classifier on the training set (train_set) # classifier = /your code/ # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham) # spam_accuracy = /your code/ # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham) # ham_accuracy = /your code/ #=============================================== classifier = NaiveBayesClassifier.train(train_set) spam_accuracy = nltk.classify.accuracy(classifier, test_spam) ham_accuracy = nltk.classify.accuracy(classifier, test_ham) # How accurate is the classifier on the test sets? print ('Test Spam accuracy: {0:.2f}%' .format(100 * spam_accuracy)) print ('Test Ham accuracy: {0:.2f}%' .format(100 * ham_accuracy)) # Show the top 20 informative features print classifier.show_most_informative_features(20)
def train(self): """ """ catalog = getToolByName(self, "portal_catalog") presentNouns = dict() trainingData = [] allNouns = catalog.uniqueValuesFor("noun_terms") for item in allNouns: presentNouns.setdefault(item, 0) subjectIndex = catalog._catalog.getIndex("Subject") nounTermsIndex = catalog._catalog.getIndex("noun_terms") # The internal catalog ids of the objects # that have noun terms in the catalog nounTermIndexIds = IISet(nounTermsIndex._unindex.keys()) # The internal catalog ids of the objects # that have subjects in the catalog subjectIndexIds = IISet(subjectIndex._unindex.keys()) commonIds = intersection(subjectIndexIds, nounTermIndexIds) for cid in commonIds: nounPresence = presentNouns.copy() nouns = nounTermsIndex._unindex[cid] tags = subjectIndex._unindex[cid] for noun in nouns: nounPresence[noun] = 1 for tag in tags: trainingData.append((nounPresence, tag)) if trainingData: self.classifier = NaiveBayesClassifier.train(trainingData)
def train_nltk(data, labels): ''' Returns a trained nltk.NaiveBayesClassifier Inputs --------- data -- np.array of tuples ''' # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True) best_model = None max_acc = float('-inf') for k, (train_index, test_index) in enumerate(kf): X_train, Y_train = data[train_index], labels[train_index] X_test, Y_test = data[test_index], labels[test_index] features_train = bulk_extract_features(X_train) features_test = bulk_extract_features(X_test) train_set = zip(features_train, Y_train) test_set = zip(features_test, Y_test) model = nbc.train(train_set) acc = nltk.classify.accuracy(model, test_set) print str(acc) if acc > max_acc: max_acc = acc best_model = model best_model.show_most_informative_features(30) return best_model
def __init_naive_bayes( self ): """ Create and trains the NaiveBayes Classifier """ try: # corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: '))) # while corpus_no == 0 or corpus_no > 3: # corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' ))) corpus = 'corpus2'#+str(corpus_no) path = os.path.join('corpora/',corpus) spam_path = os.path.join(path,'spam') ham_path = os.path.join(path,'ham') spam_dir = os.listdir(spam_path) ham_dir = os.listdir(ham_path) train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir] train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir] spam_size = len(train_spam_filelist) ham_size = len(train_ham_filelist) train_spam_set = self.__make_featured_set(train_spam_filelist,'spam') train_ham_set = self.__make_featured_set(train_ham_filelist,'ham') train_set = train_spam_set + train_ham_set self.classifier = NaiveBayesClassifier.train( train_set ) except: raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\ os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\ sys.exc_info()[2].tb_lineno, \ sys.exc_info()[1].message )
def get_matrix(spam_set, ham_set, num_folds): ''' Generate different matrix by taking the average of K Fold data ''' total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0 for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds): classifier = NaiveBayesClassifier.train(train_set) spam_len = len(test_spam_set) ham_len = len(test_ham_set) true_positive = false_positive = true_negative = false_negative = 0 for test in test_spam_set: features = test[0] predicted_label = classifier.classify(features) if predicted_label == 0: true_positive += 1 else: false_negative += 1 for test in test_ham_set: features = test[0] predicted_label = classifier.classify(features) if predicted_label == 1: true_negative += 1 else: false_positive += 1 precision = true_positive / float(true_positive + false_positive) recall = true_positive / float(true_positive + false_negative) F1 += (2 * precision * recall) / (precision + recall) spam_accuracy += true_positive / float(true_positive + false_negative) ham_accuracy += true_negative / float(true_negative + false_positive) total_precision += precision total_recall += recall return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
def __init__(self, **kwargs): super(TimeLogicAdapter, self).__init__(**kwargs) from nltk import NaiveBayesClassifier self.positive = [ 'what time is it', 'do you know the time', 'do you know what time it is', 'what is the time' ] self.negative = [ 'it is time to go to sleep', 'what is your favorite color', 'i had a great time', 'what is' ] labeled_data = ( [(name, 0) for name in self.negative] + [(name, 1) for name in self.positive] ) # train_set = apply_features(self.time_question_features, training_data) train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data] self.classifier = NaiveBayesClassifier.train(train_set)
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n): classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV'] allclassifiers = [] for name in classnames: for i in range(n): random.shuffle(featureslist) train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION) if name == 'Naive Bayes': spamclassifier = NaiveBayesClassifier.train(train_set) if name == 'Logistic Regression': spamclassifier = SklearnClassifier(LogisticRegression()) spamclassifier.train(train_set) if name == 'Linear SCV': spamclassifier = SklearnClassifier(LinearSVC(C=0.01)) spamclassifier.train(train_set) perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name) if i == 0: perfmeasures_n = perfmeasures_i else: perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i) # Store last classifier built per model allclassifiers.append(spamclassifier) # Print performance measures per classifier printperformance(name, perfmeasures_n, n) return allclassifiers
def train_classifiers(self): for word in self.senses: train_set = [] for senseId in self.senses[word]: for lsa_vector in self.senses[word][senseId]: train_set.append([dict(lsa_vector), senseId]) self.classifiers[word] = NaiveBayesClassifier.train(train_set)
def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def training(features, method, proportion_training): training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing testing_set = features[int(proportion_training*len(features)):] if method == 'NaiveBayes': classifier = NaiveBayesClassifier.train(training_set) return training_set, testing_set, classifier
def train(self, foldPercent=.8): features = self.buildFeatures() foldIndex = int(foldPercent * len(features)) self.setTrain = features[:foldIndex] self.setTest = features[foldIndex:] self.classifier = nbc.train(self.setTrain)
def train(features, samples_proportion): train_size = int(len(features) * samples_proportion) train_set, test_set = features[:train_size], features[train_size:] print ('Training set size = ' + str(len(train_set)) + ' emails') print ('Test set size = ' + str(len(test_set)) + ' emails') train_set_tuple = tuple(train_set) classifier = NaiveBayesClassifier.train(train_set_tuple) return train_set, test_set, classifier
def textClass(): #dbFile = open("samp.txt") dbFile = open("all.txt") reviews = list() #each list element is a list of words in the review ratings = list() #ratings given usefulness = list() #review classification tot_recs = 0 len_tot = 0 mlen = 0 #parse the file and create the list to be passed to the NBClassifiers while tot_recs < 150000:#True: if tot_recs % 1000 == 0: print "num records:", tot_recs tot_recs += 1 raw_rec = readRec(dbFile) if len(raw_rec) == 0: break review_text = [word.strip(punctuation) for word in raw_rec["text"]] rate_val = str( raw_rec["score"][0] ) prs_rec = parse4ftrs(raw_rec) len_tot += prs_rec["length"] if prs_rec["length"] > mlen: mlen = prs_rec["length"] use_val = str( prs_rec["class"] ) #print use_val, rate_val #word feature dictionary wfd = word_feats(review_text) ratings.append( ( wfd , rate_val) ) usefulness.append( ( wfd, use_val) ) dbFile.close() print "avg length:", len_tot/tot_recs print "max len:", mlen #select a cutoff for test v training #nrecs = len(ratings) nrecs = tot_recs rate_cl = NaiveBayesClassifier.train(ratings) use_cl = NaiveBayesClassifier.train(usefulness) return rate_cl, use_cl
def evaluate_classifier(train_set, test_spam, test_ham): """ Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham), then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative features are showed. """ classifier = NaiveBayesClassifier.train(train_set) print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam))) print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham))) print classifier.show_most_informative_features(20)
def train(features, samples_proportion): train_size = int(len(features) * samples_proportion) # initialise the training and test sets train_set, test_set = features[:train_size], features[train_size:] print ('Training set size = ' + str(len(train_set)) + ' emails') print ('Test set size = ' + str(len(test_set)) + ' emails') # train the classifier classifier = NaiveBayesClassifier.train(train_set) return train_set, test_set, classifier
def buildClassifier(hamDir, spamDir): spamEmails = [] hamEmails = [] allEmails = [] features = [] # Using glob instead of os.listdir to ignore hidden files for email in glob.glob(spamDir + "/*"): f = open(email) spamEmails.append(f.read()) f.close() for email in glob.glob(hamDir + "/*"): f = open(email) hamEmails.append(f.read()) f.close() for email in spamEmails: allEmails.append((email, 'spam')) for email in hamEmails: allEmails.append((email, 'ham')) # Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle. random.shuffle(allEmails) # Make a list of feature per email for (email, label) in allEmails: features.append((emailFeatures(email), label)) # 70:30 ratio for training:testing print "Using a 70:30 ratio for training:testing, the accuracy is as follows: " totalSize = int(len(features) * 0.7) trainingEmails, testingEmails = features[:totalSize], features[totalSize:] print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails)) classifier = NaiveBayesClassifier.train(trainingEmails) print classify.accuracy(classifier, testingEmails) print "Now creating and saving a full size classifier made up of %d emails..." %len(features) classifier = NaiveBayesClassifier.train(features) saveClassifier(classifier, "full-classifier.pickle")
def __init__(self,classifierType): titles = [] bodies = [] invalids = [] drivers = [] fromFields = [] toFields = [] ctitles = [] cbodies = [] cdrivers = [] dirname = os.path.dirname(__file__) with open(os.path.join(dirname,'sfIsGood.csv'), 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',') i = -1 for row in spamreader: i += 1 if (i > 0): titles.append(row[0]) bodies.append(row[3]) fromFields.append(row[6]) toFields.append(row[7]) invalids.append(row[6] == 'invalid') drivers.append(row[10]) if not row[6] == 'invalid': ctitles.append(row[0]) cbodies.append(row[3]) cdrivers.append(row[10]) words = [] if classifierType == 'driver': for i in range(len(ctitles)): words += nltk.word_tokenize(ctitles[i]) words += nltk.word_tokenize(cbodies[i]) documents = [((nltk.word_tokenize(ctitles[i]) + nltk.word_tokenize(cbodies[i])) , cdrivers[i]) for i in range(len(ctitles))] random.shuffle(documents) elif classifierType == 'invalid': for i in range(len(titles)): words += nltk.word_tokenize(titles[i]) words += nltk.word_tokenize(bodies[i]) documents = [((nltk.word_tokenize(titles[i]) + nltk.word_tokenize(bodies[i])) , str(invalids[i])) for i in range(len(ctitles))] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in words) self.word_features = all_words.keys()[:500] self.training_set = [(self.document_features(d), c) for (d,c) in documents] self.classifier = NaiveBayesClassifier.train(self.training_set)
def naives_classifier(self, training_set, dev_set, log=0): classifier = NaiveBayesClassifier.train(training_set) accuracy = classify.accuracy(classifier, dev_set) print('Naive Bayes accuracy dev percent: ', (accuracy * 100)) if log == 1: classifier.show_most_informative_features(20) return classifier
def user_name_classify(user_name, classifier): """Infer a gender for a User given any name, using a Naive Bayes classifier """ names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')] features = [(name, gender) for (name, gender) in names] training_set = features[500:] test_set = features[:500] classifier = NaiveBayesClassifier.train(training_set) return classifier.classify(user_name)
def classify(text, sender=None, subject=None): training_set = load_training_set() classifier = NaiveBayesClassifier.train(training_set) test_data = bag_of_words(extract_bigrams(text)) if sender is not None: test_data[sender] = True if subject is not None: test_data[subject] = True classified = classifier.prob_classify(test_data) pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()}) return categories[classified.max()]
def train(self, data): self.result_string = self._represent(data) self.labels = defaultdict(int) result_string_len = len(self.result_string) self.labels = FreqDist(self.result_string) train = [] for start in range(0, len(self.result_string) - self.n_w, self.n_w - 1): window = self.result_string[start:start + self.n_w] x_key = self.result_string[start + self.n_w] train.append(self._gen_feature(window, x_key)) self.classifier = NaiveBayesClassifier.train(train)
def train(positiveFile='positive.csv', negativeFile='negative.csv', nOccurrences=25, trainProportion=0.9): files = [positiveFile, negativeFile] tweetfeats = [] masterfeats = {} for fn in files: f = open(fn, 'r') theclass = "pos" if fn == negativeFile: theclass = "neg" sep = '\t' fin = csv.reader(f, delimiter = sep) for line in fin: text = line[1] if (len(line) != 9): print(text) # break up into tokens removing all non-word chars feat = featurify(text) for f in feat: if f in masterfeats: masterfeats[f] += 1 else: masterfeats[f] = 0 if len(feat) > 0: tweetfeats.append((feat, theclass)) mfn = masterfeats.copy() for f in masterfeats: if masterfeats[f] < nOccurrences: del mfn[f] masterfeats = mfn f = open("features.lst", "w") f.write('\n'.join(list(masterfeats.keys()))) f.close() print "Number of Features = %i" % len(masterfeats) train_cut = int(len(tweetfeats) * trainProportion) random.shuffle(tweetfeats) trainfeats = tweetfeats[:train_cut] testfeats = tweetfeats[train_cut:] print "Training sentiment classifier..." sys.stdout.flush() classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() sys.stdout.flush() # SAVE the classifier & features f = open("classifier.pickle", 'w') pickle.dump(classifier, f) f.close() f = open("features.pickle", 'w') pickle.dump(masterfeats, f) f.close()
def cross_validate(): training_set = load_training_set() random.shuffle(training_set) average = 0 cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]]) acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]]) print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1] print 'Accuracy: %4.2f' % acc average += acc print 'Average accuracy: %4.2f' % (average / 10)
def buildRevClassifier(self, features, normalize, validity): revs = self.values() random.shuffle(revs) featureSets = [(features(rev), rev.reviewer) for rev in self.values()] #limit = {'5':0, '4':0, '3':0, '2':0, '1':0} #for feature, rank in featureSets: # if limit[rank] > normalize: # featureSets.remove((feature, rank)) # limit[rank] += 1 return NaiveBayesClassifier.train(featureSets)
pos_features = [] for words in pos_reviews: pos_features.append((bag_of_words(words), 'pos')) # negative reviews feature set neg_features = [] for words in neg_reviews: neg_features.append((bag_of_words(words), 'neg')) shuffle(pos_features) shuffle(neg_features) test_feature_set = pos_features[:200] + neg_features[:200] train_feature_set = pos_features[200:] + neg_features[200:] classifier = NBC.train(train_feature_set) accuracy = classify.accuracy(classifier, test_feature_set) print(accuracy) #f = open('unigram_classifier.pickle', 'wb') #pickle.dump(classifier, f) #f.close() while (1): custom_review = input( "Enter a custom movie review (Press ENTER key to exit):\n") if (len(custom_review) < 1): break custom_review_tokens = word_tokenize(custom_review) custom_feature_set = bag_of_words(custom_review_tokens) print(classifier.classify(custom_feature_set))
def nbtrain(train_set): classifier = NaiveBayesClassifier.train(train_set) return classifier
ts = ts[:2] #print ts #feat_set=dict(feat_set) training_data = zip(tl, ts) #training_data=dict(training_data) #training_data, test_set = feat_set[:700],feat_set[700:] vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data])) feature_set = [ ({i: (i in word_tokenize(sentence.lower())) for i in vocabulary}, tag) for sentence, tag in training_data ] classifier = nbc.train(feature_set) #for classifying a new sentence test_sentence = tl[1] featurized_test_sentence = { i: (i in word_tokenize(test_sentence.lower())) for i in vocabulary } print "test_sent:", test_sentence print "tag:", classifier.classify(featurized_test_sentence) #print nltk.classify.accuracy(classifier,test_set)
feature[u_word] = (u_word in doc) return feature extract = extract_words(['admir', 'med', 'pesso']) #print(extract) # Retorna todas as palavras do documento, verifica se as palavras passada por parametro tem no documento e informe ao final sua classe(alegria ou medo) dataset_train = apply_features(extract_words, words_stemmer_train) dataset_test = apply_features(extract_words, words_stemmer_test) #print(dataset) # FAZENDO O MODELO COM NAIVE BAYES # constroi uma tabela de probabilidade classifier = NaiveBayesClassifier.train(dataset_train) #print(classifier.labels()) #print(classifier.show_most_informative_features()) #print(accuracy(classifier, dataset_test)) errors = [] for feature, target in dataset_test: result = classifier.classify(feature) if result != target: errors.append((target, result, feature)) for (target, result, feature) in errors: print(target, result, feature) # usando a matrix de confução para saber como está os dados em relação de erros e acertos y_test = []
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [ (tweet_dict, "Positive") # creating the dictionary for tweet_dict in positive_tokens_for_model ] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset # total dataset, includes positives and negatives random.shuffle(dataset) # shuffling it train_data = dataset[:7000] # train data consists of %70 of dataset test_data = dataset[7000:] # test data consists of %30 of dataset classifier = NaiveBayesClassifier.train( train_data) # classifying with Naive Bayes print("Accuracy is:", classify.accuracy(classifier, test_data)) # accuracy of testing print(classifier.show_most_informative_features( 20)) # most informative 20 words of dataset custom_tokens = remove_noise(word_tokenize(data)) # using our data print(classifier.classify(dict([token, True] for token in custom_tokens))) print(custom_tokens) unique_words = set(custom_tokens) freq_list = [] for words in unique_words: freq_list.append([custom_tokens.count(words), words])
def train(self, train_set): self.classifier = NaiveBayesClassifier.train(train_set) return self.classifier
import nltk from nltk import NaiveBayesClassifier from nltk.corpus import names import random names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) print(names[:3]) feature_sets = [(gender_features(n), g) for (n, g) in names] print(feature_sets[:3]) train_set, test_set = feature_sets[500:], feature_sets[:500] classifier = NaiveBayesClassifier.train(train_set) print(classifier.classify(gender_features('Neo'))) print(classifier.classify(gender_features('Trinity'))) print(nltk.classify.accuracy(classifier, test_set)) print(classifier.show_most_informative_features(5)) from nltk.classify import apply_features train_set = apply_features(gender_features, names[500:]) test_set = apply_features(gender_features, names[:500]) """ 素性抽出関数を改善する 訓練データに偏った素性になってしまう -> 過学習 """ from collections import OrderedDict
def get_classifier(): train_set = get_trains_set() return NaiveBayesClassifier.train(train_set)
features_data = np.array(sentences) features_data_test = np.array(testSentences) k_fold = KFold(n_splits=10, random_state=1992, shuffle=True) word_features = None accuracy_scores = [] accuracy_data_scores = [] for train_set, test_set in k_fold.split(features_data): word_features = get_word_features( get_words_in_sentences(features_data[train_set].tolist())) train_features = apply_features(extract_features, features_data[train_set].tolist()) test_features = apply_features(extract_features, features_data[test_set].tolist()) classifier = NaiveBayesClassifier.train(train_features) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) testdata_features = apply_features(extract_features, features_data_test.tolist()) refdatasets = collections.defaultdict(set) testdatasets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_features): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) for i, (feats, label) in enumerate(testdata_features):
def train(labeled_featuresets, estimator=ELEProbDist): label_probdist = estimator(label_freqdist) feature_probdist = {} return NaiveBayesClassifier(label_probdist, feature_probdist)
def evaluate_model(dataset, train_percentage=0.9): feature_set = [(get_features(i), label) for (i, label) in dataset] count = int(len(feature_set) * train_percentage) train_set, test_set = feature_set[:count], feature_set[count:] classifier = NaiveBayesClassifier.train(train_set) return nltk.classify.accuracy(classifier, test_set)
def train_test_model(self): ''' This functions is an entirely self contained, trained Naive Bayes Model for text sentiment analysis with a 75.467% accuracy Importing more positive and negative classified tweets could be used to improve the model. The results are stored in the self.trained_model variable for the DataTransform class ''' print('Preprocessing classified tweets for model.') from nltk.corpus import twitter_samples import random positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') positive_df = pd.DataFrame(positive_tweets).rename(columns={0: 'text'}) negative_df = pd.DataFrame(negative_tweets).rename(columns={0: 'text'}) dict_samp = {} positive_dict = [] positive = [] negative=[] negative_dict = [] datatransform_positive = DataTransform() datatransform_positive.set_df(positive_df) datatransform_positive.clean_text('text','token_text') for i in range(len(datatransform_positive.output_df.index)): for j in range(len(datatransform_positive.output_df['token_text'][i])): dict_samp.update({datatransform_positive.output_df['token_text'][i][j]: True}) positive_dict.append(dict_samp) dict_samp = {} for w in positive_dict: positive.append((w, 'Positive')) datatransform_negative = DataTransform() datatransform_negative.set_df(negative_df) datatransform_negative.clean_text('text','token_text') for i in range(len(datatransform_negative.output_df.index)): for j in range(len(datatransform_negative.output_df['token_text'][i])): dict_samp.update({datatransform_negative.output_df['token_text'][i][j]: True}) negative_dict.append(dict_samp) dict_samp = {} for w in negative_dict: negative.append((w, 'Negative')) dataset = positive+negative random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] self.trained_model = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(self.trained_model, test_data)) return
def trainModel(self, train_data, test_data): return NaiveBayesClassifier.train(train_data)
all_words += tweet[0] freq = fd(all_words) common = freq.most_common(200) features = [i[0] for i in common] def get_feature_dict(words): current_features = {} words_set = set(words) for w in features: current_features[w] = w in words_set return current_features training_data = [(get_feature_dict(tweet), sentiment) for tweet, sentiment in clean_words_train] testing_data = [(get_feature_dict(tweet)) for tweet in clean_words_test] print(training_data) print(testing_data[0]) classifier = nb.train(training_data) output = [] # for tweet_words in testing_data: # print("--------------------------------") # print(tweet_words) output = [classifier.classify(tweet_words) for tweet_words in testing_data] print(output) np.savetxt("predictions_twitter_sentimental.csv", output, fmt="%s", delimiter=" ")
def train_test_evaluation(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') print('Total number of positive_tweets are : ', len(positive_tweets)) print('Total number of negative_tweets are : ', len(negative_tweets)) print('-------------------------') print('one smaple of positive_tweets : ', positive_tweets[0]) print('one smaple of negative_tweets : ', negative_tweets[0]) print('-------------------------\n\n') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') print('Total number of positive_tweet_tokens are : ', len(positive_tweet_tokens)) print('Total number of negative_tweet_tokens are : ', len(negative_tweet_tokens)) print('-------------------------') print('one smaple of positive_tweet_tokens : ', positive_tweet_tokens[0]) print('one smaple of negative_tweet_tokens : ', negative_tweet_tokens[0]) print('-------------------------\n\n') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # all_pos_words = get_all_words(positive_cleaned_tokens_list) # freq_dist_pos = FreqDist(all_pos_words) # print('Most Frequent Items in Positive Tweets',freq_dist_pos.most_common(10)) # # all_neg_words = get_all_words(negative_cleaned_tokens_list) # freq_dist_neg = FreqDist(all_neg_words) # print('Most Frequent Items in negative Tweets',freq_dist_neg.most_common(10)) # print('-------------------------') positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:9000] test_data = dataset[9000:] print('Length of Train Data is : ', len(train_data)) print(' A sample of Traing Data : ', train_data[0]) print('-------------------------') print('Length of Test Data is : ', len(train_data)) print(' A sample of Test Data : ', test_data[0]) print('-------------------------') classifier = NaiveBayesClassifier.train(train_data) print("\n\n Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) f = open('tweeter_trained_cls.pickle', 'wb') pickle.dump(classifier, f) f.close() return classifier
def predict(): import nltk nltk.download('twitter_samples') nltk.download('stopwords') nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') nltk.download('punkt') from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import twitter_samples, stopwords from nltk.tag import pos_tag from nltk.tokenize import word_tokenize from nltk import FreqDist, classify, NaiveBayesClassifier import re, string, random import pickle def remove_noise(tweet_tokens, stop_words=()): cleaned_tokens = [] for token, tag in pos_tag(tweet_tokens): token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token) token = re.sub("(@[A-Za-z0-9_]+)", "", token) if tag.startswith("NN"): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() token = lemmatizer.lemmatize(token, pos) if len(token ) > 0 and token not in string.punctuation and token.lower( ) not in stop_words: cleaned_tokens.append(token.lower()) return cleaned_tokens def get_all_words(cleaned_tokens_list): for tokens in cleaned_tokens_list: for token in tokens: yield token def get_tweets_for_model(cleaned_tokens_list): for tweet_tokens in cleaned_tokens_list: yield dict([token, True] for token in tweet_tokens) if __name__ == "__main__": positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized( 'negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append( remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append( remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) custom_tweet = "" if request.method == 'POST': custom_tweet = request.form['text'] custom_tokens = remove_noise(word_tokenize(custom_tweet)) NB_Cls = classifier.classify( dict([token, True] for token in custom_tokens)) print(custom_tweet, NB_Cls) pickle.dump(NB_Cls, open('sentimental_101.pkl', 'wb')) return render_template('results.html', result=NB_Cls)
word_features = list(set(all_words))[:2000] def find_features(wordList): words = set(wordList) features = {} for w in word_features: features[w] = (w in words) return features training_set = [] for wordList, category in documents: training_set.append((find_features(wordList), category)) classifier = NaiveBayesClassifier.train(training_set) while True: choose = 0 print("Opinion List") print("============") if (len(opinionList) > 0): for index, opinion in enumerate(opinionList): print(str(index + 1) + ". " + opinion) else: print("No opinion inserted") print("Opinion Analysis") print("1. Insert Opinion")
class NBClassifier(TransformerMixin): """Naive Bayes classifier for part-of-text classification. The classifier creates a wrapper around NLTK NaiveBayesClassifier and implements `transform` and `fit_transform` methods suitable for pipeline integration. :param label_probdist: P(label), the probability distribution over labels. It is expressed as a ``ProbDistI`` whose samples are labels. I.e., P(label) = ``label_probdist.prob(label)``. :param feature_probdist: P(fname=fval|label), the probability distribution for feature values, given labels. It is expressed as a dictionary whose keys are ``(label, fname)`` pairs and whose values are ``ProbDistI`` objects over feature values. I.e., P(fname=fval|label) = ``feature_probdist[label,fname].prob(fval)``. If a given ``(label,fname)`` is not a key in ``feature_probdist``, then it is assumed that the corresponding P(fname=fval|label) is 0 for all values of ``fval``. """ def __init__(self, label_probdist=None, feature_probdist=None, estimator=ELEProbDist): self._estimator = estimator # in case arguments are specified (ie. when restoring the classifier) if all([label_probdist, feature_probdist]): self._classifier = NaiveBayesClassifier( label_probdist=label_probdist, feature_probdist=feature_probdist, ) else: self._classifier = None @property def features(self): if self._classifier is None: return None return self._classifier.most_informative_features() # noinspection PyPep8Naming, PyUnusedLocal def fit(self, X: typing.Iterable, y=None, **fit_params): # pylint: disable=invalid-name,unused-argument """Fits the classifier to the given data set. :param X: Iterable, output of FeatureExtractor The X is expected to be an iterable of tuples (tagged_word, feature_set, label), where feature set is a dictionary of evaluated features. The format of X matches the output of `FeatureExtractor`. :param y: redundant (included to preserve base class method definition) """ # NLTK classifier expects stacked featuresets for the training, # so we need to reduce the dimenstionality labeled_featuresets = list() for entry in X: labeled_featuresets.extend([ (featureset, feature_label) for _, featureset, feature_label in entry ]) # initialize the NLTK classifier self._classifier = NaiveBayesClassifier.train( labeled_featuresets, estimator=self._estimator) return self # noinspection PyPep8Naming, PyUnusedLocal def transform(self, X): # pylint: disable=invalid-name,unused-argument """Auxiliary function to be used in pipeline.""" return self # noinspection PyPep8Naming def evaluate( self, X: typing.Iterable, # pylint: disable=invalid-name y: typing.Iterable, sample, n=3): """Perform evaluation of the classifier instance. :param X: Iterable, test data Same shape as for `fit` and `fit_predict` methods :param y: Iterable, of labels :param sample: one of labels to get the prediction for (for example, if labels are ['class_A', 'class_B', 'class_C'], the sample could be 'class_A'. :param n: int, number of candidates to output """ # noinspection PyTypeChecker,PyTypeChecker if len(X) != len(y): raise ValueError("`X` and `y` must be of the same length.") candidate_arr = self.fit_predict(X, n=n, sample=sample) correctly_predicted = 0 for candidates, label in zip(candidate_arr, y): pred = self._valid_candidates(candidates, label) correctly_predicted += int(pred) # return the accuracy score # noinspection PyTypeChecker return precision(total=len(y), correct=correctly_predicted) # noinspection PyPep8Naming def fit_predict(self, X: typing.Iterable, y=None, **fit_params): # pylint: disable=invalid-name,unused-argument """Makes prediction about the given data. :param X: Iterable, prediction data The prediction data is expected to be of type List[(name_tuple, feature_set [,feature,label)] where feature_set corresponds to the output of FeatureExtractor and feature labels (if provided) should be None (will be ignored anyway). :param y: redundant (included to preserve bace class method definition) :param fit_params: kwargs, fit parameters n: number of candidates to output sample: one of labels to get the prediction for (for example, if labels are ['class_A', 'class_B', 'class_C'], the sample could be 'class_A'. """ # get fit parameters n = fit_params.get('n', 3) sample = fit_params.get('sample', None) # do not allow sample to be `None` (wouldn't be possible to sort # the candidates in a logical way) if sample is None: raise ValueError("`fit_parameter` `sample` was not specified." " This is not allowed in `fit_predict` method") if not all([hasattr(var, '__len__') for var in [X, y or []]]): raise TypeError("`X` and `y` must implement `__len__` method") # noinspection PyTypeChecker predictions = [None] * len(X) for i, x in enumerate(X): candidate_pred = [None] * len(x) for j, candidate in enumerate(x): if len(candidate) == 3: # feature label was provided as part of X set (usual case), ignore it name_tuple, features, _ = candidate else: name_tuple, features = candidate candidate_pred[j] = (name_tuple, self.predict(features, sample=sample)) sorted_pred = sorted(candidate_pred, key=lambda t: t[1], reverse=True) predictions[i] = sorted_pred[:n] return np.array(predictions) def predict(self, features: dict, sample=None) -> typing.Any: """Make predictions based on given features. :param features: dict, features to be used for prediction Dictionary of (feature_key, feature_value) :param sample: one of labels to get the prediction for (for example, if labels are ['class_A', 'class_B', 'class_C'], the sample could be 'class_A'. :returns: Union[float, dict] If `sample` is specified, returns P(sample|features), ie the probability of `sample` given features, where `sample` is one of labels. Otherwise returns dict of (label: max_prob) for all known labels. """ if self._classifier is None: raise ValueError("Unable to make predictions. " "Classifier has not been trained yet!") prob_dist = self._classifier.prob_classify(features) # sort by the probability if sample is not None: probs = prob_dist.prob(sample) else: probs = {s: prob_dist.prob(s) for s in self._classifier.labels()} return probs def show_most_informative_features(self): if self._classifier is None: return self._classifier.show_most_informative_features() def export(self, export_dir=None, export_name=None) -> str: """Exports timestamped pickled classifier to the given directory. :returns: path to the timestamped .checkpoint file """ export_dir = export_dir or 'export/' export_name = export_name or 'classifier' if export_name.endswith('.checkpoint'): export_name = ".".join(export_name.split('.')[:-1]) time_stamp = str(datetime.datetime.now().timestamp()) # create export directory os.makedirs(export_dir, exist_ok=True) time_stamped_fname = ".".join([export_name, time_stamp, 'checkpoint']) time_stamped_fpath = os.path.join(export_dir, time_stamped_fname) # pickle and export the classifier with open(time_stamped_fpath, 'wb') as exp_file: pickle.dump(self, exp_file) return time_stamped_fname @staticmethod def restore(checkpoint) -> "NBClassifier": """Restores the classifier from a checkpoint file. :param checkpoint: path to directory or specific checkpoint If path to directory provided, the newest checkpoint is restored. """ def _restore_checkpoint(fp): with open(fp, 'rb') as checkpoint_file: # load the exported classifier return pickle.load(checkpoint_file) if os.path.isdir(checkpoint): checkpoint_dir = checkpoint checkpoints = [ os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint) if f.endswith('.checkpoint') ] # find the latest if not checkpoints: raise ValueError( "No checkpoints were found in `{}`.".format(checkpoint)) latest_checkpoint = sorted(checkpoints)[-1] clf = _restore_checkpoint(latest_checkpoint) else: clf = _restore_checkpoint(checkpoint) return clf @staticmethod def _valid_candidates(candidates: typing.Iterable, label): """Check whether the correct label is among candidates.""" for candidate, _ in candidates: # FIXME: a bug here, NLTK lets weird things like '**' go through -> causes crash candidate_name, _ = candidate try: if re.search(candidate_name, label, flags=re.IGNORECASE): return True except: return False return False
def __init__(self, feat_sets): self.train_set = feat_sets[:9500] self.test_set = feat_sets[9500:] self.Multinomial_classifier = SklearnClassifier(MultinomialNB()) self.bernoulli_classifier = SklearnClassifier(BernoulliNB()) self.naivebayes_classifier = NaiveBayesClassifier.train(self.train_set)
def sentim(self, data): stop_words = ['the', 'an', 'the', 'i', 'a', 'and', 'to'] #, 'none'] #, 'heartworm', ' distemper/parvo'] #stopwords.words('english') path_csv = '../data/csv/tf_idf_adoptable_csv.csv' df = read_df_csv(path_csv) X_negative = df["description"] #data corpus_dirty = [] for doc in range(len(X_negative)): str_corpus = str(X_negative[doc]) corpus_dirty.append(str_corpus) negative_documents = [] for doc in range(len(X_negative)): record = X_negative[doc] record = (record.lower()) replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') remove_digits = str.maketrans('', '', digits) replaced = replaced.translate(remove_digits) clean = replaced.replace(", '...'", "").replace("...", '') negative_documents.append(clean) # print(documents) # # # 2. Create a set of tokenized documents. negative_descriptions = [word_tokenize(content) for content in negative_documents] negative_cleaned_tokens_list = [] for tokens in negative_descriptions: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_neg_words = get_all_words(negative_cleaned_tokens_list) freq_dist_neg = FreqDist(all_neg_words) print("most common ADOPTABLE words: ", freq_dist_neg.most_common(10)) ################################################################## ################################################################## ################################################################## path_csv = '../data/csv/tf_idf_adopted_csv.csv' df = read_df_csv(path_csv) X_positive = df["description"] #data corpus_dirty = [] for doc in range(len(X_positive)): str_corpus = str(X_positive[doc]) corpus_dirty.append(str_corpus) positive_documents = [] for doc in range(len(X_positive)): record = X_positive[doc] record = (record.lower()) replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') remove_digits = str.maketrans('', '', digits) replaced = replaced.translate(remove_digits) clean = replaced.replace(", '...'", "").replace("...", '') positive_documents.append(clean) # print(documents) # # # 2. Create a set of tokenized documents. positive_descriptions = [word_tokenize(content) for content in positive_documents] # print("\n\nPositive Descriptions Tokenized: ", positive_descriptions) # ['dora', 'female', 'shep', 'mix', 'brindle', 'dhpp', 'kc', '//', 'no', 'puppy', 'hi', 'cathleen', ',', 'she', 'is', 'doing', 'great', 'and', 'really', 'starting'], ['meet', 'nova', '!', 'now', 'that', 'she', 'is', 'done', 'raising', 'her', 'pups', 'she', 'is', 'looking', 'for', 'a', 'home', 'of', 'her', 'own', 'where']] positive_cleaned_tokens_list = [] for tokens in positive_descriptions: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) # save_documents = open("pickled_algos/all_pos_words.pickle","wb") # pickle.dump(positive_cleaned_tokens_list, save_documents) # save_documents.close() freq_dist_pos = FreqDist(all_pos_words) print("most common ADOPTED words: ", freq_dist_pos.most_common(10)) ################################################################## ################################################################## ################################################################## positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) # positive_tokens_for_model = all_pos_words.pickle negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(description_dict, "Positive") for description_dict in positive_tokens_for_model] negative_dataset = [(description_dict, "Negative") for description_dict in negative_tokens_for_model] # print("positive_dataset: ", positive_dataset) # print("negative_dataset: ", negative_dataset) dataset = positive_dataset + negative_dataset seventy_percent_of_data = int(len(dataset) * .7) thirty_percent_of_data = int(len(dataset) * .3) # print(thirty_percent_of_data) #361 random.shuffle(dataset) #to avoid bias train_data = dataset[:seventy_percent_of_data] test_data = dataset[thirty_percent_of_data:] classifier = NaiveBayesClassifier.train(train_data) # classifier = MultinomialNB.fit(train_data) save_classifier = open("naivebayes_pet.pickle","wb") pickle.dump(classifier, save_classifier) save_classifier.close() print("%%%%%%%%%%%%%%%%%%%Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) # from nltk.corpus import twitter_samples # print("&&&&&&&&&&&&&&&&&&&&&&&&&") # print(twitter_samples) data = str(data) punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' for ele in data: if ele in punc: data = data.replace(ele, "") data = data.split() # print("tokenized data: ", data) #breakdown parts of speech parts_of_speech = [] parts_of_speech.append(nltk.pos_tag(data)) print("parts of speech tagging: ", parts_of_speech) #lemmatized data: stop_words = [] #left here in case I want to add words in the future cleaned_tokens = [] for token, tag in nltk.pos_tag(data): if tag.startswith("NN"): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() token = lemmatizer.lemmatize(token, pos) if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words: cleaned_tokens.append(token.lower()) custom_tokens = remove_noise(word_tokenize(str(data))) print(str(data), classifier.classify(dict([token, True] for token in custom_tokens))) sentiment_result = [classifier.classify(dict([token, True] for token in custom_tokens))] print("sentiment_result: ", type(sentiment_result), sentiment_result) data = sentiment_result return data
#print(positive) sad_token = get_tweets_for_model(negative) joy_token = get_tweets_for_model(positive) negative_dataset = [(tweet_dict, "negative") for tweet_dict in sad_token] positive_dataset = [(tweet_dict, "positive") for tweet_dict in joy_token] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:900] test_data = dataset[900:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) # Connect to MariaDB Platform try: conn = mariadb.connect( user="******", #- enter your username #password="******" - enter your password database="tcsproject" # - enter your database name ) except mariadb.Error as e: print(f"Error connecting to MariaDB Platform: {e}") sys.exit(1)
for word in features: if word not in labelled_features: labelled_features[word.lower()] = label_count labelled_features[word.lower()][label] += features[word] print "Currently at %d distinct tokens and %d papers" % ( len(labelled_features), samplecount) label_probdist = get_label_probdist(labelled_features) feature_probdist = get_feature_probdist(labelled_features) classifier = NaiveBayesClassifier(label_probdist, feature_probdist) for samplefile in test_samples: features = {} p = PaperParser() p.parsePaper(samplefile) for sentence in p.extractRawSentences(): tokens = nltk.word_tokenize(sentence) for word in tokens: features[word] = True dirname = os.path.basename(os.path.dirname(samplefile)) label = labels[dirname]
def train(all_features, ratio): train_size = int(len(all_features) * ratio) train_set, test_set = all_features[:train_size], all_features[train_size:] clf = NaiveBayesClassifier.train(train_set) return train_set, test_set, clf
print("Dictionary with Positive class : ", positiveReviewDataset[7]) print("Dictionary with Negative class : ", negativeReviewDataset[7]) #print("tagged neg :",negative_dataset[0]) dataset = positiveReviewDataset + negativeReviewDataset print("Dataset[0] :", dataset[0]) print("Dataset length", len(dataset)) random.shuffle(dataset) trainData = dataset[:7000] testData = dataset[7000:] trainedModel = NaiveBayesClassifier.train(trainData) print("Accuracy of the model : ", classify.accuracy(trainedModel, testData)) review = "This is a bad product." reviewTokens = noiseRemoval(word_tokenize(review)) # Test print print(review, " : ", trainedModel.classify(dict([token, True] for token in reviewTokens))) #Text = "j@nittha" #Text = re.sub("@", "a", Text) #print(Text)
def train_topic_classifier(self, train_set): classifier = NaiveBayesClassifier.train(train_set) return classifier
print("Also see: Hindu Marriage Act") elif resultc != -1 or y == "Christian": f1 = open("Christian.txt") f2 = open("christian01.txt") l1 = f1.read() arr = sent_tokenize(l1) l2 = f2.read() arr2 = word_tokenize(l2) for i in range(0, len(arr)): li1.append(tuple((arr[i], arr2[i]))) f1.close() f2.close() print("Also see: Indian Divorce Act") mycase = sys.argv[3] #mycase=input("enter your case ") c1 = 0 c2 = 0 model = NaiveBayesClassifier(li1) #model=nltk.NaiveBayesClassifier.train(li1) #print(model.classify(mycase)) case = sent_tokenize(mycase) print(mycase) for i in range(0, len(case)): temp = model.classify(case[i]) if temp == "0": c1 = c1 + 1 else: c2 = c2 + 1 print("Probability of winning case", (c1 / (c1 + c2)) * 100)
def sentim_twitter(self, data): '''heavily borrowed from https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk to show functioning model''' positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:700] test_data = dataset[700:] classifier = NaiveBayesClassifier.train(train_data) print("twitter data **********************************") print("%%%%%%%%%%%%%%%%%%% Twitter Accuracy is:", classify.accuracy(classifier, test_data)) print("twitter data **********************************") print(classifier.show_most_informative_features(10)) # data = (data) # custom_tweet = str(data) print("twitter data **********************************") print("twitter data **********************************") print("is this reading data correctly???: ", type(str(data))) custom_tweet = str(data) # this gives negative custom_tokens = remove_noise(word_tokenize(custom_tweet)) print("twitter data **********************************") print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))) twitter = classifier.classify(dict([token, True] for token in custom_tokens)) return twitter
def train_model(self, data): self.model = NaiveBayesClassifier.train(data)
def train(self, corpus, selected_feats): train_set = self.parse_corpus(corpus) print('Train set:', len(train_set)) # # unigram self.unigrams = Counter([ word for chat, win, duration, extra in train_set for word in chat ]) self.common_unigrams = [ unigram for unigram, value in self.unigrams.items() if value > 1 ] # print(len(self.unigrams), len(self.common_unigrams)) # # bigram self.bigrams = Counter([ ' '.join((word, chat[i + 1])) for chat, win, duration, extra in train_set for i, word in enumerate(chat[:-1]) ]) self.common_bigrams = [ bigram for bigram, value in self.bigrams.items() if value > 1 ] # print(len(self.bigrams), len(self.common_bigrams)) # # trigram self.trigrams = Counter([ ' '.join((word, chat[i + 1], chat[i + 2])) for chat, win, duration, extra in train_set for i, word in enumerate(chat[:-2]) ]) self.common_trigrams = [ trigram for trigram, value in self.trigrams.items() if value > 1 ] # print(len(self.trigrams), len(self.common_trigrams)) # # fourgram self.fourgrams = Counter([ ' '.join((word, chat[i + 1], chat[i + 2], chat[i + 3])) for chat, win, duration, extra in train_set for i, word in enumerate(chat[:-3]) ]) self.common_fourgrams = [ fourgram for fourgram, value in self.fourgrams.items() if value > 1 ] # print(len(self.fourgrams), len(self.common_fourgrams)) # # fivegram self.fivegrams = Counter([ ' '.join( (word, chat[i + 1], chat[i + 2], chat[i + 3], chat[i + 4])) for chat, win, duration, extra in train_set for i, word in enumerate(chat[:-4]) ]) self.common_fivegrams = [ fivegram for fivegram, value in self.fivegrams.items() if value > 1 ] # print(len(self.fivegrams), len(self.common_fivegrams)) ###### WP30 PLOT ####### # wp30s = [len(chat) // (duration / 1800) for chat,win,duration,extra in train_set] # n, bins, patches = plt.hist(wp30s, 100,alpha=0.75) # plt.show() # self.doclen = Counter([len(chat) for chat,win,duration in train_set]) ###### CHATTER PLOT ###### # data = [] # for chat, win, duration,extra in w8m8.iterate(train_set, out='Training'): # nchars = [0,0,0,0,0] # for player, message in extra: # nchars[player] += len(message) # avg = sum(nchars) / 5 # data.append(max(nchars) / avg) # n, bins, patches = plt.hist(data, 1000,alpha=0.75) # plt.show() t = [] for chat, win, duration, extra in w8m8.iterate(train_set, out='Training'): features = self.get_features(chat, duration, extra, selected_feats) t.append((features, win)) self.classifier = NaiveBayesClassifier.train(t) self.classifier.show_most_informative_features(20)
def main(): print('Building model...') print('Gathering training data...') # set nltk twitter samples as list of strings pos_sample_tweets = twitter_samples.strings('positive_tweets.json') neg_sample_tweets = twitter_samples.strings('negative_tweets.json') #### UPDATE HERE: Option to add your own tweet samples #### Remove the empty list, uncomment and update filepaths below pos_custom_tweets = [] ## helpers.import_csv('positive_tweets.csv') neg_custom_tweets = [] ## helpers.import_csv('negative_tweets.csv') # combine nltk twitter samples and custom tweets positive_tweets = pos_sample_tweets + pos_custom_tweets negative_tweets = neg_sample_tweets + neg_custom_tweets # tokenize tweets positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets] negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets] # set cleaned tokens lists positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] stop_words = stopwords.words('english') # get cleaned positive tokens for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append( helpers.remove_noise(tokens, stop_words)) # get cleaned negative tokens for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append( helpers.remove_noise(tokens, stop_words)) # convert tokens into iterable word lists all_pos_words = helpers.get_all_words(positive_cleaned_tokens_list) all_neg_words = helpers.get_all_words(negative_cleaned_tokens_list) # get frequency distribution of word lists freq_dist_pos = FreqDist(all_pos_words) freq_dist_neg = FreqDist(all_neg_words) # print top 10 positive and negative words print('Top 10 positive and negative words:') print(freq_dist_pos.most_common(10)) print(freq_dist_neg.most_common(10)) # convert tokens to a dictionary for modelling positive_tokens_for_model = helpers.get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = helpers.get_tweets_for_model( negative_cleaned_tokens_list) # assign a label to positive tokens positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] # assign a label to negative tokens negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] # set dataset and randomize to train model dataset = positive_dataset + negative_dataset random.shuffle(dataset) # split the data into a 70:30 ratio among 10K tweets train_data = dataset[:7000] test_data = dataset[7000:] # train a Naive Bayes model classifier = NaiveBayesClassifier.train(train_data) # print model accuracy print("Model accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) print('Model complete!\n') return classifier
def category_by_pos(): from nltk.corpus import brown from nltk import FreqDist from nltk import DecisionTreeClassifier from nltk import NaiveBayesClassifier from nltk import classify suffix_fdist = FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] # print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = DecisionTreeClassifier.train(train_set) # print 'Decision Tree %f' % classify.accuracy(classifier, test_set) classifier = NaiveBayesClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)