def set_classifier(self): if self.name == 'SLP': return SLP(train=self.train_data, iterations=self.iterations) elif self.name == 'NB': return NB(train=self.train_data) else: print "Unknown classifier name"
def resolve_certainty(certainty_info): '''Resolve certainty with Naive Bayes''' if certainty_info == '': return 'No certainty info.' else: nb = NB() for observation, certainty in csv( 'library/templatetags/c_training_data.csv'): v = Document(observation, type=int(certainty), stopwords=True) nb.train(v) return nb.classify(Document(certainty_info))
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.items(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def classifyTweets(filename, trainingSet): print('Classifying {}...\n'.format(filename)) data = open('{}/processed{}'.format(INPUT_PATH, filename.capitalize()), 'rb') reader = csv.reader(data) info = list(reader) classifier = NB(train = trainingSet, alpha = 0.0001) tweets = [] for i in range(len(info)): tweet = info[i][0] result = classifier.classify(Document(tweet)) tweets.append([tweet, result]) # Write all tweets to file with open('{}/results.csv'.format(OUTPUT_PATH), 'wb+') as f: writer = csv.writer(f) writer.writerows(tweets)
def normal_test(data, type): print '----------------------------------------------------' print 'TEST FUNCTION STARTED FOR ' + type + '!' total_data_size = len(data) training_size = int(round(total_data_size/2)) test_size = training_size print 'Total Size: ' + str(total_data_size) print 'Training Size: ' + str(training_size) print 'Test Size: ' + str(test_size) print 'Training Started for ' + type + '!' classification_methods = { #uncomment based on what classification algorithm you would like to test 'NB' : NB(train=data[:training_size], baseline=MAJORITY, method=MULTINOMIAL), 'KNN2' : KNN(train=data[:training_size], baseline=MAJORITY, k=2, distance=COSINE), 'KNN3' : KNN(train=data[:training_size], baseline=MAJORITY, k=3, distance=COSINE), 'KNN4' : KNN(train=data[:training_size], baseline=MAJORITY, k=4, distance=COSINE), 'KNN5' : KNN(train=data[:training_size], baseline=MAJORITY, k=5, distance=COSINE), 'KNN6' : KNN(train=data[:training_size], baseline=MAJORITY, k=6, distance=COSINE), 'KNN7' : KNN(train=data[:training_size], baseline=MAJORITY, k=7, distance=COSINE), 'KNN8' : KNN(train=data[:training_size], baseline=MAJORITY, k=8, distance=COSINE), 'KNN9' : KNN(train=data[:training_size], baseline=MAJORITY, k=9, distance=COSINE), 'KNN10' : KNN(train=data[:training_size], baseline=MAJORITY, k=10, distance=COSINE), 'SLP1' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=1), 'SLP2' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=2), 'SLP3' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=3), 'SVM' : SVM(train=data[:training_size], type=CLASSIFICATION, kernel=POLYNOMIAL), } print 'Normal Testing Started!' # uncomment to start the normal test for classification in classification_methods.keys(): #measure the time it takes to classify! start = timeit.default_timer() #normal test accuracy, precision, recall, f1 = classification_methods[classification].test(data[training_size:training_size+test_size]) stop = timeit.default_timer() print '*' + classification + '*' print 'Accuracy: ' + str(accuracy) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1-score: ' + str(f1) print 'Time: ' + str(stop - start) print
def learnCategories(tn): nb = NB() for (cat, content) in csv('resource/%s.catdata' % tn, separator=';', headers=True): if not cat or not content: continue t = cat # toASCII(cat) v = Document(content, type=t, stemmer=None, stopwords=False, language='fr') nb.train(v) # cr = csv('resource/%s.catdata' % tn, separator = ';', headers = True) # for (i, r) in enumerate(cr): # v = Document(str(i), type = r[0], stemmer = None, stopwords = False, language = 'fr') # nb.train(v) logging.info('TRAINED %s on %d categories', tn, len(nb.classes)) nb.save('resource/%s.classifier' % tn)
for score, message in data: document = Document(message, type=int(score) > 0) documents.append(document) m = Model(documents) print("number of documents:", len(m)) print("number of words:", len(m.vector)) print("number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))) print() # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print(classifier.classify("win money")) # False: most likely spam. print(classifier.classify("fix bug")) # True: most likely a real message. print() # False: people don't talk like this on developer lists... print(classifier.classify("customer")) # True: because most likely everyone knows everyone. print(classifier.classify("guys")) print()
for concept, w1 in m.lsa.vectors[d.id].items(): for feature, w2 in m.lsa.concepts[concept].items(): if w1 != 0 and w2 != 0: print(feature, w1 * w2) # clustering d1 = Document('Cats are independent pets.', name='cat') d2 = Document('Dogs are trustworthy pets.', name='dog') d3 = Document('Boxes are made of cardboard.', name='box') m = Model((d1, d2, d3)) print m.cluster(method=HIERARCHICAL, k=2) # hierarchical clustering cluster = Cluster((1, Cluster((2, Cluster((3, 4)))))) print cluster.depth print cluster.flatten(1) # training a classifier nb = NB() for review, rating in csv('data/input/reviews.csv'): v = Document(review, type=int(rating), stopwords=True) nb.train(v) print nb.classes print nb.classify(Document('A good movie!')) # testing a classifier data = csv('data/input/reviews.csv') data = [(review, int(rating)) for review, rating in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] nb = NB(train=data[:500]) accuracy, precision, recall, f1 = nb.test(data[500:]) print accuracy # binary classification
import xml.etree.ElementTree as xmlTree from pattern.vector import Document, NB, count, words from pattern.web import plaintext from pattern.db import csv from collections import Counter nb = NB() wordStats = Counter() opinionStats = Counter({'positive': 0, 'negative': 0, 'overall': 0}) for grade, opinion in csv('trainData.csv', separator = '\t'): comment = Document(opinion, type=int(grade), stopwords = True) nb.train(comment) tree = xmlTree.parse("Posts.xml") root = tree.getroot() for row in root: doc = Document(plaintext(row.attrib['Body']), filter = lambda w: w.strip("'").isalpha() and len(w) > 1, stopwords = False) opinion = nb.classify(doc) opinionStats['overall'] +=1 if opinion > 0: opinionStats['positive'] += 1 else: opinionStats['negative'] += 1 wordStats += Counter(doc.words) print wordStats.most_common(10) print opinionStats