def resolve_certainty(certainty_info): '''Resolve certainty with Naive Bayes''' if certainty_info == '': return 'No certainty info.' else: nb = NB() for observation, certainty in csv( 'library/templatetags/c_training_data.csv'): v = Document(observation, type=int(certainty), stopwords=True) nb.train(v) return nb.classify(Document(certainty_info))
def csvGrab(fileName): data = csv(fileName) data = [[d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[9],d[10]] for d in data] data = [data[i] for i in range(1,len(data))] data = [[d[0],d[1],d[2],d[3],d[4],d[5],d[6],datetime.datetime.strptime(d[7],"%m/%d/%Y").strftime('%Y-%m-%d'),d[8]] for d in data] return data
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.iteritems(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.items(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
from pattern.web import Wiktionary, DOM from pattern.db import csv # This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org). # It then trains a classifier that can predict the gender of unknown names (about 78% correct). # The classifier is small (80KB) and fast. w = Wiktionary(language="en") f = csv() # csv() is a short alias for Datasheet(). # Collect male and female given names from Wiktionary. # Store the data as (name, gender)-rows in a CSV-file. for gender in ("male", "female"): for ch in ("abcdefghijklmnopqrstuvwxyz"): p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True) for name in p.links: if not name.startswith("Appendix:"): f.append((name, gender[0])) f.save("given-names.csv") print ch, gender # Create a classifier that predicts gender based on name. from pattern.vector import SVM, chngrams, count, kfoldcv class GenderByName(SVM): def train(self, name, gender=None): SVM.train(self, self.vector(name), gender)
for feature, w2 in m.lsa.concepts[concept].items(): if w1 != 0 and w2 != 0: print(feature, w1 * w2) # clustering d1 = Document('Cats are independent pets.', name='cat') d2 = Document('Dogs are trustworthy pets.', name='dog') d3 = Document('Boxes are made of cardboard.', name='box') m = Model((d1, d2, d3)) print m.cluster(method=HIERARCHICAL, k=2) # hierarchical clustering cluster = Cluster((1, Cluster((2, Cluster((3, 4)))))) print cluster.depth print cluster.flatten(1) # training a classifier nb = NB() for review, rating in csv('data/input/reviews.csv'): v = Document(review, type=int(rating), stopwords=True) nb.train(v) print nb.classes print nb.classify(Document('A good movie!')) # testing a classifier data = csv('data/input/reviews.csv') data = [(review, int(rating)) for review, rating in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] nb = NB(train=data[:500]) accuracy, precision, recall, f1 = nb.test(data[500:]) print accuracy # binary classification data = csv('data/input/reviews.csv')
import os, sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Wiktionary, DOM from pattern.db import csv, pd # This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org). # It then trains a classifier that can predict the gender of unknown names (about 78% correct). # The classifier is small (80KB) and fast. w = Wiktionary(language="en") f = csv() # csv() is a short alias for Datasheet(). # Collect male and female given names from Wiktionary. # Store the data as (name, gender)-rows in a CSV-file. # The pd() function returns the parent directory of the current script, # so pd("given-names.csv") = pattern/examples/01-web/given-names.csv. for gender in ("male", "female"): for ch in ("abcdefghijklmnopqrstuvwxyz"): p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True) for name in p.links: if not name.startswith("Appendix:"): f.append((name, gender[0])) f.save(pd("given-names.csv")) print(ch, gender) # Create a classifier that predicts gender based on name.
import xml.etree.ElementTree as xmlTree from pattern.vector import Document, NB, count, words from pattern.web import plaintext from pattern.db import csv from collections import Counter nb = NB() wordStats = Counter() opinionStats = Counter({'positive': 0, 'negative': 0, 'overall': 0}) for grade, opinion in csv('trainData.csv', separator = '\t'): comment = Document(opinion, type=int(grade), stopwords = True) nb.train(comment) tree = xmlTree.parse("Posts.xml") root = tree.getroot() for row in root: doc = Document(plaintext(row.attrib['Body']), filter = lambda w: w.strip("'").isalpha() and len(w) > 1, stopwords = False) opinion = nb.classify(doc) opinionStats['overall'] +=1 if opinion > 0: opinionStats['positive'] += 1 else: opinionStats['negative'] += 1 wordStats += Counter(doc.words) print wordStats.most_common(10) print opinionStats
# -*- coding: utf-8 -*- from pattern.vector import Document, Model, TFIDF, SVM, kfoldcv,REGRESSION,RADIAL,CLASSIFICATION from pattern.db import csv from sys import argv import jieba import json import python_db # extraversion, agreeable, conscientiousness, neuroticism, openness category = ['ext', 'agr', 'con', 'neu', 'ope'] # open the corpus file data = csv('./csv/corpus.csv') # create the document.vector data_doc = {} for cate in category: data_doc[cate] = [] for text, ext, agr, con, neu, ope in data: data_doc['ext'].append(Document(' '.join(jieba.cut(text)), type = int(ext)==1)) data_doc['agr'].append(Document(' '.join(jieba.cut(text)), type = int(agr)==1)) data_doc['con'].append(Document(' '.join(jieba.cut(text)), type = int(con)==1)) data_doc['neu'].append(Document(' '.join(jieba.cut(text)), type = int(neu)==1)) data_doc['ope'].append(Document(' '.join(jieba.cut(text)), type = int(ope)==1)) # create the TFIDF model m = {} for cate in category: m[cate] = Model(documents = data_doc[cate], weight=TFIDF)
#For training this classifier we need pattern, nltk (including the corpus), re and csv modules# from pattern.vector import Model, Document, BINARY, SVM, kfoldcv, IG, SLP,KNN, NB from pattern.db import csv from pattern.en import ngrams from pattern.vector import stem, PORTER, LEMMA from nltk.corpus import stopwords import re import csv as csv1 #The file 'FbTrainingData.csv' should be in the same directory# data = csv('FbTrainingData.csv') data = [[message, int(side_effect_indicator)] for message, side_effect_indicator in data] #List of nltk stopwords stop = stopwords.words('english') #Adding medicine names and obvious names into the stop words medlist1 = ["diabetes","actos", "pioglitazone hydrochloride", "pioglitazone", "glustin", "glizone", "pioz", "zactos"] medlist2 = ["medformin","glucophage", "metformin", "glucophage xr", "metformin hydrochloride", "carbophage sr", "riomet", "fortamet", "glumetza", "obimet", "gluformin", "dianben", "diabex", "diaformin", "siofor","metfogamma", "riomet"] medlist3 = ["byetta", "bydureon", "exenatide"]