Beispiel #1
0
def resolve_certainty(certainty_info):
    '''Resolve certainty with Naive Bayes'''
    if certainty_info == '':
        return 'No certainty info.'
    else:
        nb = NB()
        for observation, certainty in csv(
                'library/templatetags/c_training_data.csv'):
            v = Document(observation, type=int(certainty), stopwords=True)
            nb.train(v)
        return nb.classify(Document(certainty_info))
def csvGrab(fileName):
    data = csv(fileName)

    data = [[d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[9],d[10]] for d in data]


    data = [data[i] for i in range(1,len(data))]

    data = [[d[0],d[1],d[2],d[3],d[4],d[5],d[6],datetime.datetime.strptime(d[7],"%m/%d/%Y").strftime('%Y-%m-%d'),d[8]] for d in data]


    return data
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.iteritems():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
Beispiel #4
0
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.items():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
Beispiel #5
0
from pattern.web import Wiktionary, DOM
from pattern.db import csv

# This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
# It then trains a classifier that can predict the gender of unknown names (about 78% correct).
# The classifier is small (80KB) and fast.

w = Wiktionary(language="en")
f = csv() # csv() is a short alias for Datasheet().

# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.

for gender in ("male", "female"):
    for ch in ("abcdefghijklmnopqrstuvwxyz"):
        p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
        for name in p.links:
            if not name.startswith("Appendix:"):
                f.append((name, gender[0]))
        f.save("given-names.csv")
        print ch, gender

# Create a classifier that predicts gender based on name.

from pattern.vector import SVM, chngrams, count, kfoldcv

class GenderByName(SVM):

    def train(self, name, gender=None):
        SVM.train(self, self.vector(name), gender)
        for feature, w2 in m.lsa.concepts[concept].items():
            if w1 != 0 and w2 != 0:
                print(feature, w1 * w2)
# clustering
d1 = Document('Cats are independent pets.', name='cat')
d2 = Document('Dogs are trustworthy pets.', name='dog')
d3 = Document('Boxes are made of cardboard.', name='box')
m = Model((d1, d2, d3))
print m.cluster(method=HIERARCHICAL, k=2)
# hierarchical clustering
cluster = Cluster((1, Cluster((2, Cluster((3, 4))))))
print cluster.depth
print cluster.flatten(1)
# training a classifier
nb = NB()
for review, rating in csv('data/input/reviews.csv'):
    v = Document(review, type=int(rating), stopwords=True)
    nb.train(v)
print nb.classes
print nb.classify(Document('A good movie!'))
# testing a classifier
data = csv('data/input/reviews.csv')
data = [(review, int(rating)) for review, rating in data]
data = [
    Document(review, type=rating, stopwords=True) for review, rating in data
]
nb = NB(train=data[:500])
accuracy, precision, recall, f1 = nb.test(data[500:])
print accuracy
# binary classification
data = csv('data/input/reviews.csv')
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Wiktionary, DOM
from pattern.db import csv, pd

# This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
# It then trains a classifier that can predict the gender of unknown names (about 78% correct).
# The classifier is small (80KB) and fast.

w = Wiktionary(language="en")
f = csv()  # csv() is a short alias for Datasheet().

# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.
# The pd() function returns the parent directory of the current script,
# so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.

for gender in ("male", "female"):
    for ch in ("abcdefghijklmnopqrstuvwxyz"):
        p = w.search("Appendix:%s_given_names/%s" %
                     (gender.capitalize(), ch.capitalize()),
                     cached=True)
        for name in p.links:
            if not name.startswith("Appendix:"):
                f.append((name, gender[0]))
        f.save(pd("given-names.csv"))
        print(ch, gender)

# Create a classifier that predicts gender based on name.
Beispiel #8
0
import xml.etree.ElementTree as xmlTree
from pattern.vector import Document, NB, count, words
from pattern.web import plaintext
from pattern.db import csv
from collections import Counter

nb = NB()
wordStats = Counter()
opinionStats = Counter({'positive': 0, 'negative': 0, 'overall': 0})

for grade, opinion in csv('trainData.csv', separator = '\t'):
    comment = Document(opinion, type=int(grade), stopwords = True)
    nb.train(comment)

tree = xmlTree.parse("Posts.xml")
root = tree.getroot()

for row in root:
    doc = Document(plaintext(row.attrib['Body']), 
                filter = lambda w: w.strip("'").isalpha() and len(w) > 1,
                stopwords = False)
    opinion = nb.classify(doc)
    opinionStats['overall'] +=1
    if opinion > 0:
        opinionStats['positive'] += 1
    else:
        opinionStats['negative'] += 1
    wordStats += Counter(doc.words)

print wordStats.most_common(10)
print opinionStats
Beispiel #9
0
# -*- coding: utf-8 -*-


from pattern.vector import Document, Model, TFIDF, SVM, kfoldcv,REGRESSION,RADIAL,CLASSIFICATION
from pattern.db import csv 
from sys import argv
import jieba
import json
import python_db

# extraversion, agreeable, conscientiousness, neuroticism, openness
category = ['ext', 'agr', 'con', 'neu', 'ope'] 

# open the corpus file
data = csv('./csv/corpus.csv')

# create the document.vector
data_doc = {}
for cate in category:
    data_doc[cate] = []
for text, ext, agr, con, neu, ope in data:
    data_doc['ext'].append(Document(' '.join(jieba.cut(text)), type = int(ext)==1))
    data_doc['agr'].append(Document(' '.join(jieba.cut(text)), type = int(agr)==1))
    data_doc['con'].append(Document(' '.join(jieba.cut(text)), type = int(con)==1))
    data_doc['neu'].append(Document(' '.join(jieba.cut(text)), type = int(neu)==1))
    data_doc['ope'].append(Document(' '.join(jieba.cut(text)), type = int(ope)==1))

# create the TFIDF model
m = {}
for cate in category:
    m[cate] = Model(documents = data_doc[cate], weight=TFIDF)
#For training this classifier we need pattern, nltk (including the corpus), re and csv modules#

from pattern.vector import Model, Document, BINARY, SVM, kfoldcv, IG, SLP,KNN, NB
from pattern.db import csv
from pattern.en import ngrams
from pattern.vector import stem, PORTER, LEMMA
from nltk.corpus import stopwords
import re
import csv as csv1

#The file 'FbTrainingData.csv' should be in the same directory#

data = csv('FbTrainingData.csv')
data = [[message, int(side_effect_indicator)] for message, side_effect_indicator in data]




#List of nltk stopwords
stop = stopwords.words('english')




#Adding medicine names and obvious names into the stop words
medlist1 = ["diabetes","actos", "pioglitazone hydrochloride", "pioglitazone",  "glustin", "glizone", "pioz", "zactos"]

medlist2 = ["medformin","glucophage", "metformin", "glucophage xr", "metformin hydrochloride", "carbophage sr", "riomet", "fortamet", "glumetza", "obimet", "gluformin", "dianben", "diabex", "diaformin", "siofor","metfogamma", "riomet"]

medlist3 = ["byetta", "bydureon", "exenatide"]