Example #1
0
 def __init__(self, username, password, method='kmeans'):
     auth = ClientAuthMethod(username, password)
     self.reader = GoogleReader(auth)
     self.reader.buildSubscriptionList()
     self.categories = self.reader.getCategories()
     self.corpus = Corpus()
     self.method = method
Example #2
0
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Document, Corpus, Vectorspace

# Latent Semantic Analysis (LSA) is a statistical machine learning method
# based on singular value decomposition (SVD).
# It discovers semantically related words across documents.
# The idea is to group the document vectors in a matrix
# (each document is a row, each word in the corpus is a column),
# and then to reduce the number of dimensions, filtering out "noise".

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1, D2, D3, D4])

lsa = corpus.lsa()

print lsa.keywords(D4)
print
print lsa.search("curiosity")

# Document D4 now yields kill as a keyword, although this word was not D4's description.
# However, document D2 and D4 share curiosity as a keyword,
# so D4 inherits some of the keywords from D2.
# Performing a search on curiosity now also yields document D3 as a result.
Example #3
0
-----------
This program trains a kNN classifier to recognize adjectives taken from Twitter. 
Adjectives are classified and identified as #win or #fail.
The adjective vectors are put in a corpus to train the classifier.
"damn" and "sucks" are classified as #fail & "awesome" and "cool" are classified as "win"
Results vary according to real-time tweets.

'''

from pattern.web import Twitter
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Corpus, KNN

corpus = Corpus()  #collection of texts

for i in range(1, 15):
    for tweet in Twitter().search(
            '#win' or '#fail', start=i, count=100
    ):  #searches 15*100=1500 tweets for these classes of hashtags
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()
        s = Sentence(
            parse(s)
        )  #parse anlayzes & gives strings that are annotated with specified tags
        s = search('JJ',
                   s)  #searches for adjectives in tweets (JJ = adjectiive)
        s = [match[0].string for match in s]
        s = ' '.join(s)
Example #4
0
# Naive Bayes is one of the oldest classifiers,
# but is is still popular because it is fast for corpora
# that have many documents and many words.
# It is outperformed by KNN and SVM, but useful for running tests.

# We'll test it with a corpus of spam e-mail messages
# included in the test suite, stored as a CSV-file.
# The corpus contains mostly technical e-mail from developer mailing lists.
data = Datasheet.load(
    os.path.join("..", "..", "test", "corpora", "apache-spam.csv"))

documents = []
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
corpus = Corpus(documents)

print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(
    len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# Train Naive Bayes on all documents.
# Each document has a type: True for real e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = Bayes()
for document in corpus:
    classifier.train(document)
Example #5
0
import os, sys; sys.path.insert(0, os.path.join("..", ".."))

import os
import glob
from pattern.vector import Document, Corpus, Bayes, KNN, features, distance, Vector, _distance, COSINE, kdtree


#from pattern.web import PDF
##pdf = PDF(open("/users/tom/downloads/10-1.1.1.61.7217.pdf", "rb").read())
#pdf = PDF(open("/users/tom/downloads/10-1.1.1.14.8422.pdf", "rb").read())
#print Document(unicode(pdf), threshold=1).keywords(30)
#print xxx

corpus = Corpus()
for product in glob.glob(os.path.join("reviews", "*")):
    for review in glob.glob(os.path.join(product, "*.txt")):
        polarity = "yes" in review
        s = open(review).read()
        corpus.append(Document(s, type=polarity, top=50, threshold=2))

#print "testtree"
#V = lambda x: Vector(dict(enumerate(x)))
#v = [(2,3), (5,4), (9,6), (4,7), (8,1), (7,2)]
#v = [V(x) for x in v]
#t = kdtree(v)
#print t.nn(V((9,5)))
#print xxx

n = 10
x = 0
t1 = 0