Python Corpus.Corpus Examples

Programming Language: Python

Namespace/Package Name: pattern.vector

Class/Type: Corpus

Method/Function: Corpus

Examples at hotexamples.com: 5

Python Corpus.Corpus - 5 examples found. These are the top rated real world Python examples of pattern.vector.Corpus.Corpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Corpus(5)

append(4)

build(2)

lsa(2)

reduce(2)

cluster(1)

document(1)

export(1)

extend(1)

feature_selection(1)

filter(1)

load(1)

nn(1)

save(1)

search(1)

Example #1

Show file

 def __init__(self, username, password, method='kmeans'):
     auth = ClientAuthMethod(username, password)
     self.reader = GoogleReader(auth)
     self.reader.buildSubscriptionList()
     self.categories = self.reader.getCategories()
     self.corpus = Corpus()
     self.method = method

Example #2

Show file

import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Document, Corpus, Vectorspace

# Latent Semantic Analysis (LSA) is a statistical machine learning method
# based on singular value decomposition (SVD).
# It discovers semantically related words across documents.
# The idea is to group the document vectors in a matrix
# (each document is a row, each word in the corpus is a column),
# and then to reduce the number of dimensions, filtering out "noise".

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1, D2, D3, D4])

lsa = corpus.lsa()

print lsa.keywords(D4)
print
print lsa.search("curiosity")

# Document D4 now yields kill as a keyword, although this word was not D4's description.
# However, document D2 and D4 share curiosity as a keyword,
# so D4 inherits some of the keywords from D2.
# Performing a search on curiosity now also yields document D3 as a result.

Example #3

Show file

File: twittermining.py Project: zbhuiyan/SoftDesSp15

-----------
This program trains a kNN classifier to recognize adjectives taken from Twitter. 
Adjectives are classified and identified as #win or #fail.
The adjective vectors are put in a corpus to train the classifier.
"damn" and "sucks" are classified as #fail & "awesome" and "cool" are classified as "win"
Results vary according to real-time tweets.

'''

from pattern.web import Twitter
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Corpus, KNN

corpus = Corpus()  #collection of texts

for i in range(1, 15):
    for tweet in Twitter().search(
            '#win' or '#fail', start=i, count=100
    ):  #searches 15*100=1500 tweets for these classes of hashtags
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()
        s = Sentence(
            parse(s)
        )  #parse anlayzes & gives strings that are annotated with specified tags
        s = search('JJ',
                   s)  #searches for adjectives in tweets (JJ = adjectiive)
        s = [match[0].string for match in s]
        s = ' '.join(s)

Example #4

Show file

# Naive Bayes is one of the oldest classifiers,
# but is is still popular because it is fast for corpora
# that have many documents and many words.
# It is outperformed by KNN and SVM, but useful for running tests.

# We'll test it with a corpus of spam e-mail messages
# included in the test suite, stored as a CSV-file.
# The corpus contains mostly technical e-mail from developer mailing lists.
data = Datasheet.load(
    os.path.join("..", "..", "test", "corpora", "apache-spam.csv"))

documents = []
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
corpus = Corpus(documents)

print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(
    len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# Train Naive Bayes on all documents.
# Each document has a type: True for real e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = Bayes()
for document in corpus:
    classifier.train(document)

Example #5

Show file

File: 05-bayes.py Project: mlyne/Scripts

import os, sys; sys.path.insert(0, os.path.join("..", ".."))

import os
import glob
from pattern.vector import Document, Corpus, Bayes, KNN, features, distance, Vector, _distance, COSINE, kdtree


#from pattern.web import PDF
##pdf = PDF(open("/users/tom/downloads/10-1.1.1.61.7217.pdf", "rb").read())
#pdf = PDF(open("/users/tom/downloads/10-1.1.1.14.8422.pdf", "rb").read())
#print Document(unicode(pdf), threshold=1).keywords(30)
#print xxx

corpus = Corpus()
for product in glob.glob(os.path.join("reviews", "*")):
    for review in glob.glob(os.path.join(product, "*.txt")):
        polarity = "yes" in review
        s = open(review).read()
        corpus.append(Document(s, type=polarity, top=50, threshold=2))

#print "testtree"
#V = lambda x: Vector(dict(enumerate(x)))
#v = [(2,3), (5,4), (9,6), (4,7), (8,1), (7,2)]
#v = [V(x) for x in v]
#t = kdtree(v)
#print t.nn(V((9,5)))
#print xxx

n = 10
x = 0
t1 = 0