Exemple #1
0
 def __init__(self, username, password, method='kmeans'):
     auth = ClientAuthMethod(username, password)
     self.reader = GoogleReader(auth)
     self.reader.buildSubscriptionList()
     self.categories = self.reader.getCategories()
     self.corpus = Corpus()
     self.method = method
Exemple #2
0
    def suggest_tags(self, ):
        """ Uses cosine similarity to suggest tags to the note 
        IDEAS:
            more weight for same notebook
            more weight for relative creation time
        """
        corrected_notes = {}

        if not self.mongo.users.find_one({'_id':self.user_id, 'bool_lsa':True},{'bool_lsa':1}):
            # we have not done lsa before, do it now we want a fast KNN
            self._lsa_extract()
        
        corpus = Corpus.load(cls, '/data/corpus/'+str(self.user_id))
        ## only untagged notes
        untaged_notes = self.mongo.notes.find({'_id_tags':None},{})
        for note in untaged_notes:
            suggested_tags =  set()
            # get the doc from the corpus
            for weight, doc in corpus.nearest_neigbors(corpus[(note['_id'])], top=5):
                # get the similar doc
                tags = self.mongo.notes.find_one(
                        {'_id':doc.id},{'str_tags':1}).get('str_tags')
                if tags:
                    suggested_tags.update(tags)
            corrected_notes[(note['_id'])] = suggested_tags
        return corrected_notes
Exemple #3
0
 def __init__(self, username, password, method='kmeans'):
     auth = ClientAuthMethod(username, password)
     self.reader = GoogleReader(auth)
     self.reader.buildSubscriptionList()
     self.categories = self.reader.getCategories()
     self.corpus = Corpus()
     self.method = method
Exemple #4
0
    def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)
Exemple #5
0
sys.path.append(os.path.join("..", "..", ".."))

from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight,
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# A better measure is term frequency - inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so their words can be compared to calculate TF-IDF.

corpus = Corpus.build(os.path.join("corpus", "*.txt"))
d = corpus.document(name="lion")

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf(
    "food")  # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are,
# based on the "vector" of word tf-idf frequencies of each document.
# This is called cosine-similarity:
d1 = corpus.document(name="lion")
d2 = corpus.document(name="tiger")
d3 = corpus.document(name="dolphin")
Exemple #6
0
# Naive Bayes is one of the oldest classifiers,
# but is is still popular because it is fast for corpora
# that have many documents and many words.
# It is outperformed by KNN and SVM, but useful for running tests.

# We'll test it with a corpus of spam e-mail messages
# included in the test suite, stored as a CSV-file.
# The corpus contains mostly technical e-mail from developer mailing lists.
data = Datasheet.load(
    os.path.join("..", "..", "test", "corpora", "apache-spam.csv"))

documents = []
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
corpus = Corpus(documents)

print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(
    len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# Train Naive Bayes on all documents.
# Each document has a type: True for real e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = Bayes()
for document in corpus:
    classifier.train(document)
Exemple #7
0
import os, sys; sys.path.insert(0, os.path.join("..", ".."))

import os
import glob
from pattern.vector import Document, Corpus, Bayes, KNN, features, distance, Vector, _distance, COSINE, kdtree


#from pattern.web import PDF
##pdf = PDF(open("/users/tom/downloads/10-1.1.1.61.7217.pdf", "rb").read())
#pdf = PDF(open("/users/tom/downloads/10-1.1.1.14.8422.pdf", "rb").read())
#print Document(unicode(pdf), threshold=1).keywords(30)
#print xxx

corpus = Corpus()
for product in glob.glob(os.path.join("reviews", "*")):
    for review in glob.glob(os.path.join(product, "*.txt")):
        polarity = "yes" in review
        s = open(review).read()
        corpus.append(Document(s, type=polarity, top=50, threshold=2))

#print "testtree"
#V = lambda x: Vector(dict(enumerate(x)))
#v = [(2,3), (5,4), (9,6), (4,7), (8,1), (7,2)]
#v = [V(x) for x in v]
#t = kdtree(v)
#print t.nn(V((9,5)))
#print xxx

n = 10
x = 0
t1 = 0
clusters = clusterer.cluster(vectors, True)

## k-means clustering
# clusterer = nltk.cluster.KMeansClusterer(2, nltk.cluster.euclidean_distance)
# clusters = clusterer.cluster(U, assign_clusters=True, trace=False)

print "clusterer: ", clusterer
print "clustered: ", vectors
print "As: ", clusters
# print "Means: ", clusterer.means()

# show the dendrogram
clusterer.dendrogram().show(leaf_labels= ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27'])

## lsa analysis
corpus = Corpus(documents)
corpus.reduce(10)



# for document in corpus:
# 	print
# 	print document.name
# 	for concept, w1 in corpus.lsa.vectors[document.id].items():
# 		for word, w2 in corpus.lsa.concepts[concept].items():
# 			if w1 !=0 and w2 !=0:
# 				print (word, w1*w2)

## clustering analysis by pattern's hierarchical
patterncluster = corpus.cluster(method=HIERARCHICAL, k=10, iterations=1000)
print patterncluster, patterncluster.depth
-----------
This program trains a kNN classifier to recognize adjectives taken from Twitter. 
Adjectives are classified and identified as #win or #fail.
The adjective vectors are put in a corpus to train the classifier.
"damn" and "sucks" are classified as #fail & "awesome" and "cool" are classified as "win"
Results vary according to real-time tweets.

'''

from pattern.web import Twitter
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Corpus, KNN

corpus = Corpus()  #collection of texts

for i in range(1, 15):
    for tweet in Twitter().search(
            '#win' or '#fail', start=i, count=100
    ):  #searches 15*100=1500 tweets for these classes of hashtags
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()
        s = Sentence(
            parse(s)
        )  #parse anlayzes & gives strings that are annotated with specified tags
        s = search('JJ',
                   s)  #searches for adjectives in tweets (JJ = adjectiive)
        s = [match[0].string for match in s]
        s = ' '.join(s)
Exemple #10
0
# This reduces the amount of data to work with (for example when clustering),
# and filters out noise, so that semantically related words come out stronger.

# We'll use the Pang & Lee corpus of movie reviews, included in the testing suite.
# Take 200 positive reviews and 200 negative reviews:
data = Datasheet.load(
    os.path.join("..", "..", "test", "corpora", "pang&lee-polarity.csv"))
data = data[:200] + data[-200:]

# Build a corpus of review documents.
# Each document consists of the top 30 words in the movie review.
documents = []
for score, review in data:
    document = Document(review, type=int(score) > 0, top=30)
    documents.append(document)
corpus = Corpus(documents)

print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(
    len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
Exemple #11
0
class DM_GReader():

    def __init__(self, username, password, method='kmeans'):
        auth = ClientAuthMethod(username, password)
        self.reader = GoogleReader(auth)
        self.reader.buildSubscriptionList()
        self.categories = self.reader.getCategories()
        self.corpus = Corpus()
        self.method = method

    def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster] for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(k)

    def generate_repr_ids(self, k):
        """
        For each cluster, we choose an arbitary article as the cluster's representative.

        Return the ids of the article, here the document name is the article's id.
        Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data.
        Now we use the centroid to represent the documents

        """
        self._generate_clusters(k)
        return self.centroids

    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(self.corpus.document(doc).vector, self.corpus.document(center).vector)

        return cost

    def get_article_content(self, ids):
        """
        Use the ids to find the content of the articles through google web content API
        """
        url = 'http://www.google.com/reader/api/0/stream/items/contents'
        id_handle = 'tag:google.com,2005:reader/item/%s'

        contents = []
        for _id in ids:
            r = requests.post(url, data={'i': (id_handle % _id)})
            contents.append(r.json)
        return contents

    def generate_htmls(self, k, ids):
        """
        Use the ids and k to generate htmls
        """
        htmls = {}
        for i in self.get_article_content(ids):
            feed = i['items'][0]
            for content in [u'content', u'summary']:
                if content in feed:
                    title = feed['title']
                    url = feed['alternate'][0]['href']
                    htmls[title] = url
        return htmls
Exemple #12
0
    def import_category(self,
                        category_id=0,
                        path=None,
                        local=False,
                        max_articles=2000,
                        days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm(
            (datetime.date.today() -
             datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(
                self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(
                            Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[
                    u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)
Exemple #13
0
from pattern.web import Twitter
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Corpus, KNN

# This example trains a simple classifier with Twitter messages.
# The idea is that if you have a number of texts with a "type" 
# (e.g., positive/negative opinion, language, author, ...),
# you can predict the type of other "unknown" texts.
# The k-nearest neighbor algorithm classifies texts according
# to the types of known texts that are most similar to the given input text.
# Different similarity measures can be used (e.g, how many characters are the same,
# how many words are the same, ...), by default cosine similarity is used (see the docs).

corpus = Corpus()

# First, we mine a corpus of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in Twitter().search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()        # tweet in lowercase
        s = Sentence(parse(s))               # parse tree with part-of-speech tags
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s] # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            corpus.append(Document(s, type=p, stemmer=None))
Exemple #14
0
from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# Another interesting measure is term frequency-inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so that their words can be compared to calculate TF-IDF.

# You can build a corpus from a folder of text files.
# We supply a naming function that names individual documents based on the file path.
corpus = Corpus.build(os.path.join("corpus", "*.txt"),
                      name=lambda path: os.path.basename(path)[:-4])

d = corpus.document(name="lion")  # Filename is now used as document name.

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf(
    "food")  # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are.
# The corpus can be represented as a matrix with documents as rows
# and words as columns. Each document row is called a "vector",
# containing the TF-IDF scores for word columns.
# We can compare two vectors by calculating their angle,
Exemple #15
0
from pattern.vector import Document, Corpus

# Latent Semantic Analysis (LSA) is a statistical machine learning method 
# based on a matrix calculation called "singular value decomposition" (SVD).
# It discovers semantically related words across documents.
# It groups these into different "concepts" 
# and creates a "concept vector" instead of a word vector for each document.
# This reduces the amount of data to work with (for example when clustering),
# and filters out noise, so that semantically related words come out stronger. 

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1,D2,D3,D4])

print corpus.search("curiosity")
print

corpus.reduce()

# A search on the reduced concept space also yields D3 ("pet") as a result,
# since D2 and D2 are slightly similar even though D3 does not explicitly contain "curiosity".
# Note how the results also yield stronger similarity scores (noise was filtered out).
print corpus.search("curiosity")
print

# The concept vector for document D1:
#print corpus.lsa.vectors[D1.id]
#print
f_pos=open('Positive_pr','r')
neg_lines=f_neg.readlines()
pos_lines=f_pos.readlines()
print 'Number of Negative Tweets:',len(neg_lines)
print 'Number of Positive Tweets:',len(pos_lines)

documents = []
for line in neg_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type=0)
    documents.append(document)

for line in pos_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type=1)
    documents.append(document)

corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

classifier=Bayes(aligned=True)
for document in corpus:
    classifier.train(document,type=document.type)
print 'Done training'

# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'Bayes Classifier'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
#Classify the Preprocessed tweets file into Positive or NEgative categories.
# Sample with 20 tweets file.
from pattern.vector import Document,Bayes,LSA, Corpus,PORTER,TFIDF
from numpy import diag,dot
from numpy.linalg import svd
filename='sample_20'
f=open(filename,'r')
lines=f.readlines()
Positive=lines[9:]
Negative=lines[:9]
Type=[0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1]
docs=[]
for text in lines:
	vec=Document(text,stopword=True,stemmer=PORTER,type='1')
	docs.append(vec)

corpus=Corpus(documents=docs,weight=TFIDF)
print corpus.vectors
corpus.reduce(3)
print "hi"
print corpus.lsa.vectors

f.close()
Exemple #18
0
class DM_GReader():
    def __init__(self, username, password, method='kmeans'):
        auth = ClientAuthMethod(username, password)
        self.reader = GoogleReader(auth)
        self.reader.buildSubscriptionList()
        self.categories = self.reader.getCategories()
        self.corpus = Corpus()
        self.method = method

    def import_category(self,
                        category_id=0,
                        path=None,
                        local=False,
                        max_articles=2000,
                        days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm(
            (datetime.date.today() -
             datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(
                self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(
                            Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[
                    u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS,
                                                k=k,
                                                seed=KMPP,
                                                p=p,
                                                iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster]
                             for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(
                k)

    def generate_repr_ids(self, k):
        """
        For each cluster, we choose an arbitary article as the cluster's representative.

        Return the ids of the article, here the document name is the article's id.
        Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data.
        Now we use the centroid to represent the documents

        """
        self._generate_clusters(k)
        return self.centroids

    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(
                    self.corpus.document(doc).vector,
                    self.corpus.document(center).vector)

        return cost

    def get_article_content(self, ids):
        """
        Use the ids to find the content of the articles through google web content API
        """
        url = 'http://www.google.com/reader/api/0/stream/items/contents'
        id_handle = 'tag:google.com,2005:reader/item/%s'

        contents = []
        for _id in ids:
            r = requests.post(url, data={'i': (id_handle % _id)})
            contents.append(r.json)
        return contents

    def generate_htmls(self, k, ids):
        """
        Use the ids and k to generate htmls
        """
        htmls = {}
        for i in self.get_article_content(ids):
            feed = i['items'][0]
            for content in [u'content', u'summary']:
                if content in feed:
                    title = feed['title']
                    url = feed['alternate'][0]['href']
                    htmls[title] = url
        return htmls
Exemple #19
0
from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# Another interesting measure is term frequency-inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so that their words can be compared to calculate TF-IDF.

# You can build a corpus from a folder of text files.
# We supply a naming function that names individual documents based on the file path.
corpus = Corpus.build(os.path.join("corpus", "*.txt"), name=lambda path: os.path.basename(path)[:-4])

d = corpus.document(name="lion") # Filename is now used as document name.

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf("food") # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are.
# The corpus can be represented as a matrix with documents as rows
# and words as columns. Each document row is called a "vector",
# containing the TF-IDF scores for word columns.
# We can compare two vectors by calculating their angle,
# this is called "cosine-similarity":
pos_lines=f_pos.readlines()

print 'Number of Positive Tweets:',len(pos_lines)
print 'Number of Negative Tweets:',len(neg_lines)

documents = []
for line in neg_lines:
    document = Document(line,stopword=False,stemmer=PORTER,type=0)
    documents.append(document)


for line in pos_lines:
    document = Document(line,stopword=False,stemmer=PORTER,type=1)
    documents.append(document)

corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print



# Train Naive Bayes on all documents.
# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'Bayes Classifier'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print Bayes.test(corpus,folds=10)
f_neg=open('Negative_pr','r')
f_pos=open('Positive_pr','r')
neg_lines=f_neg.readlines()
pos_lines=f_pos.readlines()
print 'Number of Negative Tweets:',len(neg_lines)
print 'Number of Positive Tweets:',len(pos_lines)

documents = []
for line in neg_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='0')
    documents.append(document)
for line in pos_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='1')
    documents.append(document)

corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

#Filtering top 1000 features using Information Gain Criterion
corpus=corpus.filter(features=(corpus.feature_selection(top=1000,method=IG)))

# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'classifying using KNN'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print KNN.test(corpus,k=100,folds=10,distance=COSINE)
Exemple #22
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Document, Corpus, Vectorspace

# Latent Semantic Analysis (LSA) is a statistical machine learning method 
# based on singular value decomposition (SVD).
# It discovers semantically related words across documents. 
# The idea is to group the document vectors in a matrix 
# (each document is a row, each word in the corpus is a column), 
# and then to reduce the number of dimensions, filtering out "noise".

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1,D2,D3,D4])

lsa = corpus.lsa()

print lsa.keywords(D4)
print
print lsa.search("curiosity")

# Document D4 now yields kill as a keyword, although this word was not D4's description. 
# However, document D2 and D4 share curiosity as a keyword, 
# so D4 inherits some of the keywords from D2. 
# Performing a search on curiosity now also yields document D3 as a result.
Exemple #23
0
# and creates a "concept vector" instead of a word vector for each document.
# This reduces the amount of data to work with (for example when clustering),
# and filters out noise, so that semantically related words come out stronger. 

# We'll use the Pang & Lee corpus of movie reviews, included in the testing suite.
# Take 200 positive reviews and 200 negative reviews:
data = Datasheet.load(os.path.join("..","..","test","corpora","pang&lee-polarity.csv"))
data = data[:200] + data[-200:]

# Build a corpus of review documents.
# Each document consists of the top 30 words in the movie review.
documents = []
for score, review in data:
    document = Document(review, type=int(score) > 0, top=30)
    documents.append(document)
corpus = Corpus(documents)

print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
Exemple #24
0
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Document, Corpus, Vectorspace

# Latent Semantic Analysis (LSA) is a statistical machine learning method
# based on singular value decomposition (SVD).
# It discovers semantically related words across documents.
# The idea is to group the document vectors in a matrix
# (each document is a row, each word in the corpus is a column),
# and then to reduce the number of dimensions, filtering out "noise".

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1, D2, D3, D4])

lsa = corpus.lsa()

print lsa.keywords(D4)
print
print lsa.search("curiosity")

# Document D4 now yields kill as a keyword, although this word was not D4's description.
# However, document D2 and D4 share curiosity as a keyword,
# so D4 inherits some of the keywords from D2.
# Performing a search on curiosity now also yields document D3 as a result.
Exemple #25
0
 def load_corpus(self):
     """ Load a corpus, used because we might change corpus saving and
     retrieving and with this we can be sure any changes wont affect other
     methods
     """
     return Corpus.load(cls, '/data/corpus/'+str(self.user_id))
Exemple #26
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight,
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# A better measure is term frequency - inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so their words can be compared to calculate TF-IDF.

corpus = Corpus.build(os.path.join("corpus", "*.txt"))
d = corpus.document(name="lion")

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf("food") # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are,
# based on the "vector" of word tf-idf frequencies of each document.
# This is called cosine-similarity:
d1 = corpus.document(name="lion")
d2 = corpus.document(name="tiger")
d3 = corpus.document(name="dolphin")
d4 = corpus.document(name="shark")
Exemple #27
0
from pattern.web import Twitter
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Corpus, KNN

# This example trains a simple classifier with Twitter messages.
# The idea is that if you have a number of texts with a "type" 
# (e.g., positive/negative opinion, language, author, ...),
# you can predict the type of other "unknown" texts.
# The k-nearest neighbor algorithm classifies texts according
# to the types of known texts that are most similar to the given input text.
# Different similarity measures can be used (e.g, how many characters are the same,
# how many words are the same, ...), by default cosine similarity is used (see the docs).

corpus = Corpus()

# First, we mine a corpus of tweets.
# We'll use hashtags as type.
for page in range(1,6):
    for tweet in Twitter().search('#win OR #fail', start=page, count=100, cached=False):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL'
        s = tweet.description.lower()        # tweet in lowercase
        s = Sentence(parse(s))               # parse tree with part-of-speech tags
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s] # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            corpus.append(Document(s, type=p, threshold=0, stemmer=None))