Example #1
0
class DM_GReader():
    def __init__(self, username, password, method='kmeans'):
        auth = ClientAuthMethod(username, password)
        self.reader = GoogleReader(auth)
        self.reader.buildSubscriptionList()
        self.categories = self.reader.getCategories()
        self.corpus = Corpus()
        self.method = method

    def import_category(self,
                        category_id=0,
                        path=None,
                        local=False,
                        max_articles=2000,
                        days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm(
            (datetime.date.today() -
             datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(
                self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(
                            Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[
                    u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS,
                                                k=k,
                                                seed=KMPP,
                                                p=p,
                                                iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster]
                             for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(
                k)

    def generate_repr_ids(self, k):
        """
        For each cluster, we choose an arbitary article as the cluster's representative.

        Return the ids of the article, here the document name is the article's id.
        Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data.
        Now we use the centroid to represent the documents

        """
        self._generate_clusters(k)
        return self.centroids

    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(
                    self.corpus.document(doc).vector,
                    self.corpus.document(center).vector)

        return cost

    def get_article_content(self, ids):
        """
        Use the ids to find the content of the articles through google web content API
        """
        url = 'http://www.google.com/reader/api/0/stream/items/contents'
        id_handle = 'tag:google.com,2005:reader/item/%s'

        contents = []
        for _id in ids:
            r = requests.post(url, data={'i': (id_handle % _id)})
            contents.append(r.json)
        return contents

    def generate_htmls(self, k, ids):
        """
        Use the ids and k to generate htmls
        """
        htmls = {}
        for i in self.get_article_content(ids):
            feed = i['items'][0]
            for content in [u'content', u'summary']:
                if content in feed:
                    title = feed['title']
                    url = feed['alternate'][0]['href']
                    htmls[title] = url
        return htmls
Example #2
0
class DM_GReader():

    def __init__(self, username, password, method='kmeans'):
        auth = ClientAuthMethod(username, password)
        self.reader = GoogleReader(auth)
        self.reader.buildSubscriptionList()
        self.categories = self.reader.getCategories()
        self.corpus = Corpus()
        self.method = method

    def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster] for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(k)

    def generate_repr_ids(self, k):
        """
        For each cluster, we choose an arbitary article as the cluster's representative.

        Return the ids of the article, here the document name is the article's id.
        Google Reader is using "i=http://www.google.com/reader/api/0/stream/items/contents" to get the content of a specific data.
        Now we use the centroid to represent the documents

        """
        self._generate_clusters(k)
        return self.centroids

    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(self.corpus.document(doc).vector, self.corpus.document(center).vector)

        return cost

    def get_article_content(self, ids):
        """
        Use the ids to find the content of the articles through google web content API
        """
        url = 'http://www.google.com/reader/api/0/stream/items/contents'
        id_handle = 'tag:google.com,2005:reader/item/%s'

        contents = []
        for _id in ids:
            r = requests.post(url, data={'i': (id_handle % _id)})
            contents.append(r.json)
        return contents

    def generate_htmls(self, k, ids):
        """
        Use the ids and k to generate htmls
        """
        htmls = {}
        for i in self.get_article_content(ids):
            feed = i['items'][0]
            for content in [u'content', u'summary']:
                if content in feed:
                    title = feed['title']
                    url = feed['alternate'][0]['href']
                    htmls[title] = url
        return htmls