Python Corpus.load Examples

Programming Language: Python

Namespace/Package Name: pattern.vector

Class/Type: Corpus

Method/Function: load

Examples at hotexamples.com: 4

Python Corpus.load - 4 examples found. These are the top rated real world Python examples of pattern.vector.Corpus.load extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Corpus(5)

append(4)

build(2)

lsa(2)

reduce(2)

cluster(1)

document(1)

export(1)

extend(1)

feature_selection(1)

filter(1)

load(1)

nn(1)

save(1)

search(1)

Example #1

Show file

File: analytics.py Project: enjoylife/StupidMonkey

    def suggest_tags(self, ):
        """ Uses cosine similarity to suggest tags to the note 
        IDEAS:
            more weight for same notebook
            more weight for relative creation time
        """
        corrected_notes = {}

        if not self.mongo.users.find_one({'_id':self.user_id, 'bool_lsa':True},{'bool_lsa':1}):
            # we have not done lsa before, do it now we want a fast KNN
            self._lsa_extract()
        
        corpus = Corpus.load(cls, '/data/corpus/'+str(self.user_id))
        ## only untagged notes
        untaged_notes = self.mongo.notes.find({'_id_tags':None},{})
        for note in untaged_notes:
            suggested_tags =  set()
            # get the doc from the corpus
            for weight, doc in corpus.nearest_neigbors(corpus[(note['_id'])], top=5):
                # get the similar doc
                tags = self.mongo.notes.find_one(
                        {'_id':doc.id},{'str_tags':1}).get('str_tags')
                if tags:
                    suggested_tags.update(tags)
            corrected_notes[(note['_id'])] = suggested_tags
        return corrected_notes

Example #2

Show file

File: DM_GReader.py Project: zhouzhuojie/DM_GReader

    def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

Example #3

Show file

    def import_category(self,
                        category_id=0,
                        path=None,
                        local=False,
                        max_articles=2000,
                        days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm(
            (datetime.date.today() -
             datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(
                self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(
                            Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[
                    u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)

Example #4

Show file

File: analytics.py Project: enjoylife/StupidMonkey

 def load_corpus(self):
     """ Load a corpus, used because we might change corpus saving and
     retrieving and with this we can be sure any changes wont affect other
     methods
     """
     return Corpus.load(cls, '/data/corpus/'+str(self.user_id))