Example #1
0
    def suggest_tags(self, ):
        """ Uses cosine similarity to suggest tags to the note 
        IDEAS:
            more weight for same notebook
            more weight for relative creation time
        """
        corrected_notes = {}

        if not self.mongo.users.find_one({'_id':self.user_id, 'bool_lsa':True},{'bool_lsa':1}):
            # we have not done lsa before, do it now we want a fast KNN
            self._lsa_extract()
        
        corpus = Corpus.load(cls, '/data/corpus/'+str(self.user_id))
        ## only untagged notes
        untaged_notes = self.mongo.notes.find({'_id_tags':None},{})
        for note in untaged_notes:
            suggested_tags =  set()
            # get the doc from the corpus
            for weight, doc in corpus.nearest_neigbors(corpus[(note['_id'])], top=5):
                # get the similar doc
                tags = self.mongo.notes.find_one(
                        {'_id':doc.id},{'str_tags':1}).get('str_tags')
                if tags:
                    suggested_tags.update(tags)
            corrected_notes[(note['_id'])] = suggested_tags
        return corrected_notes
Example #2
0
    def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)
Example #3
0
    def import_category(self,
                        category_id=0,
                        path=None,
                        local=False,
                        max_articles=2000,
                        days=3):
        """Import the specific category to a Pattern Corpus for future calculation.
        category_id: the integer indicates which category to use.
        cont: the integer tells how many queries to issue to continuously crawl the GReader.
        path: the location for storing the pickle of the Pattern Corpus.
        local: to use the local stored corpus?
        max_articles: the number of max articles we try to crawl if one day's subscriptions is too much."""

        if path is None:
            print "Please provide with a path to store/load local pickle file."
            return

        if local:
            self.corpus = Corpus.load(path)
            return

        self.target_category = self.categories[category_id]
        continuation = None

        # Crawl only the data within one day
        time_threadshold = calendar.timegm(
            (datetime.date.today() -
             datetime.timedelta(days=days)).timetuple())

        i = 1

        while 1 and i < (max_articles / 20):

            self.target_category_content = self.reader.getCategoryContent(
                self.target_category, continuation=continuation)
            feeds = self.target_category_content[u'items']

            if self.target_category_content['updated'] < time_threadshold:
                break

            feeds_docs = []
            for feed in feeds:
                doc_name = feed[u'id'][-16:]
                for content in [u'content', u'summary']:
                    if content in feed:
                        feed_soup = BeautifulSoup(feed[content][u'content'])
                        feed_text = feed_soup.get_text()
                        feeds_docs.append(
                            Document(feed_text, stemmer=LEMMA, name=doc_name))
                        break

            self.corpus.extend(feeds_docs)

            if u'continuation' in self.target_category_content and self.target_category_content[
                    u'continuation'] is not None:
                continuation = self.target_category_content[u'continuation']
            else:
                print 'Finished!'
                break

            print 'Retrieving %d articles...' % (i * 20)
            i = i + 1

        self.corpus.save(path, update=True)
Example #4
0
 def load_corpus(self):
     """ Load a corpus, used because we might change corpus saving and
     retrieving and with this we can be sure any changes wont affect other
     methods
     """
     return Corpus.load(cls, '/data/corpus/'+str(self.user_id))