def suggest_tags(self, ): """ Uses cosine similarity to suggest tags to the note IDEAS: more weight for same notebook more weight for relative creation time """ corrected_notes = {} if not self.mongo.users.find_one({'_id':self.user_id, 'bool_lsa':True},{'bool_lsa':1}): # we have not done lsa before, do it now we want a fast KNN self._lsa_extract() corpus = Corpus.load(cls, '/data/corpus/'+str(self.user_id)) ## only untagged notes untaged_notes = self.mongo.notes.find({'_id_tags':None},{}) for note in untaged_notes: suggested_tags = set() # get the doc from the corpus for weight, doc in corpus.nearest_neigbors(corpus[(note['_id'])], top=5): # get the similar doc tags = self.mongo.notes.find_one( {'_id':doc.id},{'str_tags':1}).get('str_tags') if tags: suggested_tags.update(tags) corrected_notes[(note['_id'])] = suggested_tags return corrected_notes
def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3): """Import the specific category to a Pattern Corpus for future calculation. category_id: the integer indicates which category to use. cont: the integer tells how many queries to issue to continuously crawl the GReader. path: the location for storing the pickle of the Pattern Corpus. local: to use the local stored corpus? max_articles: the number of max articles we try to crawl if one day's subscriptions is too much.""" if path is None: print "Please provide with a path to store/load local pickle file." return if local: self.corpus = Corpus.load(path) return self.target_category = self.categories[category_id] continuation = None # Crawl only the data within one day time_threadshold = calendar.timegm((datetime.date.today() - datetime.timedelta(days=days)).timetuple()) i = 1 while 1 and i < (max_articles / 20): self.target_category_content = self.reader.getCategoryContent(self.target_category, continuation=continuation) feeds = self.target_category_content[u'items'] if self.target_category_content['updated'] < time_threadshold: break feeds_docs = [] for feed in feeds: doc_name = feed[u'id'][-16:] for content in [u'content', u'summary']: if content in feed: feed_soup = BeautifulSoup(feed[content][u'content']) feed_text = feed_soup.get_text() feeds_docs.append(Document(feed_text, stemmer=LEMMA, name=doc_name)) break self.corpus.extend(feeds_docs) if u'continuation' in self.target_category_content and self.target_category_content[u'continuation'] is not None: continuation = self.target_category_content[u'continuation'] else: print 'Finished!' break print 'Retrieving %d articles...' % (i * 20) i = i + 1 self.corpus.save(path, update=True)
def import_category(self, category_id=0, path=None, local=False, max_articles=2000, days=3): """Import the specific category to a Pattern Corpus for future calculation. category_id: the integer indicates which category to use. cont: the integer tells how many queries to issue to continuously crawl the GReader. path: the location for storing the pickle of the Pattern Corpus. local: to use the local stored corpus? max_articles: the number of max articles we try to crawl if one day's subscriptions is too much.""" if path is None: print "Please provide with a path to store/load local pickle file." return if local: self.corpus = Corpus.load(path) return self.target_category = self.categories[category_id] continuation = None # Crawl only the data within one day time_threadshold = calendar.timegm( (datetime.date.today() - datetime.timedelta(days=days)).timetuple()) i = 1 while 1 and i < (max_articles / 20): self.target_category_content = self.reader.getCategoryContent( self.target_category, continuation=continuation) feeds = self.target_category_content[u'items'] if self.target_category_content['updated'] < time_threadshold: break feeds_docs = [] for feed in feeds: doc_name = feed[u'id'][-16:] for content in [u'content', u'summary']: if content in feed: feed_soup = BeautifulSoup(feed[content][u'content']) feed_text = feed_soup.get_text() feeds_docs.append( Document(feed_text, stemmer=LEMMA, name=doc_name)) break self.corpus.extend(feeds_docs) if u'continuation' in self.target_category_content and self.target_category_content[ u'continuation'] is not None: continuation = self.target_category_content[u'continuation'] else: print 'Finished!' break print 'Retrieving %d articles...' % (i * 20) i = i + 1 self.corpus.save(path, update=True)
def load_corpus(self): """ Load a corpus, used because we might change corpus saving and retrieving and with this we can be sure any changes wont affect other methods """ return Corpus.load(cls, '/data/corpus/'+str(self.user_id))