Exemple #1
0
 def save_page(self, pid, last_id, retry=30):
     last_id = int(last_id)
     while retry:
         try:
             pageObj = Page().get_from_id(pid)
             pageObj['in_database'] = last_id
             pageObj.save()
             break
         except:
             retry -= 1
             continue
     
     if hasattr(self, 'log'): 
         if last_id == -1:
             self.log.error('Fail to insert or update %s, %d'%(pid, int(last_id)))
         else:
             self.log.info('Inserted %s, %d'%(pid, int(last_id)))
Exemple #2
0
 def write(self, page):
     for k, v in page.items():
         try:
             if isinstance(page[k], str): page[k] = page[k].decode('utf-8')
         except:
             print k
             print page['effective_url']
             raise StorageError(Traceback())
     if isinstance(self.label, str): label = self.label.decode('utf-8')
     else: label = self.label
     url_hash = md5(page['effective_url']).hexdigest().decode('utf-8')
     if isinstance(page['wrapper'], dict):
         for k, v in page['wrapper'].items():
             if isinstance(v, unicode): page['wrapper'][k] = v.encode('utf-8')
         wrapper = pickle.dumps(page['wrapper']).decode('utf-8')
     else:
         wrapper = page['wrapper']
     retry = 30
     while retry:
         try:
             pageObj = Page().get_from_id(url_hash)
             if not pageObj:
                 pageObj = New(Page())
                 pageObj['_id'] = url_hash
                 pageObj.label = label
                 pageObj.url = page['url']
                 pageObj.effective_url = page['effective_url']
                 pageObj.url_hash = url_hash
                 pageObj.page = page['body']
                 pageObj.etag = page['etag']
                 pageObj.last_modified = page['last_modified']
                 pageObj.wrapper = wrapper
             elif md5(wrapper.encode('utf-8')).hexdigest() != md5(pageObj.wrapper.encode('utf-8')).hexdigest():
                 pageObj.last_updated_at = datetime.utcnow()
                 pageObj.label = label
                 pageObj.url = page['url']
                 pageObj.body = page['body']
                 pageObj.etag = page['etag']
                 pageObj.last_modified = page['last_modified']
                 pageObj.wrapper = wrapper
                 pageObj.updated_times += 1
                 days = (pageObj.last_updated_at - pageObj.inserted_at).days + 1
                 pageObj.update_freq = 1.0 * pageObj.updated_times / days
                 pageObj.rank = int(30.0 * pageObj.update_freq)
             else:
                 return page
             pageObj.save()
             break
         except:
             retry -= 1
             continue
     if hasattr(self, 'urltrie_label') and self.urltrie_label:
         label = self.urltrie_label.encode('utf-8')
         ident = md5('url:%s, label:%s'%(page['url'].encode('utf-8'), label)).hexdigest().decode('utf-8')
         retry = 30
         while retry:
             try:
                 urlTrieObj = URLTrie().get_from_id(ident)
                 if urlTrieObj:
                     urlTrieObj['in_database'] = 1
                     urlTrieObj.save()
                 break
             except:
                 retry -= 1
                 continue
     return page