def save_page(self, pid, last_id, retry=30): last_id = int(last_id) while retry: try: pageObj = Page().get_from_id(pid) pageObj['in_database'] = last_id pageObj.save() break except: retry -= 1 continue if hasattr(self, 'log'): if last_id == -1: self.log.error('Fail to insert or update %s, %d'%(pid, int(last_id))) else: self.log.info('Inserted %s, %d'%(pid, int(last_id)))
def write(self, page): for k, v in page.items(): try: if isinstance(page[k], str): page[k] = page[k].decode('utf-8') except: print k print page['effective_url'] raise StorageError(Traceback()) if isinstance(self.label, str): label = self.label.decode('utf-8') else: label = self.label url_hash = md5(page['effective_url']).hexdigest().decode('utf-8') if isinstance(page['wrapper'], dict): for k, v in page['wrapper'].items(): if isinstance(v, unicode): page['wrapper'][k] = v.encode('utf-8') wrapper = pickle.dumps(page['wrapper']).decode('utf-8') else: wrapper = page['wrapper'] retry = 30 while retry: try: pageObj = Page().get_from_id(url_hash) if not pageObj: pageObj = New(Page()) pageObj['_id'] = url_hash pageObj.label = label pageObj.url = page['url'] pageObj.effective_url = page['effective_url'] pageObj.url_hash = url_hash pageObj.page = page['body'] pageObj.etag = page['etag'] pageObj.last_modified = page['last_modified'] pageObj.wrapper = wrapper elif md5(wrapper.encode('utf-8')).hexdigest() != md5(pageObj.wrapper.encode('utf-8')).hexdigest(): pageObj.last_updated_at = datetime.utcnow() pageObj.label = label pageObj.url = page['url'] pageObj.body = page['body'] pageObj.etag = page['etag'] pageObj.last_modified = page['last_modified'] pageObj.wrapper = wrapper pageObj.updated_times += 1 days = (pageObj.last_updated_at - pageObj.inserted_at).days + 1 pageObj.update_freq = 1.0 * pageObj.updated_times / days pageObj.rank = int(30.0 * pageObj.update_freq) else: return page pageObj.save() break except: retry -= 1 continue if hasattr(self, 'urltrie_label') and self.urltrie_label: label = self.urltrie_label.encode('utf-8') ident = md5('url:%s, label:%s'%(page['url'].encode('utf-8'), label)).hexdigest().decode('utf-8') retry = 30 while retry: try: urlTrieObj = URLTrie().get_from_id(ident) if urlTrieObj: urlTrieObj['in_database'] = 1 urlTrieObj.save() break except: retry -= 1 continue return page