def calc_item_content(doc, spider): pages = [] for attach_id in doc['attachments']: if isinstance(attach_id, ObjectId): attach = spider.db_adapter.find_attachment_by_id(attach_id) else: attach = spider.db_adapter.find_attachment_by_url(attach_id) if attach: mime_type = attach.get('mime_type', '') if mime_type == 'text/html': pages.append(attach.get('data', '')) content = ''.join([get_content(page) for page in pages]) return content
def process_item(self, spider): now = datetime.datetime.utcnow() doc = self['doc'] doc.pop('detail_url', None) #计算部分 site_url_hash = url_digest(doc['url']) content = ''.join([get_content(page) for page in doc.get('detail_pages', [])]) calc_doc = { 'url_hash': site_url_hash, 'content': content, } #提取数据的默认填充部分 default_doc1 = { 'spider_name': spider.name, 'data_type': '新闻', 'crawl_time': now, 'query': self['query'], } all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems()) for k, v in all_doc: doc.setdefault(k, v)