Beispiel #1
0
def calc_item_content(doc, spider):
    pages = []
    for attach_id in doc['attachments']:
        if isinstance(attach_id, ObjectId):
            attach = spider.db_adapter.find_attachment_by_id(attach_id)
        else:
            attach = spider.db_adapter.find_attachment_by_url(attach_id)
        if attach:
            mime_type = attach.get('mime_type', '')
            if mime_type == 'text/html':
                pages.append(attach.get('data', ''))
    content = ''.join([get_content(page) for page in pages])
    return content
Beispiel #2
0
 def process_item(self, spider):
     now = datetime.datetime.utcnow()
     doc = self['doc']
     doc.pop('detail_url', None)
     #计算部分
     site_url_hash = url_digest(doc['url'])
     content = ''.join([get_content(page) for page in doc.get('detail_pages', [])])
     calc_doc = {
         'url_hash': site_url_hash,
         'content': content,
     }
     #提取数据的默认填充部分
     default_doc1 = {
         'spider_name': spider.name,
         'data_type': '新闻',
         'crawl_time': now,
         'query': self['query'],
     }
     all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems())
     for k, v in all_doc:
         doc.setdefault(k, v)