Example #1
0
def is_duplicate_request_by_url(db_adapter, request, item):
    db = db_adapter.db_mod.db
    collection = item['collection']
    url_hash = url_digest(request.url)
    spec = {'url_hash': url_hash}
    if db[collection].find_one(spec):
        return True
    else:
        return False
Example #2
0
 def is_duplicate_request(self, request, item):
     db = self.db_mod.db
     collection = item["collection"]
     url_hash = url_digest(request.url)
     spec = {"url_hash": url_hash}
     doc = db[collection].find_one(spec)
     if doc:
         return True
     else:
         return False
 def save_attachment(self, attach):
     # noinspection PyUnresolvedReferences
     db = self.db_mod.db
     collection = db["attachment_data"]
     attach_url = attach.get("url", "")
     url_hash = attach.get("url_hash", "")
     if attach_url:
         attach["url_hash"] = url_digest(attach_url)
         if url_hash:
             attach["url_hash"] = url_hash
         spec = {"url_hash": attach["url_hash"]}
         collection.update(spec, attach, upsert=True, w=0)
         return attach["url"]
     else:
         doc_id = collection.insert(attach, w=0)
         return str(doc_id)
Example #4
0
 def process_item(self, spider):
     now = datetime.datetime.utcnow()
     doc = self['doc']
     doc.pop('detail_url', None)
     #计算部分
     site_url_hash = url_digest(doc['url'])
     calc_doc = {
         'url_hash': site_url_hash,
     }
     default_doc1 = {
         'spider_name': spider.name,
         'data_type': '视频',
         'crawl_time': now,
         'query': self['query'],
     }
     all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems())
     for k, v in all_doc:
         doc.setdefault(k, v)
     doc['attachments'] = self['attachments']
Example #5
0
    def process_item(self, spider):
        now = datetime.datetime.utcnow()
        doc = self['doc']
        doc.pop('detail_url', None)

        #计算部分
        site_url_hash = url_digest(doc['url'])
        calc_doc = {
            'url_hash':site_url_hash,
        }
        default_doc1 = {
            'spider_name': spider.name,
            'data_type': '专利',
            'crawl_time': now,
            'query': self['query'],

            'patent_name':'',   #专利名称
            'patent_type': '',  #专利类型
            'patent_state': '', #专利状态
            'inventor':'',  #发明人
            'applicant': '',    #申请人
            'applicant_address':'', #地址
            'application_number': '',   #专利号(申请号)
            'application_time':'',  #申请时间
            'publication_number': '',   #公告号
            'publication_time': '', #公告日
            'filing_time': '',  #颁证日
            'agent_person': '', #代理人
            'agent_institution': '',    #代理机构
            'classification': '',   #分类号
            'abstract':'',  #摘要
            'description':'',   #描述
            'claims':'',    #主权项
            'attachments':[],
        }
        all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems())
        for k, v in all_doc:
            doc.setdefault(k, v)
        doc['attachments'] = self['attachments']
        content = calc_item_content(doc, spider)
        doc['content'] = content
Example #6
0
 def process_item(self, spider):
     now = datetime.datetime.utcnow()
     doc = self['doc']
     doc.pop('detail_url', None)
     #计算部分
     site_url_hash = url_digest(doc['url'])
     content = ''.join([get_content(page) for page in doc.get('detail_pages', [])])
     calc_doc = {
         'url_hash': site_url_hash,
         'content': content,
     }
     #提取数据的默认填充部分
     default_doc1 = {
         'spider_name': spider.name,
         'data_type': '新闻',
         'crawl_time': now,
         'query': self['query'],
     }
     all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems())
     for k, v in all_doc:
         doc.setdefault(k, v)
 def find_attachment_by_url(self, url):
     # noinspection PyUnresolvedReferences
     db = self.db_mod.db
     spec = {"url_hash": url_digest(url)}
     return db["attachment_data"].find_one(spec)