def is_duplicate_request_by_url(db_adapter, request, item): db = db_adapter.db_mod.db collection = item['collection'] url_hash = url_digest(request.url) spec = {'url_hash': url_hash} if db[collection].find_one(spec): return True else: return False
def is_duplicate_request(self, request, item): db = self.db_mod.db collection = item["collection"] url_hash = url_digest(request.url) spec = {"url_hash": url_hash} doc = db[collection].find_one(spec) if doc: return True else: return False
def save_attachment(self, attach): # noinspection PyUnresolvedReferences db = self.db_mod.db collection = db["attachment_data"] attach_url = attach.get("url", "") url_hash = attach.get("url_hash", "") if attach_url: attach["url_hash"] = url_digest(attach_url) if url_hash: attach["url_hash"] = url_hash spec = {"url_hash": attach["url_hash"]} collection.update(spec, attach, upsert=True, w=0) return attach["url"] else: doc_id = collection.insert(attach, w=0) return str(doc_id)
def process_item(self, spider): now = datetime.datetime.utcnow() doc = self['doc'] doc.pop('detail_url', None) #计算部分 site_url_hash = url_digest(doc['url']) calc_doc = { 'url_hash': site_url_hash, } default_doc1 = { 'spider_name': spider.name, 'data_type': '视频', 'crawl_time': now, 'query': self['query'], } all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems()) for k, v in all_doc: doc.setdefault(k, v) doc['attachments'] = self['attachments']
def process_item(self, spider): now = datetime.datetime.utcnow() doc = self['doc'] doc.pop('detail_url', None) #计算部分 site_url_hash = url_digest(doc['url']) calc_doc = { 'url_hash':site_url_hash, } default_doc1 = { 'spider_name': spider.name, 'data_type': '专利', 'crawl_time': now, 'query': self['query'], 'patent_name':'', #专利名称 'patent_type': '', #专利类型 'patent_state': '', #专利状态 'inventor':'', #发明人 'applicant': '', #申请人 'applicant_address':'', #地址 'application_number': '', #专利号(申请号) 'application_time':'', #申请时间 'publication_number': '', #公告号 'publication_time': '', #公告日 'filing_time': '', #颁证日 'agent_person': '', #代理人 'agent_institution': '', #代理机构 'classification': '', #分类号 'abstract':'', #摘要 'description':'', #描述 'claims':'', #主权项 'attachments':[], } all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems()) for k, v in all_doc: doc.setdefault(k, v) doc['attachments'] = self['attachments'] content = calc_item_content(doc, spider) doc['content'] = content
def process_item(self, spider): now = datetime.datetime.utcnow() doc = self['doc'] doc.pop('detail_url', None) #计算部分 site_url_hash = url_digest(doc['url']) content = ''.join([get_content(page) for page in doc.get('detail_pages', [])]) calc_doc = { 'url_hash': site_url_hash, 'content': content, } #提取数据的默认填充部分 default_doc1 = { 'spider_name': spider.name, 'data_type': '新闻', 'crawl_time': now, 'query': self['query'], } all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems()) for k, v in all_doc: doc.setdefault(k, v)
def find_attachment_by_url(self, url): # noinspection PyUnresolvedReferences db = self.db_mod.db spec = {"url_hash": url_digest(url)} return db["attachment_data"].find_one(spec)