def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp.get('pages', {}).iteritems(): page_info = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_info) return ret
def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): page = PageInfo.from_query(pid_dict, source=self.source) ret.append(page) return ret
def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): page_info = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_info) return ret
def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp['pages'].iteritems(): if pid.startswith('-'): continue page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp["pages"].iteritems(): if pid.startswith("-"): pid_dict["pageid"] = None # TODO: breaks consistency :/ page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp['pages'].iteritems(): if pid.startswith('-'): pid_dict['pageid'] = None # TODO: breaks consistency :/ page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): try: page_ident = PageInfo.from_query(pid_dict, source=self.source) except ValueError: continue ret.append(page_ident) return ret
def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp['pages'].iteritems(): if pid.startswith('-'): pid_dict['pageid'] = None # TODO: breaks consistency :/ try: page_ident = PageInfo.from_query(pid_dict, source=self.source) except ValueError: continue ret.append(page_ident) return ret
def process_item(self, item, spider): try: existence = self.session.query(exists().where( PageInfo.currenturl == item['currenturl'])).scalar() if not existence: # 不存在 self.session.add( PageInfo(baseurl=item['baseurl'], currenturl=item['currenturl'], content=item['content'], fetchtime=item['fetchtime'], contentmd5=item['contentmd5'], contenttype=item['contenttype'], prevfetchtime=item['prevfetchtime'], domain_name=item['domain_name'], page_title=item['page_title'], page_body=item['page_body'])) else: if not self.session.query(exists().where( PageInfo.contentmd5 == item['contentmd5'])).scalar(): # 数据存在,更新操作 self.session.query(PageInfo).filter( PageInfo.currenturl == item['currenturl']).update({ 'content': item['content'], 'contentmd5': item['contentmd5'], 'contenttype': item['contenttype'], 'page_title': item['page_title'], 'fetchtime': item['fetchtime'], 'page_body': item['page_body'] }) self.session.commit() except Exception as e: print(e) self.session.rollback() return item
def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp.get('pages', {}).items(): page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret