Exemple #1
0
 def extract_results(self, query_resp):
     ret = []
     for pid, pid_dict in query_resp.get('pages', {}).iteritems():
         page_info = PageInfo.from_query(pid_dict,
                                         source=self.source)
         ret.append(page_info)
     return ret
Exemple #2
0
 def extract_results(self, query_resp):
     ret = []
     for k, pid_dict in query_resp['pages'].iteritems():
         page = PageInfo.from_query(pid_dict,
                                    source=self.source)
         ret.append(page)
     return ret
Exemple #3
0
 def extract_results(self, query_resp):
     ret = []
     for k, pid_dict in query_resp['pages'].iteritems():
         page_info = PageInfo.from_query(pid_dict,
                                         source=self.source)
         ret.append(page_info)
     return ret
Exemple #4
0
 def extract_results(self, query_resp):
     ret = []
     for pid, pid_dict in query_resp['pages'].iteritems():
         if pid.startswith('-'):
             continue
         page_ident = PageInfo.from_query(pid_dict, source=self.source)
         ret.append(page_ident)
     return ret
Exemple #5
0
 def extract_results(self, query_resp):
     ret = []
     for pid, pid_dict in query_resp["pages"].iteritems():
         if pid.startswith("-"):
             pid_dict["pageid"] = None  # TODO: breaks consistency :/
         page_ident = PageInfo.from_query(pid_dict, source=self.source)
         ret.append(page_ident)
     return ret
Exemple #6
0
 def extract_results(self, query_resp):
     ret = []
     for pid, pid_dict in query_resp['pages'].iteritems():
         if pid.startswith('-'):
             pid_dict['pageid'] = None  # TODO: breaks consistency :/
         page_ident = PageInfo.from_query(pid_dict,
                                          source=self.source)
         ret.append(page_ident)
     return ret
Exemple #7
0
 def extract_results(self, query_resp):
     ret = []
     for pid, pid_dict in query_resp['pages'].iteritems():
         if pid.startswith('-'):
             continue
         page_ident = PageInfo.from_query(pid_dict,
                                          source=self.source)
         ret.append(page_ident)
     return ret
Exemple #8
0
 def extract_results(self, query_resp):
     ret = []
     for k, pid_dict in query_resp['pages'].iteritems():
         try:
             page_ident = PageInfo.from_query(pid_dict,
                                              source=self.source)
         except ValueError:
             continue
         ret.append(page_ident)
     return ret
Exemple #9
0
 def extract_results(self, query_resp):
     ret = []
     for pid, pid_dict in query_resp['pages'].iteritems():
         if pid.startswith('-'):
             pid_dict['pageid'] = None  # TODO: breaks consistency :/
         try:
             page_ident = PageInfo.from_query(pid_dict,
                                              source=self.source)
         except ValueError:
             continue
         ret.append(page_ident)
     return ret
Exemple #10
0
    def process_item(self, item, spider):
        try:

            existence = self.session.query(exists().where(
                PageInfo.currenturl == item['currenturl'])).scalar()
            if not existence:
                # 不存在
                self.session.add(
                    PageInfo(baseurl=item['baseurl'],
                             currenturl=item['currenturl'],
                             content=item['content'],
                             fetchtime=item['fetchtime'],
                             contentmd5=item['contentmd5'],
                             contenttype=item['contenttype'],
                             prevfetchtime=item['prevfetchtime'],
                             domain_name=item['domain_name'],
                             page_title=item['page_title'],
                             page_body=item['page_body']))
            else:
                if not self.session.query(exists().where(
                        PageInfo.contentmd5 == item['contentmd5'])).scalar():
                    # 数据存在,更新操作
                    self.session.query(PageInfo).filter(
                        PageInfo.currenturl == item['currenturl']).update({
                            'content':
                            item['content'],
                            'contentmd5':
                            item['contentmd5'],
                            'contenttype':
                            item['contenttype'],
                            'page_title':
                            item['page_title'],
                            'fetchtime':
                            item['fetchtime'],
                            'page_body':
                            item['page_body']
                        })
            self.session.commit()
        except Exception as e:
            print(e)
            self.session.rollback()
        return item
Exemple #11
0
 def extract_results(self, query_resp):
     ret = []
     for k, pid_dict in query_resp.get('pages', {}).items():
         page_ident = PageInfo.from_query(pid_dict, source=self.source)
         ret.append(page_ident)
     return ret