def parse(self, response):
     info = response.body.decode('utf-8')
     info = json.loads(info)
     logging.debug(json.dumps(info))
     if 'items' not in info.keys():
         self.err_after(response.meta)
         return None
     item_info = info['items']
     keyword = response.meta['keyword']
     sort = response.meta['sort']  # 上一页最后一个产品的排名
     p_time = response.meta['p_time']
     item_list = []
     page = response.meta['page']
     proxy = response.meta['proxy']
     # print('parse_before', sort,len(item_info), keyword)
     # 返回有数据,处理数据
     if len(item_info) > 0:
         for value in item_info:
             sort = sort + 1
             # 判断是否推广
             if 'ad' in value.keys():
                 mall_id = value['ad']['mall_id']
                 is_ad = 1
                 suggest_keyword = ''
             else:
                 mall_id = 0
                 is_ad = 0
                 suggest_keyword = ''
             goods_info = value
             goods_info['keyword'] = keyword
             goods_info['sort'] = sort
             goods_info['p_time'] = p_time
             goods_info['mall_id'] = mall_id
             goods_info['is_ad'] = is_ad
             goods_info['suggest_keyword'] = suggest_keyword
             item_list.append(goods_info)
         # 处理单个关键字下所有产品的排名
         item = KeywordGoodsList()
         item['goods_list'] = item_list
         item['page'] = page
         # print('parse_middle', sort,len(item_info), keyword)
         yield item
         page += 1  # 返回数据,页码加1,未返回数据,重新抓取
         # print('parse_after', sort,len(item_info), keyword)
         if page <= self.max_page:
             url = self.build_search_url(page, self.size, keyword, '')
             headers = self.make_headers()
             meta = {
                 'proxy': proxy,
                 'page': page,
                 'keyword': keyword,
                 'sort': sort,
                 'p_time': p_time
             }
             yield scrapy.Request(url,
                                  meta=meta,
                                  callback=self.parse,
                                  headers=headers,
                                  dont_filter=True,
                                  errback=self.errback_httpbin)
Beispiel #2
0
 def parse(self, response):
     info = response.body.decode('utf-8')
     info = json.loads(info)
     item_info = info['items']
     keyword = response.meta['keyword']
     sort = response.meta['sort']  # 上一页最后一个产品的排名
     item_list = []
     page = response.meta['page']
     proxy = response.meta['proxy']
     # print('parse_before', sort,len(item_info), keyword)
     # 返回有数据,处理数据
     if len(item_info) > 0:
         for value in item_info:
             sort = sort + 1
             # 判断是否推广
             if 'ad' in value.keys():
                 mall_id = value['ad']['mall_id']
                 is_ad = 1
                 suggest_keyword = ''
             else:
                 mall_id = 0
                 is_ad = 0
                 suggest_keyword = ''
             item_list.append({
                 'keyword': keyword,
                 'sort': sort,
                 'goods_id': value['goods_id'],
                 'p_time': self.p_time,
                 'mall_id': mall_id,
                 'is_ad': is_ad,
                 'suggest_keyword': suggest_keyword
             })
         item = KeywordGoodsList()
         # 此处作兼容
         item['page'] = page
         page = page + 1  # 返回数据,页码加1,未返回数据,重新抓取
         # 处理单个关键字下所有产品的排名
         item['goods_list'] = item_list
         # print('parse_middle', sort,len(item_info), keyword)
         yield item
         # print('parse_after', sort,len(item_info), keyword)
         if page <= self.max_page:
             url = self.build_search_url(page, self.size, keyword)
             headers = self.make_headers()
             meta = {
                 'proxy': proxy,
                 'page': page,
                 'keyword': keyword,
                 'sort': sort
             }
             yield scrapy.Request(url,
                                  meta=meta,
                                  callback=self.parse,
                                  headers=headers,
                                  dont_filter=True,
                                  errback=self.errback_httpbin)