コード例 #1
0
 def parse1(self, response):
     if check_json_format(response.body.decode("utf-8")):
         pass
         # yield item
     back_url = response.meta['url']
     back_id = str(response.meta['id'])
     back_header = str(response.meta['header'])
     yield scrapy.Request(
         url=back_url,
         headers=eval(back_header),
         callback=self.parse1,
         priority=0,
         dont_filter=True,
         meta={
             'dont_redirect':
             True,
             'id':
             back_id,
             'header':
             back_header,
             'url':
             back_url,
             'handle_httpstatus_list':
             [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400]
         })
コード例 #2
0
 def parse1(self, response):
     html = response.body.text()
     if check_json_format(response.body.decode("utf-8")):
         jsonresponse = json.loads(response.body.decode("utf-8"))
         # print(type(jsonresponse))
         data = jsonresponse['data']
         if 'ad' in data:
             ad = data['ad']
             feed = ad['feed']
             for n in feed:
                 item = SinaAppItem()
                 item['pos'] = str(n.get('pos', ""))
                 item['newsId'] = str(n.get('newsId', ""))
                 item['title'] = str(n.get('title', ""))
                 item['link'] = str(n.get('link', ""))
                 item['pic'] = str(n.get('pic', ""))
                 item['showTag'] = str(n.get('showTag', ""))
                 item['articlePreload'] = str(n.get('articlePreload', ""))
                 if 'commentStatus' in n:
                     item['commentStatus'] = str(
                         n['commentCountInfo']['commentStatus'])
                 else:
                     item['commentStatus'] = ''
                 item['adid'] = str(n.get('adid', ""))
                 item['dislikeTags'] = str(n.get('dislikeTags', ""))
                 item['parameter_id'] = str(response.meta['id'])
                 yield item
     back_url = response.meta['url']
     back_id = str(response.meta['id'])
     back_header = str(response.meta['header'])
     yield scrapy.Request(
         url=back_url,
         headers=eval(back_header),
         callback=self.parse1,
         priority=0,
         dont_filter=True,
         meta={
             'dont_redirect':
             True,
             'id':
             back_id,
             'header':
             back_header,
             'url':
             back_url,
             'handle_httpstatus_list':
             [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400]
         })
コード例 #3
0
 def parse(self, response):
     if check_json_format(response.body.decode("utf-8", "ignore")):
         jsonresponse = json.loads(response.body.decode("utf-8", "ignore"))
         item_list = jsonresponse[0]["item"]
         for advert in item_list:
             if "style" in advert:
                 if "attribute" in advert["style"]:
                     if "广告" in advert["style"]["attribute"]:
                         item = FenghuangAppItem()
                         item['thumbnail'] = str(advert["thumbnail"])
                         item['title'] = str(advert["title"])
                         item['appSource'] = str(advert["appSource"])
                         item['intro'] = str(advert["intro"])
                         item['adId'] = str(advert["adId"])
                         item['adPositionId'] = str(advert["adPositionId"])
                         item['type'] = str(advert["type"])
                         item['source'] = str(advert["source"])
                         item['weburl'] = str(advert["link"].get(
                             "weburl", ""))
                         item['view'] = str(advert["style"].get("view", ""))
                         item['images'] = str(advert["style"].get(
                             "images", ""))
                         item['parameter_id'] = str(response.meta['id'])
                         yield item
     back_url = response.meta['url']
     back_id = str(response.meta['id'])
     back_header = str(response.meta['header'])
     yield scrapy.Request(
         url=back_url,
         callback=self.parse,
         priority=0,
         dont_filter=True,
         meta={
             'dont_redirect':
             True,
             'id':
             back_id,
             'header':
             back_header,
             'url':
             back_url,
             'handle_httpstatus_list':
             [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400]
         })
コード例 #4
0
 def parse(self, response):
     if check_json_format(response.body.decode("utf-8")):
         jsonresponse = json.loads(response.body.decode("utf-8"))
         if 'recommendArticles' in jsonresponse:
             news = jsonresponse['recommendArticles']
             for ad in news:
                 if 'iconText' in ad:
                     if '广告' == ad['iconText']:
                         item = SouhuAppItem()
                         data = ad['data']
                         if 'error' in data and '1' == str(data['error']):
                             pass
                         else:
                             item['monitorkey'] = str(
                                 data.get('monitorkey', ""))
                             item['resource'] = str(data.get(
                                 'resource', ""))
                             item['resource2'] = str(
                                 data.get('resource2', ""))
                             item['weight'] = str(data.get('weight', ""))
                             item['itemspaceid'] = str(
                                 data.get('itemspaceid', ""))
                             item['resource1'] = str(
                                 data.get('resource1', ""))
                             item['impressionid'] = str(
                                 data.get('impressionid', ""))
                             item['special'] = str(data.get('special', ""))
                             item['offline'] = str(data.get('offline', ""))
                             item['adid'] = str(data.get('adid', ""))
                             item['viewmonitor'] = str(
                                 data.get('viewmonitor', ""))
                             item['size'] = str(data.get('size', ""))
                             item['online'] = str(data.get('online', ""))
                             item['position'] = str(data.get(
                                 'position', ""))
                             item['tag'] = str(data.get('tag', ""))
                             item['editNews'] = str(ad.get('editNews', ""))
                             item['statsType'] = str(ad.get(
                                 'statsType', ""))
                             item['isPreload'] = str(ad.get(
                                 'isPreload', ""))
                             item['newsType'] = str(ad.get('newsType', ""))
                             item['gbcode'] = str(ad.get('gbcode', ""))
                             item['commentNum'] = str(
                                 ad.get('commentNum', ""))
                             item['isHasSponsorships'] = str(
                                 ad.get('isHasSponsorships', ""))
                             item['recomTime'] = str(ad.get(
                                 'recomTime', ""))
                             item['newsId'] = str(ad.get('newsId', ""))
                             item['iconNight'] = str(ad.get(
                                 'iconNight', ""))
                             item['isWeather'] = str(ad.get(
                                 'isWeather', ""))
                             item['isRecom'] = str(ad.get('isRecom', ""))
                             item['iconText'] = str(ad.get('iconText', ""))
                             item['link'] = str(ad.get('link', ""))
                             item['iconDay'] = str(ad.get('iconDay', ""))
                             item['abposition'] = str(
                                 ad.get('abposition', ""))
                             item['adType'] = str(ad.get('adType', ""))
                             item['playTime'] = str(ad.get('playTime', ""))
                             item['adp_type'] = str(ad.get('adp_type', ""))
                             item['isFlash'] = str(ad.get('isFlash', ""))
                             item['isHasTv'] = str(ad.get('isHasTv', ""))
                             item['newschn'] = str(ad.get('newschn', ""))
                             item['parameter_id'] = str(response.meta['id'])
                             yield item
     back_url = response.meta['url']
     back_id = str(response.meta['id'])
     back_header = str(response.meta['header'])
     yield scrapy.Request(
         url=back_url,
         headers=eval(back_header),
         callback=self.parse,
         priority=0,
         dont_filter=True,
         meta={
             'dont_redirect':
             True,
             'id':
             back_id,
             'header':
             back_header,
             'url':
             back_url,
             'handle_httpstatus_list':
             [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400]
         })
コード例 #5
0
 def parse(self, response):
     if check_json_format(response.body.decode("utf-8", "ignore")):
         jsonresponse = json.loads(response.body.decode("utf-8", "ignore"))
         news = jsonresponse['data']
         for n in news:
             dict_news = json.loads(n['content'])
             if 'label' in dict_news:
                 if '广告' in str(dict_news['label']):
                     item = ToutiaoAppItem()
                     item['abstract'] = str(dict_news.get('abstract', ""))
                     item['action_list'] = str(
                         dict_news.get('action_list', ""))
                     item['aggr_type'] = str(dict_news.get('aggr_type', ""))
                     item['allow_download'] = str(
                         dict_news.get('allow_download', ""))
                     item['article_sub_type'] = str(
                         dict_news.get('article_sub_type', ""))
                     item['article_tpye'] = str(
                         dict_news.get('article_tpye', ""))
                     item['article_url'] = str(
                         dict_news.get('article_url', ""))
                     item['ban_comment'] = str(
                         dict_news.get('ban_comment', ""))
                     item['behot_time'] = str(
                         dict_news.get('behot_time', ""))
                     item['bury_count'] = str(
                         dict_news.get('bury_count', ""))
                     item['cell_flag'] = str(dict_news.get('cell_flag', ""))
                     item['cell_layout_style'] = str(
                         dict_news.get('cell_layout_style', ""))
                     item['cell_type'] = str(dict_news.get('cell_type', ""))
                     item['comment_count'] = str(
                         dict_news.get('comment_count', ""))
                     item['content_decoration'] = dict_news.get(
                         'content_decoration', "")
                     item['digg_count'] = str(
                         dict_news.get('digg_count', ""))
                     item['display_url'] = str(
                         dict_news.get('display_url', ""))
                     item['filter_words'] = str(
                         dict_news.get('filter_words', ""))
                     item['group_flags'] = str(
                         dict_news.get('group_flags', ""))
                     item['has_video'] = str(dict_news.get('has_video', ""))
                     item['hot'] = str(dict_news.get('hot', ""))
                     item['ignore_web_transform'] = str(
                         dict_news.get('ignore_web_transform', ""))
                     item['is_subject'] = str(
                         dict_news.get('is_subject', ""))
                     item['item_id'] = str(dict_news.get('item_id', ""))
                     item['item_version'] = str(
                         dict_news.get('item_version', ""))
                     item['label'] = str(dict_news.get('label', ""))
                     item['label_style'] = str(
                         dict_news.get('label_style', ""))
                     item['large_image_list'] = str(
                         dict_news.get('large_image_list', ""))
                     item['level'] = str(dict_news.get('level', ""))
                     item['log_pb'] = str(dict_news.get('log_pb', ""))
                     item['natant_level'] = str(
                         dict_news.get('natant_level', ""))
                     item['preload_web'] = str(
                         dict_news.get('preload_web', ""))
                     item['publish_time'] = str(
                         dict_news.get('publish_time', ""))
                     item['raw_ad_data'] = str(
                         dict_news.get('raw_ad_data', ""))
                     item['read_count'] = str(
                         dict_news.get('read_count', ""))
                     item['repin_count'] = str(
                         dict_news.get('repin_count', ""))
                     item['rid'] = str(dict_news.get('rid', ""))
                     item['share_count'] = str(
                         dict_news.get('share_count', ""))
                     item['share_info'] = str(
                         dict_news.get('share_info', ""))
                     item['share_url'] = str(dict_news.get('share_url', ""))
                     item['show_dislike'] = str(
                         dict_news.get('show_dislike', ""))
                     item['show_portrait'] = str(
                         dict_news.get('show_portrait', ""))
                     item['show_portrait_article'] = str(
                         dict_news.get('show_portrait_article', ""))
                     item['source'] = str(dict_news.get('source', ""))
                     item['source_avatar'] = str(
                         dict_news.get('source_avatar', ""))
                     item['tag'] = str(dict_news.get('tag', ""))
                     item['group_id'] = str(dict_news.get('group_id', ""))
                     item['tag_id'] = str(dict_news.get('tag_id', ""))
                     item['title'] = str(dict_news.get('title', ""))
                     item['url'] = str(dict_news.get('url', ""))
                     item['user_repin'] = str(
                         dict_news.get('user_repin', ""))
                     item['user_verified'] = str(
                         dict_news.get('user_verified', ""))
                     item['video_detail_info'] = str(
                         dict_news.get('video_detail_info', ""))
                     item['video_duration'] = str(
                         dict_news.get('video_duration', ""))
                     item['video_id'] = str(dict_news.get('video_id', ""))
                     item['video_play_info'] = str(
                         dict_news.get('video_play_info', ""))
                     item['video_proportion_article'] = str(
                         dict_news.get('video_proportion_article', ""))
                     item['video_style'] = str(
                         dict_news.get('video_style', ""))
                     if 'large_image_list' in dict_news:
                         image_urls = eval(
                             str(dict_news['large_image_list']))[0]['url']
                         item['image_urls'] = image_urls
                     item['parameter_id'] = response.meta['id']
                     yield item
     back_url = response.meta['url']
     back_id = str(response.meta['id'])
     back_header = str(response.meta['header'])
     yield scrapy.Request(
         url=back_url,
         headers=eval(back_header),
         callback=self.parse,
         priority=0,
         dont_filter=True,
         meta={
             'dont_redirect':
             True,
             'id':
             back_id,
             'header':
             back_header,
             'url':
             back_url,
             'handle_httpstatus_list':
             [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400]
         })