def parse1(self, response): if check_json_format(response.body.decode("utf-8")): pass # yield item back_url = response.meta['url'] back_id = str(response.meta['id']) back_header = str(response.meta['header']) yield scrapy.Request( url=back_url, headers=eval(back_header), callback=self.parse1, priority=0, dont_filter=True, meta={ 'dont_redirect': True, 'id': back_id, 'header': back_header, 'url': back_url, 'handle_httpstatus_list': [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400] })
def parse1(self, response): html = response.body.text() if check_json_format(response.body.decode("utf-8")): jsonresponse = json.loads(response.body.decode("utf-8")) # print(type(jsonresponse)) data = jsonresponse['data'] if 'ad' in data: ad = data['ad'] feed = ad['feed'] for n in feed: item = SinaAppItem() item['pos'] = str(n.get('pos', "")) item['newsId'] = str(n.get('newsId', "")) item['title'] = str(n.get('title', "")) item['link'] = str(n.get('link', "")) item['pic'] = str(n.get('pic', "")) item['showTag'] = str(n.get('showTag', "")) item['articlePreload'] = str(n.get('articlePreload', "")) if 'commentStatus' in n: item['commentStatus'] = str( n['commentCountInfo']['commentStatus']) else: item['commentStatus'] = '' item['adid'] = str(n.get('adid', "")) item['dislikeTags'] = str(n.get('dislikeTags', "")) item['parameter_id'] = str(response.meta['id']) yield item back_url = response.meta['url'] back_id = str(response.meta['id']) back_header = str(response.meta['header']) yield scrapy.Request( url=back_url, headers=eval(back_header), callback=self.parse1, priority=0, dont_filter=True, meta={ 'dont_redirect': True, 'id': back_id, 'header': back_header, 'url': back_url, 'handle_httpstatus_list': [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400] })
def parse(self, response): if check_json_format(response.body.decode("utf-8", "ignore")): jsonresponse = json.loads(response.body.decode("utf-8", "ignore")) item_list = jsonresponse[0]["item"] for advert in item_list: if "style" in advert: if "attribute" in advert["style"]: if "广告" in advert["style"]["attribute"]: item = FenghuangAppItem() item['thumbnail'] = str(advert["thumbnail"]) item['title'] = str(advert["title"]) item['appSource'] = str(advert["appSource"]) item['intro'] = str(advert["intro"]) item['adId'] = str(advert["adId"]) item['adPositionId'] = str(advert["adPositionId"]) item['type'] = str(advert["type"]) item['source'] = str(advert["source"]) item['weburl'] = str(advert["link"].get( "weburl", "")) item['view'] = str(advert["style"].get("view", "")) item['images'] = str(advert["style"].get( "images", "")) item['parameter_id'] = str(response.meta['id']) yield item back_url = response.meta['url'] back_id = str(response.meta['id']) back_header = str(response.meta['header']) yield scrapy.Request( url=back_url, callback=self.parse, priority=0, dont_filter=True, meta={ 'dont_redirect': True, 'id': back_id, 'header': back_header, 'url': back_url, 'handle_httpstatus_list': [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400] })
def parse(self, response): if check_json_format(response.body.decode("utf-8")): jsonresponse = json.loads(response.body.decode("utf-8")) if 'recommendArticles' in jsonresponse: news = jsonresponse['recommendArticles'] for ad in news: if 'iconText' in ad: if '广告' == ad['iconText']: item = SouhuAppItem() data = ad['data'] if 'error' in data and '1' == str(data['error']): pass else: item['monitorkey'] = str( data.get('monitorkey', "")) item['resource'] = str(data.get( 'resource', "")) item['resource2'] = str( data.get('resource2', "")) item['weight'] = str(data.get('weight', "")) item['itemspaceid'] = str( data.get('itemspaceid', "")) item['resource1'] = str( data.get('resource1', "")) item['impressionid'] = str( data.get('impressionid', "")) item['special'] = str(data.get('special', "")) item['offline'] = str(data.get('offline', "")) item['adid'] = str(data.get('adid', "")) item['viewmonitor'] = str( data.get('viewmonitor', "")) item['size'] = str(data.get('size', "")) item['online'] = str(data.get('online', "")) item['position'] = str(data.get( 'position', "")) item['tag'] = str(data.get('tag', "")) item['editNews'] = str(ad.get('editNews', "")) item['statsType'] = str(ad.get( 'statsType', "")) item['isPreload'] = str(ad.get( 'isPreload', "")) item['newsType'] = str(ad.get('newsType', "")) item['gbcode'] = str(ad.get('gbcode', "")) item['commentNum'] = str( ad.get('commentNum', "")) item['isHasSponsorships'] = str( ad.get('isHasSponsorships', "")) item['recomTime'] = str(ad.get( 'recomTime', "")) item['newsId'] = str(ad.get('newsId', "")) item['iconNight'] = str(ad.get( 'iconNight', "")) item['isWeather'] = str(ad.get( 'isWeather', "")) item['isRecom'] = str(ad.get('isRecom', "")) item['iconText'] = str(ad.get('iconText', "")) item['link'] = str(ad.get('link', "")) item['iconDay'] = str(ad.get('iconDay', "")) item['abposition'] = str( ad.get('abposition', "")) item['adType'] = str(ad.get('adType', "")) item['playTime'] = str(ad.get('playTime', "")) item['adp_type'] = str(ad.get('adp_type', "")) item['isFlash'] = str(ad.get('isFlash', "")) item['isHasTv'] = str(ad.get('isHasTv', "")) item['newschn'] = str(ad.get('newschn', "")) item['parameter_id'] = str(response.meta['id']) yield item back_url = response.meta['url'] back_id = str(response.meta['id']) back_header = str(response.meta['header']) yield scrapy.Request( url=back_url, headers=eval(back_header), callback=self.parse, priority=0, dont_filter=True, meta={ 'dont_redirect': True, 'id': back_id, 'header': back_header, 'url': back_url, 'handle_httpstatus_list': [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400] })
def parse(self, response): if check_json_format(response.body.decode("utf-8", "ignore")): jsonresponse = json.loads(response.body.decode("utf-8", "ignore")) news = jsonresponse['data'] for n in news: dict_news = json.loads(n['content']) if 'label' in dict_news: if '广告' in str(dict_news['label']): item = ToutiaoAppItem() item['abstract'] = str(dict_news.get('abstract', "")) item['action_list'] = str( dict_news.get('action_list', "")) item['aggr_type'] = str(dict_news.get('aggr_type', "")) item['allow_download'] = str( dict_news.get('allow_download', "")) item['article_sub_type'] = str( dict_news.get('article_sub_type', "")) item['article_tpye'] = str( dict_news.get('article_tpye', "")) item['article_url'] = str( dict_news.get('article_url', "")) item['ban_comment'] = str( dict_news.get('ban_comment', "")) item['behot_time'] = str( dict_news.get('behot_time', "")) item['bury_count'] = str( dict_news.get('bury_count', "")) item['cell_flag'] = str(dict_news.get('cell_flag', "")) item['cell_layout_style'] = str( dict_news.get('cell_layout_style', "")) item['cell_type'] = str(dict_news.get('cell_type', "")) item['comment_count'] = str( dict_news.get('comment_count', "")) item['content_decoration'] = dict_news.get( 'content_decoration', "") item['digg_count'] = str( dict_news.get('digg_count', "")) item['display_url'] = str( dict_news.get('display_url', "")) item['filter_words'] = str( dict_news.get('filter_words', "")) item['group_flags'] = str( dict_news.get('group_flags', "")) item['has_video'] = str(dict_news.get('has_video', "")) item['hot'] = str(dict_news.get('hot', "")) item['ignore_web_transform'] = str( dict_news.get('ignore_web_transform', "")) item['is_subject'] = str( dict_news.get('is_subject', "")) item['item_id'] = str(dict_news.get('item_id', "")) item['item_version'] = str( dict_news.get('item_version', "")) item['label'] = str(dict_news.get('label', "")) item['label_style'] = str( dict_news.get('label_style', "")) item['large_image_list'] = str( dict_news.get('large_image_list', "")) item['level'] = str(dict_news.get('level', "")) item['log_pb'] = str(dict_news.get('log_pb', "")) item['natant_level'] = str( dict_news.get('natant_level', "")) item['preload_web'] = str( dict_news.get('preload_web', "")) item['publish_time'] = str( dict_news.get('publish_time', "")) item['raw_ad_data'] = str( dict_news.get('raw_ad_data', "")) item['read_count'] = str( dict_news.get('read_count', "")) item['repin_count'] = str( dict_news.get('repin_count', "")) item['rid'] = str(dict_news.get('rid', "")) item['share_count'] = str( dict_news.get('share_count', "")) item['share_info'] = str( dict_news.get('share_info', "")) item['share_url'] = str(dict_news.get('share_url', "")) item['show_dislike'] = str( dict_news.get('show_dislike', "")) item['show_portrait'] = str( dict_news.get('show_portrait', "")) item['show_portrait_article'] = str( dict_news.get('show_portrait_article', "")) item['source'] = str(dict_news.get('source', "")) item['source_avatar'] = str( dict_news.get('source_avatar', "")) item['tag'] = str(dict_news.get('tag', "")) item['group_id'] = str(dict_news.get('group_id', "")) item['tag_id'] = str(dict_news.get('tag_id', "")) item['title'] = str(dict_news.get('title', "")) item['url'] = str(dict_news.get('url', "")) item['user_repin'] = str( dict_news.get('user_repin', "")) item['user_verified'] = str( dict_news.get('user_verified', "")) item['video_detail_info'] = str( dict_news.get('video_detail_info', "")) item['video_duration'] = str( dict_news.get('video_duration', "")) item['video_id'] = str(dict_news.get('video_id', "")) item['video_play_info'] = str( dict_news.get('video_play_info', "")) item['video_proportion_article'] = str( dict_news.get('video_proportion_article', "")) item['video_style'] = str( dict_news.get('video_style', "")) if 'large_image_list' in dict_news: image_urls = eval( str(dict_news['large_image_list']))[0]['url'] item['image_urls'] = image_urls item['parameter_id'] = response.meta['id'] yield item back_url = response.meta['url'] back_id = str(response.meta['id']) back_header = str(response.meta['header']) yield scrapy.Request( url=back_url, headers=eval(back_header), callback=self.parse, priority=0, dont_filter=True, meta={ 'dont_redirect': True, 'id': back_id, 'header': back_header, 'url': back_url, 'handle_httpstatus_list': [302, 404, 403, 407, 500, 502, 503, 504, 408, 416, 400] })