def parse(self, response): try: if response.status == 404: return r = json.loads(response.body) for each in r: aid = each['aid'] author = each['name'] mid = each['mid'] view = each['playTotal'] favorite = each['favoritesTotal'] danmaku = each['danmakuTotal'] coin = None share = None like = None date = response.meta['date'] date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8]) current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d") data = { 'view': view, 'favorite': favorite, 'danmaku': danmaku, 'coin': coin, 'share': share, 'like': like, 'datetime': current_date } subChannel = None tid = None title = each['title'] date = each['created'] date = datetime.datetime.strptime(date[0:-5], '%Y-%m-%dT%H:%M:%S') pic = 'http:' + each['pic'] item = VideoItem() item['current_view'] = view item['current_favorite'] = favorite item['current_danmaku'] = danmaku item['current_coin'] = coin item['current_share'] = share item['current_like'] = like item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date if author == '腾讯动漫' or author == '哔哩哔哩番剧': continue self.coll.find_one({'aid': aid}) d = self.coll.find_one({'aid': aid}) flag = 0 if d != None and 'data' in d: if 'subChannel' in d: item['subChannel'] = d['subChannel'] if 'channel' in d: item['channel'] = d['channel'] for each_data in d['data']: data_date = each_data['datetime'].strftime("%Y-%m-%d") if data_date == date_str: flag = 1 break if flag != 0: item['data'] = None yield item except Exception as error: pass
def parse(self, response): try: if response.status == 404: return r = json.loads(response.body) for each in r: aid = each['aid'] author = each['name'] mid = each['mid'] view = each['playTotal'] favorite = each['favoritesTotal'] danmaku = each['danmakuTotal'] coin = None share = None like = None date = response.meta['date'] date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8]) current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d") data = { 'view': view, 'favorite': favorite, 'danmaku': danmaku, 'coin': coin, 'share': share, 'like': like, 'datetime': current_date } subChannel = None tid = None title = each['title'] date = each['created'] pic = 'http:' + each['pic'] item = VideoItem() item['current_view'] = view item['current_favorite'] = favorite item['current_danmaku'] = danmaku item['current_coin'] = coin item['current_share'] = share item['current_like'] = like item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date if author == '腾讯动漫' or author == '哔哩哔哩番剧': continue self.coll.find_one({'aid': aid}) d = self.coll.find_one({'aid': aid}) flag = 0 if d != None and 'data' in d: if 'subChannel' in d: item['subChannel'] = d['subChannel'] if 'channel' in d: item['channel'] = d['channel'] for each_data in d['data']: data_date = each_data['datetime'].strftime("%Y-%m-%d") if data_date == date_str: flag = 1 break if flag == 0: yield item except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error)
def parse(self, response): try: self.task.crawl_count += 1 r = json.loads(response.body) d = r["data"] keys = list(d.keys()) for each_key in keys: aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] danmaku = d[each_key]['stat']['danmaku'] coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] current_date = datetime.utcnow() + timedelta(hours=8) data = { 'view': view, 'favorite': favorite, 'danmaku': danmaku, 'coin': coin, 'share': share, 'like': like, 'datetime': current_date } subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] pic = d[each_key]['pic'] item = VideoItem() item['current_view'] = view item['current_favorite'] = favorite item['current_danmaku'] = danmaku item['current_coin'] = coin item['current_share'] = share item['current_like'] = like item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date if subChannel != '': item['channel'] = sub_channel_2_channel[subChannel] elif subChannel == '资讯': if tid == 51: item['channel'] == '番剧' if tid == 170: item['channel'] == '国创' if tid == 159: item['channel'] == '娱乐' else: item['channel'] = None url_list = response.url.split('&') if len(url_list) == 2: item['object_id'] = url_list[1] else: item['object_id'] = None yield item except Exception as error: # 出现错误时打印错误日志 self.task.crawl_failed += 1 if r['code'] == -404: return logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error)
def parse(self, response): try: r = json.loads(response.body) d = r["data"] keys = list(d.keys()) for each_key in keys: aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] danmaku = d[each_key]['stat']['danmaku'] coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] current_date = datetime.now() data = { 'view': view, 'favorite': favorite, 'danmaku': danmaku, 'coin': coin, 'share': share, 'like': like, 'datetime': current_date } subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] pic = d[each_key]['pic'] item = VideoItem() item['current_view'] = view item['current_favorite'] = favorite item['current_danmaku'] = danmaku item['current_coin'] = coin item['current_share'] = share item['current_like'] = like item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date if subChannel != '': if (subChannel not in sub_channel_2_channel): item['channel'] = '未知' else: item['channel'] = sub_channel_2_channel[subChannel] elif subChannel == '资讯': if tid == 51: item['channel'] == '番剧' if tid == 170: item['channel'] == '国创' if tid == 159: item['channel'] == '娱乐' else: item['channel'] = None yield item except Exception as error: # 出现错误时打印错误日志 if r['code'] == -404: return mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error)