def parse(self, response): try: isotimeformat = '%Y-%m-%d' item = response.meta['item'] json_data = json.loads(response.text) video_info = json_data['data'] for video in video_info[2:]: video = json.loads(video['content']) item['id'] = video['group_id'] url = video['display_url'] item['download_url'] = video['display_url'] item['like_cnt'] = video['video_like_count'] item['cmt_cnt'] = video['comment_count'] item['sha_cnt'] = video['share_count'] item['view_cnt'] = video['video_detail_info'][ 'video_watch_count'] item['thumbnails'] = video['large_image_list'][0]['url'] item['title'] = video['title'] item['video_height'] = json.loads( video['video_play_info'] )['video_list']['video_1']['vheight'] item['video_width'] = json.loads( video['video_play_info'] )['video_list']['video_1']['vwidth'] item['spider_time'] = time.strftime( isotimeformat, time.localtime(time.time())) item['from'] = '西瓜视频' item['category'] = item['category'] rep = re.search(r'http://toutiao.com/group/(.*)/', url).group(1) item['url'] = 'https://www.ixigua.com/i' + rep + '/' md = hashlib.md5() # 构造一个md5 md.update(str(item['url']).encode()) item['osskey'] = md.hexdigest() if item['view_cnt'] >= item['view_cnt_compare'] or item[ 'cmt_cnt'] >= item['cmt_cnt_compare']: is_ture = Iduoliao.redis_check(item['osskey']) if is_ture is True: self.broser.get(item['download_url']) exists = self.is_visible('//video') if exists is True: url = self.broser.find_element_by_xpath( '//video').get_attribute("src") print(url) self.broser.quit() except Exception as f: Print.error(f) print('错误所在的行号:', f.__traceback__.tb_lineno) # 判断是否出现解析失败 pass
def parse(self, response): isotimeformat = '%Y-%m-%d' item = response.meta['item'] # 构建票圈post请求获取作品信息1 url = 'https://longvideoapi.qingqu.top/longvideoapi/video/distribute/category/videoList' res = requests.post( url, headers=pq_headers, data=item['data'], timeout=30, ) try: videos = json.loads(res.text)['data'] for video in videos: item['url'] = re.match(r'https://.*.m3u8?', video['videoPath']).group() item['download_url'] = '' item['like_cnt'] = 0 item['cmt_cnt'] = 0 item['sha_cnt'] = video['shareCount'] item['view_cnt'] = video['playCount'] item['thumbnails'] = video['coverImg']['coverImgPath'] try: item['title'] = video['title'] except: item['title'] = video['shareTitle'] item['id'] = video['id'] item['video_height'] = video['height'] item['video_width'] = video['width'] item['spider_time'] = time.strftime( isotimeformat, time.localtime(time.time())) item['from'] = '票圈长视频' # 构造一个md5 md = hashlib.md5() md.update(str(item['url']).encode()) item['osskey'] = md.hexdigest() # 加密结果 print(item) # 筛选视频是否合格 # if item['view_cnt'] >= item['view_cnt_compare'] or item['sha_cnt'] >= item['cmt_cnt_compare']: # is_ture = Iduoliao.redis_check(item['osskey']) # if is_ture is True: # # 开始去水印上传 Iduoliao.upload(item['url'], item['thumbnails'], item['osskey'], '票圈长视频', item['title'], item['old_type']) # pass except Exception as f: Print.error(f) pass
def redis_check(md5_name): try: redis_db = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True) is_presence = redis_db.zrank('spider', md5_name) if is_presence is None: mapping = { md5_name: 10 } redis_db.zadd('spider', mapping) Print.info('添加 {} 到redis当中'.format(md5_name)) return True else: return False except Exception as f: Print.error(f)
def parse(self, response): isotimeformat = '%Y-%m-%d' item = response.meta['item'] url = 'https://api.xiaoniangao.cn/trends/get_recommend_trends' try: res = requests.post(url, headers=xng_zf_headers, proxies=self.proxies, data=item['data'], timeout=30) json_data = json.loads(res.text) video_datas = json_data['data']['list'] for video in video_datas: item['url'] = video['v_url'] item['download_url'] = video['v_url'] item['like_cnt'] = video['favor']['total'] item['cmt_cnt'] = 0 item['sha_cnt'] = 0 item['view_cnt'] = video['views'] item['thumbnails'] = video['url'] item['title'] = video['title'] item['id'] = video['album_id'] item['video_height'] = video['vw'] item['video_width'] = video['w'] item['spider_time'] = time.strftime( isotimeformat, time.localtime(time.time())) item['from'] = '小年糕祝福' # 构造一个md5 md = hashlib.md5() md.update(str(item['url']).encode()) item['osskey'] = md.hexdigest() # 加密结果 # 筛选条件 if item['view_cnt'] >= item['view_cnt_compare']: is_ture = Iduoliao.redis_check(item['osskey']) if is_ture is True: # 开始去水印上传 Iduoliao.upload(item['url'], item['thumbnails'], item['osskey'], '小年糕祝福', item['title'], item['old_type']) except Exception as f: Print.error('小年糕祝福爬虫错误:{}'.format(f)) pass
def tangdou(self, response): isotimeformat = '%Y-%m-%d' item = response.meta['item'] json_data = json.loads(response.text) video_info = json_data['datas']['list'] try: for video in video_info: item['url'] = 'http://aqiniu.tangdou.com/' + video[ 'videourl'] + '-20.mp4' item['download_url'] = 'http://aqiniu.tangdou.com/' + video[ 'videourl'] + '-20.mp4' item['like_cnt'] = 0 item['cmt_cnt'] = 0 item['sha_cnt'] = 0 item['view_cnt'] = video['hits_total'] item['thumbnails'] = 'https://aimg.tangdou.com' + video['pic'] item['title'] = video['title'] item['id'] = video['vid'] item['video_height'] = 0 item['video_width'] = 0 item['spider_time'] = time.strftime( isotimeformat, time.localtime(time.time())) item['from'] = '糖豆' item['category'] = item['category'] # 构造一个md5 md = hashlib.md5() md.update(str(item['url']).encode()) item['osskey'] = md.hexdigest() # 加密结果 # 筛选条件 if item['view_cnt'] >= item['view_cnt_compare']: is_ture = Iduoliao.redis_check(item['osskey']) if is_ture is True: # 开始去水印上传 Iduoliao.upload(item['url'], item['thumbnails'], item['osskey'], '糖豆', item['title'], item['old_type']) except Exception as f: Print.error('糖豆爬虫错误:{}'.format(f)) pass
def parse(self, response): isotimeformat = '%Y-%m-%d' item = response.meta['item'] url = 'https://sv.baidu.com/haokan/api?tn=1008350o&ctn=1008350o&imei=02B4B04B-2F2E-49DB-AF2D-AFFC79A3B0D2&cuid=3E8B5CD30DC5CF707754338AB6C6B1B408204C669OMPAQEKPQC&os=ios&osbranch=i0&ua=750_1334_326&ut=iPhone8%2C1_12.2&net_type=1&apiv=4.10.3.10&appv=1&version=4.10.3.10&life=1551235144&clife=1551235144&sids=2518_4-2540_1-2583_1-2627_2-2604_2-2635_1-2659_4-2665_2-2673_1-2685_1-2686_2-2691_2-2694_2-2697_2-2704_1-2717_3-2731_2-2732_4-2739_1-2743_2-2745_2-2498_1-2750_1-2753_1-2761_2-2772_1-2776_1-2782_2-2787_1-2796_1-2803_2&idfa=AB9793B9-CEE3-4EB2-9994-6DB2632BF4E6&hid=E0D63A86979B6633AB05F6AE72350416&log=vhk&location=&cmd=feed' res = requests.post(url, headers=hk_headers, proxies=self.proxies, data=item['data']) json_data = json.loads(res.text) video_info = json_data['feed']['data']['list'] try: for video in video_info: item['url'] = '' item['download_url'] = video['content']['video_src'] item['like_cnt'] = video['content']['praiseNum'] item['cmt_cnt'] = video['content']['comment_cnt'] item['sha_cnt'] = 0 item['view_cnt'] = video['content']['playcnt'] item['thumbnails'] = video['content']['thumbnails'] item['title'] = video['content']['title'] item['id'] = video['content']['vid'] item['video_height'] = video['content']['height'] item['video_width'] = video['content']['width'] item['spider_time'] = time.strftime(isotimeformat, time.localtime(time.time())) item['from'] = '好看视频' item['category'] = item['category'] # 构造一个md5 md = hashlib.md5() md.update(str(item['download_url']).encode()) item['osskey'] = md.hexdigest() # 加密结果 # 筛选视频是否合格1 if item['view_cnt'] >= item['view_cnt_compare'] or item['sha_cnt'] >= item['cmt_cnt_compare']: is_ture = Iduoliao.redis_check(item['osskey']) if is_ture is True: # 开始去水印上传 Iduoliao.upload(item['download_url'], item['thumbnails'], item['osskey'], '好看视频', item['title'], item['old_type']) except Exception as f: Print.error(f)
def upload(url, img_url, filename, videofrom, title, old_type): if videofrom == "西瓜视频": # 传入视频下载地址,返回新的文件名字 new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=True) # 获取视频的帧宽,帧高, 用于去水印定位 size_filename, width, height = IduoliaoTool.get_video_size(url) # 下载视频的封面地址 img_filename = IduoliaoTool.img_download(img_url, filename) # 当三种东西准备就绪,调用去水印工具 if new_filename and size_filename and img_filename: # 去水印,判断是否成功返回真的视频文件用于oss上传 dewatermark_name = IduoliaoTool.dewatermark(width, height, 20, 200, 55, 204, new_filename, title, old_type, videofrom) if dewatermark_name: # oss上传视频 # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True) # oss上传视频封面 # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False) pass # 上传完毕,删除文件 if os.path.exists(img_filename): os.remove(img_filename) if os.path.exists(size_filename): os.remove(size_filename) if videofrom == "票圈长视频": # 获取ffmpeg导出视频名字 synthesis_filename = re.match(r'https://rescdn.yishihui.com/longvideo/(.*)/(.*)/(.*)/(.*)', url).group(4) ffmpeg_filename = re.match(r'(.*)\.m3u8', synthesis_filename).group(1) + '.mp4' isotimeformat = '%Y-%m-%d' day = time.strftime(isotimeformat, time.localtime(time.time())) filename2 = './{}/{}/{}'.format(videofrom, old_type, day) if not os.path.exists(filename2): os.makedirs(filename2) filename = './{}/{}/{}/{}'.format(videofrom, old_type, day, title) + '.mp4' # 下载视频 os.system('ffmpeg -i {} {}'.format(url, filename)) if videofrom == "UC浏览器": IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=False) if videofrom == "糖豆": # 获取视频的帧宽,帧高, 用于去水印定位 size_filename, width, height = IduoliaoTool.get_video_size(url) if int(width) > int(height): # 传入视频下载地址,返回新的文件名字 new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=True) # 下载视频的封面地址 img_filename = IduoliaoTool.img_download(img_url, filename) # 当三种东西准备就绪,调用去水印工具 if new_filename and size_filename and img_filename: # 去水印,判断是否成功返回真的视频文件用于oss上传 dewatermark_name = IduoliaoTool.dewatermark(width, height, 10, 100, 50, 110, new_filename, title, old_type, videofrom) if dewatermark_name: # oss上传视频 # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True) # oss上传视频封面 # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False) pass # 上传完毕,删除文件 if os.path.exists(img_filename): os.remove(img_filename) if os.path.exists(size_filename): os.remove(size_filename) if videofrom == "开眼视频": isotimeformat = '%Y-%m-%d' day = time.strftime(isotimeformat, time.localtime(time.time())) filename2 = 'Z:\\爬虫储存\\爬虫储存1.0\\{}\\{}\\{}'.format(videofrom, old_type, day) if not os.path.exists(filename2): os.makedirs(filename2) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.3.2.1000 Chrome/30.0.1599.101 Safari/537.36"} with closing(requests.get(url, stream=True, headers=headers)) as r: chunk_size = 1024 # content_size = int(r.headers['content-length']) filename = 'Z:\\爬虫储存\\爬虫储存1.0\\{}\\{}\\{}\\{}'.format(videofrom, old_type, day, title) + '.mp4' with open(filename, "wb") as f: n = 1 for chunk in r.iter_content(chunk_size=chunk_size): # loaded = n * 1024.0 / content_size f.write(chunk) n += 1 Print.info('下载视频: {}'.format(filename)) if videofrom == "小年糕": # 获取视频的帧宽,帧高, 用于去水印定位 size_filename, width, height = IduoliaoTool.get_video_size(url) # if int(width) > int(height): # 传入视频下载地址,返回新的文件名字 new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=True) # 下载视频的封面地址 img_filename = IduoliaoTool.img_download(img_url, filename) # 当三种东西准备就绪,调用去水印工具 if new_filename and size_filename and img_filename: # 去水印,判断是否成功返回真的视频文件用于oss上传 dewatermark_name = IduoliaoTool.dewatermark(width, height, int(height)-70, 100, 50, 120, new_filename, title, old_type, videofrom) if dewatermark_name: # oss上传视频 # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True) # oss上传视频封面 # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False) pass # 上传完毕,删除文件 if os.path.exists(img_filename): os.remove(img_filename) if os.path.exists(size_filename): os.remove(size_filename) if videofrom == "小年糕祝福": IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=False) if videofrom == "好看视频": # 传入视频下载地址,返回新的文件名字 new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=True) # 获取视频的帧宽,帧高, 用于去水印定位 size_filename, width, height = IduoliaoTool.get_video_size(url) # 下载视频的封面地址 img_filename = IduoliaoTool.img_download(img_url, filename) # 当三种东西准备就绪,调用去水印工具 if new_filename and size_filename and img_filename: # 去水印,判断是否成功返回真的视频文件用于oss上传 dewatermark_name = IduoliaoTool.dewatermark(width, height, 10, 150, 50, 160, new_filename, title, old_type, videofrom) if dewatermark_name: # oss上传视频 # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True) # oss上传视频封面 # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False) pass # 上传完毕,删除文件 if os.path.exists(img_filename): os.remove(img_filename) if os.path.exists(size_filename): os.remove(size_filename)
def parse(self, response): isotimeformat = '%Y-%m-%d' item = response.meta['item'] try: # UC浏览器 json_data = json.loads(response.text) ids = json_data['data']['items'] ids = [id for id in ids if len(id['id']) == 20] video_datas = [ { # 视频id 'id': json_data['data']['articles'][id['id']]['id'], # 视频地址 'url': json_data['data']['articles'][id['id']]['url'], # 视频标题 'title': json_data['data']['articles'][id['id']]['title'], # 视频分类 'category': json_data['data']['articles'][id['id']]['category'][0], # 原始分类 'old_type': json_data['data']['articles'][id['id']]['category'][0], # 视频封面地址 'thumbnails': json_data['data']['articles'][ id['id']]['videos'][0]['poster']['url'], # 视频宽 'video_width': json_data['data']['articles'][ id['id']]['videos'][0]['video_width'], # 视频高 'video_height': json_data['data']['articles'][ id['id']]['videos'][0]['video_height'], # 播放量 'view_cnt': json_data['data']['articles'][id['id']]['videos'][0] ['view_cnt'], # 评论数 'cmt_cnt': json_data['data']['articles'][id['id']]['cmt_cnt'], 'from': 'UC浏览器', 'spider_time': time.strftime(isotimeformat, time.localtime(time.time())), } for id in ids if json_data['data']['articles'][id['id']] ['videos'][0]['view_cnt'] ] item['video_datas'] = video_datas self.engine = create_engine( "mysql+pymysql://root:[email protected]/UC?charset=utf8") # 创建会话 self.session = sessionmaker(self.engine) self.mySession = self.session() for gzh_cids in item['video_datas']: work = {} work['url'] = gzh_cids['url'] work['thumbnails'] = gzh_cids['thumbnails'] work['title'] = gzh_cids['title'] work['work_id'] = int(gzh_cids['id']) work['video_height'] = gzh_cids['video_height'] work['video_width'] = gzh_cids['video_width'] md = hashlib.md5() # 构造一个md5 md.update(str(work['thumbnails']).encode()) url_md5 = md.hexdigest() # 加密结果 work['url_md5'] = url_md5 # if work['video_width'] >= 1000: result = self.mySession.query(Work).filter_by( url_md5=work['url_md5']).first() if result is None: print('添加视频:{}'.format(work['title'])) work = Work(url=work['url'], thumbnails=work['thumbnails'], title=work['title'], url_md5=work['url_md5'], video_height=work['video_height'], video_width=work['video_width'], status=0) self.mySession.add(work) self.mySession.commit() else: pprint('视频已存在') self.mySession.query(Url).filter(Url.id == item['id']).update( {"status": "1"}) self.mySession.commit() self.mySession.query(Url).filter(Url.id < 1000000).update( {"status": "1"}) self.mySession.commit() except Exception as f: Print.error('UC浏览器爬虫错误:{}'.format(f)) pass
def parse(self, response): try: isotimeformat = '%Y-%m-%d' item = response.meta['item'] json_data = json.loads(response.text) video_info = json_data['data'] for video in video_info[2:]: video = json.loads(video['content']) item['id'] = video['group_id'] url = video['display_url'] item['download_url'] = video['display_url'] item['like_cnt'] = video['video_like_count'] item['cmt_cnt'] = video['comment_count'] item['sha_cnt'] = video['share_count'] item['view_cnt'] = video['video_detail_info']['video_watch_count'] item['thumbnails'] = video['large_image_list'][0]['url'] item['title'] = video['title'] item['video_height'] = json.loads(video['video_play_info'])['video_list']['video_1']['vheight'] item['video_width'] = json.loads(video['video_play_info'])['video_list']['video_1']['vwidth'] item['spider_time'] = time.strftime(isotimeformat, time.localtime(time.time())) item['from'] = '西瓜视频' item['category'] = item['category'] rep = re.search(r'http://toutiao.com/group/(.*)/', url).group(1) item['url'] = 'https://www.ixigua.com/i' + rep + '/' md = hashlib.md5() # 构造一个md5 md.update(str(item['url']).encode()) item['osskey'] = md.hexdigest() if item['view_cnt'] >= item['view_cnt_compare'] or item['cmt_cnt'] >= item['cmt_cnt_compare']: is_ture = Iduoliao.redis_check(item['osskey']) if is_ture is True: try: # 输入要解析的地址 self.url_box.send_keys(item['url']) # 点击解析 click_button = self.broser.find_element_by_css_selector('[class="nya-btn"]') click_button.click() # 判断是否出现解析失败 exists = self.is_visible('//*[@id="__layout"]/div/div[1]/div/div[2]/div[2]/button') if exists is True: click_button = self.broser.find_element_by_css_selector('[class="vue-dialog-button"]') click_button.click() self.url_box.clear() # 判断是否获取成功 exists = self.is_visible('//*[@id="__layout"]/div/main/div[3]/fieldset[2]/legend/span') if exists is True: url = self.broser.find_element_by_xpath( '//*[@id="__layout"]/div/main/div[3]/fieldset[2]/div/p/a').get_attribute('href') # 开始去水印上传 Iduoliao.upload(url, item['thumbnails'], item['osskey'], '西瓜视频', item['title'], item['old_type']) self.url_box.clear() except Exception as f: print(f) self.broser.quit() except Exception as f: Print.error(f) print('错误所在的行号:', f.__traceback__.tb_lineno) # 判断是否出现解析失败 exists = self.is_visible('//*[@id="__layout"]/div/div[2]/div/div[2]/div[1]/div[2]') if exists is True: click_button = self.broser.find_element_by_css_selector('[class="vue-dialog-button"]') click_button.click() self.url_box.clear() pass