Example #1
0
    def parse(self, response):
        try:
            isotimeformat = '%Y-%m-%d'
            item = response.meta['item']
            json_data = json.loads(response.text)
            video_info = json_data['data']

            for video in video_info[2:]:
                video = json.loads(video['content'])
                item['id'] = video['group_id']
                url = video['display_url']
                item['download_url'] = video['display_url']
                item['like_cnt'] = video['video_like_count']
                item['cmt_cnt'] = video['comment_count']
                item['sha_cnt'] = video['share_count']
                item['view_cnt'] = video['video_detail_info'][
                    'video_watch_count']
                item['thumbnails'] = video['large_image_list'][0]['url']
                item['title'] = video['title']
                item['video_height'] = json.loads(
                    video['video_play_info']
                )['video_list']['video_1']['vheight']
                item['video_width'] = json.loads(
                    video['video_play_info']
                )['video_list']['video_1']['vwidth']
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '西瓜视频'
                item['category'] = item['category']
                rep = re.search(r'http://toutiao.com/group/(.*)/',
                                url).group(1)
                item['url'] = 'https://www.ixigua.com/i' + rep + '/'

                md = hashlib.md5()  # 构造一个md5
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()

                if item['view_cnt'] >= item['view_cnt_compare'] or item[
                        'cmt_cnt'] >= item['cmt_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        self.broser.get(item['download_url'])
                        exists = self.is_visible('//video')
                        if exists is True:
                            url = self.broser.find_element_by_xpath(
                                '//video').get_attribute("src")
                            print(url)
            self.broser.quit()

        except Exception as f:
            Print.error(f)
            print('错误所在的行号:', f.__traceback__.tb_lineno)
            # 判断是否出现解析失败
            pass
Example #2
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']

        # 构建票圈post请求获取作品信息1
        url = 'https://longvideoapi.qingqu.top/longvideoapi/video/distribute/category/videoList'
        res = requests.post(
            url,
            headers=pq_headers,
            data=item['data'],
            timeout=30,
        )
        try:
            videos = json.loads(res.text)['data']
            for video in videos:
                item['url'] = re.match(r'https://.*.m3u8?',
                                       video['videoPath']).group()
                item['download_url'] = ''
                item['like_cnt'] = 0
                item['cmt_cnt'] = 0
                item['sha_cnt'] = video['shareCount']
                item['view_cnt'] = video['playCount']
                item['thumbnails'] = video['coverImg']['coverImgPath']
                try:
                    item['title'] = video['title']
                except:
                    item['title'] = video['shareTitle']

                item['id'] = video['id']
                item['video_height'] = video['height']
                item['video_width'] = video['width']
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '票圈长视频'
                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果

                print(item)
                # 筛选视频是否合格
                # if item['view_cnt'] >= item['view_cnt_compare'] or item['sha_cnt'] >= item['cmt_cnt_compare']:
                #     is_ture = Iduoliao.redis_check(item['osskey'])
                #     if is_ture is True:
                #         # 开始去水印上传
                Iduoliao.upload(item['url'], item['thumbnails'],
                                item['osskey'], '票圈长视频', item['title'],
                                item['old_type'])
                #         pass
        except Exception as f:
            Print.error(f)
            pass
Example #3
0
    def redis_check(md5_name):
        try:
            redis_db = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True)
            is_presence = redis_db.zrank('spider', md5_name)
            if is_presence is None:
                mapping = {
                    md5_name: 10
                }
                redis_db.zadd('spider', mapping)
                Print.info('添加 {} 到redis当中'.format(md5_name))
                return True

            else:
                return False

        except Exception as f:
            Print.error(f)
Example #4
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']
        url = 'https://api.xiaoniangao.cn/trends/get_recommend_trends'
        try:
            res = requests.post(url,
                                headers=xng_zf_headers,
                                proxies=self.proxies,
                                data=item['data'],
                                timeout=30)
            json_data = json.loads(res.text)

            video_datas = json_data['data']['list']
            for video in video_datas:
                item['url'] = video['v_url']
                item['download_url'] = video['v_url']
                item['like_cnt'] = video['favor']['total']
                item['cmt_cnt'] = 0
                item['sha_cnt'] = 0
                item['view_cnt'] = video['views']
                item['thumbnails'] = video['url']
                item['title'] = video['title']
                item['id'] = video['album_id']
                item['video_height'] = video['vw']
                item['video_width'] = video['w']
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '小年糕祝福'

                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果
                # 筛选条件
                if item['view_cnt'] >= item['view_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        # 开始去水印上传
                        Iduoliao.upload(item['url'], item['thumbnails'],
                                        item['osskey'], '小年糕祝福', item['title'],
                                        item['old_type'])

        except Exception as f:
            Print.error('小年糕祝福爬虫错误:{}'.format(f))
            pass
Example #5
0
    def tangdou(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']
        json_data = json.loads(response.text)
        video_info = json_data['datas']['list']
        try:
            for video in video_info:
                item['url'] = 'http://aqiniu.tangdou.com/' + video[
                    'videourl'] + '-20.mp4'
                item['download_url'] = 'http://aqiniu.tangdou.com/' + video[
                    'videourl'] + '-20.mp4'
                item['like_cnt'] = 0
                item['cmt_cnt'] = 0
                item['sha_cnt'] = 0
                item['view_cnt'] = video['hits_total']
                item['thumbnails'] = 'https://aimg.tangdou.com' + video['pic']
                item['title'] = video['title']
                item['id'] = video['vid']
                item['video_height'] = 0
                item['video_width'] = 0
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '糖豆'
                item['category'] = item['category']
                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果
                # 筛选条件
                if item['view_cnt'] >= item['view_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        # 开始去水印上传
                        Iduoliao.upload(item['url'], item['thumbnails'],
                                        item['osskey'], '糖豆', item['title'],
                                        item['old_type'])

        except Exception as f:
            Print.error('糖豆爬虫错误:{}'.format(f))
            pass
Example #6
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']

        url = 'https://sv.baidu.com/haokan/api?tn=1008350o&ctn=1008350o&imei=02B4B04B-2F2E-49DB-AF2D-AFFC79A3B0D2&cuid=3E8B5CD30DC5CF707754338AB6C6B1B408204C669OMPAQEKPQC&os=ios&osbranch=i0&ua=750_1334_326&ut=iPhone8%2C1_12.2&net_type=1&apiv=4.10.3.10&appv=1&version=4.10.3.10&life=1551235144&clife=1551235144&sids=2518_4-2540_1-2583_1-2627_2-2604_2-2635_1-2659_4-2665_2-2673_1-2685_1-2686_2-2691_2-2694_2-2697_2-2704_1-2717_3-2731_2-2732_4-2739_1-2743_2-2745_2-2498_1-2750_1-2753_1-2761_2-2772_1-2776_1-2782_2-2787_1-2796_1-2803_2&idfa=AB9793B9-CEE3-4EB2-9994-6DB2632BF4E6&hid=E0D63A86979B6633AB05F6AE72350416&log=vhk&location=&cmd=feed'

        res = requests.post(url, headers=hk_headers, proxies=self.proxies, data=item['data'])
        json_data = json.loads(res.text)
        video_info = json_data['feed']['data']['list']
        try:
            for video in video_info:
                item['url'] = ''
                item['download_url'] = video['content']['video_src']
                item['like_cnt'] = video['content']['praiseNum']
                item['cmt_cnt'] = video['content']['comment_cnt']
                item['sha_cnt'] = 0
                item['view_cnt'] = video['content']['playcnt']
                item['thumbnails'] = video['content']['thumbnails']
                item['title'] = video['content']['title']
                item['id'] = video['content']['vid']
                item['video_height'] = video['content']['height']
                item['video_width'] = video['content']['width']
                item['spider_time'] = time.strftime(isotimeformat, time.localtime(time.time()))
                item['from'] = '好看视频'
                item['category'] = item['category']
                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['download_url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果

                # 筛选视频是否合格1
                if item['view_cnt'] >= item['view_cnt_compare'] or item['sha_cnt'] >= item['cmt_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        # 开始去水印上传
                        Iduoliao.upload(item['download_url'], item['thumbnails'], item['osskey'], '好看视频', item['title'],
                                        item['old_type'])

        except Exception as f:
            Print.error(f)
Example #7
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']

        try:
            # UC浏览器
            json_data = json.loads(response.text)
            ids = json_data['data']['items']
            ids = [id for id in ids if len(id['id']) == 20]

            video_datas = [
                {
                    # 视频id
                    'id':
                    json_data['data']['articles'][id['id']]['id'],
                    # 视频地址
                    'url':
                    json_data['data']['articles'][id['id']]['url'],
                    # 视频标题
                    'title':
                    json_data['data']['articles'][id['id']]['title'],
                    # 视频分类
                    'category':
                    json_data['data']['articles'][id['id']]['category'][0],
                    # 原始分类
                    'old_type':
                    json_data['data']['articles'][id['id']]['category'][0],
                    # 视频封面地址
                    'thumbnails':
                    json_data['data']['articles'][
                        id['id']]['videos'][0]['poster']['url'],
                    # 视频宽
                    'video_width':
                    json_data['data']['articles'][
                        id['id']]['videos'][0]['video_width'],
                    # 视频高
                    'video_height':
                    json_data['data']['articles'][
                        id['id']]['videos'][0]['video_height'],
                    # 播放量
                    'view_cnt':
                    json_data['data']['articles'][id['id']]['videos'][0]
                    ['view_cnt'],
                    # 评论数
                    'cmt_cnt':
                    json_data['data']['articles'][id['id']]['cmt_cnt'],
                    'from':
                    'UC浏览器',
                    'spider_time':
                    time.strftime(isotimeformat, time.localtime(time.time())),
                } for id in ids if json_data['data']['articles'][id['id']]
                ['videos'][0]['view_cnt']
            ]

            item['video_datas'] = video_datas
            self.engine = create_engine(
                "mysql+pymysql://root:[email protected]/UC?charset=utf8")

            # 创建会话
            self.session = sessionmaker(self.engine)
            self.mySession = self.session()

            for gzh_cids in item['video_datas']:
                work = {}
                work['url'] = gzh_cids['url']
                work['thumbnails'] = gzh_cids['thumbnails']
                work['title'] = gzh_cids['title']
                work['work_id'] = int(gzh_cids['id'])
                work['video_height'] = gzh_cids['video_height']
                work['video_width'] = gzh_cids['video_width']
                md = hashlib.md5()  # 构造一个md5
                md.update(str(work['thumbnails']).encode())
                url_md5 = md.hexdigest()  # 加密结果
                work['url_md5'] = url_md5
                # if work['video_width'] >= 1000:
                result = self.mySession.query(Work).filter_by(
                    url_md5=work['url_md5']).first()
                if result is None:
                    print('添加视频:{}'.format(work['title']))
                    work = Work(url=work['url'],
                                thumbnails=work['thumbnails'],
                                title=work['title'],
                                url_md5=work['url_md5'],
                                video_height=work['video_height'],
                                video_width=work['video_width'],
                                status=0)

                    self.mySession.add(work)
                    self.mySession.commit()

                else:
                    pprint('视频已存在')

                self.mySession.query(Url).filter(Url.id == item['id']).update(
                    {"status": "1"})
                self.mySession.commit()

                self.mySession.query(Url).filter(Url.id < 1000000).update(
                    {"status": "1"})
                self.mySession.commit()

        except Exception as f:
            Print.error('UC浏览器爬虫错误:{}'.format(f))

            pass
Example #8
0
    def parse(self, response):
        try:
            isotimeformat = '%Y-%m-%d'
            item = response.meta['item']
            json_data = json.loads(response.text)
            video_info = json_data['data']

            for video in video_info[2:]:
                video = json.loads(video['content'])
                item['id'] = video['group_id']
                url = video['display_url']
                item['download_url'] = video['display_url']
                item['like_cnt'] = video['video_like_count']
                item['cmt_cnt'] = video['comment_count']
                item['sha_cnt'] = video['share_count']
                item['view_cnt'] = video['video_detail_info']['video_watch_count']
                item['thumbnails'] = video['large_image_list'][0]['url']
                item['title'] = video['title']
                item['video_height'] = json.loads(video['video_play_info'])['video_list']['video_1']['vheight']
                item['video_width'] = json.loads(video['video_play_info'])['video_list']['video_1']['vwidth']
                item['spider_time'] = time.strftime(isotimeformat, time.localtime(time.time()))
                item['from'] = '西瓜视频'
                item['category'] = item['category']
                rep = re.search(r'http://toutiao.com/group/(.*)/', url).group(1)
                item['url'] = 'https://www.ixigua.com/i' + rep + '/'

                md = hashlib.md5()  # 构造一个md5
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()

                if item['view_cnt'] >= item['view_cnt_compare'] or item['cmt_cnt'] >= item['cmt_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        try:
                            # 输入要解析的地址
                            self.url_box.send_keys(item['url'])
                            # 点击解析
                            click_button = self.broser.find_element_by_css_selector('[class="nya-btn"]')
                            click_button.click()

                            # 判断是否出现解析失败
                            exists = self.is_visible('//*[@id="__layout"]/div/div[1]/div/div[2]/div[2]/button')
                            if exists is True:
                                click_button = self.broser.find_element_by_css_selector('[class="vue-dialog-button"]')
                                click_button.click()
                                self.url_box.clear()

                            # 判断是否获取成功
                            exists = self.is_visible('//*[@id="__layout"]/div/main/div[3]/fieldset[2]/legend/span')
                            if exists is True:
                                url = self.broser.find_element_by_xpath(
                                    '//*[@id="__layout"]/div/main/div[3]/fieldset[2]/div/p/a').get_attribute('href')

                                # 开始去水印上传
                                Iduoliao.upload(url, item['thumbnails'], item['osskey'], '西瓜视频', item['title'], item['old_type'])
                            self.url_box.clear()
                        except Exception as f:
                            print(f)
            self.broser.quit()

        except Exception as f:
            Print.error(f)
            print('错误所在的行号:', f.__traceback__.tb_lineno)
            # 判断是否出现解析失败
            exists = self.is_visible('//*[@id="__layout"]/div/div[2]/div/div[2]/div[1]/div[2]')
            if exists is True:
                click_button = self.broser.find_element_by_css_selector('[class="vue-dialog-button"]')
                click_button.click()
                self.url_box.clear()
            pass