Exemple #1
0
    def parse(self, response):

        # 判断当前数据是否爬取
        if check_spider_history(self.type, response.url) == True:
            pass
        # 开始时间
        start = time.time()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)
        curr_page = url.split('/?m=vod-index-pg-')[1].split('.html')[0]

        # url:电影详情页
        count = -1
        for each in reverse_arr(response.xpath('//div[@class="xing_vb"]/ul')):
            self.index = self.index + 1
            count = count + 1
            if count == 0 or count == 51:
                continue
            try:
                url2 = each.xpath("./li/span[2]/a/@href").extract()[0]
            except:
                print(each)
                continue
            url3 = self.domain + url2
            print(url3)
            html = get_one_page(url3)
            html = etree.HTML(html)
            try:
                each = html.xpath('//div[@class="vodBox"]')[0]
            except:
                print('跳过 -> ' + url3)
                continue
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            # /html/body/div[5]/div[1]/div/div
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/h2
            # /html/body/div[5]/div[1]/div/div/div[2]/div[2]/ul/li[1]/span
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/span
            movie_id = url2.split('id-')[1].split('.html')[0]
            movie_item = MovieItem()
            movie_item['id'] = movie_id
            movie_item['src'] = get_str_from_xpath(
                each.xpath('./div/img/@src'))
            movie_item['name'] = get_str_from_xpath(
                each.xpath('./div[2]/div[1]/h2/text()'))
            movie_item['update_status'] = get_str_from_xpath(
                each.xpath('./div[2]/div[1]/span/text()'))
            movie_item['score'] = get_str_from_xpath(
                each.xpath('./div[2]/div[1]/label/text()'))
            nickname = get_str_from_xpath(
                each.xpath('./div[2]/div[2]/ul/li[1]/span/text()'))
            if (nickname == ''):
                nickname = movie_item['name']
            movie_item['nickname'] = nickname
            tmp_directors = get_str_from_xpath(
                each.xpath('./div[2]/div[2]/ul/li[2]/span/text()'))
            directors = tmp_directors.split(',')
            if ('/' in tmp_directors):
                directors = tmp_directors.split('/')
            movie_item['directors'] = directors
            tmp_actors = get_str_from_xpath(
                each.xpath('./div[2]/div[2]/ul/li[3]/span/text()'))
            actors = tmp_actors.split(',')
            if ('/' in tmp_actors):
                actors = tmp_directors.split('/')
            movie_item['actors'] = actors
            type2 = get_str_from_xpath(
                each.xpath('./div[2]/div[2]/ul/li[4]/span/text()'))
            type = get_type_from_type2(type2)
            if (is_exclude_type2(type2) == True):
                continue
            if (type == '综艺' or type == '动漫'):
                if (type2.endswith('片') == False):
                    type2 = type2 + '片'
            movie_item['type2'] = reverse_type2(type2)
            movie_item['type'] = type
            movie_item['region'] = reverse_region(
                get_str_from_xpath(
                    each.xpath('./div[2]/div[2]/ul/li[5]/span/text()')))
            movie_item['language'] = get_str_from_xpath(
                each.xpath('./div[2]/div[2]/ul/li[6]/span/text()'))
            movie_item['release_date'] = reverse_release_date(
                get_str_from_xpath(
                    each.xpath('./div[2]/div[2]/ul/li[7]/span/text()')))
            movie_item['duration'] = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[8]/span/text()'))
            movie_item['update_time'] = reverse_update_time(
                get_str_from_xpath(
                    each.xpath('./div[2]/div[2]/ul/li[8]/span/text()')))
            movie_item['description'] = get_str_from_xpath(
                html.xpath('//div[@class="vodplayinfo"]')[1].xpath(
                    './span/text()'))
            sources = []
            count = 1
            source = {'name': '', 'types': []}
            index = 1
            for each in html.xpath('/html/body/div[5]/div[4]/div[2]/div/ul'):
                source = {'name': '', 'types': []}
                source['name'] = get_str_from_xpath(
                    html.xpath('/html/body/div[5]/div[4]/div[2]/div/h3[' +
                               (str)(index) + ']/text()'))
                types = []
                for each2 in each.xpath('./li'):
                    type = {'name': '', 'url': ''}
                    type['name'] = get_str_from_xpath(
                        each2.xpath('./text()')).split('$')[0]
                    type['url'] = get_str_from_xpath(
                        each2.xpath('./input/@value'))
                    print('正在爬取 ' + curr_page + '/' + (str)(self.total_page) +
                          ' ' + (str)(self.index) + '/' + (str)(self.total) +
                          ' -> ' + movie_id + ' ' + source['name'] + ' ' +
                          type['name'])
                    types.append(type)
                    count = count + 1
                index = index + 1
                source['types'] = types
                sources.append(source)
                # 跳过播放列表为空的视频并记录
                flag = 0
                if (len(types) == 0):
                    continue
            movie_item['sources'] = sources
            if (movie_item['update_status']) == '':
                movie_item['update_status'] = sources[0]['types'][0]['name']
            # 视频已爬取且未更新
            if (is_need_source(movie_item, 'movie') == False):
                print(movie_id + ' 已爬取')
                continue
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
              str(process_time) + 's')
Exemple #2
0
    def parse(self, response):

        # 开始时间
        start = time.time()

        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        # 获取影视类型
        origin_url = response.url
        # 影视类型对应数字
        type_num = origin_url.split('channel_id=')[1].split('&')[0]
        # 影视类型
        movie_type = self.type_dic.get(type_num)
        # 影视后缀
        type_suffix = self.type_suffix_dic.get(type_num)
        if (self.target == 'latest'):
            self.totalPage = 1
            self.total = self.totalPage * self.pageSize * len(self.start_urls)

        # 生成视频列表地址
        for i in reverse_arr(range(1, self.totalPage + 1)):
            url = origin_url.split('&pageNum=')[0] + '&pageNum=' + (
                str
            )(i) + '&pageSize=' + (str)(
                self.pageSize
            ) + '&site=iqiyi&source_type=&three_category_id=&without_qipu=1'
            print(url)
            response = get_response(url)
            if (len(response.json()['data']['list']) == 0):
                self.index = self.index + self.pageSize
            # 解析视频列表
            for list in reverse_arr(response.json()['data']['list']):
                self.index = self.index + 1
                movie_item = MovieItem()
                # http://pic3.iqiyipic.com/image/20180409/3c/a8/a_100131281_m_601_m1_260_360.webp
                # http://pic3.iqiyipic.com/image/20180409/3c/a8/a_100131281_m_601_m1.jpg
                movie_item['src'] = list['imageUrl'].split(
                    '.jpg')[0] + '_260_360.jpg'
                movie_item['name'] = list['name'].replace(' ', '')
                try:
                    movie_item['nickname'] = list['focus']
                except:
                    movie_item['nickname'] = movie_item['name']
                try:
                    score = list['score']
                except:
                    score = get_random_str()
                movie_item['score'] = score
                movie_item['duration'] = list['duration']
                movie_item['type'] = movie_type
                type2_list = list['categories']
                if (type2_list == []):
                    type2 = '其他'
                else:
                    type2 = type2_list[0]['name']
                    if ('话' in type2):
                        type2 = type2_list[1]['name']
                    if (type2.endswith(type_suffix) == False):
                        type2 = type2 + type_suffix
                if (is_exclude_type2(type2) == True):
                    continue
                movie_item['type2'] = reverse_type2(type2)
                try:
                    movie_item['release_date'] = reverse_release_date(
                        list['formatPeriod'].split('-')[0])
                except:
                    # 记录跳过的视频信息
                    history_type = 'iqiyi'
                    history_url = url
                    history_text = '跳过'
                    if (check_spider_history(history_type, history_url,
                                             history_text) == False):
                        write_spider_history(history_type, history_url,
                                             history_text)
                    continue
                try:
                    description = list['description']
                except:
                    description = movie_item['name']
                movie_item['description'] = description
                actors = list['secondInfo']
                if ('主演:' in actors):
                    if ('/' in actors):
                        actors = strip_arr(actors.split('主演:')[1].split('/'))
                    elif (',' in actors):
                        actors = strip_arr(actors.split('主演:')[1].split(','))
                    else:
                        actors = strip_arr([actors.split('主演:')[1]])
                else:
                    actors = ['未知']
                movie_item['actors'] = actors
                movie_item['update_time'] = get_current_time()
                # 解析视频详情
                videoLink = list['playUrl']
                html = get_one_page(videoLink)
                html = etree.HTML(html)
                print(videoLink)
                if ('a_' in videoLink):
                    # 影视详情页: http://www.iqiyi.com/a_19rrh8dq1x.html
                    movie_id = videoLink.split('.html')[0].split('a_')[1]
                    movie_item['id'] = movie_id
                    if (type_num == '1'
                            and movie_id not in self.type_num_2_1_id_list):
                        self.type_num_2_1_id_list.append(movie_id)
                    directors = get_arr_from_xpath(
                        html.xpath(
                            '//p[@class="episodeIntro-director"]/a/text()'))
                    if (directors == []):
                        directors = ['未知']
                    movie_item['directors'] = directors
                    region = get_str_from_xpath(
                        html.xpath('//p[@class="episodeIntro-area"]/a/text()'))
                    movie_item['region'] = reverse_region(region)
                    movie_item['language'] = get_str_from_xpath(
                        html.xpath('//p[@class="episodeIntro-lang"]/a/text()'))
                    videoLink = 'https:' + get_str_from_xpath(
                        html.xpath('//a[@class="albumPlayBtn"]/@href'))
                else:
                    # 影视详情页: https://www.iqiyi.com/v_19ru2jih7w.html
                    splits = videoLink.split('.html')[0]
                    if ('v_' not in splits):
                        continue
                    movie_id = splits.split('v_')[1]
                    movie_item['id'] = movie_id
                    directors = get_arr_from_xpath(
                        html.xpath('//a[@itemprop="director"]/text()'))
                    if (directors == []):
                        directors = ['未知']
                    movie_item['directors'] = directors
                    arr = get_arr_from_xpath(
                        html.xpath(
                            '//*[@id="titleRow"]/div[1]/div/div[2]/a/text()'))
                    region = ''
                    language = ''
                    for tmp_str in arr:
                        if ('语' or '话' in str):
                            language = tmp_str
                        if ('国' in tmp_str):
                            region = tmp_str
                        elif ('语' in tmp_str):
                            region = tmp_str.split('语')[0] + '国'
                    if (region == ''):
                        region = '其他'
                    if (language == ''):
                        language = '其他'
                    movie_item['region'] = reverse_region(region)
                    movie_item['language'] = language
                # 获取更新状态
                if (type_num == '1'):
                    # 电视剧
                    update_status = get_str_from_xpath(
                        html.xpath(
                            '//*[@id="widget-tab-2"]/div[1]/span[2]/a/span/text()'
                        )) + '期'
                if (type_num == '2' or type_num == '4' or type_num == '15'):
                    # 电视剧
                    update_status = get_str_from_xpath(
                        html.xpath(
                            '//*[@id="widget-tab-3"]/div[1]/span[2]/a/text()')
                    ) + get_str_from_xpath(
                        html.xpath(
                            '//*[@id="widget-tab-3"]/div[1]/span[2]/a/i/text()'
                        )) + '集'
                if (type_num == '6'):
                    # 综艺
                    update_status = get_str_from_xpath(
                        html.xpath(
                            '//*[@id="widget-tab-3"]/div[1]/span[2]/a/text()')
                    ) + get_str_from_xpath(
                        html.xpath(
                            '//*[@id="widget-tab-2"]/div[1]/span[2]/a/span/text()'
                        )) + '期'
                # 解析播放列表
                driver = get_driver()
                driver.get(videoLink)
                param = driver.execute_script('return param')
                html = driver.page_source
                html = etree.HTML(html)
                driver.quit()
                albumId = param['albumId']
                if (albumId == '0' and type_num != '1'):
                    break
                sources = []
                source = {'name': '爱奇艺视频', 'types': []}
                types = []
                # 电影
                if (type_num == '1'):
                    each = html.xpath(
                        '//*[@id="rightPlayList"]/div[1]/ul/li')[0]
                    type = {'name': '', 'url': ''}
                    type['name'] = get_str_from_xpath(
                        each.xpath('./div[1]/a/@title'))
                    type['url'] = videoLink
                    print('正在爬取 ' + movie_type + ' ' + (str)(i) + '/' +
                          (str)(self.totalPage) + ' ' + (str)(self.index) +
                          '/' + (str)(self.total) + ' -> ' + movie_id + ' ' +
                          source['name'] + ' ' + type['name'])
                    types.append(type)
                    xpath_length = len(
                        each.xpath(
                            '//*[@id="widget-movie-superseries"]/ul/li'))
                    if (xpath_length > 0):
                        type = {'name': '', 'url': ''}
                        type['name'] = get_str_from_xpath(
                            each.xpath(
                                '//*[@id="widget-movie-superseries"]/div/h3/text()'
                            ))
                        type['url'] = get_str_from_xpath(
                            each.xpath(
                                '//*[@id="widget-movie-superseries"]/ul/li/div[1]/a/@href'
                            )).split('?')[0]
                        print('正在爬取 ' + movie_type + ' ' + (str)(i) + '/' +
                              (str)(self.totalPage) + ' ' + (str)(self.index) +
                              '/' + (str)(self.total) + ' -> ' + movie_id +
                              ' ' + source['name'] + ' ' + type['name'])
                        types.append(type)
                else:
                    # 电视剧、综艺、动漫、少儿
                    for page in range(1, 60):
                        if (type_num != '6'):
                            url = 'https://pcw-api.iqiyi.com/albums/album/avlistinfo?aid=' + albumId + '&page=' + (
                                str)(page) + '&size=30'
                            print(url)
                            json_str = json.loads(get_one_page(url))
                            data = json_str['data']
                            epsodelist = data['epsodelist']
                            if (epsodelist == []):
                                break
                            for each in epsodelist:
                                type = {'name': '', 'url': ''}
                                type['name'] = reverse_type_name(
                                    (str)(each['order']))
                                type['url'] = each['playUrl']
                                print('正在爬取 ' + movie_type + ' ' + (str)(i) +
                                      '/' + (str)(self.totalPage) + ' ' +
                                      (str)(self.index) + '/' +
                                      (str)(self.total) + ' -> ' + movie_id +
                                      ' ' + source['name'] + ' ' +
                                      type['name'])
                                types.append(type)
                        else:
                            # 综艺
                            url = 'https://pcw-api.iqiyi.com/album/album/fytoplist?cid=' + type_num + '&dim=hour&page=' + (
                                str)(page) + '&size=10&type=realTime'
                            print(url)
                            json_str = json.loads(get_one_page(url))
                            data = json_str['data']
                            if (data == []):
                                break
                            for each in data:
                                print(each)
                                type = {'name': '', 'url': ''}
                                type['name'] = each['period'] + '期'
                                type['url'] = each['episode_play_url']
                                print('正在爬取 ' + movie_type + ' ' + (str)(i) +
                                      '/' + (str)(self.totalPage) + ' ' +
                                      (str)(self.index) + '/' +
                                      (str)(self.total) + ' -> ' + movie_id +
                                      ' ' + source['name'] + ' ' +
                                      type['name'])
                                types.append(type)
                source['types'] = types
                sources.append(source)
                movie_item['sources'] = sources
                # 跳过播放列表为空的视频并记录
                flag = 0
                if (len(types) == 0):
                    continue
                # 修改更新状态
                if (type_num == '1'):
                    # 电影
                    if (len(types) < 4):
                        movie_item['update_status'] = '爱奇艺视频'
                    else:
                        movie_item['update_status'] = types[0]['name']
                elif (type_num == '2' or type_num == '6' or type_num == '4'
                      or type_num == '15'):
                    # 电视剧
                    if (update_status == '集' or update_status == '期'):
                        update_status = types[len(types) - 1]['name']
                    movie_item['update_status'] = update_status
                # 视频已爬取且未更新
                if (is_need_source(movie_item, 'movie') == False):
                    print(movie_id + ' 已爬取')
                    continue
                yield movie_item
                self.total_valid += 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
              str(process_time) + 's')
Exemple #3
0
    def parse(self, response):

        # 开始时间
        start = time.time()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)

        # url:电影详情页
        count = -1
        for each in reverse_arr(
                response.xpath('//div[@class="list_lis"]/ul/li')):
            self.index = self.index + 1
            count = count + 1
            movie_item = MovieItem()
            url2 = self.domain + each.xpath("./div/a").attrib['href']
            print(url2)
            splits = url2.split('.html')[0].split('/')
            movie_id = splits[len(splits) - 1]
            update_status = get_str_from_xpath(
                each.xpath('./div/p[2]/span/text()'))
            if (update_status != ''):
                update_status = update_status.split("'")[3]
            movie_item['update_status'] = update_status
            movie_item['region'] = reverse_region(
                get_str_from_xpath(
                    each.xpath('./div/p[3]/span/text()')).split("'")[3])
            movie_item['release_date'] = reverse_release_date(
                get_str_from_xpath(each.xpath(
                    './div/p[3]/span[2]/text()')).split('年')[0]).split("'")[3]
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            # https://www.88ys.com/guochanju/202002/79916.html
            html = get_one_page(url2)
            html = etree.HTML(html)
            # /html/body/div[5]/div[1]/div/div
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/h2
            # /html/body/div[5]/div[1]/div/div/div[2]/div[2]/ul/li[1]/span
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/span
            each = html.xpath('//div[@class="vod"]')[0]
            movie_item['id'] = movie_id
            movie_item['src'] = get_str_from_xpath(
                each.xpath('./div/img/@src'))
            movie_item['name'] = get_str_from_xpath(
                each.xpath('./div[2]/dl/dd/h1/a/text()'))
            movie_item['score'] = get_random_str()
            movie_item['nickname'] = movie_item['name']
            movie_item['directors'] = ['未知']
            movie_item['actors'] = each.xpath('./div[2]/dl/div/p[2]/a/text()')
            movie_item['type'] = get_str_from_xpath(
                each.xpath('//*[@id="daphang"]/a[2]/text()'))
            type2 = get_str_from_xpath(
                each.xpath('//*[@id="daphang"]/a[3]/text()'))
            if (is_exclude_type2(type2) == True):
                continue
            movie_item['type2'] = reverse_type2(type2)
            movie_item['language'] = get_str_from_xpath(
                each.xpath('./div[2]/dl/dd[6]/text()'))
            movie_item['duration'] = ''
            movie_item['update_time'] = reverse_update_time(
                get_str_from_xpath(each.xpath('./div[2]/dl/div/p/text()')))
            movie_item['description'] = get_str_from_xpath(
                html.xpath('//div[@class="des"]/span[2]/text()'))
            count = 1
            sources = []
            for each in html.xpath('//div[@class="p_list"]'):
                source_name = get_str_from_xpath(each.xpath('./div/h3/text()'))
                if (source_name == ''):
                    break
                source = {'name': source_name, 'types': []}
                types = []
                for each2 in each.xpath('./div[2]/li'):
                    type = {
                        'name': get_str_from_xpath(each2.xpath('./a/text()')),
                        'url': ''
                    }
                    types.append(type)
                source['types'] = types
                sources.append(source)
            source_url = self.domain + '/play/' + (
                movie_id.split('id')[1]) + '-0-0.html'
            html = get_one_page(source_url)
            html = etree.HTML(html)
            # "第01集$https://www.iqiyi.com/v_19rw197vvg.html#第02集$https://www.iqiyi.com/v_19rw19b1l0.html
            try:
                video_info_list = parse_unicode(
                    html.xpath('//script/text()')[1].split('(')[1].split(')')
                    [0])
                print(video_info_list)
                sources_temp = handle_with_video_info_list(video_info_list)
            except:
                print(html.xpath('//script/text()'))
                continue
            # 解决通过video_info_list获取到的资源类型和播放列表类型乱码的问题
            for i in range(0, len(sources)):
                for j in range(0, len(sources_temp)):
                    if (sources_temp[j]['name'] == sources[i]['name']):
                        for k in range(0, len(sources[i]['types'])):
                            sources[i]['types'][k]['url'] = sources_temp[j][
                                'types'][k]['url']
                        break
            for source in sources:
                if (source['types'][0]['url'] == ''):
                    sources.remove(source)
            movie_item['sources'] = sources
            # 跳过播放列表为空的视频并记录
            if (len(sources) == 0):
                continue
            if (movie_item['update_status']) == '':
                movie_item['update_status'] = sources[0]['types'][
                    len(sources[0]['types']) - 1]['name']
            # 视频已爬取且未更新
            if (is_need_source(movie_item, 'movie') == False):
                print(movie_id + ' 已爬取')
                continue
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
              str(process_time) + 's')
Exemple #4
0
    def parse(self, response):

        # 判断当前数据是否爬取
        if check_spider_history(self.type, response.url) == True:
            pass
        # 开始时间
        start = time.time()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)
        curr_page = url.split('/?m=vod-index-pg-')[1].split('.html')[0]

        # url:电影详情页
        count = -1
        for each in reverse_arr(response.xpath('//*[@id="data_list"]/tr')):
            self.index = self.index + 1
            count = count + 1
            if count == 0 or count == 51:
                continue
            url2 = each.xpath("./td[1]/a/@href").extract()[0]
            movie_id = url2.split('id-')[1].split('.html')[0]
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            url3 = self.domain + url2
            print(url3)
            html = get_one_page(url3)
            html = etree.HTML(html)
            each = html.xpath('//div[@class="contentMain"]')[0]
            movie_item = MovieItem()
            movie_item['id'] = movie_id
            movie_item['src'] = get_str_from_xpath(
                each.xpath('./div/img/@src'))
            movie_item['name'] = get_str_from_xpath(
                each.xpath('./div[2]/li[1]/text()[2]'))
            movie_item['update_status'] = get_str_from_xpath(
                each.xpath('./div[2]/li[3]/text()[2]'))
            movie_item['score'] = get_str_from_xpath(
                each.xpath('./div[2]/li[9]/div[2]/text()[2]'))
            movie_item['nickname'] = get_str_from_xpath(
                each.xpath('./div[2]/li[2]/text()[2]'))
            movie_item['directors'] = get_str_from_xpath(
                each.xpath('./div[2]/li[5]/text()[2]')).split(',')
            movie_item['actors'] = get_str_from_xpath(
                each.xpath('./div[2]/li[4]/text()[2]')).split(',')
            type2 = get_str_from_xpath(
                each.xpath('./div[2]/li[6]/div[1]/text()[2]'))
            type = get_type_from_type2(type2)
            if (is_exclude_type2(type2) == True):
                continue
            if (type == '综艺' or type == '动漫'):
                if (type2.endswith('片') == False):
                    type2 = type2 + '片'
            movie_item['type2'] = reverse_type2(type2)
            movie_item['type'] = type
            movie_item['region'] = reverse_region(
                get_str_from_xpath(
                    each.xpath('./div[2]/li[7]/div[2]/text()[2]')))
            movie_item['language'] = get_str_from_xpath(
                each.xpath('./div[2]/li[7]/div[1]/text()[2]'))
            movie_item['release_date'] = reverse_release_date(
                get_str_from_xpath(
                    each.xpath('./div[2]/li[8]/div[2]/text()[2]')))
            movie_item['duration'] = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[8]/span/text()'))
            movie_item['update_time'] = reverse_update_time(
                get_str_from_xpath(
                    each.xpath('./div[2]/li[9]/div[1]/text()[2]')))
            movie_item['description'] = get_str_from_xpath(
                html.xpath('/html/body/div[5]/div/div/p[2]/text()'))
            sources = []
            flag = 0
            count = 1
            for each2 in html.xpath('//div[@class="movievod"]/ul/li'):
                if (len(each2.xpath('./input')) == 0):
                    if (count > 1):
                        source['types'] = types
                        sources.append(source)
                        count = 1
                    source = {'name': '', 'types': []}
                    source['name'] = get_str_from_xpath(
                        each2.xpath('./text()'))
                    types = []
                    flag = flag + 1
                    continue
                else:
                    full_name = get_str_from_xpath(
                        each2.xpath('./input/@value'))
                    if (full_name == 'checkbox'):
                        source['types'] = types
                        sources.append(source)
                        break
                    print(full_name)
                    if (flag % 2 == 0 and '$' not in full_name):
                        source['types'] = types
                        sources.append(source)
                        count = 1
                type = {'name': '', 'url': ''}
                type['name'] = full_name.split('$')[0]
                type['url'] = full_name.split('$')[1]
                print('正在爬取 ' + curr_page + '/' + (str)(self.total_page) +
                      ' ' + (str)(self.index) + '/' + (str)(self.total) +
                      ' -> ' + movie_id + ' ' + source['name'] + ' ' +
                      type['name'])
                types.append(type)
                count = count + 1
            movie_item['sources'] = sources
            # 跳过播放列表为空的视频并记录
            flag = 0
            if (len(types) == 0):
                continue
            # 视频已爬取且未更新
            if (is_need_source(movie_item, 'movie') == False):
                print(movie_id + ' 已爬取')
                continue
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
              str(process_time) + 's')
Exemple #5
0
    def parse(self, response):

        # 判断当前数据是否爬取
        if check_spider_history(self.type, response.url) == True:
            pass
        # 开始时间
        start = time.time()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)
        curr_page = url.split('/?m=vod-index-pg-')[1].split('.html')[0]

        # url:电影详情页
        count = -1
        for each in reverse_arr(response.xpath("/html/body/div[4]/ul")):
            self.index = self.index + 1
            count = count + 1
            if count == 0 or count == 51:
                continue
            url2 = each.xpath("./li/span[2]/a/@href").extract()[0]
            movie_id = url2.split('id-')[1].split('.html')[0]
            dic = {'id': movie_id}
            movie_server = db_utils.find(dic)
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            print(self.domain + url2)
            html = get_one_page(self.domain + url2)
            html = etree.HTML(html)
            each = html.xpath('/html/body/div[4]')[0]
            movie_item = MovieItem()
            movie_item['id'] = movie_id
            movie_item['src'] = (str)(
                each.xpath('./div[1]/div/div/div[1]/img/@src')[0])
            movie_item['name'] = (str)(
                each.xpath('./div[1]/div/div/div[2]/div[1]/h2/text()')[0])
            movie_item['update_status'] = get_str_from_xpath(
                each.xpath('./div[1]/div/div/div[2]/div[1]/span/text()'))
            movie_item['score'] = get_str_from_xpath(
                each.xpath('./div[1]/div/div/div[2]/div[1]/label/text()'))
            movie_item['nickname'] = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[1]/span/text()'))
            movie_item['directors'] = get_arr_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[2]/span/text()'))
            movie_item['actors'] = get_arr_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[3]/span/text()'))
            type2 = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[4]/span/text()'))
            if (is_exclude_type2(type2) == True):
                continue
            movie_item['type2'] = reverse_type2(type2)
            type = get_type_from_type2(type2)
            movie_item['type'] = type
            movie_item['region'] = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[5]/span/text()'))
            movie_item['language'] = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[6]/span/text()'))
            movie_item['release_date'] = reverse_release_date(
                get_str_from_xpath(
                    each.xpath(
                        './div[1]/div/div/div[2]/div[2]/ul/li[7]/span/text()'))
            )
            movie_item['duration'] = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[8]/span/text()'))
            movie_item['update_time'] = reverse_update_time(
                get_str_from_xpath(
                    each.xpath(
                        './div[1]/div/div/div[2]/div[2]/ul/li[9]/span/text()'))
            )
            movie_item['description'] = get_str_from_xpath(
                each.xpath(
                    './div[1]/div/div/div[2]/div[2]/ul/li[14]/div/span[2]/text()'
                ))
            sources = []
            for each2 in each.xpath('./div[3]/div[2]/div/div'):
                source = {'name': '', 'types': []}
                source['name'] = (str)(each2.xpath('./h3/span/text()')[0])
                types = []
                for each3 in each2.xpath('./ul/li'):
                    full_name = (str)(each3.xpath('./text()')[0])
                    type = {'name': '', 'url': ''}
                    type['name'] = full_name.split('$')[0]
                    type['url'] = full_name.split('$')[1]
                    print('正在爬取 ' + curr_page + '/' + (str)(self.total_page) +
                          ' ' + (str)(self.index) + '/' + (str)(self.total) +
                          ' -> ' + movie_id + ' ' + source['name'] + ' ' +
                          type['name'])
                    types.append(type)
                source['types'] = types
                sources.append(source)
            movie_item['sources'] = sources
            # 跳过播放列表为空的视频并记录
            flag = 0
            if (len(types) == 0):
                continue
            # 视频已爬取且未更新
            # 视频已爬取且未更新
            if (is_need_source(movie_item, 'movie') == False):
                print(movie_id + ' 已爬取')
                continue
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
              str(process_time) + 's')
Exemple #6
0
    def parse(self, response):

        # 开始时间
        start = time.time()

        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        origin_url = response.url
        movie_type = origin_url.split('channel=')[1].split('&')[0]
        orign_html = get_one_page(origin_url)
        flag = 0
        html = etree.HTML(orign_html)
        if (flag == 0):
            if (self.target == 'all'):
                total = 2000
            elif (self.target == 'latest'):
                total = (int)(get_str_from_xpath(
                    html.xpath(
                        '/html/body/div[5]/div/div[1]/div[1]/span/text()')))
            #  获取影视列表地址
            page_size = 30
            start_page = 0
            if (total % page_size == 0):
                total_page = (int)(total / page_size)
            else:
                total_page = (int)(total / page_size) + 1
            if (self.target == 'latest'):
                total_page = 1
                total = total_page * page_size

            #   获取影视列表内容
            for offset_temp in reverse_arr(range(start_page, total,
                                                 page_size)):
                current_page = (int)(offset_temp / page_size) + 1
                url = origin_url.replace(
                    'listpage=1', 'listpage=' + (str)(current_page)).replace(
                        'offset=0', 'offset=' + (str)(offset_temp))
                print('当前页面:' + url)
                count = 0
                html = get_one_page(url)
                html = etree.HTML(html)
                for each in reverse_arr(
                        html.xpath("//div[@class='list_item']")):
                    count += 1
                    movie_id = get_str_from_xpath(each.xpath(
                        "./a/@href")).split('cover/')[1].split('.html')[0]
                    movie_item = MovieItem()
                    movie_item['id'] = movie_id
                    movie_item['src'] = get_str_from_xpath(
                        each.xpath("./a/img[1]/@src"))
                    movie_item['name'] = get_str_from_xpath(
                        each.xpath("./a/@title"))
                    if (movie_type == 'movie'):
                        update_status = '腾讯视频'
                    else:
                        update_status = get_str_from_xpath(
                            each.xpath("./div/text()"))
                    # id, src, name, update_time, actors, type, score, release_date, description
                    # 解析视频详情
                    # https://v.qq.com/x/cover/mzc00200uj8xmtb.html
                    url = self.domain + '/x/cover/' + movie_id + '.html'
                    print(url)
                    driver = get_driver()
                    driver.get(url)
                    cover_info = driver.execute_script('return COVER_INFO')
                    driver.quit()
                    try:
                        score = cover_info['score']['score']
                    except:
                        score = get_random_str()
                    movie_item['score'] = score
                    try:
                        nick_name = cover_info['second_title']
                    except:
                        nick_name = movie_item['name']
                    movie_item['nickname'] = nick_name
                    if (update_status == '' or update_status == None):
                        try:
                            update_status = cover_info['episode_updated']
                            if (update_status == None):
                                update_status = '腾讯视频'
                        except:
                            continue
                    movie_item['update_status'] = update_status
                    try:
                        directors = cover_info['director']
                    except:
                        try:
                            directors = cover_info['director_id']
                        except:
                            directors = []
                    movie_item['directors'] = directors
                    try:
                        actors = cover_info['leading_actor']
                    except:
                        try:
                            actors = cover_info['leading_actor_id']
                        except:
                            actors = []
                    movie_item['actors'] = actors
                    movie_item['type'] = self.type_name_dic[movie_type]
                    if (movie_type == 'tv'):
                        type2_suffix = '剧'
                    else:
                        type2_suffix = '片'
                    main_genre = cover_info['main_genre']
                    if (main_genre == None):
                        main_genre = self.type_name_dic[movie_type]
                    movie_item['type2'] = reverse_type2(main_genre +
                                                        type2_suffix)
                    movie_item['region'] = reverse_region(
                        cover_info['area_name'])
                    try:
                        language = cover_info['langue']
                    except:
                        language = '内详'
                    movie_item['language'] = language
                    movie_item['release_date'] = cover_info['year']
                    movie_item['duration'] = '0'
                    movie_item['update_time'] = get_current_time()
                    movie_item['description'] = cover_info['description']
                    sources = []
                    source = {'name': '腾讯视频', 'types': []}
                    types = []
                    type_name = ''
                    for each2 in cover_info['nomal_ids']:
                        type = {}
                        flag = each2['F']
                        if (flag == 0 or flag == 4):
                            continue
                        type['name'] = reverse_type_name((str)(each2['E']))
                        type['url'] = url.split(
                            '.html')[0] + '/' + each2['V'] + '.html'
                        print('正在爬取 ' + movie_type + ' ' +
                              (str)(current_page) + '/' + (str)(total_page) +
                              ' ' + (str)(count) + '/' + (str)(total) +
                              ' -> ' + movie_id + ' ' + source['name'] + ' ' +
                              type['name'])
                        types.append(type)
                    if (len(types) == 0): continue
                    source['types'] = types
                    sources.append(source)
                    movie_item['sources'] = sources
                    # 视频已爬取且未更新
                    if (is_need_source(movie_item, 'movie') == False):
                        print(movie_id + ' 已爬取')
                        continue
                    yield movie_item
                    self.total_valid = self.total_valid + 1
            # 结束时间
            end = time.time()
            process_time = end - start
            print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
                  str(process_time) + 's')
Exemple #7
0
    def parse(self, response):

        # 开始时间
        start = time.time()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)
        curr_page = url.split('-')[1].split('.html')[0]

        # url:电影详情页
        count = -1
        for each in reverse_arr(response.xpath('//tr[@class="row"]')):
            self.index = self.index + 1
            count = count + 1
            if count == 0 or count == 51:
                continue
            url2 = each.xpath("./td/a/@href").extract()[0]
            movie_id = url2.split('?')[1].split('.html')[0]
            dic = {'id': movie_id}
            movie_server = db_utils.find(dic)
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            html = get_one_page(self.domain + url2, encode='gb2312')
            html = etree.HTML(html)
            each = html.xpath('//table[@style="text-align:left"]')[0]
            movie_item = MovieItem()
            movie_item['id'] = movie_id
            movie_item['src'] = get_str_from_xpath(
                each.xpath('./tr[1]/td[1]/div/img/@src'))
            movie_item['name'] = get_str_from_xpath(
                each.xpath('./tr[1]/td[2]/table/tr[1]/td/font/text()'))
            movie_item['update_status'] = get_str_from_xpath(
                each.xpath('./tr[1]/td[2]/table/tr[7]/td/font/text()'))
            movie_item['score'] = get_random_str()
            # 视频已爬取且未更新
            if (movie_server.count() > 0
                    and movie_server.__getitem__(0)['update_status']
                    == movie_item['update_status']):
                print(movie_id + ' 已爬取')
                continue
            movie_item['nickname'] = movie_item['name']
            movie_item['directors'] = get_arr_from_xpath(
                each.xpath('./tr[1]/td[2]/table/tr[3]/td/font/a/text()'))
            movie_item['actors'] = get_arr_from_xpath(
                each.xpath('./tr[1]/td[2]/table/tr[2]/td/font/text()'))
            type2 = get_str_from_xpath(
                each.xpath('./tr[1]/td[2]/table/tr[4]/td/font/text()'))
            if (is_exclude_type2(type2) == True):
                continue
            movie_item['type2'] = reverse_type2(type2)
            if movie_item['type2'].find('综艺') != -1:
                movie_item['type'] = '综艺'
            elif movie_item['type2'].find('动漫') != -1:
                movie_item['type'] = '动漫'
            elif movie_item['type2'].find('福利片') != -1:
                movie_item['type'] = '福利片'
                continue
            elif movie_item['type2'].find('伦理片') != -1:
                movie_item['type'] = '伦理片'
                continue
            elif movie_item['type2'].find('音乐片') != -1:
                movie_item['type'] = '音乐片'
            elif movie_item['type2'].find(
                    '片') != -1 or movie_item['type2'].find('电影') != -1:
                movie_item['type'] = '电影'
            elif movie_item['type2'].find('剧') != -1:
                movie_item['type'] = '电视剧'
            movie_item['region'] = get_str_from_xpath(
                each.xpath('./tr[1]/td[2]/table/tr[5]/td/font/text()'))
            movie_item['language'] = get_str_from_xpath(
                each.xpath('./tr[1]/td[2]/table/tr[8]/td/font/text()'))
            movie_item['release_date'] = reverse_release_date(
                get_str_from_xpath(
                    each.xpath('./tr[1]/td[2]/table/tr[9]/td/font/text()')))
            movie_item['duration'] = '无'
            movie_item['update_time'] = reverse_update_time(
                get_str_from_xpath(
                    each.xpath('./tr[1]/td[2]/table/tr[6]/td/font/text()')))
            movie_item['description'] = get_str_from_xpath(
                each.xpath('//div[@class="intro"]/font/text()'))
            sources = []
            for each2 in each.xpath('//td[@class="bt"]'):
                source = {'name': '', 'types': []}
                source['name'] = (str)(each2.xpath('./h1/text()')[0])
                types = []
                type_length = len(each2.xpath('.//tr'))
                count = 1
                for each3 in each2.xpath('.//tr'):
                    if (count == type_length): break
                    count += 1
                    full_name = (str)(each3.xpath('./td/a/text()')[0])
                    type = {'name': '', 'url': ''}
                    type['name'] = full_name.split('$')[0]
                    type['url'] = full_name.split('$')[1]
                    print('正在爬取 ' + curr_page + '/' + (str)(self.total_page) +
                          ' ' + (str)(self.index) + '/' + (str)(self.total) +
                          ' -> ' + movie_id + ' ' + source['name'] + ' ' +
                          type['name'])
                    types.append(type)
                source['types'] = types
                sources.append(source)
                # 跳过播放列表为空的视频并记录
                flag = 0
                if (len(types) == 0):
                    continue
            movie_item['sources'] = sources
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
              str(process_time) + 's')
Exemple #8
0
    def parse(self, response):

        # 开始时间
        start = time.time()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)
        curr_page = url.split('?page=')[1].split('&')[0]

        # url:电影详情页
        count = -1
        for each in reverse_arr(
                response.xpath('//ul[@class="stui-vodlist clearfix"]/li')):
            self.index = self.index + 1
            count = count + 1
            movie_item = MovieItem()
            url2 = self.domain + each.xpath("./a").attrib['href']
            print(url2)
            movie_id = url2.split('.html')[0].split('num')[1]
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            # https://www.88ys.com/guochanju/202002/79916.html
            html = get_one_page(url2)
            html = etree.HTML(html)
            # /html/body/div[5]/div[1]/div/div
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/h2
            # /html/body/div[5]/div[1]/div/div/div[2]/div[2]/ul/li[1]/span
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/span
            each = html.xpath(
                '//div[@class="stui-content col-pd clearfix"]')[0]
            movie_item['id'] = movie_id
            movie_item['src'] = get_str_from_xpath(
                each.xpath('./div/a/img/@data-original'))
            movie_item['name'] = get_str_from_xpath(
                each.xpath('./div[2]/h3/text()'))
            movie_item['score'] = get_random_str()
            movie_item['nickname'] = movie_item['name']
            type2 = get_str_from_xpath(each.xpath('./div/p/a/text()'))
            if (is_exclude_type2(type2) == True):
                continue
            type2 = reverse_type2(type2)
            movie_item['type2'] = type2
            movie_item['type'] = get_type_from_type2(type2)
            movie_item['region'] = reverse_region(
                get_str_from_xpath(each.xpath('./div/p/a[2]/text()')))
            movie_item['release_date'] = reverse_release_date(
                get_str_from_xpath(each.xpath('./div/p/a[3]/text()')))
            directors = each.xpath('./div/p[3]/a/text()')
            if (len(directors) > 0 and '/' in directors[0]):
                directors = directors[0].split('/')
            movie_item['directors'] = directors
            actors = each.xpath('./div/p[2]/a/text()')
            if (len(actors) > 0 and '/' in actors[0]):
                actors = actors[0].split('/')
            movie_item['actors'] = actors
            movie_item['language'] = '内详'
            movie_item['duration'] = ''
            movie_item['update_time'] = '暂无'
            movie_item['description'] = get_str_from_xpath(
                html.xpath('//div[@class="stui-content__desc"]/text()'))
            count = 1
            sources = []
            for each in html.xpath('//div[@id="playlist"]'):
                source_name = get_str_from_xpath(
                    each.xpath('./div/h3/span/text()'))
                if (source_name == ''):
                    break
                source = {'name': source_name, 'types': []}
                types = []
                for each2 in each.xpath('./div[2]/div/ul/li'):
                    type = {
                        'name': get_str_from_xpath(each2.xpath('./a/text()')),
                        'url': ''
                    }
                    play_url_html = self.domain + get_str_from_xpath(
                        each2.xpath('./a/@href'))
                    print(play_url_html)
                    try:
                        html2 = get_one_page(play_url_html)
                        html2 = etree.HTML(html2)
                    except:
                        continue
                    play_url_m3u8 = get_str_from_xpath(
                        html2.xpath('//iframe/@src')).split('url=')[1]
                    type['url'] = play_url_m3u8
                    print('正在爬取 ' + curr_page + '/' + (str)(self.total_page) +
                          ' ' + (str)(self.index) + '/' + (str)(self.total) +
                          ' -> ' + movie_id + ' ' + source['name'] + ' ' +
                          type['name'])
                    types.append(type)
                source['types'] = types
                sources.append(source)
            movie_item['sources'] = sources
            # 跳过播放列表为空的视频并记录
            if (len(sources) == 0):
                continue
            movie_item['update_status'] = sources[0]['types'][
                len(sources[0]['types']) - 1]['name']
            # 视频已爬取且未更新
            if (is_need_source(movie_item, 'movie') == False):
                print(movie_id + ' 已爬取')
                continue
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' +
              str(process_time) + 's')
Exemple #9
0
    def parse(self, response):

        # 开始时间
        start = time.time()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)
        curr_page = url.split('/?m=vod-index-pg-')[1].split('.html')[0]

        # url:电影详情页
        count = -1
        for each in reverse_arr(response.xpath('//div[@class="xing_vb"]/ul')):
            self.index = self.index + 1
            count = count + 1
            if count == 0 or count == 51:
                continue
            url2 = each.xpath("./li/span[2]/a").attrib['href']
            print(url2)
            id_splits = url2.split('id-')
            if (len(id_splits) < 2): continue
            movie_id = id_splits[1].split('.html')[0]
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            # http://135zy0.com/?m=vod-detail-id-14.html
            url2 = self.domain + '/?m=vod-detail-id-'+movie_id+'.html'
            print(url2)
            try:
                html = get_one_page(url2)
                html = etree.HTML(html)
            except:
                write_spider_history(self.movie_type, url2)
                continue
            # /html/body/div[5]/div[1]/div/div
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/h2
            # /html/body/div[5]/div[1]/div/div/div[2]/div[2]/ul/li[1]/span
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/span
            each = html.xpath('//div[@class="vodBox"]')[0]
            movie_item = MovieItem()
            movie_item['id'] = movie_id
            src = get_str_from_xpath(each.xpath('./div/img/@src'))
            if ('pic=' in src):
                src = src.split('pic=')[1]
            movie_item['src'] = src
            movie_item['name'] = get_str_from_xpath(each.xpath('./div[2]/div[1]/h2/text()'))
            movie_item['update_status'] = get_str_from_xpath(each.xpath('./div[2]/div[1]/span/text()'))
            movie_item['score'] = get_str_from_xpath(each.xpath('./div[2]/div[1]/label/text()'))
            nickname = get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[1]/span/text()'))
            if (nickname == ''):
                nickname = movie_item['name']
            movie_item['nickname'] = nickname
            movie_item['directors'] = get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[2]/span/text()')).split(',')
            movie_item['actors'] = get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[3]/span/text()')).split(',')
            type2 = get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[4]/span/text()'))
            type = get_type_from_type2(type2)
            if (is_exclude_type2(type2) == True):
                continue
            if (type == '综艺' or type == '动漫'):
                if (type2.endswith('片') == False):
                    type2 = type2 + '片'
            movie_item['type2'] = reverse_type2(type2)
            movie_item['type'] = type
            movie_item['region'] = reverse_region(get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[5]/span/text()')))
            movie_item['language'] = get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[6]/span/text()'))
            movie_item['release_date'] = reverse_release_date(get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[9]/span/text()'))).split(' ')[0].split('-')[0]
            movie_item['duration'] = get_str_from_xpath(each.xpath('./div[1]/div/div/div[2]/div[2]/ul/li[8]/span/text()'))
            movie_item['update_time'] = reverse_update_time(get_str_from_xpath(each.xpath('./div[2]/div[2]/ul/li[9]/span/text()')))
            movie_item['description'] = get_str_from_xpath(html.xpath('//span[@class="more"]/text()'))
            sources = []
            count2 = 1
            index = 1
            for each in html.xpath('/html/body/div[5]/div[3]/div[2]/div/ul'):
                source = {'name': '', 'types': []}
                source['name'] = get_str_from_xpath(html.xpath('/html/body/div[5]/div[3]/div[2]/div/h3['+(str)(index)+']/text()')).split(':')[1].split('  ')[0]
                types = []
                for each2 in each.xpath('./li'):
                    type = {'name': '', 'url': ''}
                    type['name'] = get_str_from_xpath(each2.xpath('./text()')).split('$')[0]
                    type['url'] = get_str_from_xpath(each2.xpath('./a/@href')).split('url=')[1]
                    print('正在爬取 ' + curr_page + '/' + (str)(self.total_page) + ' ' + (str)(self.index) + '/' + (str)(
                        self.total) + ' -> ' + movie_id + ' ' + source['name'] + ' ' + type['name'])
                    types.append(type)
                    count2 = count2 + 1
                if (len(types) == 0):
                    continue
                index = index + 1
                source['types'] = types
                sources.append(source)
            movie_item['sources'] = sources
            # 跳过播放列表为空的视频并记录
            flag = 0
            if (len(sources) == 0):
                continue
            if (movie_item['update_status']) == '':
                movie_item['update_status'] = sources[0]['types'][len(sources[0]['types'])-1]['name']
            # 视频已爬取且未更新
            if (is_need_source(movie_item, 'movie') == False):
                print(movie_id + ' 已爬取')
                continue
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' + str(process_time) + 's')
Exemple #10
0
    def parse(self, response):

        # 开始时间
        start = time.time()
        driver = get_driver()
        # 获取所有电影的 id,用于判断电影是否已经爬取
        collection = 'movie'
        db_utils = MongoDbUtils(collection)
        dict = [{'sources': {'$elemMatch': {'$ne': []}}}, {'id': 1}]
        data = db_utils.find(dict)
        movie_ids = []
        for movie_id in data:
            movie_ids.append(movie_id['id'])

        url = response.url
        print('当前页面:' + url)
        curr_page = url.split('-pg-')[1].split('.html')[0]

        # url:电影详情页
        count = -1
        for each in reverse_arr(response.xpath('//div[@class="index-area clearfix"]/ul/li')):
            self.index = self.index + 1
            count = count + 1
            url2 = self.domain + each.xpath("./a").attrib['href']
            print(url2)
            splits = url2.split('.html')[0].split('/')
            movie_id = splits[len(splits)-1]
            # id, src, name, update_time, actors, type, score, release_date, description
            # 解析视频源
            # https://www.88ys.com/guochanju/202002/79916.html
            try:
                html = get_one_page(url2)
                html = etree.HTML(html)
            except:
                write_spider_history(self.movie_type, url2)
                continue
            # /html/body/div[5]/div[1]/div/div
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/h2
            # /html/body/div[5]/div[1]/div/div/div[2]/div[2]/ul/li[1]/span
            # /html/body/div[5]/div[1]/div/div/div[2]/div[1]/span
            each = html.xpath('//div[@class="ct mb clearfix"]')[0]
            movie_item = MovieItem()
            movie_item['id'] = movie_id
            movie_item['src'] = get_str_from_xpath(each.xpath('./div/img/@src'))
            movie_item['name'] = get_str_from_xpath(each.xpath('./div[2]/dl/h1/text()'))
            movie_item['update_status'] = get_str_from_xpath(each.xpath('./div[2]/dl/dt/text()'))
            movie_item['score'] = get_random_str()
            movie_item['nickname'] = movie_item['name']
            directors_str = get_str_from_xpath(each.xpath('./div[2]/dl/dd[3]/text()'))
            directors = directors_str.split(' ')
            if (',' in directors_str):
                directors = directors_str.split(',')
            movie_item['directors'] = directors
            actors_str = get_str_from_xpath(each.xpath('./div[2]/dl/dt[2]/text()'))
            actors = actors_str.split(' ')
            if (',' in actors_str):
                actors = actors_str.split(',')
            movie_item['actors'] = actors
            type_type2_splits = get_str_from_xpath(each.xpath('./div[2]/dl/dd/text()')).split(' ')
            movie_item['type'] = type_type2_splits[0]
            try:
                type2 = type_type2_splits[1]
            except:
                type2 = '其他'
            if (is_exclude_type2(type2) == True):
                continue
            if (type == '综艺' or type == '动漫'):
                if (type2.endswith('片') == False):
                    type2 = type2 + '片'
            movie_item['type2'] = reverse_type2(type2)
            movie_item['region'] = reverse_region(get_str_from_xpath(each.xpath('./div[2]/dl/dd[4]/text()')))
            movie_item['language'] = get_str_from_xpath(each.xpath('./div[2]/dl/dd[6]/text()'))
            movie_item['release_date'] = reverse_release_date(get_str_from_xpath(each.xpath('./div[2]/dl/dd[5]/text()')))
            movie_item['duration'] = ''
            movie_item['update_time'] = reverse_update_time(get_str_from_xpath(each.xpath('./div[2]/dl/dd[2]/text()')))
            movie_item['description'] = get_str_from_xpath(each.xpath('./div[2]/div/text()'))
            source_name_list = []
            for each in html.xpath('//div[@class="playfrom tab8 clearfix"]/ul/li'):
                source_name_list.append(get_str_from_xpath(each.xpath('./text()')))
                source_id = get_str_from_xpath(each.xpath('./@id')).split('tab')[1]
            source_url = self.domain + get_str_from_xpath(html.xpath('//div[@id="stab'+source_id+'"]/div/ul/li/a[1]/@href'))
            driver.get(source_url)
            html = driver.page_source
            html = etree.HTML(html)
            # "第01集$https://www.iqiyi.com/v_19rw197vvg.html#第02集$https://www.iqiyi.com/v_19rw19b1l0.html
            mac_url = driver.execute_script('return mac_url')
            sources = handle_with_mac_url(source_name_list, mac_url)
            movie_item['sources'] = sources
            # 跳过播放列表为空的视频并记录
            if (len(sources) == 0):
                continue
            if (movie_item['update_status']) == '':
                movie_item['update_status'] = sources[0]['types'][len(sources[0]['types'])-1]['name']
            # 视频已爬取且未更新
            if (is_need_source(movie_item, 'movie') == False):
                print(movie_id + ' 已爬取')
                continue
            yield movie_item
            self.total_valid = self.total_valid + 1
        # 结束时间
        end = time.time()
        process_time = end - start
        print('本次共爬取 ' + str(self.total_valid) + ' 条数据,用时 ' + str(process_time) + 's')