Ejemplo n.º 1
0
    def process_one_video(self, line):
        video_info = copy.deepcopy(self.video_data)
        try:
            video_info['title'] = line.find('a', {'target': 'video'})['title']
        except:
            video_info['title'] = None
        try:
            url = line.find('a', {'target': 'video'})['href']
            video_info['url'] = 'https:' + url
        except:
            video_info['url'] = None
        try:
            play_count_str = line.find('span', {'class': 'v-num'}).text
            video_info['play_count'] = trans_play_count(play_count_str)
        except:
            video_info['play_count'] = 0
#            logging.warning("can't get play_count at page %s" % video_info['url'])
        try:
            release_time_str = line.find('span', {
                'class': 'v-publishtime'
            }).text
            video_info['release_time'] = trans_strtime_to_timestamp(
                input_time=release_time_str, missing_year=True)
        except:
            release_time_str = 0
#            logging.warning("can't get release_time at page %s" % video_info['url'])
        try:
            dura_str = line.find('span', {'class': 'v-time'}).text
            video_info['duration'] = trans_duration(dura_str)
        except:
            video_info['duration'] = 0
#            logging.warning("can't get duration at page %s" % video_info['url'])
        fetch_time = int(time.time() * 1e3)
        video_info['fetch_time'] = fetch_time
        return video_info
Ejemplo n.º 2
0
 def parse_video_page_html(self, html):
     page_lst = html.split('fangyuchenggoalkeeper')
     url = page_lst[0]
     page = page_lst[1]
     soup = BeautifulSoup(page, 'html.parser')
     try:
         title = soup.find('h1', {'class': 'td-playbase__title'}).span.text
     except:
         title = None
     try:
         releaser = soup.find('a', {
             'class': 'td-play__userinfo__name'
         }).text
     except:
         releaser = None
     try:
         midsteptime = soup.find(
             'div', {
                 'class': 'td-play__videoinfo__details-box__time'
             }).text[:-2]
         release_time = int(
             datetime.datetime.strptime(
                 midsteptime, '%Y-%m-%d %H:%M:%S').timestamp() * 1e3)
     except:
         release_time = None
     try:
         releaserUrl = soup.find(
             "a", {"class": "td-play__userinfo__name"})['href']
     except:
         releaserUrl = None
     try:
         find_play_count = ' '.join(
             re.findall('total_vv.*stripe_bottom', page))
         replace_comma = find_play_count.replace(',', '')
         play_count_str = ' '.join(
             re.findall('total_vv":"\d+', replace_comma))
         play_count = int(' '.join(re.findall('\d+', play_count_str)))
     except:
         play_count = 0
     try:
         find_dura = re.findall('stripe_bottom":"\d+:\d+', page)
         dura_str = ' '.join(find_dura).split('":"')[-1]
         duration = trans_duration(dura_str)
     except:
         duration = 0
     fetch_time = int(time.time() * 1e3)
     info_dic = {
         'platform': self.platform,
         "title": title,
         'url': url,
         'duration': duration,
         "releaser": releaser,
         "release_time": release_time,
         "releaserUrl": releaserUrl,
         'play_count': play_count,
         'fetch_time': fetch_time
     }
     return info_dic
Ejemplo n.º 3
0
 def video_page(self, url, channel=None):
     """
     Due to iqiyi import hot index instead of play count,
     the crawler is updated on 2018-11-23
     """
     url = self.rebuild_video_url(url)
     start = time.time()
     get_page = retry_get_url(url)
     end = time.time() - start
     print("first request costs %s seconds" % end)
     if get_page is None:
         print('Failed to get html page for url: %s' % url)
         return None
     get_page.encoding = 'utf-8'
     page = get_page.text
     soup = BeautifulSoup(page, 'html.parser')
     page_info = soup.find("div", {"is": "i71-play"})[":page-info"]
     page_dic = json.loads(page_info)
     title = page_dic["tvName"]
     url = page_dic["pageUrl"]
     dura_str = page_dic["duration"]
     duration = trans_duration(dura_str)
     try:
         releaser = page_dic["user"]["name"]
         releaserUrl = page_dic["user"]["profileUrl"]
     except:
         releaser = None
         releaserUrl = None
     video_info = soup.find("div", {"is": "i71-play"})[":video-info"]
     video_dic = json.loads(video_info)
     release_time = video_dic["firstPublishTime"]
     tvId = video_dic["tvId"]
     start1 = time.time()
     hot_idx_url = "https://pub.m.iqiyi.com/jp/h5/count/hotDisplay/?qipuId=%s" % tvId
     get_hot_idx = retry_get_url(hot_idx_url)
     end2 = time.time() - start1
     print("second request costs %s seconds" % end2)
     hot_idx_str = get_hot_idx.text
     hot_idx = int(
         re.findall("\d+", ' '.join(re.findall('"count":\d+',
                                               hot_idx_str)))[0])
     fetch_time = int(
         datetime.datetime.timestamp(datetime.datetime.now()) * 1e3)
     video_page_dict = copy.deepcopy(self.video_data)
     video_page_dict["title"] = title
     video_page_dict["url"] = url
     video_page_dict["duration"] = duration
     video_page_dict["releaser"] = releaser
     video_page_dict["releaserUrl"] = releaserUrl
     video_page_dict["release_time"] = release_time
     video_page_dict["hot_idx"] = hot_idx
     video_page_dict["fetch_time"] = fetch_time
     video_page_dict["tvId"] = tvId
     if channel is not None:
         video_page_dict["channel"] = channel
     return video_page_dict
Ejemplo n.º 4
0
 def video_page(self, url):
     video_info = copy.deepcopy(self.video_data)
     get_page = requests.get(url)
     page = get_page.text
     soup = BeautifulSoup(page, 'html.parser')
     try:
         video_info['title'] = soup.find('h1', {'class': 'td-playbase__title'}).span.text
     except:
         video_info['title'] = None
     try:
         video_info['releaser'] = soup.find('a',{'class':'td-play__userinfo__name'}).text
     except:
         video_info['releaser'] = None
     try:
         midsteptime = soup.find('div',{'class':
                                        'td-play__videoinfo__details-box__time'}).text[:-2]
         video_info['release_time'] = int(datetime.datetime.strptime(midsteptime,
                                                                     '%Y-%m-%d %H:%M:%S').timestamp()*1e3)
     except:
         video_info['release_time'] = None
     try:
         video_info['releaserUrl'] = soup.find("a", {"class": "td-play__userinfo__name"})['href']
     except:
         video_info['releaserUrl'] = None
     try:
         find_play_count = ' '.join(re.findall('total_vv.*stripe_bottom', page))
         replace_comma_pcnt = find_play_count.replace(',', '')
         play_count_str = ' '.join(re.findall('total_vv":"\d+', replace_comma_pcnt))
         video_info['play_count'] = int(' '.join(re.findall('\d+', play_count_str)))
     except:
         video_info['play_count'] = 0
     try:
         find_comment_count = ' '.join(re.findall('total_comment.*recommend', page))
         replace_comma_ccnt = find_comment_count.replace(',', '')
         comment_count_str =  ' '.join(re.findall('total_comment":"\d+', replace_comma_ccnt))
         video_info['comment_count'] = int(' '.join(re.findall('\d+', comment_count_str)))
     except:
         video_info['comment_count'] = 0
     try:
         find_dura = re.findall('stripe_bottom":"\d+:\d+', page)
         dura_str = ' '.join(find_dura).split('":"')[-1]
         video_info['duration'] = trans_duration(dura_str)
     except:
         video_info['duration'] = 0
     video_info['fetch_time'] = int(time.time()*1e3)
     video_info['url'] = url
     print("get video data at %s" % url)
     return video_info
Ejemplo n.º 5
0
        def handle_one_video(one, video_info, releaser, releaserUrl, platform):
            video_data = copy.deepcopy(video_info)

            video_itemid = one['attr']['itemId']
            find_asyncData = one['asyncData']

            video_data['platform'] = platform
            video_data['releaser'] = releaser
            video_data['releaserUrl'] = releaserUrl
            video_data['title'] = one['title']
            video_data['url'] = r'https://sv.baidu.com/videoui/page/videoland?context=' + parse.quote(
                    '{"nid":"sv_%s"}' % \
                    one['id'][3:])
            video_data['duration'] = trans_duration(one['timeLong'])
            video_data['video_id'] = one['article_id']
            video_data['release_time'] = int(one['publish_at']) * 1000
            fetch_time = int(time.time() * 1e3)
            video_data['fetch_time'] = fetch_time

            params2 = {
                'params': json.dumps([find_asyncData]),
                'uk': uk,
                '_': str(int(time.time()) * 1000)
            }
            rq_get2 = requests.get(
                'https://mbd.baidu.com/webpage?type=homepage&action=interact&format=jsonp&callback=jsonp2',
                params=params2)
            page_info2 = json.loads(rq_get2.text[7:-1])
            try:
                video_data['play_count'] = int(
                    page_info2['data']['user_list'][video_itemid]['read_num'])
            except:
                video_data['play_count'] = 0
            try:
                video_data['favorite_count'] = int(
                    page_info2['data']['user_list'][video_itemid]
                    ['praise_num'])
            except:
                video_data['favorite_count'] = 0
            try:
                video_data['comment_count'] = int(
                    page_info2['data']['user_list'][video_itemid]
                    ['comment_num'])
            except:
                video_data['comment_count'] = 0
            return video_data
Ejemplo n.º 6
0
 def video_page_seleium(self, task=0):
     # seleium 获取单集视频 缺少vid
     self.driver = webdriver.Chrome(options=self.chrome_options)
     self.driver.maximize_window()
     has_data = rds_get.dbsize()
     while has_data:
         keys = rds_get.randomkey()
         res = rds_get.hgetall(keys)
         has_data = rds_get.dbsize()
         # time.sleep(0.2)
         try:
             self.driver.get(res["url"])
             time.sleep(5)
             video_list = self.driver.find_elements_by_xpath("//div[@id='rightPlayList']//li")
             for count, video_obj in enumerate(video_list):
                 # if count <= 1:
                 #     continue
                 try:
                     ActionChains(self.driver).click(video_obj).perform()
                     time.sleep(2)
                 except:
                     continue
                 self.driver.implicitly_wait(10)
                 title = self.driver.find_element_by_xpath("//h1[@class='player-title']").text
                 # WebDriverWait(self.driver, 10,2).until(lambda x: x.find_element_by_xpath("/html[1]/body[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/article[1]"))
                 # playcount_obj = self.driver.find_element_by_xpath("//span[@class='basic-title']")
                 duration_str = self.driver.find_element_by_xpath("//iqpspan[@class='iqp-time-dur']").text
                 duration = trans_duration(duration_str)
                 # action = ActionChains(self.driver)
                 # action.click(playcount_obj).perform()
                 # play_conut_obj_lsit = self.driver.find_elements_by_xpath("//div[@class='hot-chart-tab']//li")
                 # action = ActionChains(self.driver)
                 # action.click(play_conut_obj_lsit[-1]).perform()
                 # for x in range(11):
                 #     # print(x)
                 #     ActionChains(self.driver).move_by_offset(10 * x, 20).perform()
                 # play_count_sum_obj = self.driver.find_element_by_xpath("//div[@class='hot-chart']//div[2]")
                 # print(play_count_sum_obj.location)
                 # print(play_count_sum_obj.text)
                 print("task ", task)
                 action = ActionChains(self.driver)
                 # action.move_by_offset(1406,1048)
                 # action.move_to_element_with_offset(play_count_sum_obj,500,-300).perform()
                 # time.sleep(0.5)
                 # play_count_re = re.findall('指数:(\d+)', play_count_sum_obj.text)
                 # if play_count_re:
                 #     play_count = play_count_re[0]
                 # else:
                 #     play_count = ""
                 # print(play_count_re)
                 # language = self.driver.find_elements_by_xpath("//div[@class='intro-mn']//a[1]")
                 # if language:
                 #     language = language[0].text
                 # else:
                 #     language = ""
                 # style_tags = self.driver.find_elements_by_xpath("//div[@class='intro-mn']//a[2]")
                 # if style_tags:
                 #     style_tags = style_tags[0].text
                 # else:
                 #     style_tags = ""
                 # project_tags = self.driver.find_elements_by_xpath("//div[@class='intro-mn']//a[3]")
                 # if project_tags:
                 #     project_tags = project_tags[0].text
                 # else:
                 #     project_tags = ""
                 # faverate_count_sum = self.driver.find_element_by_xpath("//span[@class='like-icon-box']")
                 one_video_dic = {
                         # "play_count_sum": play_count,
                         "url": res["url"],
                         "video_url": self.driver.current_url,
                         "video_title": title,
                         "album": res["title"],
                         "duration": duration
                 }
                 self.parse_single_data(one_video_dic, one_video_dic["video_url"])
             # self.detail_page_api(res)
             rds_get.delete(keys)
         except Exception as e:
             print(e, res["url"])
Ejemplo n.º 7
0
 def one_video_page(self, title, url):
     video_obj_list = self.driver.find_elements_by_xpath(
         "//div[@id='eplist_module']//li")
     if video_obj_list:
         # time.sleep(0.1)
         action = ActionChains(self.driver)
         video_name_tags = self.driver.find_elements_by_xpath(
             "//i[@class='mode-change iconfont icon-ep-list-simple']")
         if video_name_tags:
             # time.sleep(0.1)
             action.move_to_element(video_name_tags[0]).click().perform()
             del action
         time.sleep(0.1)
         video_obj_list = self.driver.find_elements_by_xpath(
             "//div[@id='eplist_module']//li")
         video_obj = self.driver.find_element_by_xpath(
             "//div[@id='eplist_module']//li")
         for video_count, video_obj in enumerate(video_obj_list):
             self.driver.implicitly_wait(10)
             action = ActionChains(self.driver)
             action.click(video_obj).perform()
             del action
             self.driver.execute_script("window.scrollBy(0,1000)")
             time.sleep(0.2)
             video_title = video_obj.text
             if_pay = ""
             # print(video_title)
             if "\n" in video_title:
                 video_title, if_pay = video_title.split("\n", -1)
             # print(self.driver.page_source)
             comment_count_list = self.driver.find_elements_by_xpath(
                 "//span[@class='results']")
             if comment_count_list:
                 comment_count = comment_count_list[0].text
             else:
                 comment_count = 0
             # print(comment_count)
             video_id = self.driver.find_element_by_xpath(
                 "//a[@class='av-link']")
             video_url = video_id.get_attribute("href")
             # print(video_url)
             # print(video_id.text)
             barrage_count_list = self.driver.find_elements_by_xpath(
                 "//span[@class='bilibili-player-video-info-danmaku-number']"
             )
             if barrage_count_list:
                 barrage_count = barrage_count_list[0].text
             else:
                 barrage_count = "-"
             # print(barrage_count)
             duration = self.driver.find_elements_by_xpath(
                 '//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[10]/div[2]/div[2]/div[1]/div[3]/div/span[3]'
             )
             try:
                 duration = trans_duration(duration[0].text)
                 print(video_count, duration)
             except:
                 duration = ""
             # print(duration)
             project_name = "bilibili_%s_%s" % (title, video_title)
             dic = {
                 "title": title,
                 "video_title": video_title,
                 "if_pay": if_pay,
                 "comment_count": comment_count,
                 "url": url,
                 "video_url": video_url,
                 "video_id": video_id.text,
                 "barrage_count": barrage_count,
                 "duration": duration,
                 "video_count": video_count + 1
             }
             self.parse_single_data(dic, project_name)
     else:
         self.driver.execute_script("window.scrollBy(0,1000)")
         # time.sleep(0.4)
         video_title = self.driver.find_element_by_xpath(
             '//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[1]/div[1]').text
         if_pay = ""
         # print(video_title)
         if "\n" in video_title:
             video_title, if_pay = video_title.split("\n", -1)
         # print(self.driver.page_source)
         comment_count = self.driver.find_element_by_xpath(
             "//span[@class='results']").text
         # print(comment_count)
         video_id = self.driver.find_element_by_xpath(
             "//a[@class='av-link']")
         video_url = video_id.get_attribute("href")
         # print(video_url)
         # print(video_id.text)
         barrage_count = self.driver.find_element_by_xpath(
             "//span[@class='bilibili-player-video-info-danmaku-number']"
         ).text
         # print(barrage_count)
         duration = self.driver.find_elements_by_xpath(
             '//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[10]/div[2]/div[2]/div[1]/div[3]/div/span[3]'
         )
         try:
             duration = trans_duration(duration[0].text)
             print(duration)
         except:
             duration = 0
         project_name = "bilibili_%s_%s" % (title, video_title)
         dic = {
             "title": title,
             "video_title": video_title,
             "if_pay": if_pay,
             "comment_count": comment_count,
             "url": url,
             "video_url": video_url,
             "video_id": video_id.text,
             "barrage_count": barrage_count,
             "duration": duration,
             "video_count": 1
         }
         self.parse_single_data(dic, project_name)
Ejemplo n.º 8
0
    def releaser_page_web(self,
                          releaserUrl,
                          output_to_file=False,
                          filepath=None,
                          releaser_page_num_max=30,
                          output_to_es_raw=False,
                          output_to_es_register=False,
                          push_to_redis=False,
                          es_index=None,
                          doc_type=None,
                          fetchFavoriteCommnt=True):
        pid = os.getpid()
        releaser_id = self.get_releaser_id(releaserUrl)
        print('releaser_id is %s' % releaser_id)
        result_lst = []
        # video_info = self.video_data
        page_num = 0
        has_more = True
        ctime = ""
        count_false = 0
        # proxies = None
        proxies = get_proxy_dic()
        while page_num <= releaser_page_num_max and has_more:

            post_url = 'https://haokan.baidu.com/haokan/wiseauthor?app_id={0}&_api=1&_skip={1}&ctime={2}&_limit=10&video_type=media&sort_type=sort_by_time'.format(
                releaser_id, page_num, ctime)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                "referer":
                "https://haokan.baidu.com/haokan/wiseauthor?app_id=1564003728536358",
                "sec-fetch-mode": "cors",
                "sec-fetch-site": "same-origin",
                "accept": "*/*",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh,zh-CN;q=0.9",
                "content-type": "application/x-www-form-urlencoded"
            }
            try:
                if page_num == 0:
                    for loop in range(5):
                        get_page = requests.get(releaserUrl,
                                                headers=headers,
                                                timeout=3,
                                                proxies=proxies)
                        # print(get_page.text)
                        page_dic, fans_num = self.web_first_pag(get_page.text)
                        if page_dic['apiData']['video']['results']:
                            page_num += 1
                            break
                else:
                    get_page = requests.get(post_url,
                                            headers=headers,
                                            timeout=3)
                    page_dic = get_page.json()
                    page_num += 1
                    # print(page_dic)
            except:
                continue
            try:
                info_lst = page_dic['apiData']['video']['results']
            except:
                info_lst = []
            try:
                ctime = page_dic['apiData']['video']['ctime']
                has_more = page_dic['apiData']['video']['has_more']
                if not has_more:
                    has_more = False
            except:
                has_more = False
            if info_lst != []:
                count_false = 0
                print("Process %s is processing %s at page %s" %
                      (pid, releaser_id, page_num))
                time.sleep(int(random.uniform(1, 2)))
                for line in info_lst:
                    video_data = copy.deepcopy(self.video_data_template)
                    video_data['title'] = line['content']['title']
                    video_id = line['content']['vid']
                    video_data['video_id'] = video_id
                    # partial_url = '{"nid":"sv_%s"}' % video_id
                    # partial_url_encode = urllib.parse.quote_plus(partial_url)
                    video_data['url'] = line['content']["video_short_url"]
                    video_data['play_count'] = line['content']['playcnt']
                    video_data['favorite_count'] = int(
                        line['content']['praiseNum'])
                    try:
                        video_data['comment_count'] = int(
                            line['content']['commentNum'])
                    except:
                        video_data['comment_count'] = 0
                    video_data['releaser_followers_count'] = int(fans_num)
                    # print('like num is %s' % video_data['favorite_count'])
                    try:
                        video_data['duration'] = trans_duration(
                            line['content']['duration'])
                    except:
                        video_data['duration'] = 0
                    video_data['releaser'] = line['content']['author']
                    video_data['releaser_id_str'] = "haokan_%s" % (
                        line['content']['authorid'])
                    video_data[
                        'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + line[
                            'content']['authorid']
                    fetch_time = int(time.time() * 1e3)
                    video_data['fetch_time'] = fetch_time
                    releaser_time_str = line['content']['publish_time']
                    video_data['release_time'] = trans_strtime_to_timestamp(
                        input_time=releaser_time_str)
                    print(
                        video_id, releaser_time_str,
                        datetime.datetime.fromtimestamp(
                            video_data['release_time'] / 1000), page_num)
                    yield video_data
            else:
                count_false += 1
                if count_false < 5:
                    continue
                else:
                    break
Ejemplo n.º 9
0
 def search_page(self, title=None, *args, **kwargs):
     data_list = []
     timestamp = int(datetime.datetime.now().timestamp() * 1e3)
     url = "https://r.inews.qq.com/search?chlid=_qqnews_custom_search_all&search_from=&needSearchTabs=1&needSpreadAds=1&rtAd=1&new_user=0&uid=48a4725886d57203&omgid=&trueVersion=6.0.40&qimei=866174725888628&devid=866174725888628&appver=22_android_6.0.40&Cookie=lskey%3D;skey%3D;uin%3D;%20luin%3D;logintype%3D0;%20main_login%3D;%20&qn-sig=07db3b98ab9133d39b8b053fa1c51bd9&qn-rid=1002_2f55f6ab-2eb6-45e5-a4df-6dd9778c8b9d&qn-newsig=39b264b07173439d052ff2d6875cb7bc6aa47770dea55c7b64addee42138715a"
     post_json = {
         "search_type": "all",
         "query": title,
         "cp_type": "0",
         "disable_qc": "0",
         "searchStartFrom": "header",
         "launchSearchFrom": "billboard",
         "isDefault": "0",
         "searchTag": title,
         "adReqData":
         '{"adtype":0,"pf":"aphone","app_channel":"17","ext":{"mob":{"mobstr":"AdiIlDlcnKXLQu1Gx+HOa9fvgiA9BRLUAJ+RowxbYWkHaon9eDa0Qwt66FFNIY+xQHqSdGqfLc6p9ylswsJt1g4qWDeFDIxT6590GrPXznUizTPR0SutVVVQrHa1pbvX4WGx3yOrDNHGJCSrP38Gxej3\/ixgaVTB84d6i7sXgUhFCzcs3pS+DNShM79K7bIwO5U38eccvqle6nYKvELivuDIVr46chKdSokttQzbmf7OUSutGSHdn1+pihXvbFDkzgD+ut6PT\/G1E+O8eHwjZBf7K4Y8tpPABOH182j7JA6xpvoAP8r1WaHh73EtA5+T1M2dU3LtOMC0Sv\/Ngcf6btjefIkMDVoY+hWb8yKKd65UHSYvzpzLEdFNuEV8Sm33B789P9fCqLbnjf11OokPFjtC\/ORvR0dHItka56fkSNAZ2D+rmH8PPbMhZxSa\/bgOZywy2i8yu\/JRg8Rv8zRu4FkB6\/jIXkGCoWI1S7jUfnTIxCHu8iFOGo+Jr4VzMzqbnsi7XWhvKBye\/hPJkrISvw0wg5kg\/TPoj5Yu7aHH2pk31+uIbFRMFIzyj3p0I+yNmvpJECr4MuQmIXf8OP5OUlNVcDuZoXkyR4xy8ON1ou2Vtx+LQ\/x9xK2\/VR7up5apAPQMzmuzTOMcizdpO3FkrcXh0baOYJ7drGJWx4EO\/6nP9Y6J3GAU+YZsc+hCE3XHJpuZsfRsM2i7M4FnrZGz948VfFhY50Zk09eqK7y\/QsS++6su71tzvghFW0u3FOe1WMDvu3c4mMyYKIHkPQtGd5paAR81Xr6\/tGrhjh6CMcoHdppa9BV\/yM2s+NCTnxaZXoyuzljspI8x\/LjHLJuCLchAoPdOoND6mfoE7HGAajgdoFwR06I6zxN3RNQpB1RHIpmJCt+GcmAI4qld6qooO3lb\/8jkO8CBb69wapSAmvyzRvNVNPRa91ubAARkhW5DM62NjIDLN6COAWNEPZs6SfMbQ4jXNsIdXSR8ZZ8NuhO2uS9hU4+EadRYqVgn4yg1Z23d0HwQd0t0Gnw1X\/sAEIrR4sHyW0cVNMoWXkcfmM7UEq4oSCjLm6KTEhFuIR8EDm2HUEcUvcL+y0xr3Rr2YBuTVRR+bpnqffhYvyqRJILXaP2ddNrPt+a1Cl2sbL0INHVxfymPabok4Us8+jgbseBAf3iy8yOLDAQjG4z3iYVcLtgnoJnTLzTtAMC+wPYCbzoGi+hlXlBEF6FcxpU569ZT4YSIFI0xV8RXia+p7CnkaUWwmoKLBEwIG58rjqWO3+uyhvF0o\/\/RFi7QSF4U1DFy7qNQBPyoOiwEyKYZlbq4pQ6DjMYPWjBboU8NjY3qyoE\/CzwwSE75Gwk7w5DwYLs="}},"ver":"6.0.40","appversion":"200302","chid":2,"slot":[{"loid":"40,39","loid_watch_count":",0","channel":"_qqnews_custom_search_all","refresh_type":0,"recent_rot":["1,2,3"],"orders_info":["215508757,9693616,1554848392,1000,2801,110,2,CKsEMMrx8JcOOKTExfqioLSXG1AAYO32wqqapY+yEg==","215016046,9899501,1204054842,1000,4109,110,2,CKsEMMez\/wE4+fad\/47u\/sJLUNvVyZUNYKTQneXuxPaYngFyDAgBEI\/dwd33hde\/WXIECAIQAA==","214804999,14224364,2744407378,1000,606,110,2,CNkDMLuQydYFOJzXk9iVub73ZlC7rffuBGAAcgwIARDVn9eQtc2S6yNyBAgCEAA="]}],"launch":"0","wxversion":"0"}',
         "lon": "121.321859",
         "cityList": "news_news_sh",
         "loc_street": "申兰路",
         "village_name": "Unknown",
         "lastLocatingTime": str(int(timestamp / 1e3)),
         "provinceId": "12",
         "loc_city_name": "上海市",
         "loc_catalog": "基础设施:交通设施:火车站",
         "loc_province_name": "上海市",
         "loc_name": "上海虹桥站",
         "town_name": "新虹街道",
         "loc_district_name": "闵行区",
         "loc_addr": "上海市闵行区申贵路1500号",
         "lat": "31.194424",
         "cityId": "12",
         "adcode": "310112",
         "is_special_device": "0",
         "mid": "0",
         "dpi": "320",
         "qqnetwork": "wifi",
         "rom_type": "R11-user 5.1.1 NMF26X 500200210 release-keys",
         "isColdLaunch": "1",
         "real_device_width": "2.81",
         "net_proxy": "DIRECT@",
         "net_bssid": "48:A4:72:58:86:D5",
         "isMainUserLogin": "******",
         "currentChannelId": "_qqnews_custom_search_all",
         "isElderMode": "0",
         "apptype": "android",
         "islite": "0",
         "hw": "OPPO_OPPOR11",
         "global_session_id": str(timestamp),
         "screen_width": "900",
         "isClosePersonalized": "0",
         "videoAutoPlay": "1",
         "imsi": "460077203886213",
         "cpuabi": "armeabi-v7a",
         "isoem": "0",
         "currentTabId": "news_news",
         "startTimestamp": str(int(timestamp / 1e3)),
         "net_slot": "0",
         "qn-time": str(timestamp),
         "pagestartfrom": "icon",
         "mac": "48:A4:72:58:86:D5",
         "activefrom": "icon",
         "net_ssid": "R1148a4725886d57203",
         "store": "17",
         "screen_height": "1600",
         "top_activity": "NewsSearchResultListActivity",
         "real_device_height": "5",
         "origin_imei": "866174725888628",
         "network_type": "wifi",
         "origCurrentTab": "top",
         "global_info":
         "1|1|1|1|1|14|4|1|0|6|1|1|1||0|J309P000000000:J902P000000000:J601P900000000:A601P800217702:A601P700321102:B601P600286205:A601P500154501:A601P400161601:J601P300000000:B601P200096102:A601P100272502:A601P000261102:J601P904000000:J601P903000000:A601P902266601:A601P901291001:J601P811000000:A601P701226201:A601P622269601:A601P621294101:A601P620269601:J601P111000000:J601P110000000:A601P109107102:A601P105118803:A601P019237403:A601P016212405:J601P006000000:J603P000000000:J401P100000000:A401P000050901:J602P900000000:J602P800000000:J602P700000000:J602P600000000:A602P500267502:B602P400286004:J602P300000000:J602P200000000:J602P100000000:B602P000315504:A602P901257901:J602P616000000:A602P615304801:A602P613271701:A602P611253801:A602P516234601:A602P414259901:A602P307160708:J602P302000000:A602P208205801:J602P117000000:A602P007272801:A602P003136401:J304P000000000:J310P700000000:A310P200210802:J310P100000000:B310P020314103:A310P010301701:B310P000267107:B701P000323002:A703P000322204:A704P000309801:J702P000000000:J405P000000000:J064P400000000:J064P300000000:B064P100243802:B064P020290902:J064P010000000:J064P000000000:A085P000087701:B074P200238202:J074P040000000:B074P030315703:A074P020315602:A074P010315401:B074P000142402:J903P000000000:A267P300215801:A267P200263601:A267P100299801:B267P000300102:A073P040317201:B073P030314503:A073P020313801:J073P010000000:B073P000313603:J060P700000000:J060P300000000:J060P200000000:B060P100299703:A060P090287301:J060P020000000:J060P010000000:B060P000311102:J060P099000000:J060P016000000:A406P000313203:J403P700000000:J403P600000000:A403P200206702:B403P100246105:J403P010000000:A403P000310401:A403P602218702:B404P200262402:A404P000263407:J055P200000000:J055P090000000:J055P080000000:J055P070000000:J055P060000000:J055P050000000:J055P010000000:A055P000265801:J402P100000000:J402P090000000:J402P080000000:J402P060000000:J402P020000000:A402P000301403:J054P400000000:J054P300000000:J054P200000000:A054P100269701:B054P090289604:A054P080289702:J054P050000000:J054P040000000:A054P030288501:J054P010000000:A054P000319901:J056P000000000:A901P200252304:B901P100226405:B901P000232405:J407P000000000|1402|0|1|25|25|0|0|0||3|3|1|1|1|1|1|1|-1|0|0|5|2|0|0|0|3|0|0|1|3|0|2|0|0|2|0|0|1|0|1|1|0|0|1|0|4|0|1|1|11|20|1|0|1|1|0|0|1|4|0|1|1|41|2|51|60|0|1|0|0|1|5|1|0|0|71|0|0|1|71",
         "imsi_history": "460077203886213",
         "net_apn": "0",
     }
     res = requests.post(url, headers=self.headers, data=post_json)
     page_text = res.json()
     for one_video in page_text["secList"]:
         video_dic = {}
         try:
             one_video = one_video["newsList"][0]
             video_dic['title'] = one_video.get('title')
             video_dic['url'] = one_video.get("url")
             releaser_id = one_video.get('media_id')
             video_dic['releaser'] = one_video.get('chlname')
             video_dic[
                 'releaserUrl'] = "https://view.inews.qq.com/media/%s" % releaser_id
             release_time = int(one_video.get('timestamp'))
             video_dic['release_time'] = int(release_time * 1e3)
             video_dic['video_id'] = one_video.get('video_channel').get(
                 "video").get("vid")
             video_dic['duration'] = trans_duration(
                 one_video.get('video_channel').get("video").get(
                     "duration"))
             video_dic['play_count'] = one_video.get('readCount')
             video_dic['repost_count'] = one_video.get('shareCount')
             video_dic['comment_count'] = one_video.get('comments')
             video_dic['favorite_count'] = one_video.get('likeInfo')
             video_dic['fetch_time'] = int(
                 datetime.datetime.now().timestamp() * 1e3)
             video_dic['releaser_id_str'] = "腾讯新闻_%s" % releaser_id
             video_dic['video_img'] = one_video.get('miniProShareImage')
             video_dic['platform'] = self.platform
             video_dic["is_hot"] = 1
             video_dic["data_provider"] = "CCR"
         except Exception as e:
             print(e)
             continue
         data_list.append(video_dic)
     output_result(
         result_Lst=data_list,
         platform=self.platform,
         output_to_es_raw=True,
     )
     data_list.clear()
Ejemplo n.º 10
0
    def get_page_list(self, data):
        headers = {

                "accept": "*/*",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh,zh-CN;q=0.9",
                "cookie": "__ysuid=1553755022290AhL; juid=01d9bjgm0l2ngc; cna=2U8aFb1yaVcCAdr3nSX3f47K; __aryft=1557994090; __artft=1557994090; UM_distinctid=16eea2ee55739c-08e02b1a26b73c-2393f61-161012-16eea2ee55881b; ykss=f0cdf05d77e5a6dcebeb4c1c; __ayft=1576549468599; __aysid=1577241653062aoh; __ayscnt=5; yseid=1577241727003hUSqrH; yseidcount=4; ycid=0; __arycid=dz-3-00; __arcms=dz-3-00; referhost=https%3A%2F%2Flist.youku.com; _m_h5_c=60b9e2b4228097503d3975caca016d24_1577269476232%3B6030e92d9f896f1b7024ac8e5df7c81a; P_ck_ctl=70E1D32F5B5E92006640274BCF8D7371; _m_h5_tk=92bfeed90e6fedabcac24cf2fbc211de_1577268775512; _m_h5_tk_enc=9c860e1c7cdd927ab133515c7922f98e; CNZZDATA1277955961=1269611647-1575885450-https%253A%252F%252Flist.youku.com%252F%7C1577259749; seid=01dsu50o381d4f; __arpvid=15772649493186fuodZ-1577264949336; __aypstp=141; __ayspstp=80; seidtimeout=1577266751889; ypvid=1577264954182j6rlMt; ysestep=32; yseidtimeout=1577272154186; ystep=41; __ayvstp=95; __aysvstp=95; isg=BE1Nn-BVLy9TNInF8eGG1rJNXGkHgr-WrpQ5v4_I4OQxhm44V3_MzRVQ9FJFRpm0",
                "referer": "https://v.youku.com/",
                "sec-fetch-mode": "no-cors",
                "sec-fetch-site": "same-origin",
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",

        }
        page = 1
        while True:
            page_info_text = requests.get("http:" + data["url"], headers=headers).text
            # print(page_info_text)
            vid = re.findall("videoId: \'(\d*)\'", page_info_text)[0]
            showid = re.findall("showid: \'(\d*)\'", page_info_text)[0]
            encode_id = data["id"]
            pm = re.findall("playmode: \'(\d*)\'", page_info_text)[0]
            cat_id = re.findall("catId: \'(\d*)\'", page_info_text)[0]
            componentid = re.findall('"componentId":(\d*)', page_info_text)[0]
            isSimple = re.findall("isSimple: \'(.*)\'", page_info_text)[0]
            parser_dic = {
                    "l": "debug",
                    "pm": pm,
                    "vid": vid,
                    "fid": "0",
                    "showid": showid,
                    "sid": "0",
                    "componentid": componentid,
                    "videoCategoryId": cat_id,
                    "isSimple": isSimple,
                    "videoEncodeId": encode_id,
                    "page": page,
            }
            page_html = requests.get("https://v.youku.com/page/playlist?%s" % urllib.parse.urlencode(parser_dic),
                                     headers=headers)
            page += 1
            page_json = page_html.json()
            # print(page_html.json)
            if page_json["html"] == "\n":
                break
            soup = BeautifulSoup(page_json["html"], 'lxml')
            # print(soup)
            # soup.contents
            dev_list = soup.find_all(attrs={"class": "item item-cover"})
            for dev in dev_list:
                video_title = dev.get("title")
                vid = dev.get("item-id")
                video_url = dev.a.get("href")
                dev_text = dev.text
                if "VIP" in dev_text:
                    if_pay = "VIP"
                else:
                    if_pay = ""
                play_count = re.findall("热度 (\d+)", dev_text)[0]
                try:
                    duration = re.findall('(\d+:\d+:\d+)', dev_text)[0]
                except:
                    duration = re.findall('(\d+:\d+)', dev_text)[0]
                # print(dev.get_text)
                dic = {
                        "video_title": video_title,
                        "duration": trans_duration(duration),
                        "play_count": play_count,
                        "if_pay": if_pay,
                        "video_url": "https:" + video_url,
                        "url": "https:" + data["url"],
                        "vid": vid,
                        "album": data["title"]

                }
                self.parse_single_data(dic, video_url)
Ejemplo n.º 11
0
 def parse_video_page_single_process(self,
                                     output_to_file=False,
                                     filepath=None,
                                     push_to_redis=False,
                                     output_to_es_raw=True,
                                     es_index="crawler-data-raw",
                                     doc_type="doc",
                                     output_to_es_register=False):
     key = 'iqiyi_video_page_html'
     result_list = []
     pid = os.getpid()
     while connect_with_redis.length_of_lst(key) > 0:
         video_page_html = connect_with_redis.retrieve_video_page_html_from_redis(
             platform=self.platform)
         soup = BeautifulSoup(video_page_html, 'html.parser')
         try:
             page_info = soup.find("div", {"is": "i71-play"})[":page-info"]
             page_info = page_info.replace("'", '"')
             page_dic = json.loads(page_info)
         except:
             page_dic = None
         if page_dic is not None:
             title = page_dic["tvName"]
             url = page_dic["pageUrl"]
             dura_str = page_dic["duration"]
             duration = trans_duration(dura_str)
             try:
                 releaser = page_dic["user"]["name"]
                 releaserUrl = page_dic["user"]["profileUrl"]
             except:
                 releaser = None
                 releaserUrl = None
         else:
             title = None
             url = None
             duration = None
             releaser = None
             releaserUrl = None
         try:
             video_info = soup.find("div",
                                    {"is": "i71-play"})[":video-info"]
             video_dic = json.loads(video_info)
         except:
             video_dic = None
         if video_dic is not None:
             if title is None:
                 title = video_dic['name']
             if url is None:
                 url = video_dic['url']
             if releaser is None:
                 try:
                     releaser = video_dic["user"]["name"]
                     releaserUrl = video_dic["user"]["profileUrl"]
                 except:
                     releaser = None
                     releaserUrl = None
             release_time = video_dic["firstPublishTime"]
             tvId = video_dic["tvId"]
             hot_idx_url = "https://pub.m.iqiyi.com/jp/h5/count/hotDisplay/?qipuId=%s" % tvId
             get_hot_idx = retry_get_url(hot_idx_url)
             hot_idx_str = get_hot_idx.text
             hot_idx = int(
                 re.findall(
                     "\d+", ' '.join(re.findall('"count":\d+',
                                                hot_idx_str)))[0])
         fetch_time = int(
             datetime.datetime.timestamp(datetime.datetime.now()) * 1e3)
         if releaser is None:
             try:
                 releaser = soup.find('span', {
                     'class': 'intro-iterm__txt'
                 }).text
             except:
                 releaser = None
         video_page_dict = copy.deepcopy(self.video_data)
         video_page_dict["title"] = title
         video_page_dict["url"] = url
         video_page_dict["duration"] = duration
         video_page_dict["releaser"] = releaser
         video_page_dict["releaserUrl"] = releaserUrl
         video_page_dict["release_time"] = release_time
         video_page_dict["hot_idx"] = hot_idx
         video_page_dict["fetch_time"] = fetch_time
         video_page_dict["tvId"] = tvId
         result_list.append(video_page_dict)
         print(
             "platform: %s, action: parse video page, process_id: %s, has done: %s"
             % (self.platform, pid, len(result_list)))
         if len(result_list) >= 1000:
             output_result(result_Lst=result_list,
                           platform=self.platform,
                           output_to_file=output_to_file,
                           filepath=filepath,
                           push_to_redis=push_to_redis,
                           output_to_es_raw=output_to_es_raw,
                           es_index=es_index,
                           doc_type=doc_type,
                           output_to_es_register=output_to_es_register)
             result_list.clear()
     if result_list != []:
         output_result(result_Lst=result_list,
                       platform=self.platform,
                       output_to_file=output_to_file,
                       filepath=filepath,
                       push_to_redis=push_to_redis,
                       output_to_es_raw=output_to_es_raw,
                       es_index=es_index,
                       doc_type=doc_type,
                       output_to_es_register=output_to_es_register)
         result_list.clear()