def video_page(self, task=0): self.driver = webdriver.Chrome(options=self.chrome_options) self.driver.maximize_window() has_data = rds_get.dbsize() while has_data: keys = rds_get.randomkey() res = rds_get.hgetall(keys) has_data = rds_get.dbsize() # print(has_data) # print(res) # self.driver.execute_script('window.open("%s");' % res["url"]) # 关闭到当前窗口 # self.driver.close() time.sleep(0.2) # for handle in self.driver.window_handles: # self.driver.switch_to.window(handle) self.driver.get(res["url"]) self.driver.execute_script("window.scrollBy(0,1000)") self.driver.implicitly_wait(10) title = self.driver.find_element_by_xpath( "//a[@class='media-title']").text print("task ", str(task), title) detail_page_url_obj = self.driver.find_element_by_xpath( "//a[@class='media-title']") detail_page_url = detail_page_url_obj.get_attribute("href") print(detail_page_url) time.sleep(1) discribe_info = self.driver.find_element_by_xpath( "//div[@class='media-count']") discribe_info_str = discribe_info.text play_count, barrage_count, favorite_count = discribe_info_str.split( " · ") # print(discribe_info_str) play_count = trans_play_count(play_count) if not play_count: print(discribe_info_str) play_count = "--" rate_list = self.driver.find_elements_by_xpath( "//div[@class='media-rating']") if rate_list: rate = rate_list[0].text else: rate = "" # print(rate) describe = self.driver.find_element_by_xpath( "//a[@class='media-desc webkit-ellipsis']").text describe = describe.split("\n", -1)[0] # print(describe) dic = { "describe": describe, "rate": rate, "play_count": play_count, "barrage_count": barrage_count, "favorite_count": favorite_count, } project_name = "bilibili_%s" % title self.parse_data(dic, project_name) self.one_video_page(title, res["url"]) rds_get.delete(keys)
def get_releaser_page(self, releaserUrl): get_page = requests.get(releaserUrl, timeout=3) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') try: follower_str = soup.find('li', {'class': 'snum'}).em.text follower_num = trans_play_count(follower_str) print('%s follower number is %s' % (releaserUrl, follower_num)) signature = soup.find('div', {'class': 'user-desc'}).div.span.text try: signature_type = soup.find('div', { 'class': 'user-name' }).a.next_sibling["title"] except: signature_type = "" dic = { "signature": signature, "follower_num": follower_num, "signature_type": signature_type, } print(dic) return dic except: print("can't find followers") return None
def process_one_video(self, line): video_info = copy.deepcopy(self.video_data) try: video_info['title'] = line.find('a', {'target': 'video'})['title'] except: video_info['title'] = None try: url = line.find('a', {'target': 'video'})['href'] video_info['url'] = 'https:' + url except: video_info['url'] = None try: play_count_str = line.find('span', {'class': 'v-num'}).text video_info['play_count'] = trans_play_count(play_count_str) except: video_info['play_count'] = 0 # logging.warning("can't get play_count at page %s" % video_info['url']) try: release_time_str = line.find('span', { 'class': 'v-publishtime' }).text video_info['release_time'] = trans_strtime_to_timestamp( input_time=release_time_str, missing_year=True) except: release_time_str = 0 # logging.warning("can't get release_time at page %s" % video_info['url']) try: dura_str = line.find('span', {'class': 'v-time'}).text video_info['duration'] = trans_duration(dura_str) except: video_info['duration'] = 0 # logging.warning("can't get duration at page %s" % video_info['url']) fetch_time = int(time.time() * 1e3) video_info['fetch_time'] = fetch_time return video_info
def releaser_video_sum(self, releaserUrl): get_page = retry_get_url(releaserUrl) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') total_video_num_str = soup.find('div', {'class':'title'}).span.text total_video_num = total_video_num_str.replace('(', '').replace(')', '').replace(',', '') total_video_num = trans_play_count(total_video_num) return total_video_num
def search_page(self, title=None, search_json=None, **kwargs): data_list = [] timestamp = int(datetime.datetime.now().timestamp() * 1e3) title = urllib.parse.quote(title) headers = { "Accept-Encoding": "gzip", # "X-SS-REQ-TICKET": "1587102750860", "passport-sdk-version": "14", "sdk-version": "2", #"Cookie": "odin_tt=d5d96b2812637e9d20681530fbbe4d52e8f76ae1b6afa8c0a173260321611c507ac6eca10991b21fc4f023e94371d457df784f959e94db673ef29a5bd2137091; qh[360]=1; history=alrvlFic6pJZXJCTWBmSmZt6KW6mevZSz5LU3OJ7DEKX42Zw%2Bc84wMR3iYGBweFy3EzZsPcNTLyXWN1AvLYP8%2BQPMLFfEpUA8bo%2F7nNtYOK7xNwC4k3XmMHe5MtzSTiM48DluNr01dkNTDyXuHrApsi4ejkwsV%2BSmAPmSeXoMzDxXhKcAuIVrRfWAJnJJwA25fG1DoezvFBTZrzZeg6kT%2BwWSG7Gx3UJB5h4L%2FH4gXlVn%2BtAtkvFMQRcjpv%2B%2Be9TBib2S%2BwcYBuUn8xsYGK%2FJKMAkptgfXrDASaOS4yHQHJVPy6UOjDxXuI4BeJN26Fs6MDEcYn%2FEoMDAAAA%2F%2F8%3D; install_id=112651077855; ttreq=1$0b37d53ca5c301ce96959dc97a67886da420b294", # "X-Gorgon": "0401007140017aae019cc2020b1c48dbab0ba42839014487648a", #"X-Khronos": "1587102750", "Host": "is.snssdk.com", "Connection": "Keep-Alive", "User-Agent": "okhttp/3.10.0.1", } url = "https://is.snssdk.com/api/search/content/?os_api=23&device_type=oneplus+a5010&from_search_subtab=synthesis&manifest_version_code=7690&source=search_subtab_switch&offset=0&is_ttwebview=0&action_type&is_incognito=0&keyword_type&rom_version=23&app_name=news_article&format=json&version_name=7.6.9&ac=wifi&host_abi=armeabi-v7a&update_version_code=76909&channel=baidu_0411&is_native_req=1&loadId=1&longitude=116.40717530841052&isIncognito=0&plugin=2050&forum=1&latitude=39.904680919672145&language=zh&pd=video&cur_tab_title=search_tab&aid=13&dpi=270&qrecImprId&fetch_by_ttnet=1&count=10&plugin_enable=3&search_position&ab_group=100167%2C94569%2C102754&keyword={0}&scm_version=1.0.2.830&search_json=%7B%22comment_ids%22%3A%5B%5D%2C%22event_discussion%22%3A74123%2C%22event_impression%22%3A17270790%2C%22forum_id%22%3A1664181806902302%2C%22forum_recall_wtt%22%3A%5B1664190666034183%2C1664192273575943%2C1664184430218253%2C1664185769175051%2C1664184985139212%2C1664196237152267%2C1664186792648732%2C1664188755414019%2C1664187055838215%2C1664184182571022%2C1664185938950148%2C1664188041995268%2C1664188322863172%2C1664190185024520%2C1664185602828300%2C1664184276484099%2C1664188211399684%2C1664187870713868%2C1664184484958211%2C1664183864289288%2C1664186825487371%2C1664195548700686%2C1664186585780228%2C1664197296210947%2C1664188146725901%2C1664191748459523%5D%2C%22group_source%22%3Anull%2C%22hot_gid%22%3A6816255461172445703%2C%22log_pb%22%3A%7B%22cluster_type%22%3A%220%22%2C%22entrance_hotspot%22%3A%22channel%22%2C%22hot_board_cluster_id%22%3A%226816091697949180424%22%2C%22hot_board_impr_id%22%3A%22202004171352010100140411610B1A7741%22%2C%22location%22%3A%22hot_board%22%2C%22rank%22%3A%225%22%2C%22source%22%3A%22trending_tab%22%2C%22style_id%22%3A%2210005%22%7D%2C%22mix_stick_ids%22%3A%5B1664190666034183%2C1664192273575943%2C1664184430218253%2C1664185769175051%2C1664184985139212%2C1664196237152267%2C1664186792648732%2C1664188755414019%2C1664187055838215%2C1664184182571022%2C1664185938950148%2C1664188041995268%2C1664188322863172%2C1664190185024520%2C1664185602828300%2C1664184276484099%2C1664188211399684%2C1664187870713868%2C1664184484958211%2C1664183864289288%2C1664186825487371%2C1664195548700686%2C1664186585780228%2C1664197296210947%2C1664188146725901%2C1664191748459523%5D%2C%22stick_group_ids%22%3A%5B%5D%7D&device_platform=android&search_id&has_count=0&version_code=769&from=video&device_id={1}&resolution=1080*1920&os_version=6.0.1&device_brand=Oneplus&search_sug=1&qc_query".format( title, random.randint(69418800000, 69418899999)) res = retry_get_url(url, headers=headers, timeout=5, proxies=3) page_text = res.json() for one_video in page_text["data"]: video_dic = {} try: video_dic['title'] = one_video.get('title') video_dic['url'] = one_video.get('display').get("info").get( "url") releaser_id = re.findall("user_id=(\d+)", one_video.get('user_source_url'))[0] video_dic['releaser'] = one_video.get('media_name') video_dic[ 'releaserUrl'] = "https://www.toutiao.com/c/user/%s/" % releaser_id release_time = int(one_video.get('create_time')) video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int(one_video.get('video_duration')) video_dic['play_count'] = trans_play_count( one_video.get('play_effective_count')) video_dic['repost_count'] = 0 video_dic['comment_count'] = one_video.get('comment_count') video_dic['favorite_count'] = one_video.get('digg_count') video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['video_img'] = one_video.get('display').get( 'self_info').get('image_url') video_dic['platform'] = "toutiao" video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" except Exception as e: print(e) continue data_list.append(video_dic) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def get_releaser_follower_num(self, releaserUrl): get_page = requests.get(releaserUrl) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') try: follower_str = soup.find('li', {'class': 'snum'}).em.text follower_num = trans_play_count(follower_str) print('%s follower number is %s' % (releaserUrl, follower_num)) releaser_img = self.get_releaser_image(data=soup) return follower_num, releaser_img except: print("can't can followers")
def login(self): self.driver.get("https://live.ixigua.com/room/6831736034540456716/") while True: now = datetime.datetime.now() res = self.driver.find_elements_by_xpath( "//span[@class='action-text v-middle live-skin-normal-text dp-i-block']" ) if res: play_count = trans_play_count(res[0].text) print(play_count) dic = { "menber": play_count, "fetch_time": int(now.timestamp() * 1e3) } rds.rpush("toutiao", json.dumps(dic)) time.sleep(200)
def login(self): self.driver.get( "https://m.yangshipin.cn/video?type=2&vid=2004011401&pid=600036243&ptag=4_1.4.2.20898_wxf" ) while True: now = datetime.datetime.now() res = self.driver.find_elements_by_xpath( "//span[@class='p-video-intro-person']") if res: play_count = trans_play_count(res[0].text) print(play_count) dic = { "menber": play_count, "fetch_time": int(now.timestamp() * 1e3) } rds.rpush("toutiao", json.dumps(dic)) time.sleep(200)
def process_lst_page(resp): video_lst = [] soup = BeautifulSoup(resp, 'html.parser') midstep = soup.find_all('li', {'class':'list_item'}) for line in midstep: video_dic = {} url = line.a['href'] find_play_count = BeautifulSoup(list(line)[-2], 'html.parser') play_count_str = find_play_count.find('span', {'class':'num'}).text.replace(' ', '') try: play_count = trans_play_count(play_count_str) except: play_count = 0 video_dic = {'url': url, 'play_count': play_count} video_lst.append(video_dic) return video_lst
def login(self): self.driver.get( "https://live.bilibili.com/21686237?from=search&seid=1739181021049557638" ) while True: now = datetime.datetime.now() res = self.driver.find_elements_by_xpath( "//span[@class='action-text v-middle live-skin-normal-text dp-i-block']" ) if res: play_count = trans_play_count(res[0].text) print(play_count) dic = { "menber": play_count, "fetch_time": int(now.timestamp() * 1e3) } rds.rpush("bilibili", json.dumps(dic)) time.sleep(200)
def releaser_page_pc(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=10000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ releaser = "" user_id = "153512{0}".format(random.randint(1000, 9000)) proxies = get_proxy(proxies_num) headers = { "accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "content-type": "application/json", #"Cookie": "did=web_f6e24105905d4b0381d36220ad9ccda0 ;userId=%s" % (user_id), "Cookie": "client_key=65890b29; clientid=3; did=web_f6e24105905d4b0381d36220ad9ccda0; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1574822912; didv=1583802670821; userId=1535125321; kuaishou.live.bfb1s=477cb0011daca84b36b3a4676857e5a1; userId=1535125321; kuaishou.live.web_st=ChRrdWFpc2hvdS5saXZlLndlYi5zdBKgAfPHcR6LVRx3FRHYIe2X1-gdxEI8d1iJJnM7rTZaKtVo-54m5Bolw__9dpYJoPwvA5I2Qw_7Dgl3_8N_jicpbkpT__u6ZIxcSGC3hWmVXGufsv7zVvUALqMLknpSPVoGXlt8GFBIh4LVeEsST-ghGGWB5gpAEkU2nxVB2pXUREuQ6PEh9cc_bjoODqzcROsKFGyAYVg81qp9tnJesa1oODUaEk2hY_LIikBot7IUVtJ3ydB6KCIgUeaa89k7DGhBoXcPwlWtSUp4VbGECgvvOeIaTNFMoScoBTAB; kuaishou.live.web_ph=c41f68048b583530bfa89ab7150b24df445c", "Host": "live.kuaishou.com", "Origin": "https://live.kuaishou.com", "Referer": releaserUrl, "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) pcursor = "" principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": time.sleep(random.randint(1, 2)) # self.get_cookies_and_font(releaserUrl) url_dic = { "operationName": "publicFeedsQuery", "variables": { "principalId": releaser_id, "pcursor": pcursor, "count": 100 }, "query": "query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n live {\n user {\n id\n avatar\n name\n __typename\n }\n watchingCount\n poster\n coverUrl\n caption\n id\n playUrls {\n quality\n url\n __typename\n }\n quality\n gameInfo {\n category\n name\n pubgSurvival\n type\n kingHero\n __typename\n }\n hasRedPack\n liveGuess\n expTag\n __typename\n }\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n" } api_url = 'https://live.kuaishou.com/m_graphql' try: if proxies: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5, proxies=proxies) else: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5) except: proxies = get_proxy(proxies_num) continue #print(get_page.content) page_dic = get_page.json() data_list = page_dic.get("data").get("publicFeeds").get("list") #print(data_list) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser_id, count)) # self.loginObj.delete_cookies(self.cookie_dic) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: pcursor = "no_more" continue else: pcursor = page_dic.get("data").get("publicFeeds").get( "pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get('user').get("eid") video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, info_dic.get('id')) video_dic['releaser'] = info_dic.get('user').get("name") video_dic['release_time'] = info_dic.get('timestamp') video_dic['play_count'] = trans_play_count( info_dic.get('counts').get("displayView")) video_dic['comment_count'] = trans_play_count( info_dic.get('counts').get("displayComment")) video_dic['favorite_count'] = trans_play_count( info_dic.get('counts').get("displayLike")) video_dic['video_id'] = info_dic.get('id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "kwai_%s" % (releaser_id) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def releaser_page_pc(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=10000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ releaser = "" user_id = "153512{0}".format(random.randint(1000, 9000)) proxies = get_proxy(proxies_num) headers = { "accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "content-type": "application/json", "Cookie": "did=web_504e72386a69c6d6172f1457b591415c ;userId=%s" % (user_id), "Host": "live.kuaishou.com", "Origin": "https://live.kuaishou.com", "Referer": releaserUrl, "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) pcursor = "" principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": time.sleep(random.randint(1, 2)) # self.get_cookies_and_font(releaserUrl) url_dic = { "operationName": "publicFeedsQuery", "variables": { "principalId": releaser_id, "pcursor": pcursor, "count": 100 }, "query": "query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n live {\n user {\n id\n avatar\n name\n __typename\n }\n watchingCount\n poster\n coverUrl\n caption\n id\n playUrls {\n quality\n url\n __typename\n }\n quality\n gameInfo {\n category\n name\n pubgSurvival\n type\n kingHero\n __typename\n }\n hasRedPack\n liveGuess\n expTag\n __typename\n }\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n" } api_url = 'https://live.kuaishou.com/m_graphql' try: if proxies: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5, proxies=proxies) else: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5) except: proxies = get_proxy(proxies_num) continue #print(get_page.content) page_dic = get_page.json() data_list = page_dic.get("data").get("publicFeeds").get("list") #print(data_list) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser_id, count)) # self.loginObj.delete_cookies(self.cookie_dic) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: pcursor = "no_more" continue else: pcursor = page_dic.get("data").get("publicFeeds").get( "pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get('user').get("eid") video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, info_dic.get('id')) video_dic['releaser'] = info_dic.get('user').get("name") video_dic['release_time'] = info_dic.get('timestamp') video_dic['play_count'] = trans_play_count( info_dic.get('counts').get("displayView")) video_dic['comment_count'] = trans_play_count( info_dic.get('counts').get("displayComment")) video_dic['favorite_count'] = trans_play_count( info_dic.get('counts').get("displayLike")) video_dic['video_id'] = info_dic.get('id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "kwai_%s" % (releaser_id) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=5000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = None headers = { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", "Cookie": "clientid=3; did=web_549cd4825914642449695ddccf5bfa99; client_key=65890b29; userId=%s; didv=1589785882000; sid=a94d55c86bbbccd28b8e2a8d" % random.randint(861446000, 861449800), "Host": "c.kuaishou.com", "kpf": "H5", "kpn": "KUAISHOU", "Origin": "https://c.kuaishou.com", "Referer": "https://c.kuaishou.com/fw/user/%s?fid=1535125321&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=176513752168&docABKey=share_textid_profile&shareToken=X6btjdy2izGxVqQ_A&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&appType=21&shareObjectId=1478754458&shareUrlOpened=0" % releaser_id, "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } # cookies = self.get_web_url_cookies(headers["Referer"]) proxies = get_proxy(proxies_num) while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post( "https://c.kuaishou.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 18, "pcursor": pcursor }, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post( "https://c.kuaishou.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 18, "pcursor": pcursor }, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3, 4)) page_dic = get_page.json() data_list = page_dic.get("feeds") # print(data_list) # if not data_list: # get_page = requests.post("https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count":50, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(2) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 if retry_time > 5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id_ = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic[ 'url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id_, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count( info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count( info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count( info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count( info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int( info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % ( releaser_id_) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id_ video_dic['video_img'] = info_dic.get( "coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def key_customer(self, releaserUrl, releaser_page_num_max=1000, output_to_es_raw=False, es_index='crawler-data-raw', doc_type='doc'): """ input releaserUrl must be strict as https://id.tudou.com/i/UMjc5MzI5NDA==/videos? end with /videos otherwise when scrolling it will make mistakes """ releaser_id = self.get_releaser_id(releaserUrl) print("working on releaser: %s" % releaser_id) releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id result_lst = [] get_page = retry_get_url(releaserUrl) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') try: releaser = soup.find('div', {'class': 'user-name'}).a.text except: releaser = None try: total_video_num_str = soup.find('div', { 'class': 'title' }).span.text total_video_num = total_video_num_str.replace('(', '').replace( ')', '').replace(',', '') total_video_num = trans_play_count(total_video_num) except: print(releaserUrl) if total_video_num % 50 == 0: total_page_num = int(total_video_num / 50) else: total_page_num = int(total_video_num / 50) + 1 if releaser_page_num_max > total_page_num: releaser_page_num_max = total_page_num print("releaser page num max is %s" % releaser_page_num_max) video_lst = soup.find_all('div', {'class': 'v'}) for line in video_lst: video_info = self.process_one_video(line) video_info['releaserUrl'] = releaserUrl video_info['releaser'] = releaser result_lst.append(video_info) if releaser_page_num_max >= 2: page_num = 2 try: partial_releaserUrl = soup.find('li', { 'class': 'next' }).a['href'] new_releaserUrl = 'https://id.tudou.com%s' % partial_releaserUrl except: print(new_releaserUrl) while page_num <= releaser_page_num_max: get_page = retry_get_url(new_releaserUrl) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') if page_num != releaser_page_num_max: try: new_releaserUrl = 'https://id.tudou.com' + soup.find( 'li', { 'class': 'next' }).a['href'] except: new_releaserUrl = ( 'https://id.tudou.com/i/%s/videos?order=1&page=%s' % (releaser_id, page_num)) video_lst = soup.find_all('div', {'class': 'v'}) for line in video_lst: video_info = self.process_one_video(line) video_info['releaserUrl'] = releaserUrl video_info['releaser'] = releaser result_lst.append(video_info) print('get page %s list length is %s' % (page_num, len(result_lst))) page_num += 1 output_result(result_Lst=result_lst, platform=self.platform, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) result_lst.clear() if result_lst != []: output_result(result_Lst=result_lst, platform=self.platform, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) result_lst.clear()
def list_page(self, url=None, video_list_xpath=None, play_count_xpath=None, if_free=None, title=None, describe=None, project_tag_xpath=None, provider=None, year=None, next_page_xpath=None, roll=None, project_tag=None, style_tags=None): # js = "var q=document.documentElement.scrollTop=%s" % roll # self.driver.execute_script(js) while True: # self.driver.get(url) # self.driver.implicitly_wait(5) time.sleep(0.2) # self.driver.execute_script(js) try: next_page_obj = self.driver.find_elements_by_xpath( next_page_xpath) except: # self.driver.get(self.driver.current_url) continue vidoe_list_obj = self.driver.find_elements_by_xpath( video_list_xpath) try: for one_video in vidoe_list_obj: if_pay = "" str_res_list = one_video.text.split("\n") if len(str_res_list) == 4: play_count_str, if_pay, title_str, video_count_str = str_res_list elif len(str_res_list) == 3: play_count_str, title_str, video_count_str = str_res_list else: play_count_str, title_str = str_res_list video_count_str = "" play_count = trans_play_count(play_count_str) project_name = "bilibili_%s" % title_str url = one_video.find_element_by_xpath( "./a[1]").get_attribute('href') data_dic = { "play_count": play_count, "url": url, "title": title_str, "video_count": video_count_str, "if_pay": if_pay } if style_tags: temp_dic = {} temp_dic["style_tags"] = style_tags data_dic.update(temp_dic) if project_tag: temp_dic = {} temp_dic["project_tags"] = project_tag data_dic.update(temp_dic) if year: temp_dic = {} temp_dic["year"] = year data_dic.update(temp_dic) self.parse_data(data_dic, project_name) if next_page_obj: # next_page_obj[0].click() action = ActionChains(self.driver) action.click(next_page_obj[0]).perform() else: # self.driver.close() break except Exception as e: print(e) self.driver.close()
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=5000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = None headers = { "Accept": "application/json", #"Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", #"Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", "Cookie": "did=web_c7c42d62cbb24{0}4d1ca5ffca052c3; didv=1582271776000; sid=e12d2ec74ec7af3a24d{1}cd6;pua5rv=1" .format(random.randint(1000, 9000), random.randint(20, 99)), "Host": "kpfbeijing.m.chenzhongtech.com", "kpf": "H5", "kpn": "KUAISHOU", # "Origin": "https://v.kuaishou.com", "Origin": "https://kpfbeijing.m.chenzhongtech.com", "Referer": "https://kpfbeijing.m.chenzhongtech.com/fw/user/%s?fid=1535125322&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=14810686%s&docABKey=share_textid_profile&shareToken=X-7AeJHKdHOc_-392ps0aWP381Bs&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&shareObjectId=916251992&shareUrlOpened=0" % (releaser_id, random.randint(1000, 9800)), "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36", } # cookies = self.get_web_url_cookies(headers["Referer"]) proxies = get_proxy(proxies_num) while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post( "https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post( "https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3, 4)) page_dic = get_page.json() data_list = page_dic.get("feeds") # print(data_list) # if not data_list: # get_page = requests.post("https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count":50, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(2) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 if retry_time > 5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id_ = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic[ 'url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id_, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count( info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count( info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count( info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count( info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int( info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % ( releaser_id_) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id_ video_dic['video_img'] = info_dic.get( "coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = 0 headers = { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", # "Cookie": "did=web_c7c42d62cbb24{0}4d1ca5ffca052e3; didv=1582271776000; sid=e12d2ec74ec7af3a24d{1}cd6;pua5rv=1".format(random.randint(6000,8000),random.randint(20,99)), "Cookie": "did=web_790b7bcefe7347c5937a39d34c49f7ed; didv=1583150714000; sid=ab0c3a5497ab3c8fb73c8bef", "Host": "kpfshanghai.m.chenzhongtech.com", "kpf": "H5", "kpn": "KUAISHOU", # "Origin": "https://v.kuaishou.com", "Origin": "https://kpfshanghai.m.chenzhongtech.com", "Referer": "https://kpfshanghai.m.chenzhongtech.com/fw/user/%s?fid=1535125322&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=14810686%s&docABKey=share_textid_profile&shareToken=X-7AeJHKdHOc_-392ps0aWP381Bs&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&shareObjectId=916251992&shareUrlOpened=0" % (releaser_id, random.randint(1000, 9800)), "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } proxies = get_proxy(proxies_num) # print(proxies) # proxies = {'http': 'http://*****:*****@58.55.159.141:16085/', 'https': 'http://*****:*****@58.55.159.141:16085/'} while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post( "https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post( "https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3, 5)) page_dic = get_page.json() data_list = page_dic.get("feeds") # # print(data_list) # if not data_list: # get_page = requests.post("https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count": 18, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(1) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 print("no proxies") if retry_time > 5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic[ 'url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count( info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count( info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count( info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count( info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int( info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % ( releaser_id) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = info_dic.get( "coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() return result_list