def get_releaser_follower_num_web(self, releaserUrl): proxies = get_proxy(1) releaser_id = self.get_releaser_id(releaserUrl) url = "https://sv.baidu.com/haokan/api?cmd=baijia/authorInfo&log=vhk&tn=1008621v&ctn=1008621v&imei=&cuid=51BF00514510A03B32E6CA9D7443D8F8|504550857697800&bdboxcuid=&os=android&osbranch=a0&ua=810_1440_270&ut=MI%20NOTE%203_6.0.1_23_Xiaomi&apiv=4.6.0.0&appv=414011&version=4.14.1.10&life=1555296294&clife=1558350548&hid=02112F128209DD6BAF39CA37DE9C05E6&imsi=0&network=1&location={%22prov%22:%22%22,%22city%22:%22%22,%22county%22:%22%22,%22street%22:%22%22,%22latitude%22:39.911017,%22longitude%22:116.413562}&sids=1957_2-2193_3-2230_4-2320_1-2326_2-2353_1-2359_3-2376_1-2391_1-2433_4-2436_5-2438_1-2442_1-2443_2-2452_1-2457_2-2470_1-2480_2-2511_1-2525_4-2529_1-2537_1-2538_1-2540_1-2555_2-2563_1-2565_2-2568_1-2574_1-2575_1-2577_1-2582_1" headers = { "Host": "sv.baidu.com", "Connection": "keep-alive", "Content-Length": "60", "Charset": "UTF-8", "User-Agent": 'Mozilla/5.0 (Linux; Android 6.0.1; MI NOTE 3 Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36 haokan/4.14.1.10 (Baidu; P1 6.0.1)/imoaiX_32_1.0.6_3+ETON+IM/1008621v/51BF00514520A03B32E6CA9D7443D8F8%7C504550857697800/1/4.14.1.10/414011/1', "X-Bfe-Quic": "enable=1", "XRAY-REQ-FUNC-ST-DNS": "okHttp;1558350575755;0", "XRAY-TRACEID": "be54291d-c13a-4a88-8337-9e70ad75d7d8", # "Cookie": "BAIDUID=13CD01ABA9F3F112EEE8880716798F35:FG=1; BAIDUZID=T0WvNv-W1ew-E8YpKuV10jpN5j2DYGOE4bUCyiIsT3Iun8dpVe7GQpFr7mFjzYFHEFOhQgdC_ixxf48KpG1iwQb7HiU9ypKA2obES0JACE_E; BAIDUCUID=gaHRu_u_v8gga2830u2uu_uCHilEi-uk_av9i0PDHtifa28fga26fgayvf_NP2ijA", "Content-Type": "application/x-www-form-urlencoded", "Accept-Encoding": "gzip, deflate" } post_dic = {"baijia/authorInfo": "method=get&app_id=%s" % releaser_id} get_page = requests.post(url, data=post_dic, headers=headers, proxies=proxies) res = get_page.json() try: follower_num = res.get("baijia/authorInfo").get("data").get( "fansCnt") print('%s follower number is %s' % (releaserUrl, follower_num)) return follower_num except: print("can't can followers")
def get_releaser_name(self, releaserUrl): """ Due to the function releaser_page can't get releaser name from api, I add a function to get it from web page posted by yucheng fang """ get_page = requests.get(releaserUrl,proxies=get_proxy(1)) page = get_page.text soup = BeautifulSoup(page, 'html.parser') try: releaser = soup.find('div', {'class': 'user-name'}).a.text except: print("can't get releaser name at soup.find('div', {'class': 'user-name'}).a.text") if releaser is not None: print("get releaser name at soup.find('div', {'class': 'user-name'}).a.text") return releaser else: print("get releaser name at soup.find('div', {'class': 'user-name'}).a.text") try: releaser = soup.find('div', {'class': 'head-avatar'}).a['title'] except: print("can't get releaser name at soup.find('div', {'class': 'head-avatar'}).a['title']") if releaser is not None: return releaser else: print("can't get releaser name at soup.find('div', {'class': 'head-avatar'}).a['title']") return None
def get_releaser_follower_num(self,releaserUrl): proxies = get_proxy(1) releaser_id = self.get_releaser_id(releaserUrl) url = "http://c.m.163.com/nc/subscribe/v2/topic/%s.html" % releaser_id res = requests.get(url,headers=self.headers,proxies=proxies) res_json = res.json() try: follower_num = self.forllower_num_to_int(res_json.get("subscribe_info").get("subnum")) releaser_img_url = self.get_releaser_image(data=res_json) print('%s follower number is %s' % (releaserUrl, follower_num)) return follower_num,releaser_img_url except: print("can't can followers")
def get_releaser_image(self,releaserUrl=None,data=None): if releaserUrl: proxies_num = get_proxy(proxies_num=1) releaser_id = self.get_releaser_id(releaserUrl) url = "http://c.m.163.com/nc/subscribe/v2/topic/%s.html" % releaser_id res = requests.get(url, headers=self.headers,proxies=proxies_num) res_json = res.json() try: releaser_img_url = res_json.get("subscribe_info").get("topic_icons") return releaser_img_url except: print("can't can releaser_img") else: releaser_img_url = data.get("subscribe_info").get("topic_icons") return releaser_img_url
def retry_get_url(url, retrys=3, proxies=None, timeout=10, **kwargs): retry_c = 0 while retry_c < retrys: try: if proxies: proxies_dic = get_proxy(proxies) if not proxies_dic: get_resp = requests.get(url, timeout=timeout, **kwargs) else: get_resp = requests.get(url, proxies=proxies_dic, timeout=timeout, **kwargs) else: get_resp = requests.get(url, timeout=timeout, **kwargs) return get_resp except Exception as e: retry_c += 1 time.sleep(1) print(e) print('Failed to get page %s after %d retries, %s' % (url, retrys, datetime.datetime.now())) return None
def releaser_dynamic_page_web_by_time(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=5000, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, fetchFavoriteCommnt=True, proxies_num=None): releaser_id = get_releaser_id(platform=self.platfrom, releaserUrl=releaserUrl) proxies = get_proxy(proxies_num) result_lst = [] url = 'https://webpage.mbd.baidu.com/home?context={%22app_id%22:%22' + releaser_id + "%22}" headers = { "Host": "webpage.mbd.baidu.com", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "******", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Sec-Fetch-Site": "none", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh,zh-CN;q=0.9" } video_list_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh,zh-CN;q=0.9", "Cache-Control": "max-age=0", "Referer": url, "Host": "mbd.baidu.com", "Sec-Fetch-Mode": "no-cors", "Sec-Fetch-Site": "same-site", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36" } def get_page_info(): page_info = requests.get(url, headers=headers, proxies=proxies, timeout=5) #print(page_info.text) page_text = page_info.text cookies = page_info.cookies res_data = re.findall( "window.runtime= (.*),window.runtime.pageType", page_text) if res_data: res_data = json.loads(res_data[0]) #print(res_data) return res_data, cookies res_data, cookies = get_page_info() ctime = None releaser_fans = res_data.get("user").get("fans_num") uk = res_data.get("user").get("uk") releaser = res_data.get("user").get("display_name") Hmery_Time = cookies["Hmery-Time"] page_num = 0 count_false = 0 has_more = True def get_data_list(uk, releaser, Hmery_Time, ctime): interact_list = [] data_list_dic = {} data_dic = { "tab": "dynamic", "num": "10", "uk": uk, "type": "newhome", "action": "dynamic", "format": "jsonp", "Tenger-Mhor": Hmery_Time, } if ctime: data_dic["ctime"] = ctime video_list_url = "https://mbd.baidu.com/webpage?%s" % urlencode( data_dic) video_list_res = requests.get(video_list_url, headers=video_list_headers, cookies=cookies, proxies=proxies, timeout=5) video_list_res_json = json.loads( re.findall("\((.*)\)", video_list_res.text)[0]) data_lis = video_list_res_json.get("data").get("list") has_more = video_list_res_json.get("data").get("hasMore") ctime = video_list_res_json.get("data").get("query").get("ctime") #interact_dic = {} data_dic = {} if data_lis: for single_data in data_lis: # interact_dic["user_type"] = single_data["user_type"] # interact_dic["dynamic_id"] = single_data["dynamic_id"] # interact_dic["feed_id"] = single_data["feed_id"] # interact_dic["dynamic_type"] = single_data["dynamic_type"] # interact_dic["dynamic_sub_type"] = single_data["dynamic_sub_type"] if single_data["itemData"]["layout"] == "video_play": #print(single_data["itemData"]["media_type"]) pass else: #print(single_data["itemData"]["layout"],single_data["itemData"]["media_type"]) continue _id = single_data["id"] #single_data["asyncParams"]["thread_id"] = str(single_data["asyncParams"]["thread_id"]) interact_list.append(single_data["asyncParams"]) data_dic["title"] = single_data["itemData"]["title"] data_dic["baijiahao_url"] = single_data["itemData"][ "feed_url"] data_dic[ "url"] = "https://haokan.baidu.com/v?vid=%s" % single_data[ "feed_id"] data_dic["platform"] = self.platfrom data_dic["fetch_time"] = int( datetime.datetime.now().timestamp() * 1e3) data_dic["release_time"] = int( single_data["itemData"]["ctime"] * 1e3) data_dic["data_provider"] = "CCR" data_dic["releaser"] = releaser data_dic["data_sources"] = "baijiahao" data_dic["releaserUrl"] = url data_dic["video_id"] = single_data["dynamic_id"] data_dic["releaser_id_str"] = "haokan_%s" % releaser_id data_dic["duration"] = int( float( trans_duration( single_data["itemData"]["duration"]))) data_dic['releaser_followers_count'] = int(releaser_fans) data_dic["video_img"] = single_data["itemData"][ "img_400_200"] one_dic = None one_dic = copy.deepcopy(data_dic) data_list_dic[_id] = one_dic return interact_list, data_list_dic, ctime, has_more def get_interact_list(uk, Hmery_Time, interact_list): interact_dic = { "uk": uk, "type": "homepage", "action": "interact", "format": "jsonp", "Tenger-Mhor": Hmery_Time, #"params":urlencode(str(interact_list).replace) } params_str = str(interact_list).replace("'", '"').replace(" ", "") url = "https://mbd.baidu.com/webpage?%s¶ms=%s" % ( urlencode(interact_dic), params_str) interact_res = requests.get(url, headers=video_list_headers, cookies=cookies, proxies=proxies, timeout=3) interact_res_json = json.loads( re.findall("\((.*)\)", interact_res.text)[0]) return interact_res_json["data"]["user_list"] while page_num <= releaser_page_num_max and has_more and count_false <= 5: try: proxies = get_proxy(proxies_num) interact_list, data_list_dic, ctime, has_more = get_data_list( uk, releaser, Hmery_Time, ctime) if interact_list: interact_res_json = get_interact_list( uk, Hmery_Time, interact_list) else: continue page_num += 1 for interact in interact_res_json: data_list_dic[interact]["comment_count"] = int( interact_res_json[interact]["comment_num"]) data_list_dic[interact]["favorite_count"] = int( interact_res_json[interact]["praise_num"]) data_list_dic[interact]["play_count"] = int( interact_res_json[interact]["read_num"]) data_list_dic[interact]["repost_count"] = int( interact_res_json[interact]["forward_num"]) yield data_list_dic[interact] except Exception as e: print(e) proxies = get_proxy(1) count_false += 1 if count_false <= 5: continue else: break
def releaser_page_app(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=4000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ headers = { 'Host': 'apis.tudou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Cookie': ('isg=BIeH6gcJlwZw_xQESm9jlG-vFTuRJGXxikf0g1l0mJY9yKeKYVuAvzKJbkgzOzPm;' 'cna=XA2EFIGslWoCAWp4y3KXcZh7; ykss=cdbd115c102a68710215ad93;' '__ysuid=1543316262167mjE; P_ck_ctl=62DE1D55DFE1C0F4F27A8662E6575F08;' '__ayvstp=32'), 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0' } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaser = self.get_releaser_name(releaserUrl) releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id self.video_data['releaser'] = releaser self.video_data['releaserUrl'] = releaserUrl url_dic = {"uid": releaser_id, "pL": "20"} print("working on releaser: %s releaser_id: %s" % (releaser, releaser_id)) while count <= releaser_page_num_max and retry_time < 5: proxies = get_proxy(proxies_num) url_dic['pg'] = str(count) url_dic['pn'] = str(count) api_url = 'http://apis.tudou.com/subscribe/v1/video?%s' % urllib.parse.urlencode( url_dic) # print(api_url) if proxies: get_page = requests.get(api_url, headers=headers, proxies=proxies, timeout=5) else: get_page = requests.get(api_url, headers=headers, timeout=5) page_dic = get_page.json() # has_more = page_dic.get('has_more') try: data_list = page_dic['entity'] except: retry_time += 1 proxies = get_proxy(1) time.sleep(0.25) print("no more data at releaser: %s page: %s try_time: %s" % (releaser, count, retry_time)) continue if data_list == []: retry_time += 1 proxies = get_proxy(1) time.sleep(0.25) print("no more data at releaser: %s page: %s try_time: %s" % (releaser, count, retry_time)) continue else: retry_time = 0 print("get data at releaser: %s page: %s" % (releaser, count)) count += 1 for info_dic in data_list: video_info = copy.deepcopy(self.video_data) one_video = info_dic.get('detail') if one_video is not None: get_title = one_video.get('base_detail') if get_title is not None: video_info['title'] = get_title.get('title') detail_info = one_video.get('video_detail') if detail_info is not None: video_id = detail_info.get('video_id') if video_id is not None: video_info['video_id'] = video_id video_info[ 'url'] = 'https://video.tudou.com/v/%s.html' % video_id video_info['duration'] = detail_info.get( 'duration') video_info['releaser_id_str'] = "new_tudou_%s" % ( releaser_id) video_info['comment_count'] = int( detail_info.get('comment_count')) video_info['favorite_count'] = int( detail_info.get('praiseNumber')) #favorite_count in database means 点赞数, while in web page the factor #named praiseNumber #in web page facorite_count means 收藏数 video_info['shoucang_count'] = ( detail_info.get('favorite_count')) # print('play_count', detail_info.get('vv_count')) video_info['play_count'] = detail_info.get( 'vv_count') video_info['video_img'] = self.get_video_image( detail_info) release_time_str = detail_info.get('publish_time') print(release_time_str) # print(video_info['release_time']) if '天前' in release_time_str: video_info['release_time'] = self.video_page( video_info['url'])['release_time'] else: video_info[ 'release_time'] = trans_strtime_to_timestamp( input_time=release_time_str, missing_year=True) video_info['fetch_time'] = int(time.time() * 1e3) yield video_info else: continue
def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ proxies = get_proxy(proxies_num) releaser = "" count = 1 count_false = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) page_count = 0 # releaserUrl_name = 'http://c.m.163.com/nc/subscribe/list/%s/video/%s-20.html' % (releaser_id, page_count) pcursor = None self.video_data['releaserUrl'] = releaserUrl while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": proxies = get_proxy(proxies_num) releaserUrl = 'http://c.m.163.com/nc/subscribe/list/%s/video/%s-20.html' % ( releaser_id, page_count) try: if proxies: get_page = requests.get(releaserUrl, headers=self.headers, timeout=5, proxies=proxies) else: get_page = requests.get(releaserUrl, headers=self.headers, timeout=5) # print(data_list) # print(releaserUrl) page_dic = get_page.json() data_list = page_dic.get("tab_list") except: proxies = get_proxy(1) count_false += 1 if count_false <= 5: continue else: break page_count += 20 if data_list == []: print("no more data at releaser: %s page: %s " % (releaser, count)) pcursor = "no_more" continue else: print("get data at releaser: %s page: %s" % (releaser, count)) count += 1 for info_dic in data_list: skipID = info_dic.get("skipID") page_data, release_url = self.one_video_page(skipID) video_dic = copy.deepcopy(self.video_data) video_dic['title'] = page_data.get('title') video_dic['url'] = release_url video_dic['releaser'] = page_data.get('topicName') video_dic[ 'releaserUrl'] = "https://c.m.163.com/news/sub/%s.html" % releaser_id video_dic['release_time'] = int( datetime.datetime.strptime( info_dic.get('ptime'), "%Y-%m-%d %H:%M:%S").timestamp() * 1e3) video_dic['play_count'] = page_data.get("playCount") if not video_dic['play_count']: video_dic['play_count'] = 0 video_dic['favorite_count'] = page_data.get('voteCount') if not video_dic['favorite_count']: video_dic['favorite_count'] = 0 video_dic['comment_count'] = page_data.get('replyCount') video_dic['video_id'] = skipID video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['duration'] = page_data.get("length") video_dic['releaser_id_str'] = "网易新闻_%s" % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) result_list.append(video_dic) time.sleep(0.5) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() return result_list
def get_releaser_follower_num(self, releaserUrl): count_true = 0 headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Host": "kpfshanghai.m.chenzhongtech.com", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36" } while count_true < 5: proxies = get_proxy(proxies_num=1) releaser_id = self.get_releaser_id(releaserUrl) if not releaser_id: return None, None get_body = { "fid": str(random.randint(1535120000, 1535130000)), "cc": "share_copylink", "appType": "21", "shareType": "3", "et": "null", "timestamp": int(datetime.datetime.now().timestamp() * 1e3) } get_url = 'https://kpfshanghai.m.chenzhongtech.com/fw/user/%s?%s' % ( releaser_id, urllib.parse.urlencode(get_body)) try: releaser_page = requests.get( get_url, headers=headers, # cookies=self.cookie_dic, proxies=proxies, timeout=5) except: releaser_page = requests.get( get_url, headers=headers, # cookies=self.cookie_dic, timeout=2) res_text = releaser_page.text # print(res_text) try: releaser_follower_num_str = re.findall( '<div class="fans-follows"> (.*?)<span', res_text)[0] releaser_follower_num = self.re_cal_count( releaser_follower_num_str) print(releaser_follower_num) releaser_img = re.findall('background-image:url\((.*?)\)', res_text)[0] return releaser_follower_num, releaser_img except: count_true += 1 return None, None
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): releaser_id = self.get_releaser_id(releaserUrl) # releaser = self.get_releaser_name(releaserUrl) releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id json_headers = { "accept": "application/json, text/javascript, */*; q=0.01", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", # "cookie": "cna=W99aFOvX+QACAXL4fBJI3rAw; __ysuid=1541219939103JPW; ykss=e93bad5ef9c26af71c8e7ee5; P_ck_ctl=47F163FE35A5B1B2E479B158A12376A7; __ayvstp=16; __aysvstp=16; _zpdtk=ecd18a6d5d86a28b786b653356133cfb606dd1dc; isg=BOzsOnpUnhIGhYq8YxHgZ36EvcoepZBPH_JJJ0Yt-Rc6UY5bbrVJ3rr3dxdpWcin", "referer": releaserUrl, "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", "x-csrf-token": "ecd18a6d5d86a28b786b653356133cfb606dd1dc", "x-requested-with": "XMLHttpRequest", } json_cookies = { "cna": "W99aFOvX+QACAXL4fBJI3rAw", "__ysuid": "1541219939103JPW", "ykss": "e93bad5ef9c26af71c8e7ee5", "P_ck_ctl": "47F163FE35A5B1B2E479B158A12376A7", "__ayvstp": "16", "__aysvstp": "16", "_zpdtk": "ecd18a6d5d86a28b786b653356133cfb606dd1dc", "isg": "BOzsOnpUnhIGhYq8YxHgZ36EvcoepZBPH_JJJ0Yt-Rc6UY5bbrVJ3rr3dxdpWcin", } firsh_page_headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", # "cookie": "cna=W99aFOvX+QACAXL4fBJI3rAw; __ysuid=1541219939103JPW; ykss=e93bad5ef9c26af71c8e7ee5; P_ck_ctl=47F163FE35A5B1B2E479B158A12376A7; __ayvstp=16; __aysvstp=16; _zpdtk=9053e5d58ee0c51b1f3da8008dd4bda164ecd846; isg=BHl5FRo0A8WDkd_DnlItMBsXiOVThm042sF8-Juu9KAfIpu049ZUCb80oCjUmgVw", "referer": releaserUrl, "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } first_page_res = retry_get_url(releaserUrl, headers=firsh_page_headers, proxies=proxies_num) json_cookies.update(dict(first_page_res.cookies)) user_id = re.findall('uid="(\d+)"', first_page_res.text)[0] zptk_url = "https://id.tudou.com/i/h5/id_%s/playlisttab?uid=%s" % ( user_id, user_id) playlisttab_res = retry_get_url(zptk_url, headers=json_headers, proxies=proxies_num, cookies=json_cookies) # print(dict(playlisttab_res.cookies)) json_cookies.update(dict(playlisttab_res.cookies)) json_headers["x-csrf-token"] = dict(playlisttab_res.cookies)["_zpdtk"] count = 1 retry_time = 0 result_list = [] self.video_data['releaserUrl'] = releaserUrl print("working on releaser_id: %s" % (releaser_id)) while count <= releaser_page_num_max and retry_time < 5: proxies = get_proxy(proxies_num) api_url = 'https://id.tudou.com/i/h5/id_%s/videos?ajax=1&pn=%s&pl=20' % ( user_id, count) print(api_url) if proxies: get_page = requests.get(api_url, headers=json_headers, proxies=proxies, timeout=3, cookies=json_cookies) else: get_page = requests.get(api_url, headers=json_headers, timeout=3, cookies=json_cookies) _zpdtk = dict(get_page.cookies) json_cookies.update(_zpdtk) # print(dict(get_page.cookies)) json_headers["x-csrf-token"] = _zpdtk["_zpdtk"] page_dic = get_page.json() releaser_page_num_max = page_dic["page"]["pz"] releaser = page_dic['channelOwnerInfo']["data"]["nickname"] # has_more = page_dic.get('has_more') try: data_list = page_dic['data']["data"] time.sleep(0.25) except: retry_time += 1 time.sleep(0.25) print("no more data at page: %s try_time: %s" % (count, retry_time)) continue if data_list == []: retry_time += 1 time.sleep(0.25) print("no more data at page: %s try_time: %s" % (count, retry_time)) continue else: retry_time = 0 print("get data at page: %s" % (count)) count += 1 for info_dic in data_list: video_info = copy.deepcopy(self.video_data) video_info['video_id'] = info_dic["videoid"] video_info['title'] = info_dic["title"] video_info['releaser'] = releaser video_info[ 'url'] = 'https://video.tudou.com/v/%s.html' % info_dic[ "videoid"] video_info['duration'] = int(info_dic.get('seconds') / 1e3) video_info['releaser_id_str'] = "new_tudou_%s" % ( releaser_id) video_info['comment_count'] = int( info_dic.get('total_comment')) video_info['favorite_count'] = int( info_dic.get('total_up')) # favorite_count in database means 点赞数, while in web page the factor # named praiseNumber # in web page facorite_count means 收藏数 video_info['video_img'] = info_dic.get('thumburl') video_info['play_count'] = info_dic.get('total_vv') video_info['release_time'] = int( info_dic.get('publishtime') * 1e3) # print(video_info['release_time']) # if '天前' in release_time_str: # video_info['release_time'] = self.video_page(video_info['url'])['release_time'] # else: # video_info['release_time'] = trans_strtime_to_timestamp(input_time=release_time_str, # missing_year=True) video_info['fetch_time'] = int(time.time() * 1e3) yield video_info
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = 0 headers = { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", # "Cookie": "did=web_c7c42d62cbb24{0}4d1ca5ffca052e3; didv=1582271776000; sid=e12d2ec74ec7af3a24d{1}cd6;pua5rv=1".format(random.randint(6000,8000),random.randint(20,99)), "Cookie": "did=web_790b7bcefe7347c5937a39d34c49f7ed; didv=1583150714000; sid=ab0c3a5497ab3c8fb73c8bef", "Host": "kpfshanghai.m.chenzhongtech.com", "kpf": "H5", "kpn": "KUAISHOU", # "Origin": "https://v.kuaishou.com", "Origin": "https://kpfshanghai.m.chenzhongtech.com", "Referer": "https://kpfshanghai.m.chenzhongtech.com/fw/user/%s?fid=1535125322&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=14810686%s&docABKey=share_textid_profile&shareToken=X-7AeJHKdHOc_-392ps0aWP381Bs&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&shareObjectId=916251992&shareUrlOpened=0" % (releaser_id, random.randint(1000, 9800)), "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } proxies = get_proxy(proxies_num) # print(proxies) # proxies = {'http': 'http://*****:*****@58.55.159.141:16085/', 'https': 'http://*****:*****@58.55.159.141:16085/'} while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post( "https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post( "https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3, 5)) page_dic = get_page.json() data_list = page_dic.get("feeds") # # print(data_list) # if not data_list: # get_page = requests.post("https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count": 18, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(1) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 print("no proxies") if retry_time > 5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic[ 'url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count( info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count( info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count( info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count( info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int( info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % ( releaser_id) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = info_dic.get( "coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() return result_list
def detail_page(self, task=0): has_data = rds_get.dbsize() while True: try: if has_data == 0: time.sleep(5) has_data = rds_get.dbsize() continue keys = rds_get.randomkey() print(task, keys) res = rds_get.hgetall(keys) has_data = rds_get.dbsize() time.sleep(0.2) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", # "Cookie": 'll="108288"; bid=Nigu18p408s; douban-fav-remind=1; __utmz=30149280.1569232828.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmc=223695111; __utmz=223695111.1577424180.2.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __gads=ID=5a94465a5905606f:T=1577424178:S=ALNI_MaLN0Koo_c_KZlViOYXitMeIU4e-A; trc_cookie_storage=taboola%2520global%253Auser-id%3Db8b39662-1872-4068-8af5-6aa07e65f848-tuct47c6f90; _vwo_uuid_v2=D05FB9C411155C8DD3914FECD13EFA271|627726f2be4d2b4f8f504634879ee123; __yadk_uid=DHMuGVoEnlXbjxDWHkp2o9he5M82xPoG; dbcl2="181860061:J2hVxesshhQ"; ck=ZiZr; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmv=30149280.18186; _pk_ses.100001.4cf6=*; __utma=30149280.207229468.1556444728.1577678603.1577682675.9; __utmb=30149280.0.10.1577682675; __utma=223695111.158097746.1556444728.1577678603.1577682675.8; __utmb=223695111.0.10.1577682675; _pk_id.100001.4cf6=1ac283c07b1ba609.1556444728.8.1577682693.1577678790.', "Host": "movie.douban.com", "Referer": "https://movie.douban.com/explore", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "******", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } proxies = get_proxy(5) page_source = requests.get(res["url"], headers=headers, proxies=proxies, timeout=5) # print(page_source.text) page_json = re.findall('json\">(.*?)</script', page_source.text.replace("\n", ""), flags=re.DOTALL)[0] try: res_json = json.loads(page_json, encoding="utf-8") except: print(page_json) # name = res_json.get("name") director_list = res_json.get("director") director = "" for d in director_list: director += d["name"] + "," author_list = res_json.get("author") author = "" for d in author_list: author += d["name"] + "," actor_list = res_json.get("actor") actor = "" for d in actor_list: actor += d["name"] + "," date = res_json.get("datePublished") style_tags_list = res_json.get("genre") style_tag = "" for d in style_tags_list: style_tag += d + "," # duration = res_json.get("duration") description = res_json.get("description") rate = res_json.get("aggregateRating").get("ratingValue") page_obj = etree.HTML(page_source.text) obj_list = page_obj.xpath("//span[@class='pl']") langrage = "" area = "" extra_name = "" duration = "" for tag_obj in obj_list: tag = tag_obj.text if tag == "语言:": langrage = tag_obj.tail elif tag == "制片国家/地区:": area = tag_obj.tail elif tag == "又名:": extra_name = tag_obj.tail elif tag == "片长:": duration = tag_obj.tail # elif tag =="导演:": # director = tag_obj.xpath("./following-sibling::a[1]") # pass # elif tag =="编剧:": # pass # elif tag =="主演:": # pass dic = { "year": date, "duration": duration, "extra_name": extra_name, "area": area, "langrage": langrage, # "rate":rate, "description": description, "style_tags": style_tag, "directors": director, "author": author, "actor": actor, } if not author: dic.pop("author") if not director: dic.pop("directors") if not actor: dic.pop("actor") # print(dic) self.parse_data(dic, keys) rds_get.delete(keys) except Exception as e: print(e)
def get_releaser_follower_num(self, releaserUrl): count_true = 0 headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "content-type": "application/json", "Referer": releaserUrl, "Origin": "https://live.kuaishou.com", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "live.kuaishou.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", } while count_true < 5: proxies = get_proxy(proxies_num=1) self.get_cookies_and_font(releaserUrl) releaser_id = self.get_releaser_id(releaserUrl) if not releaser_id: return None, None post_url = 'https://live.kuaishou.com/graphql' post_dic = { "operationName": "userInfoQuery", "variables": { "principalId": releaser_id }, "query": "query userInfoQuery($principalId: String) {\n userInfo(principalId: $principalId) {\n id\n principalId\n kwaiId\n eid\n userId\n profile\n name\n description\n sex\n constellation\n cityName\n living\n watchingCount\n isNew\n privacy\n feeds {\n eid\n photoId\n thumbnailUrl\n timestamp\n __typename\n }\n verifiedStatus {\n verified\n description\n type\n new\n __typename\n }\n countsInfo {\n fan\n follow\n photo\n liked\n open\n playback\n private\n __typename\n }\n bannedStatus {\n banned\n defriend\n isolate\n socialBanned\n __typename\n }\n __typename\n }\n}\n" } try: releaser_page = requests.post(post_url, headers=headers, cookies=self.cookie_dic, json=post_dic, proxies=proxies, timeout=2) except: releaser_page = requests.post(post_url, headers=headers, cookies=self.cookie_dic, json=post_dic) res_dic = releaser_page.json() print(res_dic) if res_dic.get("errors"): self.loginObj.delete_cookies(self.cookie_dic) try: releaser_follower_num_str = res_dic["data"]["userInfo"][ "countsInfo"]["fan"] releaser_follower_num = self.re_cal_count( self.unicode_to_num(releaser_follower_num_str)) print(releaser_follower_num) releaser_img = self.get_releaser_image(data=res_dic) return releaser_follower_num, releaser_img except: if count_true == 4: self.loginObj.delete_cookies(self.cookie_dic) count_true += 1 return None, None
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=5000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = None headers = { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", "Cookie": "clientid=3; did=web_549cd4825914642449695ddccf5bfa99; client_key=65890b29; userId=%s; didv=1589785882000; sid=a94d55c86bbbccd28b8e2a8d" % random.randint(861446000, 861449800), "Host": "c.kuaishou.com", "kpf": "H5", "kpn": "KUAISHOU", "Origin": "https://c.kuaishou.com", "Referer": "https://c.kuaishou.com/fw/user/%s?fid=1535125321&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=176513752168&docABKey=share_textid_profile&shareToken=X6btjdy2izGxVqQ_A&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&appType=21&shareObjectId=1478754458&shareUrlOpened=0" % releaser_id, "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } # cookies = self.get_web_url_cookies(headers["Referer"]) proxies = get_proxy(proxies_num) while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post( "https://c.kuaishou.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 18, "pcursor": pcursor }, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post( "https://c.kuaishou.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 18, "pcursor": pcursor }, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3, 4)) page_dic = get_page.json() data_list = page_dic.get("feeds") # print(data_list) # if not data_list: # get_page = requests.post("https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count":50, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(2) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 if retry_time > 5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id_ = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic[ 'url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id_, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count( info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count( info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count( info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count( info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int( info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % ( releaser_id_) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id_ video_dic['video_img'] = info_dic.get( "coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def list_page(self, releaserUrl=None, video_list_xpath=None, play_count_xpath=None, if_free=None, title=None, describe=None, project_tag="", provider=None, year=None, next_page_xpath=None, roll=None, style_tags=None, countries=""): offset = 30 headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", # "Cookie": '__mta=150368905.1577424190198.1577424190198.1577433956085.2; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; mojo-session-id={"id":"f35010c2739ba6f036e332417fe21f84","time":1577433601641}; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577434032; __mta=150368905.1577424190198.1577433956085.1577434032548.3; mojo-trace-id=57; _lxsdk_s=16f465e962c-545-9da-13a%7C%7C64', "Cookie": '__mta=150368905.1577424190198.1577931921073.1577933054583.8; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; lt=dwim2AyVn0Nr4tMQ1qCHf87HvVwAAAAAsQkAAGKVo4UF5isSHZyJ2F-6Yypd0YqL-FIGGMTWixcuMN23AhelN_OPNDA2hAk5IuCtNg; lt.sig=0AWWI8aMHZfmuLzGDO9hoKoZqT8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110; mojo-session-id={"id":"8d8eb79ab4cbaf8082e721ba64b73f3a","time":1577935255982}; mojo-trace-id=1; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577935256; __mta=150368905.1577424190198.1577933054583.1577935256193.9; _lxsdk_s=16f64452341-fac-102-6a1%7C265018624%7C3', "Host": "maoyan.com", "Referer": "https://maoyan.com/films?showType=3&offset=30", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "******", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } count_false = 0 if args.max_page: offset = args.max_page while True: try: time.sleep(0.5) print("page ", offset) url = "https://maoyan.com/films?showType=3&offset={0}".format( str(offset)) proxies = get_proxy(4) requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False) html = etree.HTML(requests_res.text) # soup = BeautifulSoup(requests_res.text, 'lxml') # print(soup) # soup.contents dev_list = html.xpath("//body//dd") for dev in dev_list: url_list = dev.xpath("./div[2]//a[1]/@href") url = "https://maoyan.com%s" % url_list[0] title = dev.xpath("./div[2]//a[1]/text()")[0] rate_list = dev.xpath("./div[3]//text()") rate_str = "" for rate in rate_list: rate_str += rate data_dic = { "url": url, "title": title, "rate": rate_str, } if style_tags: temp_dic = {} temp_dic["style_tags"] = style_tags data_dic.update(temp_dic) if project_tag: temp_dic = {} temp_dic["project_tags"] = project_tag data_dic.update(temp_dic) if year: temp_dic = {} temp_dic["year"] = year data_dic.update(temp_dic) if provider: temp_dic = {} temp_dic["provider"] = provider data_dic.update(temp_dic) self.parse_data(data_dic, url, new_data=True) offset += 30 except Exception as e: print(e)
def detail_page(self, task=0): has_data = rds_get.dbsize() while True: try: if not has_data: time.sleep(5) has_data = rds_get.dbsize() continue keys = rds_get.randomkey() print(task, keys) res = rds_get.hgetall(keys) has_data = rds_get.dbsize() time.sleep(0.2) try: if int(res["rt"][:4]) < 2010: dic = { "box_office": "", "url": "https://maoyan.com/films/%s" % keys } self.parse_data(dic, keys) rds_get.delete(keys) continue except: pass headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", # "Cookie": "_lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110,1577942292; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __utma=17099173.1331545914.1577942309.1577942309.1577942309.1; __utmc=17099173; __utmz=17099173.1577942309.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __mta=150368905.1577424190198.1578028660590.1578044222257.17; _lxsdk_s=16f6ac3e790-0de-cab-b27%7C265018624%7C6; uuid_n_v=v1; iuuid=9C5AAEF02E0E11EAB981AB68C7AB1D51622E552FC52545AE9F3D31A0EE1F6A4F; webp=true; selectci=; ci=1%2C%E5%8C%97%E4%BA%AC; theme=maoyan; _last_page=undefined; latlng=39.908589%2C116.397316%2C1578045092790; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578045104", "Host": "m.maoyan.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } # keys = 1249366 proxies = get_proxy(4) url = "http://m.maoyan.com/movie/{0}/box?_v_=yes&utm_campaign=AmovieBmovieD100&f=android&userid={1}".format( keys, random.randint(265011000, 265031000)) page_source = requests.get(url, headers=headers, proxies=proxies, timeout=5, allow_redirects=False) # print(page_source.text) try: page_json = re.findall('AppData = (.*?);</script>', page_source.text)[0] res_json = json.loads(page_json, encoding="utf-8") except: rds_get.delete(keys) continue # print(page_json) # name = res_json.get("name") box_office = res_json.get("summary").get("mbox").get("sumBox") dic = { "box_office": box_office, "url": "https://maoyan.com/films/%s" % keys } print(dic) self.parse_data(dic, keys) rds_get.delete(keys) except Exception as e: # print(e) pass
def list_page_api(self, cat, source, year, sort, releaserUrl=None, video_list_xpath=None, play_count_xpath=None, if_free=None, title=None, describe=None, project_tag="", provider=None, next_page_xpath=None, roll=None, style_tags=None, countries=""): offset = 0 headers = { "Host": "api.maoyan.com", "Connection": "Keep-Alive", "Accept-Encoding": "gzip", "User-Agent": "AiMovie /Oneplus-6.0.1-oneplus a5010-0x0-0-null-0-000000000000000-null", "mtgdid": "AAAAAAAAAAAAACh9V5sO1zmQc71i5gjpKNuww8T-JnDVTQHuVQFINVu2yYO8FhnCWl_Cqj2TMCWI983qEk_Ha5ayk_tXytbMWi4", } count_false = 0 print(cat, source, year, sort) if args.max_page: offset = args.max_page if True: offset = 0 while offset <= 2000: try: time.sleep(0.1) print("page ", offset) url = "http://api.maoyan.com/mmdb/search/movie/tag/list.json?cityId=1&limit=100&offset={0}&catId={1}&sourceId={2}&yearId={3}&sortId={4}&token=7SJTJRCOW4fNMlp_xZDfgeI8qL0AAAAAsAkAADq-Y4OtjaaVeiysSdZtMsWTuGb0liEIqBPrkrC5QNJ0xOlFWRhf__Rj4D5cDS9L9g&utm_campaign=AmovieBmovieCD-1&movieBundleVersion=8012031&utm_source=meituan&utm_medium=android&utm_term=8.12.3&utm_content=440000000189785&ci=1&net=1&dModel=oneplus%20a5010&uuid=0000000000000A10631E76CD844099D6694316F7616BBA157797426456628307&channelId=1&lat=0.0&lng=0.0&refer=c_boybi6x4&version_name=8.12.3&machine_type=0".format( str(offset), cat, source, year, sort) proxies = get_proxy(4) requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False) dev_list = requests_res.json() for dev in dev_list["list"]: data_dic = copy.deepcopy(dev) if style_tags: temp_dic = {} temp_dic["style_tags"] = style_tags data_dic.update(temp_dic) if project_tag: temp_dic = {} temp_dic["project_tags"] = project_tag data_dic.update(temp_dic) if year: temp_dic = {} temp_dic["year"] = year data_dic.update(temp_dic) if provider: temp_dic = {} temp_dic["provider"] = provider data_dic.update(temp_dic) self.parse_data(data_dic, str(data_dic["id"]), new_data=True) offset += 100 if offset >= 2000: break except Exception as e: print(e)
def releaser_page_pc(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=10000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ releaser = "" user_id = "153512{0}".format(random.randint(1000, 9000)) proxies = get_proxy(proxies_num) headers = { "accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "content-type": "application/json", "Cookie": "did=web_504e72386a69c6d6172f1457b591415c ;userId=%s" % (user_id), "Host": "live.kuaishou.com", "Origin": "https://live.kuaishou.com", "Referer": releaserUrl, "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) pcursor = "" principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": time.sleep(random.randint(1, 2)) # self.get_cookies_and_font(releaserUrl) url_dic = { "operationName": "publicFeedsQuery", "variables": { "principalId": releaser_id, "pcursor": pcursor, "count": 100 }, "query": "query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n live {\n user {\n id\n avatar\n name\n __typename\n }\n watchingCount\n poster\n coverUrl\n caption\n id\n playUrls {\n quality\n url\n __typename\n }\n quality\n gameInfo {\n category\n name\n pubgSurvival\n type\n kingHero\n __typename\n }\n hasRedPack\n liveGuess\n expTag\n __typename\n }\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n" } api_url = 'https://live.kuaishou.com/m_graphql' try: if proxies: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5, proxies=proxies) else: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5) except: proxies = get_proxy(proxies_num) continue #print(get_page.content) page_dic = get_page.json() data_list = page_dic.get("data").get("publicFeeds").get("list") #print(data_list) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser_id, count)) # self.loginObj.delete_cookies(self.cookie_dic) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: pcursor = "no_more" continue else: pcursor = page_dic.get("data").get("publicFeeds").get( "pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get('user').get("eid") video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, info_dic.get('id')) video_dic['releaser'] = info_dic.get('user').get("name") video_dic['release_time'] = info_dic.get('timestamp') video_dic['play_count'] = trans_play_count( info_dic.get('counts').get("displayView")) video_dic['comment_count'] = trans_play_count( info_dic.get('counts').get("displayComment")) video_dic['favorite_count'] = trans_play_count( info_dic.get('counts').get("displayLike")) video_dic['video_id'] = info_dic.get('id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "kwai_%s" % (releaser_id) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=5000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = None headers = { "Accept": "application/json", #"Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", #"Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", "Cookie": "did=web_c7c42d62cbb24{0}4d1ca5ffca052c3; didv=1582271776000; sid=e12d2ec74ec7af3a24d{1}cd6;pua5rv=1" .format(random.randint(1000, 9000), random.randint(20, 99)), "Host": "kpfbeijing.m.chenzhongtech.com", "kpf": "H5", "kpn": "KUAISHOU", # "Origin": "https://v.kuaishou.com", "Origin": "https://kpfbeijing.m.chenzhongtech.com", "Referer": "https://kpfbeijing.m.chenzhongtech.com/fw/user/%s?fid=1535125322&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=14810686%s&docABKey=share_textid_profile&shareToken=X-7AeJHKdHOc_-392ps0aWP381Bs&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&shareObjectId=916251992&shareUrlOpened=0" % (releaser_id, random.randint(1000, 9800)), "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36", } # cookies = self.get_web_url_cookies(headers["Referer"]) proxies = get_proxy(proxies_num) while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post( "https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post( "https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3, 4)) page_dic = get_page.json() data_list = page_dic.get("feeds") # print(data_list) # if not data_list: # get_page = requests.post("https://kpfbeijing.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count":50, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(2) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 if retry_time > 5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id_ = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic[ 'url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id_, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count( info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count( info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count( info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count( info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int( info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % ( releaser_id_) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id_ video_dic['video_img'] = info_dic.get( "coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 """ result_list = [] has_more = True count = 1 count_false = 0 releaser_id = self.find_releaser_id(releaserUrl) offset = "0" #vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999) ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint( 10000, 99999) openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint( 1000000, 9999999) idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999) iid = str(random.randint(100000000000, 103000000000)) device_id = str(random.randint(66800000000, 66990000000)) proxies = get_proxy(proxies_num) while has_more and count <= releaser_page_num_max: # print(str(releaser_id)+str(max_behot_time)) # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time))) print("get %s video on page %s" % (releaser_id, count)) time.sleep(random.randint(1, 2)) url_dic = { "source": "0", "max_cursor": offset, "user_id": releaser_id, "count": "21", "os_api": "23", "device_type": "Huawei P20", "ssmix": "a", "manifest_version_code": "985", "dpi": "429", # "uuid": "440000000189785", "app_name": "douyin", "version_name": "9.8.5", "ts": "1585532172", "app_type": "normal", "ac": "wifi", "update_version_code": "9852", "channel": "baidu", "_rticket": str(int(datetime.datetime.now().timestamp()) * 1e3), "device_platform": "android", # "iid": iid, "version_code": "985", #"cdid": "87cc1c77-cc3c-41a1-8df6-1e060b9c510b", #"openudid": "e44cc0264b92bcbf", "device_id": device_id, "resolution": "1080*2244", "os_version": "9.0.1", "language": "zh", "device_brand": "Huawei", "aid": "2329", "mcc_mnc": "46005", } # url_dic = { # "ac": "WIFI", # # "iid": iid, # "device_id": device_id, # "os_api": "18", # "app_name": "aweme", # "channel": "App Store", # # "idfa": "7AED33DD-0F97-418D-AFAA-72ED0578A44E", # # "idfa": idfa, # "device_platform": "iphone", # "build_number": "92113", # # "vid": "21B39A50-8C28-4E7E-AEB8-A67B12B1A82B", # # "vid": vid, # # "openudid": "b1021c76124449e0e9f0e43bdf51f3314aac263b", # # "openudid": openudid, # "device_type": "iPhone9,4", # "app_version": "9.2.1", # "js_sdk_version": "1.43.0.1", # "version_code": "9.2.1", # "os_version": "13.3", # "screen_width": "1242", # "aid": "1128", # "mcc_mnc": "", # "user_id": releaser_id, # "max_cursor": offset, # "count": "21", # "source": "0", # } # 其中aid 可能为获取数据的关键字段 处理方法 尝试+1 # host = random.choice(self.api_list) # self.headers["Host"] = host url = "https://{1}/aweme/v1/aweme/post/?{0}".format( urllib.parse.urlencode(url_dic), random.choice(self.api_list)) # url = "https://aweme.snssdk.com/aweme/v1/aweme/post/?source=0&max_cursor=1584105171000&user_id=100027325090&count=10&os_api=23&device_type=MI%205s&ssmix=a&manifest_version_code=940&dpi=270&uuid=440000000189785&app_name=douyin_lite&version_name=9.4.0&ts=1585532172&app_type=normal&ac=wifi&update_version_code=9402&channel=xiaomi&_rticket=1585532172572&device_platform=android&iid=109688778422&version_code=940&cdid=87cc1c77-cc3c-41a1-8df6-1e060b9c510b&openudid=e44cc0264b92bcbf&device_id=69418894872&resolution=810*1440&os_version=6.0.1&language=zh&device_brand=Xiaomi&aid=2329&mcc_mnc=46005" try: # proxies = get_proxy(proxies_num) if proxies_num: get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10) # get_page = retry_get_url(url, headers=self.headers, proxies=proxies_num, timeout=10) else: get_page = requests.get(url, headers=self.headers, timeout=10) except Exception as e: proxies = get_proxy(1) print(e) continue page_dic = {} # print(get_page.text) try: page_dic = get_page.json() # print(get_page) # print(page_dic) data_list = page_dic.get('aweme_list') if not data_list: get_page = requests.get(url, headers=self.headers, timeout=10) page_dic = get_page.json() data_list = page_dic.get('aweme_list') if not data_list: raise ValueError has_more = page_dic.get('has_more') offset = str(page_dic.get("max_cursor")) except: if not data_list: proxies = get_proxy(1) count_false += 1 if count_false >= 5: break else: continue if has_more is None: has_more = False if data_list == []: print("no data in releaser %s page %s" % (releaser_id, count)) # print(page_dic) # print(url) proxies = get_proxy(1) count_false += 1 if count_false >= 5: has_more = False continue else: count_false = 0 count += 1 for one_video in data_list: # info_str = one_video.get('content') video_dic = copy.deepcopy(self.video_data) video_dic['title'] = one_video.get('desc') video_dic['url'] = one_video.get('share_url') video_dic['releaser'] = one_video.get('author').get( "nickname") video_dic['releaserUrl'] = releaserUrl release_time = one_video.get('create_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int( one_video.get('duration') / 1000) video_dic['play_count'] = 0 video_dic['repost_count'] = one_video.get( 'statistics').get('share_count') video_dic['comment_count'] = one_video.get( 'statistics').get('comment_count') video_dic['favorite_count'] = one_video.get( 'statistics').get('digg_count') video_dic['video_id'] = one_video.get('aweme_id') video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "抖音_%s" % releaser_id video_dic['video_img'] = one_video.get('video').get( 'cover').get('url_list')[0] result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: # data_count += len(result_list) # print(result_list) # print(data_count) output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register)
def releaser_page_pc(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=10000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ releaser = "" user_id = "153512{0}".format(random.randint(1000, 9000)) proxies = get_proxy(proxies_num) headers = { "accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "content-type": "application/json", #"Cookie": "did=web_f6e24105905d4b0381d36220ad9ccda0 ;userId=%s" % (user_id), "Cookie": "client_key=65890b29; clientid=3; did=web_f6e24105905d4b0381d36220ad9ccda0; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1574822912; didv=1583802670821; userId=1535125321; kuaishou.live.bfb1s=477cb0011daca84b36b3a4676857e5a1; userId=1535125321; kuaishou.live.web_st=ChRrdWFpc2hvdS5saXZlLndlYi5zdBKgAfPHcR6LVRx3FRHYIe2X1-gdxEI8d1iJJnM7rTZaKtVo-54m5Bolw__9dpYJoPwvA5I2Qw_7Dgl3_8N_jicpbkpT__u6ZIxcSGC3hWmVXGufsv7zVvUALqMLknpSPVoGXlt8GFBIh4LVeEsST-ghGGWB5gpAEkU2nxVB2pXUREuQ6PEh9cc_bjoODqzcROsKFGyAYVg81qp9tnJesa1oODUaEk2hY_LIikBot7IUVtJ3ydB6KCIgUeaa89k7DGhBoXcPwlWtSUp4VbGECgvvOeIaTNFMoScoBTAB; kuaishou.live.web_ph=c41f68048b583530bfa89ab7150b24df445c", "Host": "live.kuaishou.com", "Origin": "https://live.kuaishou.com", "Referer": releaserUrl, "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) pcursor = "" principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": time.sleep(random.randint(1, 2)) # self.get_cookies_and_font(releaserUrl) url_dic = { "operationName": "publicFeedsQuery", "variables": { "principalId": releaser_id, "pcursor": pcursor, "count": 100 }, "query": "query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n live {\n user {\n id\n avatar\n name\n __typename\n }\n watchingCount\n poster\n coverUrl\n caption\n id\n playUrls {\n quality\n url\n __typename\n }\n quality\n gameInfo {\n category\n name\n pubgSurvival\n type\n kingHero\n __typename\n }\n hasRedPack\n liveGuess\n expTag\n __typename\n }\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n" } api_url = 'https://live.kuaishou.com/m_graphql' try: if proxies: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5, proxies=proxies) else: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5) except: proxies = get_proxy(proxies_num) continue #print(get_page.content) page_dic = get_page.json() data_list = page_dic.get("data").get("publicFeeds").get("list") #print(data_list) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser_id, count)) # self.loginObj.delete_cookies(self.cookie_dic) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: pcursor = "no_more" continue else: pcursor = page_dic.get("data").get("publicFeeds").get( "pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get('user').get("eid") video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, info_dic.get('id')) video_dic['releaser'] = info_dic.get('user').get("name") video_dic['release_time'] = info_dic.get('timestamp') video_dic['play_count'] = trans_play_count( info_dic.get('counts').get("displayView")) video_dic['comment_count'] = trans_play_count( info_dic.get('counts').get("displayComment")) video_dic['favorite_count'] = trans_play_count( info_dic.get('counts').get("displayLike")) video_dic['video_id'] = info_dic.get('id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "kwai_%s" % (releaser_id) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: yield video_dic
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, fetchFavoriteCommnt=True, proxies_num=None): pid = os.getpid() releaser_id = self.get_releaser_id(releaserUrl) print('releaser_id is %s' % releaser_id) result_lst = [] # video_info = self.video_data page_num = 0 has_more = True ctime = "" count_false = 0 # proxies = None proxies = get_proxy(proxies_num) while page_num <= releaser_page_num_max and has_more: post_url = 'https://haokan.baidu.com/haokan/wiseauthor?app_id={0}&_api=1&_skip={1}&ctime={2}&_limit=10&video_type=media&sort_type=sort_by_time'.format( releaser_id, page_num, ctime) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', "referer": "https://haokan.baidu.com/haokan/wiseauthor?app_id=1564003728536358", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "accept": "*/*", "accept-encoding": "gzip, deflate", "accept-language": "zh,zh-CN;q=0.9", "content-type": "application/x-www-form-urlencoded" } try: if page_num == 0: for loop in range(5): get_page = requests.get(releaserUrl, headers=headers, timeout=3, proxies=proxies) # print(get_page.text) page_dic, fans_num = self.web_first_pag(get_page.text) if page_dic['apiData']['video']['results']: page_num += 1 break else: get_page = requests.get(post_url, headers=headers, timeout=3) page_dic = get_page.json() page_num += 1 # print(page_dic) except: count_false += 1 if count_false >= 5: break continue try: info_lst = page_dic['apiData']['video']['results'] except: info_lst = [] try: ctime = page_dic['apiData']['video']['ctime'] has_more = page_dic['apiData']['video']['has_more'] if not has_more: has_more = False except: has_more = False if info_lst != []: count_false = 0 print("Process %s is processing %s at page %s" % (pid, releaser_id, page_num)) time.sleep(int(random.uniform(1, 2))) for line in info_lst: video_data = copy.deepcopy(self.video_data_template) video_data['title'] = line['content']['title'] video_id = line['content']['vid'] video_data['video_id'] = video_id # partial_url = '{"nid":"sv_%s"}' % video_id # partial_url_encode = urllib.parse.quote_plus(partial_url) video_data['url'] = line['content']["video_short_url"] video_data['play_count'] = line['content']['playcnt'] video_data['favorite_count'] = int( line['content']['praiseNum']) try: video_data['comment_count'] = int( line['content']['commentNum']) except: video_data['comment_count'] = 0 video_data['releaser_followers_count'] = int(fans_num) # print('like num is %s' % video_data['favorite_count']) try: video_data['duration'] = trans_duration( line['content']['duration']) except: video_data['duration'] = 0 video_data['releaser'] = line['content']['author'] video_data['releaser_id_str'] = "haokan_%s" % ( line['content']['authorid']) video_data[ 'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + line[ 'content']['authorid'] fetch_time = int(time.time() * 1e3) video_data['fetch_time'] = fetch_time releaser_time_str = line['content']['publish_time'] video_data['release_time'] = trans_strtime_to_timestamp( input_time=releaser_time_str) video_data['video_img'] = line['content']['cover_src'] print( video_id, releaser_time_str, datetime.datetime.fromtimestamp( video_data['release_time'] / 1000), page_num) result_lst.append(video_data) if len(result_lst) >= 100: output_result(result_Lst=result_lst, platform=self.platfrom, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) result_lst.clear() if result_lst != []: output_result(result_Lst=result_lst, platform=self.platfrom, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type)
def list_page(self, releaserUrl=None, video_list_xpath=None, play_count_xpath=None, if_free=None, title=None, describe=None, project_tag="", provider=None, year=None, next_page_xpath=None, roll=None, style_tags=None, countries=""): offset = 0 headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", # "Cookie": 'll="108288"; bid=Nigu18p408s; douban-fav-remind=1; __utmz=30149280.1569232828.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utmc=223695111; __utmz=223695111.1577424180.2.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __gads=ID=5a94465a5905606f:T=1577424178:S=ALNI_MaLN0Koo_c_KZlViOYXitMeIU4e-A; trc_cookie_storage=taboola%2520global%253Auser-id%3Db8b39662-1872-4068-8af5-6aa07e65f848-tuct47c6f90; _vwo_uuid_v2=D05FB9C411155C8DD3914FECD13EFA271|627726f2be4d2b4f8f504634879ee123; __yadk_uid=DHMuGVoEnlXbjxDWHkp2o9he5M82xPoG; ap_v=0,6.0; _pk_ses.100001.4cf6=*; __utma=30149280.207229468.1556444728.1577432885.1577436032.6; __utmb=30149280.0.10.1577436032; __utma=223695111.158097746.1556444728.1577432885.1577436032.5; __utmb=223695111.0.10.1577436032; _pk_id.100001.4cf6=1ac283c07b1ba609.1556444728.5.1577436391.1577432885.', "Host": "movie.douban.com", "Referer": "https://movie.douban.com/tag/", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } count_false = 0 if args.max_page: offset = args.max_page while True: try: time.sleep(0.1) print("page ", offset) if project_tag: url = "https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=电影&start={0}&genres={1}".format( str(offset), project_tag) if countries: url = "https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=电影&start={0}&countries={1}".format( str(offset), countries) proxies = get_proxy(10) requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False, timeout=5) res_json = requests_res.json() if not res_json.get("data"): requests_res = requests.get(url, headers=headers, allow_redirects=False, timeout=5) res_json = requests_res.json() data_list = res_json.get("data") if not data_list: count_false += 1 if count_false > 10: break else: time.sleep(0.2) continue count_false = 0 offset += 20 for one_video in data_list: title = one_video["title"] _id = one_video["id"] project_name = "douban_%s" % _id url = one_video["url"] rate = one_video["rate"] casts = one_video["casts"] directors = one_video["directors"] data_dic = { "url": url, "title": title, "rate": rate, "casts": casts, "directors": directors } if style_tags: temp_dic = {} temp_dic["style_tags"] = style_tags data_dic.update(temp_dic) if project_tag: temp_dic = {} temp_dic["project_tags"] = project_tag data_dic.update(temp_dic) if year: temp_dic = {} temp_dic["year"] = year data_dic.update(temp_dic) if provider: temp_dic = {} temp_dic["provider"] = provider data_dic.update(temp_dic) self.parse_data(data_dic, project_name, new_data=True) except Exception as e: print(e)