def update_tpwd(self, mode: int = 0, is_renew: bool = True, a_id: str = None): update_num = 0 for article_id, jj in self.article_list.items(): if a_id is not None and article_id != a_id: continue for o_tpwd, (num_iid, title, domain, tpwd, _, _, url) in jj.items(): c = jj[o_tpwd] if ( is_renew and self.URL_DOMAIN[1] not in url and self.URL_DOMAIN[2] not in url and self.URL_DOMAIN[10] not in url ): renew_type = 2 if url in self.URL_DOMAIN[5] else 1 origin_tpwd = self.convert2tpwd(url, title) if origin_tpwd is None: origin_tpwd = tpwd else: renew_type = 0 origin_tpwd = tpwd if num_iid == "" or domain == 16: c = ( *c[:2], 16, origin_tpwd, 1 if renew_type == 0 else 2, *c[-2:], ) else: c = self.generate_tpwd( title, int(num_iid), origin_tpwd, renew_type, c, mode ) self.article_list[article_id][o_tpwd] = c update_num += int(c[2] < 15 or (renew_type and not mode)) echo(2, "Update {} Tpwd Info Success!!".format(update_num))
def update_title(self, article_id: str): self.tpwd_map[article_id] = { ii[3]: {"content": ii[1], "item_id": ii[0]} for ii in self.article_list[article_id].values() } no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] time = 0 while len(no_title) and time < 5: title_list = [ self.tpwd_exec.submit(self.get_item_title, article_id, ii) for ii in no_title ] echo(1, article_id, "need get title:", len(title_list)) list(as_completed(title_list)) time += 1 no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] update_num = len( [ 1 for ii, jj in self.tpwd_map[article_id].items() if "title" in jj and jj["content"] != jj["title"] ] ) echo(2, "Update", article_id, update_num, "Title Success!!!") self.update_article2db(article_id)
def generate_tpwd( self, title: str, num_iid: int, renew_tpwd: str, renew_type: int, c: dict, mode: int ): goods = self.get_dg_material(title, num_iid) if goods is None or not len(goods): echo(0, "goods get", 'error' if goods is None else 'empty', ':', title, num_iid) return (*c[:2], 17, renew_tpwd, 1 if renew_type == 0 else 2, *c[-2:]) goods = goods[0] if "ysyl_click_url" in goods and len(goods["ysyl_click_url"]): url = goods["ysyl_click_url"] elif "coupon_share_url" in goods and len(goods["coupon_share_url"]): url = goods["coupon_share_url"] else: url = goods["url"] url = "https:{}".format(url) commission_rate = int(goods["commission_rate"]) commission_type = goods["commission_type"] tpwd = self.convert2tpwd(url, title) if tpwd is None: echo(0, "tpwd error:", tpwd) return (*c[:2], 18, renew_tpwd, 1 if renew_type == 0 else 2 * c[-2:]) if mode: return (*c[:3], tpwd, commission_rate, commission_type, c[-1]) if renew_type == 1: return (*c[:3], tpwd, 2, commission_type, c[-1]) return (*c[:3], tpwd, commission_rate, commission_type, c[-1])
def load_article(self, article_id: str, mode: int = 0, is_load2db: bool = True): if mode: self.get_share_info(article_id) self.load_list2db() return if article_id not in self.tpwds: article = self.basic_youdao(article_id) tpwds = list({ii: 0 for ii in regex.findall(self.TPWD_REG, article)}) self.tpwds[article_id] = tpwds else: tpwds = self.tpwds[article_id] if article_id not in self.tpwd_map: self.tpwd_map[article_id] = {} time = 0 au_list = [] no_type = [ ii for ii, jj in self.tpwd_map[article_id].items() if "type" not in jj or jj["item_id"] is None ] while ( len(self.tpwd_map[article_id]) < len(tpwds) or (len(no_type) and not time) ) and time < 5: thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[article_id]] echo(1, article_id, "tpwds len:", len(tpwds), "need load", len(thread_list)) thread_list = [ self.tpwd_exec.submit(self.decoder_tpwd_once, article_id, ii) for ii in thread_list ] list(as_completed(thread_list)) no_type = [ ii for ii, jj in self.tpwd_map[article_id].items() if "type" not in jj or jj["item_id"] is None ] au_list.extend( [ self.tpwd_exec.submit(self.decoder_tpwd_url, article_id, ii) for ii in no_type ] ) time += 1 list(as_completed(au_list)) no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] time = 0 while len(no_title) and time < 5: title_list = [ self.tpwd_exec.submit(self.get_item_title, article_id, ii) for ii in no_title ] echo(1, article_id, "need get title:", len(title_list)) list(as_completed(title_list)) time += 1 no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] if is_load2db: self.load_article2db(article_id)
def load_gather(self): """ load gather proxy pool text If failured, you should reactive the cookie. """ headers = { "Host": "www.gatherproxy.com", "Origin": "http://www.gatherproxy.com", "Referer": "http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent", "Cookie": "_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57", "Content-Type": get_content_type(), "Accept": get_accept("html"), } url = "http://www.gatherproxy.com/subscribe/infos" try: sid_url_req = requests.get(url, headers=headers, verify=False, timeout=10) except: return sid_url_html = BeautifulSoup(sid_url_req.text, "html.parser") sid_url = sid_url_html.find_all("div", class_="wrapper")[1].find_all("a")[0][ "href" ] if len(sid_url.split("sid=")) < 2: echo("0|warning", "cookie error") self.get_cookie() self.load_gather() return sid = sid_url.split("sid=")[1] sid_url = "http://www.gatherproxy.com" + sid_url data = {"ID": sid, "C": "", "P": "", "T": "", "U": "0"} gatherproxy = requests.post(sid_url, headers=headers, data=data, verify=False) with codecs.open(data_dir + "gatherproxy", "w", encoding="utf-8") as f: f.write(gatherproxy.text)
def comment_check_schedule(self, av_id: int, comment: int): ''' schedule comment check thread ''' for pn in range(1, (comment - 1) // 20 + 2): if not self.comment_next[av_id]: return echo(2, 'Comment check, av_id:', av_id, 'pn:', pn) self.check_comment_once(av_id, pn) comment = [ self.comment[av_id][k] for k in sorted(self.comment[av_id].keys()) ] basic = [ ','.join([str(jj) for jj in ii['basic']]) for ii in comment if 'basic' in ii ] replies = [] for ii in comment: if not 'replies' in ii: continue parent_rpid = ii['basic'][0] replies_t = ii['replies'] for jj in replies_t: jj[0] = '%s-%s' % (str(parent_rpid), str(jj[0])) replies.append(','.join([str(kk) for kk in jj])) with codecs.open('%s%d_comment.csv' % (comment_dir, av_id), 'w', encoding='utf-8') as f: f.write('\n'.join(basic) + '\n') f.write('\n'.join(replies) + '\n')
def check_comment_once(self, av_id: str, pn: int, sort: int, root: int = -1, ps: int = 10): """ check comment once """ comment = self.get_comment_info(av_id, pn, sort, root, ps) if comment is None: return if root != -1: echo( "2|debug", "Comment check, av_id:", av_id, "pn:", pn, "sort:", sort, "root:", root, "ps:", ps, ) else: echo("2|debug", "Comment check, av_id:", av_id, "pn:", pn, "sort:", sort) hots = comment["hots"] replies = comment["replies"] if pn > 1 or root != -1: wait_check = replies else: wait_check = replies if hots is None else [*hots, *replies] if root == -1: wait_check = [{ **jj, "idx": ii + 1 } for ii, jj in enumerate(wait_check)] else: wait_check = [{ **jj, "idx": "reply-{}".format(ii + 1) } for ii, jj in enumerate(wait_check)] for ii in wait_check: info = {"basic": self.get_comment_detail(ii, av_id, pn, sort)} rpid = info["basic"][0] crep = ii["replies"] idx = ii["idx"] if not crep is None: info["replies"] = [ self.get_comment_detail( { **kk, "idx": "{}-{}".format(idx, ww + 1) }, av_id, pn, sort, rpid, ) for ww, kk in enumerate(crep) ]
def js_compile_sn(self, prepare_map): ''' js compile sn ''' wait_js = '<script>' + self.result_js + '</script>' sn = self.js_compile.call('analysis_js', wait_js, self.slat, prepare_map) echo(2, '_sn', sn) return sn
def public_monitor(self, bv_id: str): """ a monitor """ self.public["L"].append(bv_id) created, mid = self.public["T"][bv_id] self.get_star_num(mid) self.check_rank_v2(bv_id) time.sleep(5) follower = self.star["T"][mid] if mid in self.star["T"] else 0 data1 = self.data_v2[bv_id] if bv_id in self.data_v2 else {} sleep_time = created + one_day - int(time_stamp()) if sleep_time < 0: return echo("4|debug", "Monitor Begin %s" % (bv_id)) time.sleep(sleep_time) self.get_star_num(mid) self.check_rank_v2(bv_id) time.sleep(5) follower_2 = self.star["T"][mid] if mid in self.star["T"] else 0 data2 = self.data_v2[bv_id] if bv_id in self.data_v2 else [] data = [ time_str(created), bv_id, follower, follower_2, *list(data.values()), *list(data2.values()), ] with codecs.open(data_dir + "public.csv", "a", encoding="utf-8") as f: f.write(",".join([str(ii) for ii in data]) + "\n")
def prepare_js(self): ''' prepare js ''' pre_text = basic_req(self.JD_URL, 3) INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t', pre_text)[0].split('"')[1] origin_js = basic_req(INDEX_JS_URL, 3) ''' decoder js ''' decode_js = codecs.unicode_escape_decode(origin_js)[0] ''' params replace ''' replace_list_str = decode_js.split(';')[2] empty_index = replace_list_str.index(' ') + 1 begin_index = replace_list_str.index('=[') + 2 end_index = replace_list_str.index(']') replace_list = replace_list_str[begin_index:end_index].split(',') rp = replace_list_str[empty_index:begin_index - 2] for ii, jj in enumerate(replace_list): decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj) self.slat = replace_list[46].replace('"', '') echo(2, 'salt', self.slat) ''' load to local ''' with open(decoder_js_path, 'w') as f: f.write(';\n'.join(decode_js.split(';'))) ''' del function about ajax ''' del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js) del_begin_index = decode_js.index(del_str[0]) result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] self.result_js = result_js self.js_compile = execjs.compile(open(hotel_js_path).read()) echo(1, 'Load hotel index js success!!!')
def load_spot_once(self, pn=1, city_id=10186): ''' load spot once ''' data = { 'sAct': 'KMdd_StructWebAjax|GetPoisByTag', 'iMddid': city_id, 'iTagId': 0, 'iPage': pn, } data = self.load_sn(data) print(data) req = proxy_req(self.AJAX_ROUTER_URL, 11, data=data) if req is None or not 'data' in req or not 'list' in req['data']: if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)): self.load_spot_once(pn, city_id) return spot_list = req['data']['list'] spot_pn = req['data']['page'] spot_tmp = re.findall('<h3>.*?(.*?)</h3>', spot_list) try: total_pn = int(re.findall('共<span>(.*?)</span>', spot_pn)[0]) except Exception as e: total_pn = 1 echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e) if city_id not in self.spot_result: self.spot_result[city_id] = spot_tmp else: self.spot_result[city_id] += spot_tmp self.spot_pn[city_id] = total_pn
def req_ip66(): ''' 66ip.cn js decoder ''' header['Cookie'] = generate_cookie() req_text = basic_req(IP66_URL, 3, header=header) echo(2, req_text) return req_text
def initproxy(self): """ init proxy list """ results = self.Db.select_db(self.select_list) self.proxylist = [] self.proxylists = [] self.proxylist_ss = [] self.proxylists_ss = [] if not results: echo(0, 'Please check db configure!!! The proxy pool cant use!!!>>>') return for index in results: if index[1] == 1: self.proxylists.append(index[0]) elif index[1] == 2: self.proxylist.append(index[0]) self.proxylist_ss.append(index[0]) elif index[1] == 3: self.proxylists.append(index[0]) self.proxylists_ss.append(index[0]) else: self.proxylist.append(index[0]) echo(2, len(self.proxylist), ' http proxy can use.') echo(2, len(self.proxylists), ' https proxy can use.') echo(2, len(self.proxylist_ss), ' ss http proxy can use.') echo(2, len(self.proxylists_ss), ' ss https proxy can use.')
def db_can_use_proxy(self): """ test db have or not this data """ results = self.select_proxy([ii[0] for ii in self.can_use_ip.values()]) ss_len = len([1 for ii in self.can_use_ip.values() if ii[1] > 1]) echo("2|info", "SS proxies", ss_len) insert_list = [] update_list = [] ip_map = {} if results != False: for ip_info in results: ip_map[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.can_use_ip.values(): http_type = ip_now[1] ip_now = ip_now[0] if ip_now in ip_map: if ip_map[ip_now][1]: update_list.append( (ip_map[ip_now][0], ip_now, http_type, 0)) else: insert_list.append((ip_now, http_type)) if len(insert_list): self.insert_proxy(insert_list) if len(update_list): self.update_proxy(update_list, 0) else: pass self.can_use_ip = {}
def update_article(self, article_id: str, article_body: str): p = self.share2article[article_id][-2].split("/")[-1] article_info = self.list_recent[p] data = { "fileId": p, "parentId": article_info["parentId"], "domain": article_info["domain"], "rootVersion": -1, "sessionId": "", "modifyTime": int(time_stamp()), "bodyString": article_body, "transactionId": p, "transactionTime": int(time_stamp()), "orgEditorType": article_info["orgEditorType"], "tags": article_info["tags"], "cstk": self.cstk, } url = self.SYNC_URL % ("push", self.cstk) req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1)) if req is None or list(req.keys()) != [ "entry", "meta", "effectedShareEntries", "forcePullVersion", "effected", ]: echo( "0|error", "Update atricle_id {} Error".format(article_id), req.json() if req is not None else "", ) return False echo("1|warning", "Update atricle_id {} Success!!!".format(article_id)) return True
def get_check(self): ''' check comment ''' self.load_av_lists() av_id_list = [[ii['aid'], ii['comment']] for ii in self.av_id_map.values() if not re.findall(self.ignore_list, str(ii['aid']))] av_map = {ii['aid']: ii for ii in self.av_id_map.values()} self.comment_next = {ii: True for (ii, _) in av_id_list} if self.av_id_list and len( self.av_id_list) and len(self.av_id_list) != len(av_id_list): new_av_id = [ ii for (ii, _) in av_id_list if not ii in self.av_id_list and not ii in self.del_map ] self.rank_map = {**self.rank_map, **{ii: [] for ii in new_av_id}} echo(1, new_av_id) for ii in new_av_id: shell_str = 'nohup ipython3 bilibili/bsocket.py {} %d >> log.txt 2>&1 &'.format( ii) echo(0, shell_str) os.system(shell_str % 1) os.system(shell_str % 2) email_str = '{} av:{} was releasing at {}!!! Please check the auto pipeline.'.format( av_map[ii]['title'], ii, time_str(av_map[ii]['created'])) email_str2 = '{} {} is release at {}.\nPlease check the online & common program.\n\nBest wish for you\n--------\nSend from script by gunjianpan.'.format( av_map[ii]['title'], time_str(av_map[ii]['created']), self.BASIC_AV_URL % ii) send_email(email_str2, email_str) self.update_ini(ii) self.public[ii] = [av_map[ii]['created'], av_map[ii]['mid']] self.av_id_list = [ii for (ii, _) in av_id_list] now_hour = int(time_str(time_format='%H')) now_min = int(time_str(time_format='%M')) now_time = now_hour + now_min / 60 if now_time > self.ignore_start and now_time < self.ignore_end: return if os.path.exists('{}comment.pkl'.format(comment_dir)): with codecs.open('{}comment.pkl'.format(comment_dir), 'rb') as f: self.comment = pickle.load(f) if self.assign_up_mid == -1: return threading_list = [] for (ii, jj) in av_id_list: if ii not in self.comment: self.comment[ii] = {} work = threading.Thread(target=self.comment_check_schedule, args=( ii, jj, )) threading_list.append(work) for work in threading_list: work.start() for work in threading_list: work.join() with codecs.open('{}comment.pkl'.format(comment_dir), 'wb') as f: pickle.dump(self.comment, f) return av_id_list
def get_search_list(self, q: str): if self.proxy_can_use: base_url = self.API_PROXY_URL if random.random() * 10 > 7 else self.API_BASIC_URL else: base_url = self.API_BASIC_URL url = '{}search?q={}&count=66'.format(base_url, urllib.parse.quote(q)) search_json = proxy_req(url, 1) if search_json is None or not 'subjects' in search_json: if search_json and 'code' in search_json: if search_json['code'] == 112: self.proxy_can_use = False if can_retry(url, 6): time.sleep(random.random() * (3.14 + random.randint(4, 10)) + 3.14) self.get_search_list(q) else: self.again_list.append(q) echo(0, url, 'Failed') return # echo(2, url, 'loaded') id2name = {int(ii['id']): ii['title'] for ii in search_json['subjects']} self.movie_id2name = {**self.movie_id2name, **id2name} self.finish_list.append(q) if not len(self.finish_list) % 600: echo(2, len(self.finish_list), 'Finish...') dump_bigger(self.movie_id2name, '{}douban_movie_id.pkl'.format(data_dir))
def history_rank(self, time_gap: int, now_info: list, av_id: int): echo(0, 'send history rank') time_gap = round(time_gap / 10) * 10 history_map = { ii: jj for ii, jj in self.history_map[time_gap].items() if jj[1] } other_views = [int(ii[1]) for ii in history_map.values()] other_views_len = len(other_views) other_views.append(now_info[1]) ov_sort_idx = np.argsort(-np.array(other_views)) av_ids = list(history_map.keys()) now_sorted = [ jj for jj, ii in enumerate(ov_sort_idx) if ii == other_views_len ][0] + 1 other_result = [(jj + 1, av_ids[ii]) for jj, ii in enumerate(ov_sort_idx[:4]) if ii != other_views_len] time_tt = self.get_time_str(time_gap) email_title = 'av{}发布{}, 本年度排名No.{}/{}, 播放量: {}, 点赞: {}, 硬币: {}, 收藏: {}, 弹幕: {}'.format( av_id, time_tt, now_sorted, len(other_views), now_info[1], now_info[2], now_info[3], now_info[4], now_info[7]) email_title += self.get_history_rank(now_info) context = '{}\n\n'.format(email_title) for no, av in other_result[:3]: data_info = history_map[av] context += '{}, av{}, 本年度No.{}, 播放量: {}, 点赞: {}, 硬币: {}, 收藏: {}, 弹幕: {}{}, 发布时间: {}\n'.format( self.av_id_map[av]['title'].split('|', 1)[0], av, no, data_info[1], data_info[2], data_info[3], data_info[4], data_info[7], self.get_history_rank(data_info), time_str(self.av_id_map[av]['created'])) context += '\nBest wish for you\n--------\nSend from script by gunjianpan.' send_email(context, email_title) self.history_check_finish.append(round(time_gap / 10))
def load_comment_v2(self, movie_id: int, start: int): ''' load comment by proxy''' url = self.COMMENT_PROXY_URL % (movie_id, start) self.generate_cookie() comment_json = basic_req(url, 1) if comment_json is None or not 'comments' in comment_json: if not comment_json is None and 'code' in comment_json: if comment_json['code'] == 5000: self.finish_list[(movie_id, start)] = 0 self.checkpoint() else: comment_json['code'] == 112 self.proxy_can_use = False echo(2, url, 'Failed') self.again_list.append([movie_id, start]) else: self.again_list.append([movie_id, start]) echo(0, url, 'Failed') return comment_html = comment_json['comments'] comment = {(movie_id, ii['author']['id']): [ii['author']['name'], ii['author']['id'], ii['created_at'], ii['content'], '', ii['rating']['value']] for ii in comment_html} user_list = {ii['author']['id'] for ii in comment_html} self.user_info = {*self.user_info, *user_list} self.comment = {**self.comment, **comment} if len(user_list) == 100: self.more_user.append([movie_id, start + 100]) self.finish_list[(movie_id, start)] = 0 self.finish_list[(movie_id, start + 20)] = 0 self.finish_list[(movie_id, start + 40)] = 0 self.finish_list[(movie_id, start + 60)] = 0 self.finish_list[(movie_id, start + 80)] = 0 self.checkpoint()
def load_proxies_list(self, types: int = 2): """ load proxies """ SITES = [ "http://www.proxyserverlist24.top/", "http://www.live-socks.net/" ] spider_pool = [] self.waitjudge = [] for site in SITES: self.get_other_proxies(site) self.gatherproxy(3) waitjudge = list(set(self.waitjudge)) waitjudge_http = ["http://" + ii for ii in waitjudge] waitjudge_https = ["https://" + ii for ii in waitjudge] if not types: self.waitjudge = waitjudge_http elif types == 1: self.waitjudge = waitjudge_https else: self.waitjudge = waitjudge_http + waitjudge_https echo( "1|info", "-_-_-_-_-_-_-", len(waitjudge), "Proxies wait to judge -_-_-_-_-_-_-", )
def load_index(): ''' load index ''' global movie_list version = begin_time() text = proxy_req(HOMEPAGE_URL, 3) if not len(text): if can_retry(HOMEPAGE_URL): load_index() return movie_list = re.findall('《(.*?)》', text) movie_more = re.findall('href="(.*?)">更多', text) for uri in movie_more: load_other(uri) threading_list = [threading.Thread( target=load_other, args=(ii,)) for ii in movie_another] shuffle_batch_run_thread(threading_list, 100) threading_list = [threading.Thread( target=load_other, args=(ii,)) for ii in movie_again] shuffle_batch_run_thread(threading_list, 100) # 对电影列表去重 movie_list = set(movie_list) # 导出爬取的 电影列表 out_path = 'dytt8_result.txt' with open(out_path, 'w') as f: f.write('\n'.join(movie_list)) url_num = len([*movie_more, *movie_another]) + 1 movie_num = len(movie_list) echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( url_num, movie_num, out_path, end_time(version, 0)))
def init_proxy(self): """ init proxy list """ results = self.Db.select_db(self.select_list) self.proxylist = [] self.proxylists = [] self.proxylist_ss = [] self.proxylists_ss = [] if not results: echo("0|error", "Please check db configure!!! The proxy pool cant use!!!>>>") return for index in results: if index[1] == 1: self.proxylists.append(index[0]) elif index[1] == 2: self.proxylist.append(index[0]) self.proxylist_ss.append(index[0]) elif index[1] == 3: self.proxylists.append(index[0]) self.proxylists_ss.append(index[0]) else: self.proxylist.append(index[0]) echo("2|info", len(self.proxylist), " http proxy can use.") echo("2|info", len(self.proxylists), " https proxy can use.") echo("2|info", len(self.proxylist_ss), " ss http proxy can use.") echo("2|info", len(self.proxylists_ss), " ss https proxy can use.")
def generate_cookie(): ''' eval 66ip.cn test in 19.5.7 ''' req = basic_req(IP66_URL, 2, header=header) basic_cookie = req.cookies.get_dict() ''' !important \b in py -> \x80 ''' req_text = r'{}'.format(req.text) ''' get the script will be eval ''' script_text = re.findall('<script>(.*?)</script>', req_text)[0] script_text = script_text.replace( '{eval(', '{aaa=').replace(');break', ';break') script_eval = r'{}'.format(js2py.eval_js(script_text + 'aaa')) echo(0, script_eval) try: ''' replace document & window ''' params = re.findall( r'(__jsl_clearance=.*?)\'\+\(function\(\){(.*?join\(\'\'\))}\)\(\)', script_eval) wait_eval = params[0][1].replace( "document.createElement('div')", "{}").replace("", '') wait_replace = re.findall( r'=(.{1,5}\.firstChild\.href;)', wait_eval)[0] wait_eval = wait_eval.replace(wait_replace, '"http://www.66ip.cn/";') ''' eval & encoder cookie ''' other_param = js2py.eval_js( 'function ddd() {window={};' + wait_eval + '}ddd()') cookie = '{}; {}{}'.format(encoder_cookie( basic_cookie), params[0][0], other_param) echo(1, 'cookie', cookie) return cookie except: generate_cookie()
def tostring(self, e: int): if self.s < 0: echo('0|warning', '.s < 0', self.s) return '-' t = int(np.log2(e)) r, o, i, a = (1 << t) - 1, False, '', self.t s = DB - a * DB % t if a > 0: if s < DB: n = self.E[a] >> s if n > 0: o = True i = g[n] a -= 1 while a >= 0: if s < t: n = (self.E[a] & (1 << s) - 1) << t - s a -= 1 s += DB - t n = n | (self.E[a] >> s) else: s -= t n = self.E[a] >> s & r if s <= 0: s += DB a -= 1 if n > 0: o = True if o: i += g[n] return i if o else '0'
def get_s_click_url(self, s_click_url: str): """ decoder s.click real jump url @validation time: 2019.10.23""" time.sleep(np.random.randint(0, 10)) item_url = self.get_s_click_location(s_click_url) if item_url is None: echo(3, "s_click_url location Error..") return return self.get_item_detail(item_url)
def get_download(self, types: str): url = "https://www.proxy-list.download/api/v0/get?l=en&t=" + types tt = basic_req(url, 1) if tt is None: return [] tt_list = tt[0]["LISTA"] echo(1, "Get download", types, len(tt_list)) return ["{}:{}".format(ii["IP"], ii["PORT"]) for ii in tt_list]
def _getroom_id(self, proxy: bool = True): """ get av room id """ cid = self.get_cid(self._bv_id) assert ( cid and len(cid) >= self._p ), "Actual Page len: {} <=> Need Pages Num: {}".format(len(cid), self._p) self._room_id = int(cid[self._p - 1]) echo(3, "Room_id:", self._room_id)
def load_picture_pipeline(self, file_path: str): mkdir('picture') tpk_list = self.tpwds[file_path] picture_url = [(self.tpwd_map[file_path][tpk]['picUrl'], idx) for idx, tpk in enumerate(tpk_list) if tpk in self.tpwd_map[file_path]] picture_url = [(ii, idx) for ii, idx in picture_url if not os.path.exists('picture/{}.jpg'.format(idx))] echo(1, 'Load {} picture Begin'.format(len(picture_url))) pp = [self.tpwd_exec.submit(self.load_picture, ii, jj) for ii, jj in picture_url] return pp
def checkpoint(self): checkpoint_num = 32 if self.proxy_can_use else 200 if not len(self.finish_list.keys()) % checkpoint_num: echo(2, len(self.finish_list), 'Finish...') # dump_bigger(self.comment, '{}douban_comment.pkl'.format(data_dir)) dump_bigger(self.user_info, '{}douban_user.pkl'.format(data_dir)) dump_bigger(self.finish_list, '{}douban_cf.pkl'.format(data_dir)) dump_bigger(self.more_user, '{}douban_more.pkl'.format(data_dir)) dump_bigger(self.again_list, '{}douban_again.pkl'.format(data_dir))
def get_free_proxy(self, url: str): req = basic_req(url, 2) if req is None: return [] tt = req.text t_list = re.findall("<tr><td>(\d*\.\d*\.\d*\.\d*)</td><td>(\d*?)</td>", tt) echo(1, "Get Free proxy List", url, len(t_list)) return ["{}:{}".format(ii, jj) for ii, jj in t_list]