def judgeurl(self, urls, index, times): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == 's' proxies = {type_map[http_type]: urls} test_url = type_map[ http_type] + '://music.163.com/api/playlist/detail?id=432853362' ss_url = 'https://www.google.com/?gws_rd=ssl' try: # print(test_url, proxies) # return data = basic_req(test_url, 1, proxies) result = data['result'] tracks = result['tracks'] if len(tracks) == 56: if times < 2: self.judgeurl(urls, index, times + 1) else: self.canuseip[index] = [urls, int(http_type)] data = basic_req(ss_url, 0) if len(str(data)) > 5000: self.canuseip[index] = [urls, int(http_type) + 2] else: self.cannotuseip[index] = urls except: if not index in self.canuseip: self.cannotuseip[index] = urls pass
def judgeurl(self, urls, index, times, ss_test=False): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == 's' proxies = {type_map[http_type]: urls} test_url = type_map[http_type] + '://music.163.com/api/playlist/detail?id=432853362' ss_url = 'https://www.google.com/?gws_rd=ssl' try: data = basic_req(test_url, 1, proxies) result = data['result'] tracks = result['tracks'] if len(tracks) == 56: if times < 0: self.judgeurl(urls, index, times + 1) else: echo(1, urls, proxies, 'Proxies can use.') self.canuse_proxies.append(urls) self.canuseip[index] = [urls, int(http_type)] if ss_test: data = basic_req(ss_url, 0) if len(str(data)) > 5000: self.canuseip[index] = [urls, int(http_type) + 2] else: echo(0, urls, proxies, 'Tracks len error ^--<^>--^ ') self.cannotuseip[index] = urls except: echo(0, urls, proxies, 'return error [][][][][][]') if not index in self.canuseip: self.cannotuseip[index] = urls pass
def prepare_js(self): ''' prepare js ''' pre_text = basic_req(self.JD_URL, 3) INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t', pre_text)[0].split('"')[1] origin_js = basic_req(INDEX_JS_URL, 3) ''' decoder js ''' decode_js = codecs.unicode_escape_decode(origin_js)[0] ''' params replace ''' replace_list_str = decode_js.split(';')[2] empty_index = replace_list_str.index(' ') + 1 begin_index = replace_list_str.index('=[') + 2 end_index = replace_list_str.index(']') replace_list = replace_list_str[begin_index:end_index].split(',') rp = replace_list_str[empty_index:begin_index - 2] for ii, jj in enumerate(replace_list): decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj) self.slat = replace_list[46].replace('"', '') echo(2, 'salt', self.slat) ''' load to local ''' with open(decoder_js_path, 'w') as f: f.write(';\n'.join(decode_js.split(';'))) ''' del function about ajax ''' del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js) del_begin_index = decode_js.index(del_str[0]) result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] self.result_js = result_js self.js_compile = execjs.compile(open(hotel_js_path).read()) echo(1, 'Load hotel index js success!!!')
def test_change_youdaoyun(self, atricle_id, body, article_name): """ change youdaoyun article demo @param 'buildmd/data/cookie': cookie in youdaoyun web @param article_id: change article No. @param: body: change article body @param: article_name: change article name """ url = 'https://note.youdao.com/yws/api/personal/sync?method=push&keyfrom=web&cstk=E3CF_lx8' headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Cookie': '', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'application/json, text/plain, */*', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", 'Origin': 'https://note.youdao.com', 'Referer': 'https://note.youdao.com/web' } if not os.path.exists('%scookie' % data_dir): print('Youdao Note cookie not exist!!!') return with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() headers['cookie'] = cookie[:-1] headers['Host'] = url.split('/')[2] file_list_url = 'https://note.youdao.com/yws/api/personal/file?method=listRecent&offset=0&limit=30&keyfrom=web&cstk=E3CF_lx8' file_data = {'cstk': 'E3CF_lx8'} ca = basic_req(file_list_url, 11, data=file_data, header=headers) if not len(ca): print('List Error') return change_data_origin = ca[atricle_id]['fileEntry'] body_string = [ '<?xml version="1.0"?><note xmlns="http://note.youdao.com" schema-version="1.0.3" file-version="0"><head/><body><para><coId>12-1550424181958</coId><text>', body, '</text><inline-styles/><styles/></para></body></note>' ] change_data = { 'name': article_name, 'fileId': change_data_origin['id'], 'parentId': change_data_origin['parentId'], 'domain': change_data_origin['domain'], 'rootVersion': -1, 'sessionId': '', 'modifyTime': int(round(time.time())), 'bodyString': "".join(body_string), 'transactionId': change_data_origin['id'], 'transactionTime': int(round(time.time())), 'orgEditorType': change_data_origin['orgEditorType'], 'tags': change_data_origin['tags'], 'cstk': 'E3CF_lx8' } print(change_data) cb = basic_req(url, 12, data=change_data, header=headers) return cb
def summarization_once(self, index): """ get html from news """ print(index) texts = [] hrefs = [] if index: url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=PcVKXJKRIc7s8AXB05e4Dw&sa=N&ved=0ahUKEwjSo5nBvojgAhVONrwKHcHpBfcQ8tMDCFE&biw=1627&bih=427&dpr=2&start=' + \ str(index * 10) else: url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=O8VKXJ7nFoP_8QX1oK_gDA&start=0&sa=N&ved=0ahUKEwje8JTAvojgAhWDf7wKHXXQC8w4ChDy0wMISQ&biw=1627&bih=427&dpr=2' news_lists = basic_req(url, 0) href_lists = news_lists.find_all('a', class_=['RTNUJf', 'l lLrAF']) summarization_lists = news_lists.find_all('div', class_='gG0TJc') if not len(href_lists) or not len(summarization_lists): if can_retry(url): self.summarization_once(index) return print('num: ', len(summarization_lists), url) for href in href_lists: hrefs.append(href['href']) for summarization in summarization_lists: temp_text = summarization.text.replace('\n', '').replace( '\xa0', '').replace('\t', '').replace('...', '').strip() temp_text = ' '.join(temp_text.split()) texts.append(temp_text) self.summarizations[int(index)] = texts self.hrefs[int(index)] = hrefs
def bulk_import_alimama_once(self, index, group_id): """ bulk import alimama """ url = 'http://pub.alimama.com/favorites/item/batchAdd.json' if not os.path.exists('%scookie_alimama' % data_dir): print('alimama cookie not exist!!!') return with codecs.open('%scookie_alimama' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readlines() goods_len = len(self.goods_candidate) begin_id = index * 200 end_id = min(goods_len, (index + 1) * 200) goods_ids = self.goods_candidate[begin_id:end_id] update_data = { 'groupId': group_id, 'itemListStr': ','.join(goods_ids), 't': str(int(round(time.time() * 1000))), '_tb_token_': cookie[1][:-1], 'pvid': cookie[2][:-1] } print(update_data) cb = basic_req(url, 12, data=update_data, header=headers) if cb.status_code == 200 and cb.json()['info']['message'] != 'nologin': print(cb.json()['data'])
def get_cookie(self): """ make cookie login PS: Though cookie expired time is more than 1 year, but It will be break when the connect close. So you need reactive the cookie by this function. """ headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Host': 'www.gatherproxy.com', 'Origin': 'http://www.gatherproxy.com', 'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent', 'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } login_url = 'http://www.gatherproxy.com/subscribe/login' cookie_html = basic_req(login_url, 0,header=headers) verify_text = cookie_html.find_all('div', class_='label')[2].span.text verify_list = verify_text.replace('= ','').strip().split() num_map = {'Zero': 0,'One': 1,'Two': 2, 'Three':3,'Four':4,'Fine':5,'Six':6,'Seven':7,'Eight': 8, 'Nine':9, 'Ten': 10} verify_num = [verify_list[0], verify_list[2]] for index, num in enumerate(verify_num): if num.isdigit(): verify_num[index] = int(num) elif num in num_map: verify_num[index] = num_map[num] else: echo(0, 'Error', index) # return False verify_code = 0 error = True operation = verify_list[1] if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied': verify_code = verify_num[0] + verify_num[1] error = False if operation == '-' or operation == 'minus': verify_code = verify_num[0] - verify_num[1] error = False if operation == 'X' or operation == 'multiplication': verify_code = verify_num[0] * verify_num[1] error = False if error: echo(0, 'Error', operation) if not os.path.exists('%spassage'%data_dir): echo(0, 'gather passage not exist!!!') return with codecs.open('%spassage'%data_dir, 'r', encoding='utf-8') as f: passage = [index[:-1] for index in f.readlines()] data = {'Username': passage[0], 'Password': passage[1], 'Captcha': str(verify_code)} time.sleep(2.163) r = requests.session() r.cookies = cj.LWPCookieJar() login_req = r.post(login_url, headers=headers, data=data, verify=False)
def get_goods_id_first(self, origin_url, index): """ get goods id first """ origin_url = origin_url.replace('https', 'http') # first_result = get_request_proxy(origin_url, 0) first_result = basic_req(origin_url, 0, header=self.headers) if not first_result or len(first_result.find_all('script')) < 2: if can_retry(origin_url): self.get_goods_id_first(origin_url, index) return wait = first_result.find_all('script')[1].text if not '"title":"' in wait: return title = re.findall('"title":".*","', wait)[0].split('","')[0].split('":"')[1] if title in self.title2map: self.goods_map[index] = self.title2map[title] self.url2goods[origin_url] = self.title2map[title] print(self.title2map[title]) else: print(title)
def _getroom_id(self, next_to=True, proxy=True): ''' get av room id ''' url = self.ROOM_INIT_URL % self._av_id html = get_request_proxy(url, 0) if proxy else basic_req(url, 0) head = html.find_all('head') if not len(head) or len( head[0].find_all('script')) < 4 or not '{' in head[0].find_all( 'script')[3].text: if can_retry(url): self._getroom_id(proxy=proxy) else: self._getroom_id(proxy=False) next_to = False if next_to: script_list = head[0].find_all('script')[3].text script_begin = script_list.index('{') script_end = script_list.index(';') script_data = script_list[script_begin:script_end] json_data = json.loads(script_data) if self._p == -1 or len(json_data['videoData']['pages']) < self._p: self._room_id = json_data['videoData']['cid'] else: self._room_id = json_data['videoData']['pages'][self._p - 1]['cid'] print('Room_id:', self._room_id)
def get_request(self, url, types): result = basic_req(url, 1) if not result: if can_retry(url): self.get_request(url, types) return return result
def get_request_v3(self, url, types): result = basic_req(url, 0) if result is None or not result or not len( result.find_all('p', class_='content')): if can_retry(url): self.get_request_v3(url, types) return return result
def basic_view(self, url: str, times: int, types: int): ''' press have no data input ''' url = self.AV_URL if types == 1: html = get_request_proxy(url, 0) else: html = basic_req(url, 0) if html == False and times < 5: self.basic_view(url, times + 1, types)
def get_request_proxy(self, url:str, types:int, data=None, test_func=None, header=None): """ use proxy to send requests, and record the proxy cann't use @types S0XY: X=0.->get; =1.->post; Y=0.->html; =1.->json; =2.->basic S=0.->basic ;=1.->ss support failured retry && failured auto record """ httptype = url[4] == 's' ss_type = types // 1000 types %= 1000 if ss_type: proxylist = self.proxylists_ss if httptype else self.proxylist_ss else: proxylist = self.proxylists if httptype else self.proxylist if not len(proxylist): if self.Db.db: echo(0, 'Proxy pool empty!!! Please check the db conn & db dataset!!!') proxies = {} else: index = random.randint(0, len(proxylist) - 1) proxies_url = proxylist[index] proxies = {type_map[httptype]: proxies_url} try: result = basic_req(url, types, proxies, data, header) if not test_func is None: if not test_func(result): if self.check_retry(url): self.get_request_proxy( url, types + 1000 * ss_type, data, test_func) else: self.failuredtime[url] = 0 return else: return result else: return result except: self.cannotuseip[random.randint(0, MAXN)] = proxies_url if proxies_url in proxylist: proxylist.remove(proxylist.index(proxies_url)) if not len(self.cannotuseip.keys()) % 10: self.cleancannotuse() if self.check_retry(url): self.get_request_proxy(url, types + 1000 * ss_type, data, test_func) else: return
def request_text(self, url): ''' requests text ''' req = basic_req(url, 2) if req is None: echo(0, url) if can_retry(url): self.request_text(url) else: return '' else: echo(1, url) return req.text
def get_goods_second(self, url, index): second_result = basic_req(url, 0, header=self.headers) # second_result = get_request_proxy(url, 0) if not second_result or not len(second_result.find_all('input')): if can_retry(url): self.get_goods_second(url, index) return goods_id = second_result.find_all('input')[6]['value'] print(goods_id) self.goods_map[index] = goods_id
def basic_press(self, url, times, types): """ press have no data input """ url = url + str(int(round(time.time() * 1000))) if types == 1: html = get_request_proxy(url, 1) else: html = basic_req(url, 1) if html == False and times < 5: self.basic_press(url, times + 1, types)
def load_city_list(self): ''' load city list ''' text = basic_req(self.MDD_URL, 3) city_list = re.findall( '/travel-scenic-spot/mafengwo/(.*?).html" target="_blank">(.*?)(</a>|<span)', text) id2map = { int(ii[0]): ii[1].strip() for ii in city_list if ii[0].isdigit() } city_list = id2map.keys() self.city_list = city_list self.id2map = id2map
def search_goods_once(self, goods_name, index): if not os.path.exists('%scookie_alimama' % data_dir): print('alimama cookie not exist!!!') return with codecs.open('%scookie_alimama' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readlines() url_list = [ 'https://pub.alimama.com/items/search.json?auctionTag=&perPageSize=50&shopTag=&_tb_token_=', cookie[1][:-1], '&pvid=', cookie[2][:-1], '&t=', str(int(round(time.time() * 1000))), '&_t=', str(int(round(time.time() * 1000))), '&q=', goods_name ] headers = { 'pragma': 'no-cache', 'X-Requested-With': 'XMLHttpRequest', 'cache-control': 'no-cache', 'Cookie': '', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } headers['Cookie'] = cookie[0][:-1] ca = basic_req(''.join(url_list), 2, header=headers) if ca.status_code != 200 or not 'data' in ca.json(): if can_retry(''.join(url_list)): self.search_goods_once(goods_name, index) return page_list = ca.json()['data']['pageList'] title = [ '||'.join( [str(index['auctionId']), goods_name, str(index['zkPrice'])]) for index in page_list ][0] self.goods_name[index] = title print(title)
def get_hotel_detail(self): ''' get hotel detail ''' params = { **self.generate_other_params(), 'callback': self.generate_callback(16), 'eleven': self.generate_eleven(), '_': int(time.time() * 1000) } params_list = [ '{}={}'.format(ii, (jj if not jj is None else '')) for ii, jj in params.items() ] url = '{}?{}'.format(HOTEL_ROOMLIST_FOR_DETAIL_URL, '&'.join(params_list)) echo(2, 'XHR url', url) text = basic_req(url, 1) return text
def xiciproxy(self, page): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): print("Please input num!") return [] version = begin_time() url = 'http://www.xicidaili.com/nn/%d' for index in range(1, page + 1): html = basic_req(url % (index), 0) tem = html.find_all('tr') for index in range(1, len(tem)): tds = tem[index].find_all('td') ip = tds[5].text.lower() self.waitjudge.append(ip + '://' + tds[1].text + ':' + tds[2].text) self.threadjude() end_time(version)
def get_playlist_id(self, classify, offset): """ get playlist id from classify """ host = 'https://music.163.com' allclassify = classify == '全部风格' url = host + self.classifylist[classify] + ( '?' if allclassify else '&') + 'order=hot&limit=35&offset=' + str(offset) html = basic_req(url, 0) if not html: if can_retry(url): self.get_playlist_id(classify, offset) return [] alist = html.find_all('a', class_='icon-play') if not len(alist): if can_retry(url): self.get_playlist_id(classify, offset) for index in alist: self.playlists.append(index['data-res-id'])
def load_collect_once(self, index): """ load taobao collect """ baseurl = 'https://shoucang.taobao.com/item_collect_n.htm?t=' url = baseurl + str(int(round(time.time() * 1000))) if index: url += 'ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=' + \ str(30 * index) collect_html = basic_req(url, 0) if collect_html != True and collect_html != False: collect_list = collect_html.find_all( 'li', class_=[ "J_FavListItem g-i-item fav-item ", "J_FavListItem g-i-item fav-item isinvalid", "J_FavListItem g-i-item fav-item istmall ", "J_FavListItem g-i-item fav-item istmall isinvalid" ]) print(len(collect_list)) if collect_html == True or collect_html == False or not len( collect_list): if can_retry(baseurl + str(index), index): self.load_collect_once(index) return text = [] for collect in collect_list: data_id = collect['data-id'] # data_ownerid = collect['data-ownerid'] title = collect.find_all('a', class_='img-item-title-link')[0].text price = collect.find_all( 'div', class_='g_price')[0].strong.text if len( collect.find_all('div', class_='g_price')) else '0' text.append("||".join([data_id, title, price])) self.collect[index] = text
def load_rank_index(self, index: int, day_index: int): ''' load rank ''' changeHeaders({'Referer': self.RANKING_URL % (index, day_index)}) url = self.RANKING_URL % (index, day_index) html = basic_req(url, 0) rank_list = html.find_all('li', class_='rank-item') now_av_id = [] wait_check_public = [] rank_map = {} for av in rank_list: av_href = av.find_all('a')[0]['href'] av_id = int(re.findall('av.*', av_href)[0][2:-1]) now_av_id.append(av_id) if not self.check_type(av_id): continue rank = int(av.find_all('div', class_='num')[0].text) score = int( av.find_all('div', class_='pts')[0].find_all('div')[0].text) name = av.find_all('span')[2].text if self.add_av(av_id, rank, score): rank_map[av_id] = [rank, score, name, index, day_index] ''' check assign av rank ''' for ii in self.assign_ids: if not ii in self.public: wait_check_public.append(ii) if not ii in self.last_view and not ii in self.rank_map: self.rank_map[ii] = [] have_assign = len([0 for ii in self.assign_ids if ii in now_av_id]) > 0 ''' check tid type ''' threading_public = [] for ii in rank_map.keys(): work = threading.Thread(target=self.check_type_req, args=(ii, )) threading_public.append(work) for work in threading_public: work.start() for work in threading_public: work.join() for ii, jj in rank_map.items(): if self.check_type(ii) != True: continue if not ii in self.public: wait_check_public.append(ii) self.last_check[ii] = int(time.time()) self.rank_map[ii] = jj ''' load public basic data ''' threading_public = [] for ii in wait_check_public: work = threading.Thread(target=self.public_data, args=( ii, 0, )) threading_public.append(work) for work in threading_public: work.start() for work in threading_public: work.join() ''' begin monitor ''' threading_list = [] for ii, jj in self.public.items(): if not ii in self.public_list and jj[0] + one_day > int( time.time()): work = threading.Thread(target=self.public_monitor, args=( ii, 0, )) threading_list.append(work) for work in threading_list: work.start() return have_assign
def one_click_bilibili(self, url: str, times: int, types: int): ''' press have no data input ''' url = self.AV_URL if types == 1: html = get_request_proxy(url, 0) else: html = basic_req(url, 0) if html == False: if times < 5: self.basic_view(url, times + 1, types) return times = 0 url_1 = self.CLICK_NOW_URL if types == 1: json_1 = get_request_proxy(url_1, 1) else: json_1 = basic_req(url_1, 1) if not json_1 is None: print(json_1) if not self.have_error(json_1, 1): if times < 2: self.one_click_bilibili(url, times + 1, types) return times = 0 url = self.CLICK_WEB_URL data = { 'aid': self.basic_av_id, 'cid': '', 'part': '1', 'mid': str(random.randint(10000000, 19999999)), 'lv': '2', 'ftime': '', 'stime': json_1['data']['now'], 'jsonp': 'jsonp', 'type': '3', 'sub_type': '0' } if types == 1: json_req = get_request_proxy(url, 11, data) else: json_req = basic_req(url, 11, data=data) if not json_req is None: print(json_req) if not self.have_error(json_req): if times < 2: self.one_click_bilibili(url, times + 1, types) return times = 0 url_3 = self.REPORT_HEARTBEAT_URL data_3 = { 'aid': self.basic_av_id, 'cid': '', 'mid': data['mid'], 'csrf': '', 'played_time': '0', 'realtime': '0', 'start_ts': json_1['data']['now'], 'type': '3', 'dt': '2', 'play_type': '1' } if types == 1: json_3 = get_request_proxy(url_3, 11, data_3) else: json_3 = basic_req(url_3, 11, data=data_3) if not json_3 is None: print(json_3) if not self.have_error(json_3) and times < 2: self.one_click_bilibili(url, times + 1, types) print('finish.') self.finish += 1
def load_rank_index(self, index: int, day_index: int): ''' load rank ''' changeHeaders({'Referer': self.AV_URL}) url = self.RANKING_URL % (index, day_index) text = basic_req(url, 3) rank_str = re.findall('window.__INITIAL_STATE__=(.*?);', text) if not len(rank_str): if can_retry(url): self.load_rank_index(index, day_index) return False rank_map = json.loads(rank_str[0]) rank_list = rank_map['rankList'] now_av_id = [] wait_check_public = [] rank_map = {} for ii, rank in enumerate(rank_list): av_id = int(rank['aid']) need_params = [ 'pts', 'author', 'mid', 'play', 'video_review', 'coins', 'duration', 'title' ] temp_rank_list = [ ii, *[rank[ii] for ii in need_params], index, day_index ] now_av_id.append(av_id) if not self.check_type(av_id): continue self.check_rank_rose(av_id, temp_rank_list) if self.add_av(av_id, ii, temp_rank_list[1]): rank_map[av_id] = temp_rank_list ''' check assign av rank ''' for ii in self.assign_ids: if not ii in self.public: wait_check_public.append(ii) if not ii in self.last_view and not ii in self.rank_map: self.rank_map[ii] = [] have_assign = len([0 for ii in self.assign_ids if ii in now_av_id]) > 0 ''' check tid type ''' threading_public = [] for ii in rank_map.keys(): work = threading.Thread(target=self.check_type_req, args=(ii, )) threading_public.append(work) for work in threading_public: work.start() for work in threading_public: work.join() for ii, jj in rank_map.items(): if self.check_type(ii) != True: continue if not ii in self.public: wait_check_public.append(ii) self.last_check[ii] = int(time.time()) self.rank_map[ii] = jj ''' load public basic data ''' threading_public = [] for ii in wait_check_public: work = threading.Thread(target=self.public_data, args=( ii, 0, )) threading_public.append(work) for work in threading_public: work.start() for work in threading_public: work.join() ''' begin monitor ''' threading_list = [] for ii, jj in self.public.items(): if not ii in self.public_list and jj[0] + one_day > int( time.time()): work = threading.Thread(target=self.public_monitor, args=( ii, 0, )) threading_list.append(work) for work in threading_list: work.start() return have_assign
def generate_eleven(self): ################################################################ # # [generate eleven] version 19.4.21(Test ✔️) write by gunjianpan # # 1. random generate 15 bit param `callback`; # 2. use callback request OCEANBALL -> get origin js; # 3. eval once -> (match array, and then chr() it) -> decoder js; # 4. replace document and windows(you also can use execjs & jsdom); # 5. warning you should replace `this` to some params, # Otherwise, you will get `老板给小三买了包, 却没有给你钱买房` # 6. finsh, return, and joint params; # ################################################################ callback = self.generate_callback(15) now_time = int(time.time() * 1000) url = '{}?callback={}&_={}'.format(OCEANBALL_URL, callback, now_time) referer_url = HOTEL_DETAIL_URL % self.default_hotel_id changeHeaders({'Referer': referer_url}) oceanball_js = basic_req(url, 3) array = re.findall(r'\(\[(.*)\],', oceanball_js)[0].split(',') array = [int(ii) for ii in array] offset = int(re.findall(r'item-(\d*?)\)', oceanball_js)[0]) ''' String.fromCharCode ''' oe = ''.join([chr(ii - offset) for ii in array]) ''' replace window[callback] callback function ''' replace_str = re.findall(r'{}\(new.*\)\);'.format(callback), oe)[0] eleven_params = re.findall( r'{}\(new.*\+ (.*?) \+.*\)\);'.format(callback), oe)[0] replaced_str = 'return {};'.format(eleven_params) oe = oe.replace(replace_str, replaced_str) oe = oe.replace('\'', '"').replace('\r', '') oe = oe.replace(';!', 'let aaa = ', 1) replace = ''' function(){let href='https://hotels.ctrip.com/hotel/4889292.html'; a={'documentElement': {'attributes':{}}}; b={}; function c(){}; userAgent ='Chrome/73.0.3682.0'; geolocation = 0; ''' ''' replace document & windown & navigator ''' oe = oe.replace('document.body.innerHTML.length', '888').replace('document.body.innerHTML', '""') oe = oe.replace('document.createElement("div")', '{}') oe = oe.replace('window.HTMLSpanElement', 'c').replace('document.createElement("span")', '1') oe = oe.replace('window.location.href', 'href').replace('location.href', 'href') oe = oe.replace('navigator.', '') oe = oe.replace('new Image().', '') oe = oe.replace('document.all', '0').replace('document.referrer', '""') oe = oe.replace('this || ', '') oe = oe.replace('window["document"]', 'a') oe = oe.replace('document', 'a').replace('window', 'b') oe = oe.replace('function(){', replace, 1) ''' eval script ''' eleven = js2py.eval_js(oe) echo(1, 'eleven', eleven) return eleven
def bulk_import_alimama(self): """ bulk import alimama """ version = begin_time() if not os.path.exists('%scollect_wyy' % data_dir): print('Collect File not exist!!!') return with codecs.open('%scollect_wyy' % data_dir, 'r', encoding='utf-8') as f: goods = f.readlines() self.goods_candidate = [index.split('||')[0] for index in goods] goods_len = len(self.goods_candidate) self.headers = { 'pragma': 'no-cache', 'X-Requested-With': 'XMLHttpRequest', 'cache-control': 'no-cache', 'Cookie': '', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", 'Origin': 'http://pub.alimama.com', 'Referer': 'http://pub.alimama.com/promo/search/index.htm?q=%E7%AC%AC%E5%9B%9B%E5%8D%81%E4%B9%9D%E5%A4%A9%2019%E6%98%A5%E5%AD%A3&_t=1550891362391' } if not os.path.exists('%scookie_alimama' % data_dir): print('alimama cookie not exist!!!') return with codecs.open('%scookie_alimama' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readlines() url_list = [ 'https://pub.alimama.com/favorites/group/newList.json?toPage=1&perPageSize=40&keyword=&t=', str(int(round(time.time() * 1000))), '&_tb_token_=', cookie[1][:-1], '&pvid=', cookie[2][:-1] ] url = ''.join(url_list) self.headers['Cookie'] = cookie[0][:-1] self.headers['Host'] = url.split('/')[2] group_list = basic_req(url, 2, header=self.headers) if group_list.status_code != 200 or group_list.json( )['info']['message'] == 'nologin': print('group_list error') return group_list = group_list.json()['data']['result'] group_list = [index['id'] for index in group_list] print(group_list) assert len(group_list) > (goods_len - 1) // 200 threadings = [] for index in range((goods_len - 1) // 200 + 1): work = threading.Thread(target=self.bulk_import_alimama_once, args=( index, group_list[index], )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() end_time(version)