def getimgurl(html, url, page): if url not in cache: key = search(r'id="dm5_key".+?<script[^>]+?>\s*eval(.+?)</script>', html, DOTALL) if key: key = eval(key.group(1)).split(";")[1] key = search(r"=(.+)$", key).group(1) key = eval(key) else: key = "" length = search("DM5_IMAGE_COUNT=(\d+);", html).group(1) cid = search("DM5_CID=(\d+);", html).group(1) funs = [] for p in range(1, int(length) + 1): fun_url = urljoin(url, "chapterfun.ashx?cid={}&page={}&language=1&key={}>k=6".format(cid, p, key)) funs.append(fun_url) cache[url] = funs # Grab cookies? grabhtml(funs[0], referer=url) if page - 1 >= len(cache[url]): del cache[url] raise LastPageError fun_url = cache[url][page - 1] text = grabhtml(fun_url, referer=url) d = compile(text).eval("(typeof (hd_c) != 'undefined' && hd_c.length > 0 && typeof (isrevtt) != 'undefined') ? hd_c : d") return d[0]
def login(self, v_code='', c_string=''): self.get_pre_login_info() url = 'https://passport.baidu.com/v2/api/?login' ''' head = { "Origin": "https://passport.baidu.com", "Accept-Encoding": "gzip, deflate", "Host": "passport.baidu.com", "Accept-Language": "zh-CN,zh;q=0.8", "Content-Type": "application/x-www-form-urlencoded", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Cache-Control": "max-age=0", "Referer": "https://passport.baidu.com/v2/?login", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36" } ''' data = { "staticpage": "https://passport.baidu.com/static/passpc-account/html/v3Jump.html", "charset": "UTF-8", "token": self.token, "tpl": "pp", "subpro": "", "apiver": "v3", "tt": execjs.eval('new Date().getTime()'), "codestring": c_string, "safeflg": "0", "u": "https://passport.baidu.com/", "isPhone": "false", "detect": "1", "quick_user": "******", "logintype": "basicLogin", "logLoginType": "pc_loginBasic", "idc": "", "loginmerge": "true", "username": self.usr, "password": self.rsa_pwd, "verifycode": v_code, "mem_pass": "******", "rsakey": self.key, "crypttype": "12", "ppui_logintime": execjs.eval('20000 + 10000 * Math.random() % 10000'), "gid": self.create_gid(), "callback": "parent.bd__pcbs__dvpmkh" } data = urllib.urlencode(data) #print data req = urllib2.Request(url, data) # req.add_header(head) res = urllib2.urlopen(req).read() if 'err_no=257' in res: pass #self.check_vcode(res) elif 'err_no=0' in res: return True else: pass
def login(account, password): js_path = os.path.join(os.path.dirname(__file__), 'login.long.js') with open(js_path) as f: js = f.read() execjs.eval(js) pass
def getimgurls(html, url): base, protocol, id = re.search(r"((https?://)[^/]+)/book/([^/]+)", url).groups() core = re.search(r'src="(/scripts/core[^"]+)"', html).group(1) cInfo = re.search(r'cInfo = ({[^;]+});', html).group(1) coreJs = grabhtml(base + core, referer=url) pageConfig = re.search(r'pageConfig=({[^;]+})', coreJs).group(1) images = execjs.eval(cInfo)["fs"] host = execjs.eval(pageConfig)["host"] return [protocol + host + image for image in images]
def testDecodeImgpath(): html = spider.getSourceCode( 'https://ac.qq.com/ComicView/index/id/629632/cid/45') data = re.findall(r"var DATA\s*= '(.*)'", html)[0] nonce = re.findall(r'window\[".*=(.*);', html)[0] nonce = execjs.eval(nonce) print spider.decodeImgpath(data, nonce)
def parse_manhua_info(self, response): # print response.text """ 解析漫画具体章节下的漫画详情,获取漫画图片链接,链接是通过混淆过的js自动计算出来的。 引入pyexecjs 库解析js js混淆解密,发现js代码其实就是声明了一个 var newImgs = []的数组。参考: http://tool.chinaz.com/js.aspx :param response: :return: """ js_code = response.css('body > script:nth-child(8)').xpath( './/text()').extract_first() # js_code = self.js_regex.sub(lambda m: m.group().replace(' ', ''), js_code) # 过滤掉空格 image_urls = execjs.eval(js_code + ', newImgs') item = ImageItem() item['url'] = response.url item['name'] = response.css("#title").xpath('text()').extract_first( '').split(' ')[0] item['chapter'] = filter( lambda x: x.isdigit(), response.css("#title").xpath('text()').extract_first('').split(' ') [1]) item['image_urls'] = image_urls # print image_urls yield item
def get_area_house(self, response): try: print(response.url) body = response.body.decode("utf8") groups = re.search( "\s*var search_result = \s*(.*);var search_result_list_num\s*=\s*\d", body) body = execjs.eval(groups[1]) with open("a.html", 'w', encoding="utf-8") as f: f.write(body) body = Selector(text=body) for each in body.xpath("//li[@class='title']/h2"): url = each.xpath("a/@href").extract()[0] yield scrapy.Request(url, callback=self.parse_building) for each in body.xpath( "//div[@id='search_result_page']/a[@onclick]"): if each.xpath("text()").extract()[0] == "下一页>": search_result = re.search( r".*\(.*,.*,.*,(\d*)\)", each.xpath("@onclick").extract()[0]) page_no = search_result.group(1) url = re.sub("page_no=\d*", "page_no=%s" % page_no, response.url, 1) yield scrapy.Request(url, callback=self.get_area_house) except Exception as e: self.log("!!!!!error %s" % e)
def handle_521(self, response, callback, **kwargs): n = response.meta.get('n', 0) if 'document.cookie' in response.text: js_clearance = re.findall('cookie=(.*?);location', response.text)[0] result = execjs.eval(js_clearance).split(';')[0] k, v, *_ = result.split('=') yield Request(response.url, callback=callback, cookies={k: v}, meta={'n': n + 1}, dont_filter=True) else: params = get_params(response) chars = params['chars'] bts = params['bts'] ha = params['ha'] ct = params['ct'] hash_func = hash_d[ha] clearance = encrypt_cookies(chars, bts, ct, hash_func) yield Request(response.url, callback=callback, cookies={'__jsl_clearance_s': clearance}, meta={'n': n + 1}, dont_filter=True)
def getCinemaShowtime(cinemaId, date): ''' 根据影院ID和日期,获取该影院该日的拍片情况 :param cinemaId: 影院ID :param date: 日期,格式为 20170404 :return: 一个dict,可通过['value']['showtimes']得到showtimes ''' url = 'http://service.theater.mtime.com/Cinema.api?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Cinema.Services&Ajax_CallBackMethod=GetShowtimesJsonObjectByCinemaId&' \ 'Ajax_CallBackArgument0=' + str(cinemaId) + '&Ajax_CallBackArgument1=' + str(date) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } text = '' movieIDList = [] # 抓取整个网页 try: print('Requesting url: ', url) text = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT).text except: print('Error when request url=', url) return None try: var = re.match(r'^var GetShowtimesJsonObjectByCinemaResult = (.+);', text).group(1) # 获取javascript值 if var: var = execjs.eval(var) # 用库处理js值 return var except: print('error in var = re.match ') return None
def extract_fields(self, res): selector = etree.HTML(text=res.text) # r = requests.get(url='http://www.baidu.com/') trs = selector.xpath('//div[@class="table-responsive"]/table/tbody/tr') result = [] attrib = selector.xpath('//div[@class="container-fluid"]/div[1]')[0].attrib key = filter(lambda x: 'data' in x, attrib)[0] compute_port = int(attrib[key]) for x in trs: js_str = ''.join(x.xpath('.//td[1]//script/text()')) # 得到ip ip_1 = execjs.eval(js_str.split(';')[0].split('=')[1]) string = js_str.split(';')[1].split('=')[1] base64_code = re.search('(?<=atob\().*?\.', string).group().strip('.') ip_2 = base64.b64decode(eval(base64_code)) ip = ip_1 + ip_2 # 解码端口号,得到真正的端口 port_str = js_str.split(';')[2].split('=')[1] string = re.search('\([0-9]{2,4}', port_str).group().strip('(') port = int(string) + compute_port if ip and port: result.append(dict(ip=ip, port=str(port), name=self.name)) return result
def get_data(pageCount): headers = { 'Accept': 'application/json, text/plain, */*', 'Origin': 'https://sou.zhaopin.com', 'Referer': 'https://sou.zhaopin.com/?p={}&jl=702&sf=0&st=0&kw=%E5%A4%A7%E6%95%B0%E6%8D%AE&kt=3' .format(pageCount), 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' } js = '''"f097795abafd429bb0b65846ac9944b7-" + (new Date()).valueOf() + "-" + parseInt(Math.random() * 1000000)''' url_id = execjs.eval(js) data_url = 'https://fe-api.zhaopin.com/c/i/sou?start={}&pageSize=90&cityId=702&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=大数据&kt=3&=0&_v=0.14145840&x-zp-page-request-id={}'.format( pageCount * 90 if pageCount > 0 else 0, url_id) response = requests.get(data_url, headers=headers).text #print(response) data = json.loads(response) data = data['data']['results'] if len(data) > 2: return data else: return None
def action(self, parameters): value = self.value if hasattr(self, 'value') else None value = Node.substitute_placeholders(value, parameters) reference = Node.substitute_placeholders(self.reference, parameters) if self.referenceProcessing == 'evaluate': self.reference = execjs.eval(self.reference) try: res = self._plugin_manager.call_plugin(self.plugin, reference, value) except Exception as e: logging.error(""" Exception in RequestNode. id: %s, plugin: %s, reference: %s, value: %s, parameters: %s, exception: %s""", self.id, self.plugin, str(self.reference), self.value if hasattr(self, 'value') else '[none]', str(parameters), str(e) ) if hasattr(self, 'retvar'): return {self.retvar: 'error'} if hasattr(self, 'retvar'): return {self.retvar: res}
def crawl_sina_shfe_day(): """ 新浪财经 shfe 报价 | 铜、铝、镍、锌、铅 http://stock2.finance.sina.com.cn/futures/api/jsonp.php/var _AHD2017_8_28=/GlobalFuturesService.getGlobalFuturesDailyKLine?symbol=AHD """ metal_lme_mapping = OrderedDict(( ('Al', { 'match': 'AL0', 'symbol': 'USE00159' }), ('Cu', { 'match': 'CU0', 'symbol': 'USE00160' }), ('Ni', { 'match': 'NI0', 'symbol': 'USE00161' }), ('Pb', { 'match': 'PB0', 'symbol': 'USE00162' }), ('Zn', { 'match': 'ZN0', 'symbol': 'USE00163' }), )) today = date.today().strftime('%Y_%m_%d') source = 'sina_shfe' exchange = 'SHFE' for metal, mapping in metal_lme_mapping.items(): sina_code = mapping['match'] url = f'http://stock2.finance.sina.com.cn/futures/api/jsonp.php/var%20_{sina_code}{today}=/InnerFuturesNewService.getDailyKLine?symbol={sina_code}' logger.info('开始爬取 %s, url: %s' % (source, url)) response = requests.get(url, timeout=5) response = re.findall(r'\((.*)\)', response.text)[0] day_kline = execjs.eval(response) latest_day_kline = DataSinaDayKLine.objects.filter( symbol=mapping['symbol']).order_by('-date').first() latest_day = latest_day_kline.date if latest_day_kline else date( 2000, 1, 1) for kline in filter( lambda kline: datetime.strptime(kline['d'], '%Y-%m-%d').date() >= latest_day, day_kline): DataSinaDayKLine.objects.update_or_create_all_envs( logger, varieties=metal, symbol=mapping['symbol'], exchange=exchange, date=datetime.strptime(kline['d'], '%Y-%m-%d'), defaults={ 'price_low': kline['l'], 'price_high': kline['h'], 'price_open': kline['o'], 'price_close': kline['c'], 'volume': kline['v'], })
def getkey(run_eval): a = unzip(run_eval) str1, str2 = re.findall('\$hidescript=(.*?);.*?\((.*?)\)\(\)', a)[0] js_func = str2.replace('$hidescript', str1) aes_key = execjs.eval(js_func) keys = re.findall('com.str._KEY=\"(.*?)\";', aes_key)[0] return keys
def getimgurls(html, url): pages_js = re.search(r'page_url":(\[[^\]]+\])', html).group(1) pages = execjs.eval(pages_js) # thumbs.db?! # http://manhua.dmzj.com/zhuoyandexiana/3488-20.shtml return [page for page in pages if page and not page.lower().endswith("thumbs.db")]
def main(): context = get_html_context() body = context[0] guid = context[1] strInit = 'var s,t,o,p,b,r,e,a,k,i,n,g,f, ' strInitIndex = body.index(strInit) # 拿到对象名称 objName = body[strInitIndex + len(strInit): body.find('=', strInitIndex)] # 拿到属性名称 propName = body[body.find('{"', strInitIndex) + 2: body.find('":', strInitIndex)] varName = objName + "." + propName # start -> body.find(":", str1Index) + 1 # end -> body.find("}", str1Index) initExpression = body[body.find(":", strInitIndex) + 1: body.find("}", strInitIndex)] # 记录验证码的值 sum = execjs.eval(initExpression) # 截取验证码表达式字符串 str1Start = "('challenge-form');" str1End = "a.value" otherExpression = body[body.index(str1Start) + len(str1Start) + 1: body.index(str1End)].strip()[1:] # sum+=!+[]+!![]+!![]+!![]+!![]+!![]+!![]; # sum+=+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![])); # sum+=!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]; # sum+=+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])); print otherExpression
def get_tkk(): url = 'https://translate.google.cn/' res = requests.get(url, timeout = 1) tkk_fn = find_tkk_fn(res.text) content = tkk_fn.group(1).encode('utf-8').decode('unicode_escape') tkk = execjs.eval(content) return tkk
def parse_api_data_from_page(self, html): r = re.search('<script>window.__NUXT__=(.*?);</script>', html, re.S) if not r: return js_str = r.group(1) r = execjs.eval(js_str) return r['data'][0]
def get_data_for_vine_id(self, vine_id, timeout=30): try: page = requests.get("https://vine.co/v/{}".format(vine_id), timeout=timeout) except requests.exceptions.RequestException as e: error_message = "Problem with comminicating with vine page - {}".format(e) raise PresserRequestError(error_message) if page.ok: content = BeautifulSoup(page.content) all_script_tags = content.find_all("script") potential_script_tags = [script for script in all_script_tags if not script.has_attr("src")] script_lines = [] for tag in potential_script_tags: for content in tag.contents: for line in content.split(";\n"): if line.count("window.POST_DATA"): script_lines.append(line.replace("window.POST_DATA = ", "")) if len(script_lines) > 1: raise PresserJavaScriptParseError("More POST_DATA extracted than expected") if not script_lines: raise PresserJavaScriptParseError("No POST_DATA extracted for id {}".format(vine_id)) script_line = script_lines[0].replace("POST = ", "") try: data = execjs.eval(script_line) vine = data[vine_id] return vine except execjs.RuntimeError as e: error_message = "Problem with parsing, check parsing logic. {}".format(e) raise PresserJavaScriptParseError(error_message) elif page.status_code == 404: raise Presser404Error("{} could not be found".format(page.url)) else: raise PresserURLError("{} could not be accessed {} - {}".format(page.url, page.status_code,page.content))
def get_stock_top_10(date: datetime.date): """ http://www.hkex.com.hk/chi/csm/chinaconndstat_daily.htm url ="http://www.hkex.com.hk/chi/csm/DailyStat/data_tab_daily_20170217c.js" ["Rank", "Stock Code", "Stock Name", "Buy Turnover", "Sell Turnover", "Total Turnover"] :return: """ if isinstance(date, str): date = parser.parse(date).date() url = "http://www.hkex.com.hk/chi/csm/DailyStat/data_tab_daily_{date}c.js".format(date=date.strftime("%Y%m%d")) response = requests.get(url) if response.status_code == 200: content = response.content.decode("utf-8") stock_data = execjs.eval(content) # js compile sse_hk, szse_hk = None, None for item in stock_data: if item["market"] == "SSE Southbound": # 港股通(滬) sse_hk = _parser_top_10(item["content"]) elif item["market"] == "SZSE Southbound": # 港股通(深) szse_hk = _parser_top_10(item["content"]) return sse_hk, szse_hk
def get_js_object(js_code, key): text = js_code text = text[text.find("=") + 1:] text = text[:text.rfind(";")] text = "JSON.stringify(" + text + ")" text = execjs.eval(text) return json.loads(text)
def get_tkk(): url = 'https://translate.google.cn/' res = requests.get(url, timeout=1) tkk_fn = find_tkk_fn(res.text) content = tkk_fn.group(1).encode('utf-8').decode('unicode_escape') tkk = execjs.eval(content) return tkk
def get_image_data_from_page(self, html): js = re.search(r">window.*(\(function\(p.*?)</script>", html).group(1) b64_str = re.search(r"[0-9],'([A-Za-z0-9+/=]+?)'", js).group(1) s = lzstring.LZString.decompressFromBase64(b64_str) new_js = re.sub(r"'[A-Za-z0-9+/=]*'\[.*\]\('\\x7c'\)", "'" + s + "'.split('|')", js) res = execjs.eval(new_js) return json.loads(re.search(r"(\{.*\})", res).group(1))
def getMpsHeaderWithCookie(url): # 使用session保持会话 res1 = requests.get(url, headers=headers) #print(res1.text) cookiejar = res1.cookies cookiedict = requests.utils.dict_from_cookiejar(cookiejar) #print(cookiejar) print(cookiedict) jsl_clearance_s = re.findall(r'cookie=(.*?);location', res1.text)[0] # 执行js代码 jsl_clearance_s = str( execjs.eval(jsl_clearance_s)).split('=')[1].split(';')[0] # add_dict_to_cookiejar方法添加cookie #add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s}) #cookiedict['__jsl_clearance_s'] = jsl_clearance_s #__jsl_clearance_s=1628565425.41|-1|RzuwzFWX8ZtPtb458AaFArcZRd0%3D #print(cookiedict) __jsluid_s = cookiedict['__jsluid_s'] headers[ 'cookie'] = '__jsl_clearance_s=' + jsl_clearance_s + ";__jsluid_s=" + __jsluid_s print(headers) res2 = requests.get(url, headers=headers) print(res2.text) # 提取go方法中的参数 data = json.loads(re.findall(r';go\((.*?)\)', res2.text)[0]) jsl_clearance_s = getClearance(data) # 修改cookie #add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s}) headers[ 'cookie'] = '__jsl_clearance_s=' + jsl_clearance_s + ";__jsluid_s=" + __jsluid_s print(headers) return headers
def fetch_chapter(cls, chapter_url, chapter_dir=None): mangabz_cid, mangabz_mid, mangabz_viewsign_dt, mangabz_viewsign, page_total = cls.fetch_chapter_argv( chapter_url) page_total = int(page_total) images_info = [] desc = '\rFetching {}: ({}/{})' for i in range(page_total): print(desc.format(chapter_url, i + 1, page_total), end='\r') i += 1 # skip exists image if chapter_dir is not None and os.path.isdir(chapter_dir): if os.path.exists(os.path.join(chapter_dir, str(i+1)+'.jpg')) or\ os.path.exists(os.path.join(chapter_dir, str(i+1)+'.png')): continue js_str = cls.fetch_images_js(chapter_url, i, mangabz_cid, mangabz_mid, mangabz_viewsign_dt, mangabz_viewsign) imagesList = execjs.eval(js_str) img_url = imagesList[0] img_name = str(i + 1) + os.path.splitext(cls.url2fn(img_url))[-1] images_info.append({ 'fname': img_name, 'url': img_url, }) print(' ' * os.get_terminal_size().columns, end='\r') return images_info
def get_tkk(): '''从google服务器上获取tkk值,为计算tk值做准备''' def get_res(url): try: res = requests.get(url, timeout=1.5) res.raise_for_status() #res.encoding = 'utf-8' return res except Exception as ex: print('[-]ERROR: ' + str(ex)) return res def find_tkk_fn(res): #查找tkk计算函数 re_tkk = r"TKK=eval\('(\(\(function\(\)\{.+?\}\)\(\)\))'\);" tkk_fn = re.search(re_tkk, res) return tkk_fn url = 'https://translate.google.cn/' try: res = get_res(url) tkk_fn = find_tkk_fn(res.text) #print(tkk_fn.group(1)) content = tkk_fn.group(1).encode('utf-8').decode('unicode_escape') #print(content) tkk = execjs.eval(content) #print('tkk:',tkk) return tkk except Exception as ex: print(ex)
def get_solutions(self, pid, sid, limit=10): url = self.url + '/submissions/detail/%s/' % sid js = r'var pageData =\s*(.*?);' resp = self.session.get(url) def diff(a, sl): for b in sl: r = difflib.SequenceMatcher(a=a.code, b=b.code).ratio() if r >= 0.9: return False return True solutions = [] for s in re.findall(js, resp.text, re.DOTALL): v = execjs.eval(s) try: df = json.loads(v.get('runtimeDistributionFormatted')) if df.get('lang') == self.lang: for e in df.get('distribution')[:limit]: t = int(e[0]) sln = self.get_solution(pid, t) if diff(sln, solutions): solutions.append(sln) break except ValueError: pass return solutions
def get_msg_signal(self): """ 消息信号检查 """ call_back = {"retcode": "0", "selector": "0"} try: resp = self.get( API_synccheck, params={ "r": Device.get_timestamp(), "skey": self.__auth_data["skey"], "sid": self.__auth_data["wxsid"], "uin": self.__auth_data["wxuin"], "deviceid": self.__device_id, "synckey": self.create_synckey(), "_": Device.get_timestamp(), }, timeout=API_checktimeout, ) if not resp.status_code == 200: raise AssertionError() call_back = execjs.eval(resp.text.replace("window.synccheck=", "")) except requests.exceptions.ReadTimeout: pass except requests.exceptions.Timeout: pass except Exception as e: error(e) time.sleep(1) return call_back
def get_shop_info(session, wm_latitude=22634767, wm_longitude=113834247): """ :param wm_longitude: 定位的经度 :param wm_latitude: 定位的纬度 :param session: 保持会话的实例 :return: 商铺列表 """ # header_cookie = ";".join([x + '=' + str(y) for x, y in cookie.items()]) # 请求头的Cookies拼接 headers = { 'Accept': 'application/json', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'https://h5.waimai.meituan.com', 'Referer': 'https://h5.waimai.meituan.com/waimai/mindex/home', 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Mobile Safari/537.36', 'Cookie': "lxsdk_cuid=16915455dbec8-08db788af7a2ff-1333063-1fa400-16915455dbec8; ci=30; _ga=GA1.3.2111621561.1550840081; _gid=GA1.3.426525714.1550840081; IJSESSIONID=1oa4mjr7l5r0c1fa32alvdcpbn; iuuid=C2540F1F12DE8DEB7EFFE84661C401CBEDED6125E30E08F9BF648F828AD42BDF; cityname=%E6%B7%B1%E5%9C%B3; _lxsdk=C2540F1F12DE8DEB7EFFE84661C401CBEDED6125E30E08F9BF648F828AD42BDF; webp=1; ci3=1; _hc.v=f8c27eb3-603c-e958-93d8-63f0bfa0a746.1550840270; __utmz=74597006.1550906260.3.3.utmcsr=meishi.meituan.com|utmccn=(referral)|utmcmd=referral|utmcct=/i/; latlng=22.636802,113.829362,1550906262992; i_extend=C_b1Gimthomepagecategory1394H__a; openh5_uuid=C2540F1F12DE8DEB7EFFE84661C401CBEDED6125E30E08F9BF648F828AD42BDF; showTopHeader=show; _lxsdk_s=1691935afe6-838-f5a-f2%7C%7C44; _lx_utm=utm_source%3D60030; __utma=74597006.2078366826.1550840256.1550901561.1550906260.3; __utmc=74597006; wm_order_channel=mtib; __mta=51223190.1550840082094.1550840082094.1550840111070.2" } start_index = 0 data = { 'startIndex': start_index, # 页数;从0开始 'sortId': 5, # 排序方式;0是综合排序,5是距离最近 'multiFilterIds': '', 'sliderSelectCode': '', 'sliderSelectMin': '', 'sliderSelectMax': '', 'geoType': 2, 'wm_latitude': wm_latitude, # 定位坐标 'wm_longitude': wm_longitude, 'wm_actual_latitude': 22634767, # 真实坐标 'wm_actual_longitude': 113834247, '_token': '', } url = "https://i.waimai.meituan.com/openh5/homepage/poilist?_={}".format( execjs.eval("Date.now()")) print(f"当前爬取坐标({data['wm_longitude']},{data['wm_latitude']})") try: # 只爬取第四页,后续数据需要登录才能获取 for index in range(4): res = session.post(url=url, headers=headers, data=data, timeout=5) # 返回数据成功 if res.status_code == 200: shop_list = json.loads(res.text).get("data").get("shopList") return shop_list else: print("get_shop_info:返回数据失败,status_code非200") start_index += 1 # 页数加1 time.sleep(2) except exceptions.ConnectionError: print("get_shop_info:网络连接错误") except exceptions.Timeout: print("get_shop_info:超过等待时间")
def get_bars(self, symbol, period, callback, bar_is_completed=False,bar_freq=1, start_dt=None): """ 返回k线数据 symbol:合约b tc:next_week:10 period: 周期: 1min,3min,5min,15min,30min,1day,3day,1hour,2hour,4hour,6hour,12hour """ ret_bars = [] if ':' not in symbol: self.strategy.writeCtaError(u'{} {}格式需要包含合约类型,如:btc:next_week:10'.format(datetime.now(), symbol)) return False, ret_bars s = symbol.split(':') symbol_pair, contract_type = s[0],s[1] if not symbol_pair.endswith('_usd'): symbol_pair += '_usd' if symbol_pair not in symbol_list: self.strategy.writeCtaError(u'{} {}不在下载清单中'.format(datetime.now(), symbol_pair)) return False, ret_bars url = u'https://www.okex.com/api/v1/future_kline.do?symbol={}&type={}&contract_type={}'.format(symbol_pair, period,contract_type) self.strategy.writeCtaLog('{}开始下载:{} {}数据.URL:{}'.format(datetime.now(), symbol, period, url)) bars = [] content = None try: content = self.session.get(url).content.decode('gbk') bars = execjs.eval(content) except Exception as ex: self.strategy.writeCtaError('exception in get:{},{},{}'.format(url,str(ex), traceback.format_exc())) return False, ret_bars for i, bar in enumerate(bars): if len(bar) < 5: self.strategy.writeCtaError('error when import bar:{}'.format(bar)) return False add_bar = CtaBarData() try: add_bar.vtSymbol = symbol add_bar.symbol = symbol add_bar.datetime = datetime.fromtimestamp(bar[0] / 1000) add_bar.date = add_bar.datetime.strftime('%Y-%m-%d') add_bar.time = add_bar.datetime.strftime('%H:%M:%S') add_bar.tradingDay = add_bar.date add_bar.open = float(bar[1]) add_bar.high = float(bar[2]) add_bar.low = float(bar[3]) add_bar.close = float(bar[4]) add_bar.volume = float(bar[6]) # 这里:5 是交易量,6是交易量转化BTC或LTC数量 except Exception as ex: self.strategy.writeCtaError('error when convert bar:{},ex:{},t:{}'.format(bar, str(ex), traceback.format_exc())) return False, ret_bars if start_dt is not None and bar.datetime < start_dt: continue ret_bars.append(add_bar) if callback is not None: callback(add_bar, bar_is_completed, bar_freq) return True, ret_bars
def download_bars(self, symbol, period, size_=None, start_dt=None): """ 返回k线数据 symbol:合约 period: 周期: 1min,3min,5min,15min,30min,1day,3day,1hour,2hour,4hour,6hour,12hour """ ret_bars = [] if symbol not in symbol_list: msg = u'{} {}不在下载清单中'.format(datetime.now(), symbol) if self.strategy: self.strategy.writeCtaError(msg) else: print(msg) return ret_bars url = u'https://www.okex.com/api/v1/kline.do?symbol={}&type={}'.format(symbol, period) if isinstance(size_,int): url = url + u'&size={}'.format(size_) if start_dt is not None and isinstance(start_dt,datetime): url = url + u'&since={}'.format(int(start_dt.timestamp()*1000)) self.writeLog('{}开始下载:{} {}数据.URL:{}'.format(datetime.now(), symbol, period,url)) content = None try: content = self.session.get(url).content.decode('gbk') except Exception as ex: self.writeError('exception in get:{},{},{}'.format(url,str(ex), traceback.format_exc())) return ret_bars bars = execjs.eval(content) if not isinstance(bars,list): self.writeError('返回数据不是list:{}'.format(content)) return ret_bars for i, bar in enumerate(bars): if len(bar) < 5: self.writeError('error when get bar:{}'.format(bar)) return ret_bars if i == 0: continue add_bar = {} try: bar_datetime= datetime.fromtimestamp(bar[0] / 1000) add_bar['datetime'] = bar_datetime.strftime('%Y-%m-%d %H:%M:%S') add_bar['date'] = bar_datetime.strftime('%Y-%m-%d') add_bar['time'] = bar_datetime.strftime('%H:%M:%S') add_bar['open'] = float(bar[1]) add_bar['high'] = float(bar[2]) add_bar['low'] = float(bar[3]) add_bar['close'] = float(bar[4]) add_bar['volume'] = float(bar[5]) except Exception as ex: self.writeError('error when convert bar:{},ex:{},t:{}'.format(bar, str(ex), traceback.format_exc())) ret_bars.append(add_bar) return ret_bars
def getDayBars(self, symbol, callback,start_dt=None): """ 从sina加载最新的Day数据 :param symbol: (全路径得合约名称,先使用ctaTemplate.getFullSymbol() :param callback: 回调函数 :param start_dt: 开始时间,缺省为None :return: 成功/失败 """ sinaBars = [] try: url = u'http://stock.finance.sina.com.cn/futures/api/json.php/InnerFuturesService.getInnerFuturesDailyKLine?symbol={0}'.format(symbol) self.strategy.writeCtaLog(u'从sina下载{0}的日K数据 {1}'.format(symbol, url)) responses = execjs.eval(self.session.get(url).content.decode('gbk')) dayVolume = 0 for item in responses: bar = CtaBarData() bar.vtSymbol = symbol bar.symbol = symbol # bar的close time bar.datetime = datetime.strptime(item['date'], '%Y-%m-%d') if start_dt is not None: if bar.datetime < start_dt: continue bar.date = bar.datetime.strftime('%Y%m%d') bar.tradingDay = bar.date bar.time = bar.datetime.strftime('%H:%M:00') bar.open = float(item['open']) bar.high = float(item['high']) bar.low = float(item['low']) bar.close = float(item['close']) bar.volume = int(item['volume']) bar.dayVolume = bar.volume sinaBars.append(bar) if len(sinaBars)>0: self.strategy.writeCtaLog(u'从sina读取了{0}条日线K数据'.format(len(sinaBars))) # 把sina的bar灌入回调函数 for bar in sinaBars: callback(bar) # 处理完毕,清空 sinaBars = [] return True else: self.strategy.writeCtaLog(u'从sina读取日线K数据失败') return False except Exception as e: self.strategy.writeCtaLog(u'加载Sina历史日线数据失败:'+str(e)) return False
def calc(response): query = response.match.group(3) params = dict(hl='en', q=query) response = requests.get('https://www.google.com/ig/calculator', params=params) mapping = execjs.eval(response.content) rhs, error = mapping['rhs'], mapping['error'] return rhs or error
def _parse_cookie(js): cookie_string, anonymous_function = re.search( r"(__jsl_clearance=\d+\.?\d+\|0\|)'\+(\(function\(\).+)\+';Expires=", js).groups() result = execjs.eval(anonymous_function) key, value = f"{cookie_string}{result}".split("=") return {key: value}
def get_img(js_api): try: r = requests.get(js_api, headers=headers, timeout=2) imgs = execjs.eval(r.text) #js执行eval函数 img = imgs[0] return img except Exception as e: print('api请求出错:', e)
async def get_jsObj(response): bsObj = BeautifulSoup(response, 'lxml') mainBody = bsObj.find('div', id="mainBodySingle") scripts = bsObj.find_all('script', type="text/javascript") script = scripts[-1] jsObj = execjs.eval(script.text[0:-1]) return jsObj, mainBody
def get_page_info(self, parent_link): inner_page_data = self.get_data(parent_link).decode('utf-8') inner_page_soup = BeautifulSoup(inner_page_data, 'html.parser') inner_script = inner_page_soup.find('script', {'type': 'text/javascript'}) inner_script_refined = inner_script.text.split('\n')[3].strip().replace('eval(', '')[:-1] result = execjs.eval(inner_script_refined) self.info_dict = json.loads(result.replace('var pages=pages=\'', '').rstrip('\';')) return int(self.info_dict['sum_pages'])
def get_page_info(self, parent_link): inner_page_data = self.get_data(parent_link).decode("utf-8") inner_page_soup = BeautifulSoup(inner_page_data, "html.parser") inner_script = inner_page_soup.find("script", {"type": "text/javascript"}) inner_script_refined = inner_script.text.split("\n")[3].strip().replace("eval(", "")[:-1] result = execjs.eval(inner_script_refined) self.info_dict = json.loads(result.replace("var pages=pages='", "").rstrip("';")) return int(self.info_dict["sum_pages"])
def get_page_info(self, parent_link): inner_page_data = self.get_data("http://manhua.dmzj.com%s" % parent_link, is_destop=True).decode("utf-8") inner_page_soup = BeautifulSoup(inner_page_data, "html.parser") inner_script = inner_page_soup.find("script", {"type": "text/javascript"}) inner_script_refined = inner_script.text.split("\n")[3].strip().replace("eval(", "")[:-1] result = execjs.eval(inner_script_refined) self.image_list = json.loads(result.replace("var pages=pages='", "").rstrip("';")) return len(self.image_list)
def random_string(): """ 生成随机字符串 :return: """ generate_string = execjs.eval( '(((1 + Math.random()) * 0x10000) | 0).toString(16).substring(1)') return generate_string
def demo6(): js = ''' 1+2 ''' response = execjs.eval(js) print(response)
def action(self, parameters): try: condition = self.substitute_placeholders(self.expression, parameters, skipped_to_null=True) res = eval(condition) return False if res is None else res except Exception as e: return 'ConditionalNode exception: id: %s, expression: %s, parameters: %s, exception: %s' % self.id, \ str(self.expression), str(self.parameters), str(e)
def getCodeArray(self, queryWords): codeArr = [] for w in queryWords: arr = self.getCodeArrayFromWencai( execjs.eval("encodeURIComponent('" + w + "')")) for code in arr: codeArr.append(code) return codeArr
def get_images(html, url): key = re.search('root\.YUI_config\.flickr\.api\.site_key = "([^"]+)', html).group(1) model = re.search(r"Y\.ClientApp\.init\(([\s\S]+?)\)\s*\.then", html).group(1) data = execjs.eval("auth = null, reqId = null, model = " + model + ", model.modelExport['photo-models'][0]") return query_video(data["id"], data["secret"], key)
def get_pre_login_info(self): url = 'https://passport.baidu.com/center' urllib2.urlopen(url) url = 'https://passport.baidu.com/v2/api/?getapi&tpl=pp&apiver=v3&' + str(int(time.time()) * 1000) + '&class=login&logintype=basicLogin&callback=bd__cbs__hqe0c' content = urllib2.urlopen(url).read() content = content[len('bd__cbs__hqe0c('):-1] content = execjs.eval(content) self.token = content['data']['token'] url = 'https://passport.baidu.com/v2/getpublickey?token=' + self.token + \ '&tpl=pp&apiver=v3&tt=' + \ str(int(time.time()) * 1000) + '&callback=bd__cbs__zgtpei' content = urllib2.urlopen(url).read() content = content[len('bd__cbs__zgtpei('):-1] content = execjs.eval(content) pubkey_temp = content['pubkey'] self.pubkey = pubkey_temp # .replace('\n','\\n') self.key = content['key'] # print self.token, self.pubkey, self.key self.encrypt_keys()
def getTicks2(self, symbol, callback,start_dt=None): """ # 从sina加载最新的M1数据(针对中金所) :param symbol: 合约代码(全路径得合约名称,先使用ctaTemplate.getFullSymbol() :param callback: 回调函数 :param start_dt: 开始时间,缺省为None :return: 成功/失败 """ try: url = u'http://stock2.finance.sina.com.cn/futures/api/jsonp.php/var%20t1nf_{0}=/InnerFuturesNewService.getMinLine?symbol={0}'.format(symbol) self.strategy.writeCtaLog(u'从sina下载{0}Tick数据 {1}'.format(symbol, url)) response_data= self.session.get(url).content response_data = response_data.decode('gbk').split('=')[-1] response_data = response_data.replace('(', '') response_data = response_data.replace(');', '') responses= execjs.eval(response_data) datevalue = datetime.now().strftime('%Y-%m-%d') self.strategy.writeCtaLog(u'一共提取{}条分时数据'.format(len(responses))) for i, item in enumerate(responses): tick = CtaTickData() tick.vtSymbol = symbol tick.symbol = symbol if len(item) >= 6: datevalue = item[6] tick.date = datevalue tick.time = item[0] + u':00' tick.datetime = datetime.strptime(tick.date + ' ' + tick.time, '%Y-%m-%d %H:%M:%S') tick.tradingDay = tick.date if start_dt is not None: if tick.datetime < start_dt: continue tick.lastPrice = float(item[1]) tick.volume = int(item[3]) if type(item[4]) == type(None): tick.openInterest = 0 else: tick.openInterest = int(item[4]) callback(tick) return True except Exception as e: self.strategy.writeCtaLog(u'加载sina历史Tick数据失败:' + str(e)) return False
def get_image_link(self, parent_link, page): node_script = '' while node_script is '': node_script = self.get_data(self.general_formula % (parent_link, parent_link[2:-1], page), 'http://www.dm5.com%s' % parent_link).decode('utf-8') if node_script is '': webbrowser.open_new('http://www.dm5.com%s' % parent_link) time.sleep(3) link = execjs.eval(node_script)[0] link_safe = self.unicodeToURL(link) return link_safe
def get_image_link(self, parent_link, page): javascript_script = "" while javascript_script is "": javascript_script = self.get_data( self.general_formula % (parent_link, parent_link[2:-1], page), "http://www.dm5.com%s" % parent_link ).decode("utf-8") if javascript_script is "": webbrowser.open_new("http://www.dm5.com%s" % parent_link) time.sleep(3) link = execjs.eval(javascript_script)[0] link_safe = self.unicodeToURL(link) return link_safe
def get_bars(self, symbol, period, callback, bar_is_completed=False,bar_freq=1, start_dt=None): """ 返回k线数据 symbol:合约 period: 周期: 1min,3min,5min,15min,30min,1day,3day,1hour,2hour,4hour,6hour,12hour """ if symbol not in symbol_list: self.strategy.writeCtaError(u'{} {}不在下载清单中'.format(datetime.now(), symbol)) return url = u'https://www.okex.com/api/v1/kline.do?symbol={}&type={}'.format(symbol, period) self.strategy.writeCtaLog('{}开始下载:{} {}数据.URL:{}'.format(datetime.now(), symbol, period,url)) content = None try: content = self.session.get(url).content.decode('gbk') except Exception as ex: self.strategy.writeCtaError('exception in get:{},{},{}'.format(url,str(ex), traceback.format_exc())) return bars = execjs.eval(content) for i, bar in enumerate(bars): if len(bar) < 5: self.strategy.writeCtaError('error when import bar:{}'.format(bar)) return False if i == 0: continue add_bar = CtaBarData() try: add_bar.vtSymbol = symbol add_bar.symbol = symbol add_bar.datetime = datetime.fromtimestamp(bar[0] / 1000) add_bar.date = add_bar.datetime.strftime('%Y-%m-%d') add_bar.time = add_bar.datetime.strftime('%H:%M:%S') add_bar.tradingDay = add_bar.date add_bar.open = float(bar[1]) add_bar.high = float(bar[2]) add_bar.low = float(bar[3]) add_bar.close = float(bar[4]) add_bar.volume = float(bar[5]) except Exception as ex: self.strategy.writeCtaError('error when convert bar:{},ex:{},t:{}'.format(bar, str(ex), traceback.format_exc())) return False if start_dt is not None and bar.datetime < start_dt: continue if callback is not None: callback(add_bar, bar_is_completed, bar_freq) return True
def getimgurls(html, url): # Set base url base = "http://images.dmzj.com/" # Get urls html = html.replace("\n", "") s = re.search(r"page = '';\s*(.+?);\s*var g_comic_name", html).group(1) pages = execjs.compile(s).eval("pages") pages = execjs.eval(pages) # thumbs.db?! # http://manhua.dmzj.com/zhuoyandexiana/3488-20.shtml return [base + page for page in pages if page and not page.lower().endswith("thumbs.db")]
def jseval(job=None, expression=None): if expression.startswith('{'): exp_tpl = '''function () { $job = %s; return function()%s();}() ''' else: exp_tpl = '''function () { $job = %s; return %s;}() ''' exp = exp_tpl % (json.dumps(job), expression) return execjs.eval(exp)
def grab_price_history(response): """Callback to parse out price history data""" cols = '' rows = '' # find js code lines with data for line in response.iter_lines(): line = line.strip() if line.startswith('cols'): cols = ':'.join(line.split(':')[1:]).strip(',') elif line.startswith('rows'): rows = ':'.join(line.split(':')[1:]).strip() # eval js to to get data rows = execjs.eval(rows) cols = execjs.eval(cols) # clean up cols and rows cols = [col.get('label') if col.get('label') else col.get('p').get('role') for col in cols] rows = [[f['v'] for f in row['c']] for row in rows] df = pd.DataFrame(rows) df.columns = cols return df
def samair(): """ http://www.samair.ru/proxy """ base_uri = 'http://www.samair.ru/proxy/' page = requests.get(base_uri) tree = lxml.html.fromstring(page.text) js_uri = urllib.basejoin(base_uri, tree.xpath('.//script[@type="text/javascript"]/@src')[0]) js_vars = re.search('eval\((.*)\)', requests.get(js_uri).text.strip()).group(1) js_vars = execjs.eval(js_vars) uri_list = [base_uri]+[urllib.basejoin(base_uri, u) \ for u in tree.xpath('.//a[@class="page"]/@href')] results = [] for uri in uri_list: page = requests.get(base_uri) tree = lxml.html.fromstring(page.text) rows = tree.xpath('.//table[@id="proxylist"]/tr')[1:] for row in rows: td_list = row.xpath('.//td') if len(td_list) != 4: continue ip = td_list[0].text # Get the JavaScript that corresponds to the obfuscated port: port_js = td_list[0].xpath('.//script')[0].text port_js = re.search('document\.write\(\":\"\+(.+)\)', port_js).group(1) port_vars = port_js.split('+') p = '+'.join(['(%s).toString()' % v for v in port_vars]) # Construct function to interpret to get the actual port value: f = 'function(){'+js_vars+'return '+p+'}()' port = str(execjs.eval(f)) results.append('http://'+ip+':'+port) return results
def getimgurls(html, page=0, url=""): header["Referer"] = url key = re.search(r'id="dm5_key".+?<script[^>]+?>\s*eval(.+?)</script>', html, re.S) if key: key = execjs.eval(key.group(1)).split(";")[1] key = re.search(r"=(.+)$", key).group(1) key = execjs.eval(key) else: key = "" base = re.search(r"(^.+)/[^/]*$", url).group(1) pages = re.search("DM5_IMAGE_COUNT=(\d+);", html).group(1) cid = re.search("DM5_CID=(\d+);", html).group(1) s = [] for p in range(1, int(pages)+1): currentUrl = "{}/chapterfun.ashx?cid={}&page={}&language=1&key={}".format(base, cid, p, key) ot = comiccrawler.grabhtml(currentUrl, hd=header) context = execjs.compile(ot) # window.ajaxloadimage d = context.eval("(typeof (hd_c) != 'undefined' && hd_c.length > 0 && typeof (isrevtt) != 'undefined') ? hd_c : d") s.append(d[0]) return s
def unjs_email(script): """Takes a javascript email mangling script and returns the email address.""" # Get hold of the lines of javascript which aren't fiddling with the DOM jslines = [x.strip() for x in re.search(r'<!--(.*)//-->', script, re.M | re.S).group(1).strip().splitlines() if not x.strip().startswith('document')] # The name of the variable containing the variable containing the email address # varies, so find it by regex. varname = re.search(r'var (addy\d+)', script).group(1) jslines.append('return {}'.format(varname)) js = '(function() {{{}}})()'.format(' '.join(jslines)) return unescape(execjs.eval(js))
def getimgurls(html, url): """getimgurls(html, url) -> url list Return a list of urls. """ html = html.replace("\n", "") s = re.search("page = '';(.+?);var g_comic_name", html).group(1) ctx = execjs.compile(s) pages = execjs.eval(ctx.eval("pages")) base = "http://images.dmzj.com/" # thumbs.db?! # http://manhua.dmzj.com/zhuoyandexiana/3488-20.shtml return [base + page for page in pages if page and not page.lower().endswith("thumbs.db")]
def evaluate(self, expression=None, job=None, context=None, *args, **kwargs): if expression.startswith('{'): exp_tpl = '''function () { $job = %s; $self = %s; return function()%s();}() ''' else: exp_tpl = '''function () { $job = %s; $self = %s; return %s;}() ''' exp = exp_tpl % (json.dumps(job), json.dumps(context), expression) return execjs.eval(exp)
def getepisodelist(html, url): data_js = re.search("initIntroData(.+?);", html, re.DOTALL).group(1) data = execjs.eval(data_js) ep_data = [] for category in data: ep_data += category["data"] ep_data = sorted(ep_data, key=lambda data: data["chapter_order"]) episodes = [] for data in ep_data: ep_url = "/view/{}/{}.html".format(data["comic_id"], data["id"]) title = data["title"] + data["chapter_name"] episodes.append(Episode(title, urljoin(url, ep_url))) return episodes
def evaluate_rabix_js(expression, job, context=None, engine_config=None, outdir=None, tmpdir=None): # log.debug("expression: %s" % expression) if expression.startswith('{'): exp_tpl = '''function () { $job = %s; $self = %s; return function()%s();}() ''' else: exp_tpl = '''function () { $job = %s; $self = %s; return %s;}() ''' exp = exp_tpl % (json.dumps(job), json.dumps(context), expression) result = execjs.eval(exp) log.debug("Expression result: %s" % result) return result
def get_area_house(self, response): try: print(response.url) body = response.body.decode("utf8") groups = re.search("\s*var search_result = \s*(.*);var search_result_list_num\s*=\s*\d", body) body = execjs.eval(groups[1]) with open("a.html", 'w', encoding="utf-8") as f: f.write(body) body = Selector(text=body) for each in body.xpath("//li[@class='title']/h2"): url = each.xpath("a/@href").extract()[0] yield scrapy.Request(url, callback=self.parse_building) for each in body.xpath("//div[@id='search_result_page']/a[@onclick]"): if each.xpath("text()").extract()[0] == "下一页>": search_result = re.search(r".*\(.*,.*,.*,(\d*)\)", each.xpath("@onclick").extract()[0]) page_no = search_result.group(1) url = re.sub("page_no=\d*", "page_no=%s" % page_no, response.url, 1) yield scrapy.Request(url, callback=self.get_area_house) except Exception as e: self.log("!!!!!error %s" % e)