def parse(expression, mode='eval'): if mode == 'eval': return Parser().parse(expression).children()[0].expr elif mode == 'exec': return Parser().parse(expression).children() raise TypeError("Only eval, exec modes allowed")
def case(self, case): parser_a = Parser() result_a = parser_a.parse(case) parser_b = Parser() result_b = parser_b.parse(case) self.assertEqual(result_a, result_b)
def scan_js(crawler, url, content): ''' scan javascript for url assignments (like ajax calls). ''' LOGGER.info('Scanning Javascript on %s' % url) parser = Parser() tree = parser.parse(content) for node in nodevisitor.visit(tree): if not isinstance(node, ast.Assign): # <something>: <something> continue leftval = getattr(node.left, 'value', '') # 'leftval': <something> if not leftval: continue if 'url' not in leftval: # 'url': <something> continue if isinstance(node.right, ast.String): # 'url': 'somestring' LOGGER.info('Found interesting url in JS: %s' % node.right.value[1:-1]) crawler.check_link(url, node.right.value[2:-1]) for item in node.right.__dict__.values(): # string in <something> # <something> may be function_call() / variable + 'somestring' if isinstance(item, ast.String): LOGGER.info('Found interesting url in JS: %s' % item.value[1:-1]) crawler.check_link(url, item.value[2:-1])
def get_forecast(link): html_doc = urllib2.urlopen(link).read() soup = BeautifulSoup(html_doc, "html.parser") forecast_text = soup.find("div", id="div_wgfcst1").find("script").string parser = Parser() forecast_tree = parser.parse(forecast_text) full_data = { parse_key(node.left): parse_value(node.right) for node in nodevisitor.visit(forecast_tree) if isinstance(node, ast.Assign) } forecast_tree = parser.parse(forecast_text) forecast = { parse_key(node.left): parse_array(node.right) for node in nodevisitor.visit(forecast_tree) if isinstance(node, ast.Assign) and isinstance(node.right, ast.Array) } full_data.update(forecast) return full_data
def extract_g_config(script_text): parser = Parser() ast_tree = parser.parse(script_text) for node in nodevisitor.visit(ast_tree): if isinstance(node, ast.VarDecl) and node.identifier.value == 'g_config': return extract_object_as_map(node.initializer)
def position_info_from_naver(url): parsed_obj = urlparse(url) article_info_no = 1 if parsed_obj.netloc == 'm.land.naver.com': article_info_no = parsed_obj.path.split('/')[-1] elif parsed_obj.netloc == 'new.land.naver.com': article_info_no = dict(parse_qsl(parsed_obj.query))['articleNo'] else: raise Exception('unknown url : ' + parsed_obj.netloc ) r = requests.get('https://m.land.naver.com/article/info/' + str(article_info_no)) soup = BeautifulSoup(r.content, "html.parser") res = dict() for sc in soup.findAll("script"): pos = sc.text.find('land.articleDetail.jsonPageData') if pos > 0: bracket_start_pos = sc.text.find('{',pos) bracket_end_pos = sc.text.find(';', pos) #print(sc.text[bracket_start_pos:bracket_end_pos]) #js_obj_txt = sc.text[bracket_start_pos:bracket_end_pos] js_obj_txt = sc.text[pos:bracket_end_pos] # print(js_obj_txt) parser = Parser() tree = parser.parse(js_obj_txt) visitor = NaverDataVisitor() visitor.visit(tree) res = visitor.res return res
def card_price_history(setname, cardname): ''' Scrapes price history of card from MTGPrice.com, using javascript parser Input: Setname and cardname are strings, generally taken from Scryfall API. Output: A numpy array of price history, each 'row' in the form [timestamp, price] ''' # Turn card data into soup link = 'https://www.mtgprice.com/sets/' + '_'.join( setname.split()) + '/' + '_'.join(cardname.split()) soup = BeautifulSoup(requests.get(link).content, 'html.parser') # GET RESULTS text_to_find = 'var results = [' history = [] for script in soup.findAll('script', type='text/javascript'): if text_to_find in script.text: parser = Parser() tree = parser.parse(script.text) for node in nodevisitor.visit(tree): if isinstance(node, ast.Assign) and getattr( node.left, 'value', '') == "\"data\"": for prices in node.right.items: history.append( [prices.items[0].value, prices.items[1].value]) break return np.array(history)
def test_func(self): parser = Parser() tree = parser.parse(input) mangle(tree, toplevel=True) self.assertMultiLineEqual( textwrap.dedent(tree.to_ecma()).strip(), textwrap.dedent(expected).strip())
def parse_country(self, response): charts = response.xpath('//*[@class="row graph_row"]') total_corona_chart = charts[0] script = total_corona_chart.xpath('div/script/text()').extract()[0] title = total_corona_chart.xpath('div/h3/text()').extract()[0] try: country_name = title[title.index(" in ")+4:] if country_name[:4] == "the ": country_name = country_name[4:] except e: raise ValueError("Worldometer changed their labels.\ Hold your pain, Harold.") parser = Parser() tree = parser.parse(script) data = [None, None] # dates and corresponding number of cases for node in nodevisitor.visit(tree): if isinstance(node, ast.Assign): if getattr(node.left, 'value', '') == 'categories' and not data[0]: print("\nparsing dates\n") data[0] = [eval(getattr(s, 'value', '')) for s in getattr(node.right, 'items', '')] elif getattr(node.left, 'value', '') == 'data' and not data[1]: print("\nparsing number of cases\n") data[1] = [int(getattr(n, 'value', '')) for n in getattr(node.right, 'items', '')] assert data[0] and data[1] and len(data[0]) == len(data[1]) with open("data/%s.csv" % country_name, 'w+') as f: for k in range(len(data[0])): f.write(data[0][k]) f.write(',') f.write(str(data[1][k])) f.write('\n')
def analyzeJSCodesFinerBlock(script, display=False): try: t1 = time.time() parser = Parser() script = script.strip() if script.startswith('<!--') and script.endswith('-->'): script = script[4:-3] tree = parser.parse(script) visitor = MyVisitor(display) visitor.visit(tree, 0) if len(visitor.first_level_seq) != len(visitor.scripts): print >> sys.stderr, "error parsing script: scripts and seqs length inconsistent " + script[: 100] return None, None t2 = time.time() total_time = t2 - t1 total_len = float(len(script)) try: portion = [len(x) / total_len for x in visitor.scripts] for i in range(len(portion)): t = total_time * portion[i] print "AST_TIME: %f %d" % (t, len(visitor.scripts[i])) except: pass return visitor.first_level_seq, visitor.scripts except Exception as e: print >> sys.stderr, "error parsing script: " + str( e) + " || [START]" + script[:100] + "[END]" return None, None
def get_property_attributes(url): response = requests.get(url) #html parser soup = BeautifulSoup(response.text, 'html.parser') script = soup.findAll('script', {'type': 'text/javascript'})[3] # if ad link returns valid search result, scan for attributes, else skip if soup.title.string.find('Real Estate Properties') == -1: # if ad is archived, put in dummy date, else get real date if soup.find("span", "status-label label-archive") != None: date = '31 Dec 9999' else: #get date from title of advertisement date = re.findall(r'\d{2}\s\w{3}\s\d{4}', soup.title.string)[0] #javascript parser parser = Parser() tree = parser.parse(script.text) fields = { getattr(node.left, 'value', ''): getattr(node.right, 'value', '') for node in nodevisitor.visit(tree) if isinstance(node, ast.Assign) } fields.update({'"date sold"': '"' + date + '"'}) return fields else: return None
def chapter_url2image_urls(chapter_url): g = get_info_from_url(chapter_url, chapter2images) p = Parser() for t2_or_t3, (slot_idx, pattern_idx, info_pattern) in g: tag, _, data = t2_or_t3 #p = Parser() tree = p.parse(data) pre = None for node in nodevisitor.visit(tree): if isinstance(node, ast.Identifier) and node.value == 'image_list': break pre = node assert pre != None m = _image_list_match_pattern.match(pre.to_ecma()) assert m != None image_list = eval(m.group(1)) image_list = eval(image_list) ls = [] for info in image_list.values(): src = base64.b64decode(info['src']).decode('ascii') page = info['page'] ls.append((page, src)) ls.sort() ls = tuple(src for _, src in ls) return ls
def assertUnusedObjects(self, source, expected): parser = Parser() tree = parser.parse(source) uvisit = UnusedObjectsVisitor() uvisit.do(tree) self.maxDiff = None self.assertSequenceEqual(tree.to_ecma(), expected)
def parse_global_js_for_access_id_action_url(global_js): parser = Parser() tree = parser.parse(global_js) parts = ['protocol', 'roDomain', 'ro', 'rt'] UrlParts = namedtuple('UrlParts', parts) url_parts = UrlParts([], [], [], []) getvalue = operator.attrgetter('value') err = "Too many '{}' assignments in global.js." for node in nodevisitor.visit(tree): if isinstance(node, ast.Assign): try: left_value = getvalue(node.left).strip('\'"') except AttributeError: continue if left_value in parts: right_value = getvalue(node.right).strip('\'"') assert right_value not in getattr( url_parts, left_value), err.format('protocol') getattr(url_parts, left_value).append(right_value) return url_parts.protocol[0] + url_parts.roDomain[0] + url_parts.ro[ 0] + url_parts.rt[0]
def minify(text, mangle=False): parser = Parser() tree = parser.parse(text) if mangle: mangler.mangle(tree) minified = ECMAMinifier().visit(tree) return minified
def inital_check_for_obfuscation_condtiion_sensitiveFunctions(js_text): parser = Parser() tree = parser.parse(js_text) keywords = set() if_condition = False for node in nodevisitor.visit(tree): if isinstance(node, If): if_condition = True continue stack = [node] #BFS to go to every depth of the AST tree while stack: node = stack.pop() #only dot access has a.b.getStringfromChar if isinstance(node, DotAccessor): try: for i in node.children(): stack.append(i) except: pass continue if isinstance(node, Identifier): #print (node.value), keywords.add(node.value) #print ("Done visit") obfuscation = False profiling = False if if_condition: pass ob_list = set() pro_list = set() for ob in obfuscation_function_names: if ob in keywords: #print ("[Obfuscation keywords]", ob) obfuscation = True ob_list.add(ob) #break for pro in profiling_function_names: if pro in keywords: #print ("[Profiling keywords]", pro) profiling = True pro_list.add(pro) #break #print ('if_condition: {}, obfuscation {}, profiling {}'.format(if_condition,obfuscation,profiling)) #pint (js_text) return if_condition, obfuscation, profiling, ob_list, pro_list
def assertFoldingObjects(self, source, expected): parser = Parser() tree = parser.parse(source) uvisit = foldingvisitor.FoldingVisitor() uvisit.do(tree) print(tree.to_ecma()) self.maxDiff = None self.assertSequenceEqual(tree.to_ecma(), expected)
def invJSToZ3(inv, typeEnv): p = Parser() t = p.parse(inv) assert (isinstance(t, jsast.Program)) assert (len(t.children()) == 1) assert (isinstance(t.children()[0], jsast.ExprStatement)) return jsToZ3Expr(t.children()[0].expr, typeEnv)
def get_m_decl(fil): with codecs.open(fil, 'r', encoding='utf8') as fd: s = fd.read() tree = Parser().parse(s) m = None for node in nodevisitor.visit(tree): if isinstance(node, ast.VarDecl) and node.identifier.value == 'm': m = node.initializer.to_ecma() return m
def test_throw_statement(self): # expression is not optional in throw statement input = textwrap.dedent(""" throw 'exc'; """) parser = Parser() # ASI at lexer level should insert ';' after throw self.assertRaises(SyntaxError, parser.parse, input)
def test_bug_no_semicolon_at_the_end_of_block_plus_newline_at_eof(self): # https://github.com/rspivak/slimit/issues/3 text = textwrap.dedent(""" function add(x, y) { return x + y; } """) parser = Parser() tree = parser.parse(text) self.assertTrue(bool(tree.children()))
def parse_script(data): # Hack. Fix javascript syntax issue in steam's response to_replace = 'BuildGameRow(game, )' replacement = 'BuildGameRow(game, 0)' data = data.replace(to_replace, replacement) parser = Parser() tree = parser.parse(data) variables = [node for node in nodevisitor.visit(tree) if isinstance(node, ast.VarDecl)] return variables
def get_embedded_json(): # Strip c from s, without exception def strip(s, c): if isinstance(s, str): return s.strip(c) return s div = soup.find('div', id='JSDF') scripts = div.find_all('script', src=None) # Look for $rwidgets script_texts = [] for script in scripts: for s in script.contents: if '$rwidgets' in s: script_texts.append(s) # Bodge until we get rid of slimit with silence_output(): parser = Parser() raw_values = {} for script_text in script_texts: tree = parser.parse(script_text) # Parsing js for node in nodevisitor.visit(tree): if isinstance(node, ast.FunctionCall): if isinstance(node.identifier, ast.Identifier): if node.identifier.value == '$rwidgets': # Deal with here fields = {} for n in nodevisitor.visit(node): if isinstance(n, ast.Assign): k = getattr(n.left, 'value', '').strip('"') v = strip( getattr(n.right, 'value', ''), '"') if k in duplicates: try: fields[k].append(v) except KeyError: fields[k] = [v] else: fields[k] = v # Merge fields and raw_values, resolving duplicates for (k, v) in fields.items(): if k in duplicates: try: raw_values[k] += v except KeyError: raw_values[k] = v elif v != 'null': raw_values[k] = v return raw_values
def _test_function_expression(self): text = """ if (true) { function() { foo; location = 'http://anywhere.com'; } } """ parser = Parser() parser.parse(text)
def parse(text): """ Turn a valid JavaScript source string and turn it into a source tree through the Parser provided by the slimit.parser module. """ global _parser if _parser is None: _parser = Parser() return _parser.parse(text)
def addAllIntEnv(inv, env=None): if (env == None): env = {} p = Parser() t = p.parse(inv) for node in nodevisitor.visit(t): if isinstance(node, jsast.Identifier): env[node.value] = Int return env
def extract_strings_slimit(javascript: str) -> List[str]: from slimit.parser import Parser from slimit import ast if SHOW_WARNINGS: parser = Parser() else: # File descriptor hackiness to silence warnings null_fd = os.open(os.devnull, os.O_RDWR) old_fd = os.dup(2) try: os.dup2(null_fd, 2) parser = Parser() finally: os.dup2(old_fd, 2) os.close(null_fd) os.close(old_fd) # Hack to work around https://github.com/rspivak/slimit/issues/52 KEYWORDS = r"(?:catch|delete|return|throw)" javascript = re.sub(rf"(\.\s*{KEYWORDS})\b", r"\1_", javascript) javascript = re.sub(rf"\b({KEYWORDS})(\s*:)", r"'\1'\2", javascript) parsed = parser.parse(javascript) strings = [] def add_strings(tree, strings): if tree is None: return if not isinstance(tree, (ast.Node, list, tuple)): raise TypeError("Unexpected item: {!r}".format(tree)) if isinstance(tree, ast.String): strings.append(tree.value[1:-1]) children = tree if isinstance(tree, ast.Node): children = tree.children() for child in children: add_strings(child, strings) add_strings(parsed, strings) return strings
def analyzeJSCodes(script, display=False): try: parser = Parser() tree = parser.parse(script) visitor = MyVisitor(display) visitor.visit(tree, 0) #print "first_level_seq: %d" %len(visitor.first_level_seq) return visitor.node_order_list except Exception as e: print >> sys.stderr, "error parsing script: " + str( e) + " || " + script return None
def main(): args = parse_args() with open(args.filename) as f: source = f.read() parser = Parser() tree = parser.parse(source) visitor = ConstantReductionVisitor(args.debug) tree = visitor.visit(tree) print tree.to_ecma()
def parse_JavaScript(js): global functions parser = Parser() tree = parser.parse(js) for node in nodevisitor.visit(tree): if isinstance(node, ast.FuncDecl): if len(node.parameters) > 1: last = node.parameters[len(node.parameters) - 1] first = node.parameters[0] # check for the first parameter if first.value == "this$static": # check that the last one is a callback if last.value == "callback": # the function will call createStreamWriter if its used in the client interface if "createStreamWriter" in node.to_ecma(): params = [] # if we have function arguments if len(node.parameters) > 2: # -2 for the 'this' and callback num_of_params = len(node.parameters) - 2 for param in node.parameters: # we just append the arguments we will need to make in the GWT request if param.value != "this$static" and param.value != "callback": params.append(param.value) # else we have no arguments else: num_of_params = 0 # strip the correct function name function = node.identifier.value.replace("$", "") function = re.sub('_\d+', '', function) # append to a list, since we my have functions of the same name, but different signatures extracted.append({ "function": function, "num_of_args": num_of_params, "args": params, "arg_type_data": get_param_types(function, node.to_ecma()) })