def test_findall(): jscode_snippets = [ ( r""" var arr1 = ["a","b","c"]; var arr2 = ["d","e","f"]; """, '//array', [dict, list], [['a', 'b', 'c'], ['d', 'e', 'f']] ), ( r""" var arr1 = {"a": "b", "c": "d"}; var arr2 = {"e": 1, "f": 2}; """, '//object', [dict, list], [{'a': 'b', 'c': 'd'}, {'e': 1, 'f': 2}] ), ] for snippet, xp, types, expected in jscode_snippets: js = js2xml.parse(snippet) results = [] for r in js.xpath(xp): results.extend(findall(r, types=types)) assert_list_equal([make(r) for r in results], expected)
def test_getall_complex(): jscode_snippets = [ ( r""" var needleParam = needleParam || {}; needleParam.chatGroup = "test"; needleParam.productId = "6341292"; needleParam.productPrice = "EUR 138.53".replace("$","n_").replace(/,/g,""); //Begin Needle (fan-sourcing platform) snippet jQuery(document).ready(function(){ var e = document.createElement("script"); e.type = "text/javascript"; e.async = true; e.src = document.location.protocol + "//overstock.needle.com/needle_service.js?1"; document.body.appendChild(e); }); // End Needle snippet """, [{}], ) ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) assert_list_equal(js2xml.jsonlike.getall(jsxml), expected)
def test_findall(): jscode_snippets = [ ( r""" var arr1 = ["a","b","c"]; var arr2 = ["d","e","f"]; """, "//array", [["a", "b", "c"], ["d", "e", "f"]], ), ( r""" var arr1 = {"a": "b", "c": "d"}; var arr2 = {"e": 1, "f": 2}; """, "//object", [{"a": "b", "c": "d"}, {"e": 1, "f": 2}], ), ] for snippet, xp, expected in jscode_snippets: js = js2xml.parse(snippet) results = [] for r in js.xpath(xp): results.extend(js2xml.jsonlike.findall(r)) assert_list_equal([js2xml.jsonlike.make_dict(r) for r in results], expected)
def test_parse_string(): jscode_snippets = [ ( r""" var h = 'test'; var i = "test"; var j = ""; var k = '""'; var l = '"'; var m = ''; var n = "''"; var o = "'"; """, ['test', 'test', '', '""', '"', '', "''", "'"] ), ( r""" var i = 'test\'s output'; """, [r"test's output"] ), ( r""" var i = ["\"", '\'']; var j = "test\'s output"; var k = "test\\'s output"; var l = "nested \"quotes\"."; """, ['"', "'", r"test's output", r"test\'s output", r'nested "quotes".'] ), ( r""" var i = 'https://www.blogger.com/navbar.g?targetBlogID\0754325487278375417853\46blogName\75spirello\46publishMode\75PUBLISH_MODE_BLOGSPOT\46navbarType\75LIGHT\46layoutType\75LAYOUTS\46searchRoot\75http://spirelloskrimskramserier.blogspot.com/search\46blogLocale\75no\46v\0752\46homepageUrl\75http://spirelloskrimskramserier.blogspot.com/\46vt\0751357383140196484672'; """, [r'https://www.blogger.com/navbar.g?targetBlogID=4325487278375417853&blogName=spirello&publishMode=PUBLISH_MODE_BLOGSPOT&navbarType=LIGHT&layoutType=LAYOUTS&searchRoot=http://spirelloskrimskramserier.blogspot.com/search&blogLocale=no&v=2&homepageUrl=http://spirelloskrimskramserier.blogspot.com/&vt=1357383140196484672'] ), ( r""" var i = "foo \ bar"; var j = "foo \ bar"; """, [r'foo bar', 'foo bar'] ), ( r""" var x = "\u00A9 Netscape Communications"; """, [ur'\u00a9 Netscape Communications'] ), ( u""" var x = "\u00A9 Netscape Communications"; """.encode("utf8"), [u'\u00a9 Netscape Communications'] ), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) result = jsxml.xpath("//string/text()") assert_list_equal(result, expected)
def getimgsrc(pin_id): url = 'http://huaban.com/pins/%s/' % pin_id z = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}) sel = Selector(text=z.text) jscode = sel.xpath("//script[contains(., 'app.page = app.page')]/text()").extract_first() parsed_js = js2xml.parse(jscode) for i in parsed_js.xpath('//property[@name="pins"]//property[@name="key"]/string/text()'): print 'http://img.hb.aicdn.com/' + i
def main(): ap = ArgumentParser() ap.add_argument('--debug', action='store_true') ap.add_argument('filenames', nargs='*', default=['-']) args = ap.parse_args() for fn in args.filenames: fo = sys.stdin if fn == '-' else open(fn, 'rU') parsed = js2xml.parse(fo.read()) print(js2xml.pretty_print(parsed))
def parse(self, response): script = response.xpath('//script[contains(., "var data =")]/text()').extract_first() sel = scrapy.Selector(_root=js2xml.parse(script)) for quote in sel.xpath('//var[@name="data"]/array/object'): yield { 'texto': quote.xpath('string(./property[@name="text"])').extract_first(), 'autor': quote.xpath( 'string(./property[@name="author"]//property[@name="name"])' ).extract_first(), 'tags': quote.xpath('./property[@name="tags"]//string/text()').extract(), } link_next = response.css('li.next a::attr("href")').extract_first() if link_next: yield scrapy.Request(response.urljoin(link_next))
def test_parse_number(): jscode_snippets = [ ( r""" var i = 3; """, [r'3'] ), ( r""" var i = -3.14; """, [r"-3.14"] ), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) result = jsxml.xpath("//number/@value") assert_list_equal(result, expected)
def test_parse_url(): jscode_snippets = [ ( r""" var i = 'http://www.example.com'; """, [r'http://www.example.com'] ), ( r""" var i = 'http:\/\/www.example.com'; """, [r"http://www.example.com"] ), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) result = jsxml.xpath("//string/text()") assert_list_equal(result, expected)
def test_parse(): jscode_snippets = [ r""" var i = 0; """, r""" document.write("\n"); """, r""" var t1 = "nested \"quote\"."; var t2 = 'nested \'quote\'.'; var t3 = 'nested \"quote\".'; var t2 = "nested \'quote\'."; """ ] for snippet in jscode_snippets: assert_is_not_none(js2xml.parse(snippet))
def test_parse_encoding(): jscode_snippets = [ (u""" var test = "Daniel Gra\xf1a"; """, None, [u"Daniel Gra\xf1a"] ), (u""" var test = "Daniel Gra\xf1a"; """.encode("latin1"), "latin1", [u"Daniel Gra\xf1a"] ), ] for snippet, encoding, expected in jscode_snippets: jsxml = js2xml.parse(snippet, encoding=encoding) result = jsxml.xpath("//string/text()") assert_equal(result, expected)
def test_parse_undefined(): jscode_snippets = [ ( r""" myArray = [0,1,,,4,5]; """, 2 ), ( r""" myArray = [,1,,,4,]; """, 3 # and not 4 ), (r""" myArray = [,1,,,4,,,]; """, 5 ), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) result = jsxml.xpath("count(//array/undefined)") assert_equal(result, expected)
def test_json(): jscode_snippets = [ ( r""" var arr1 = ["a","b","c"]; var arr2 = ["d","e","f"]; """, [['a', 'b', 'c'], ['d', 'e', 'f']] ), ( r""" var arr1 = ["a", null, "c"]; var arr2 = [null, "e", null]; """, [['a', None, 'c'], [None, 'e', None]] ), ( r""" var arr1 = ["a", undefined, "c"]; var arr2 = [undefined, "e", null]; """, [['a', 'undefined', 'c'], ['undefined', 'e', None]] ), ( r""" var i = -3.14; """, [] ), ( r""" money = { 'quarters': 20 }; """, [{"quarters": 20}] ), ( r""" money = { 'quarters': 10, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, [] ), ( r""" var money = { 'quarters': 10, 'something': [1,2,3,4], 'somethingelse': {'nested': [5,6,7,8]}, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, [[1,2,3,4], {'nested': [5,6,7,8]}] ), ( r""" var store = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; """, [{'apples': 10, 'carrots': [1, 2, 3, 4], 'chicken': {'eggs': [5, 6, 7, 8]}}] ), ( r""" var store1 = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; var store2 = { 'tomatoes': 20, 'potatoes': [9, false, 7, 6], 'spinach': {'cans': [true, 2]} }; """, [{'apples': 10, 'carrots': [1, 2, 3, 4], 'chicken': {'eggs': [5, 6, 7, 8]}}, {'potatoes': [9, False, 7, 6], 'spinach': {'cans': [True, 2]}, 'tomatoes': 20}] ), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) results = js2xml.jsonlike.findall(jsxml) assert_list_equal([js2xml.jsonlike.make_dict(r) for r in results], expected) for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) assert_list_equal(js2xml.jsonlike.getall(jsxml), expected)
#!/usr/bin/env python # -*- coding: utf-8 -*- import js2xml import os TEST_DIR = os.path.dirname(__file__) files = [ os.path.join(TEST_DIR, 'samples/fullcalendar.js'), os.path.join(TEST_DIR, 'samples/fullcalendar.min.js'), os.path.join(TEST_DIR, 'samples/jquery.min.js'), ] for filename in files: with open(filename) as f: jscode = f.read() tree = js2xml.parse(jscode)
def market_value_historic_pull(base_url, player_id): mv_soup = get_souped_page(base_url.replace("profil", "marktwertverlauf")) if mv_soup.find("script", text=re.compile("Highcharts.Chart")) != None: script = mv_soup.find("script", text=re.compile("Highcharts.Chart")).text parsed = js2xml.parse(script) xpath = '//array//object//property' age_list = [] club_list = [] mv_list = [] date_of_value_list = [] for i in range(len(parsed.xpath(xpath))): age = None club = None raw_value = None date_of_value = None date_raw = None if parsed.xpath(xpath)[i].get('name') == 'age': age = int(stringify_children(parsed.xpath(xpath)[i]).split("number value=")[1].split("/")[0][1:][:-1]) age_list.append(age) if parsed.xpath(xpath)[i].get('name') == 'verein': club = stringify_children(parsed.xpath(xpath)[i]).split("<string>")[1].split("</string>")[0].lower() club_list.append(club) if parsed.xpath(xpath)[i].get('name') == 'mw': raw_value = stringify_children(parsed.xpath(xpath)[i]).split("<string>")[1].split("</string>")[0].replace("€", "") if "m" in raw_value: raw_value = int( float(raw_value.strip().replace("€", "").replace("€","").replace("m","")) * 1000000 ) elif "Th." in raw_value: raw_value = int(raw_value.strip().replace("€", "").replace("€","").replace("Th.","")) * 1000 elif "-": raw_value = 0 mv_list.append(raw_value) if parsed.xpath(xpath)[i].get('name') == 'datum_mw': date_raw = stringify_children(parsed.xpath(xpath)[i]).split("<string>")[1].split("</string>")[0] if date_raw != None: year_of_birth = int(date_raw[len(date_raw)-4:]) month_of_birth = month_to_number(date_raw.split(" ")[0]) day_of_birth = int(date_raw.split(" ")[1].split(",")[0]) date_of_value = datetime.date(year_of_birth, month_of_birth, day_of_birth) date_of_value_list.append(date_of_value) market_value_history = pd.DataFrame( {'club': club_list, 'value': mv_list, 'data_date': date_of_value_list, 'age': age_list }) market_value_history['player_id'] = player_id return(market_value_history) else: market_value_history = pd.DataFrame( {'club': [None], 'value': [None], 'data_date': [None], 'age': [None] }) market_value_history['player_id'] = player_id return(market_value_history)
def parse_product(self, response): doc, tag, text = Doc().tagtext() product_item = LVRItem() image_url_prefix = 'http://images.luisaviaroma.com/Big' # Grab the url from html metadata. # Even though we know the URL from the request, use the URL in the # product page just in case it comes in different. product_item['url'] = \ response.xpath( '/html/head/meta[@property="og:url"]/@content').extract_first() # Grab the first (main) image from html metadata. product_item['photos'] = [ response.xpath( '/html/head/meta[@property="og:image"]/@content' ).extract_first()] assert product_item['photos'][0].startswith(image_url_prefix) # Grab breadcrumb list from microdata kept in an ordered list. # These will be joined to make the category. breadcrumbs = \ response.xpath( '//ol[@itemtype="http://schema.org/BreadcrumbList"]' '/li/a/span[@itemprop="name"]/text()').extract() # Grab the json from javascript. script = \ response.xpath( '//script[contains(., "itemResponse")]/text()').extract_first() script_element = js2xml.parse(script) # Grab just the itemResponse assignment from the script. item_elements = \ script_element.xpath( '//assign[left/identifier[@name="itemResponse"]]/right/*') item_dict = js2xml.jsonlike.make_dict(item_elements[0]) assert item_dict['HasValidDefaultPrice'] # Item title comes from the # (Designer->Description + ShortDescription) elements product_item['title'] = \ u'{} - {}'.format( item_dict['Designer']['Description'], item_dict['ShortDescription']) desc_items = [u'ITEM CODE {}'.format(item_dict['ItemKey']['ItemCode'])] desc_items.extend(item_dict['LongtDescription'].strip('|').split('|')) if item_dict['Composition']: desc_items.append( u'Composition: {}'.format(item_dict['Composition'])) with tag('ul'): for desc_item in desc_items: with tag('li'): text(desc_item) product_item['description'] = doc.getvalue() product_item['currency_code'] = \ item_dict['Pricing'][0]['Prices'][0]['CurrencyId'] product_item['price'] = \ item_dict['Pricing'][0]['Prices'][0]['FinalPrice'] # Add as keywords the breadcrumb, sku, designer name, plus # each word in the product name, but total no more than 10 # keywords. product_item['keywords'] = \ breadcrumbs + [item_dict['Designer']['Description']] + \ [item_dict['ItemKey']['ItemCode']] +\ item_dict['ShortDescription'].split()[0:5] # Make a category from the breadcrumbs. # e.g., WOMEN-> SHOES-> SANDALS to a category: # "WOMEN >> SHOES >> SANDALS" product_item['category'] = u' >> '.join(breadcrumbs) product_item['sku'] = item_dict['ItemKey']['ItemCode'] for photo in item_dict['ItemPhotos']: photo_url = u''.join([image_url_prefix, photo['Path']]) # Original photo duplicated in this list so ignore original. if photo_url not in product_item['photos']: product_item['photos'].append(photo_url) return product_item
chart_scripts = soep.body.find_all('script', type='text/javascript', text=re.compile("Chart")) Twitter_Stat = pd.DataFrame(columns=Twitter_Headers) Telegram_Stat = pd.DataFrame(columns=Telegram_Headers) Youtube_Stat = pd.DataFrame(columns=Youtube_Headers) Reddit_Stat = pd.DataFrame(columns=Reddit_Headers) Github_Stat = pd.DataFrame(columns=Github_Headers) Facebook_Stat = pd.DataFrame(columns=Facebook_Headers) BitcoinTalk_Stat = pd.DataFrame(columns=BitcoinTalk_Headers) Alexa_Stat = pd.DataFrame(columns=Alexa_Headers) for chart in chart_scripts: chart_data = chart.text parsed = js2xml.parse(chart_data) # print(js2xml.pretty_print(parsed)) chart_name = parsed.xpath("//var//arguments//string/text()")[ 0] # 'ic-twitter-stat' # print(chart_name) if chart_name == 'ic-twitter-stat': for d in parsed.xpath( "//property[@name='data']//property[@name='labels']"): Twitter_Stat['Date'] = d.xpath(".//array/string/text()") for d in parsed.xpath( "//property[@name='datasets']//array//object"): variable = d.xpath( ".//property[@name='label']//string/text()")[0] Twitter_Stat[variable] = [ d.xpath(".//property[@name='data']//array/number/@value") ][0]
def test_vars(): jscode_snippets = [ ( r""" var arr1 = ["a","b","c"]; var arr2 = ["d","e","f"]; """, {'arr1': ['a', 'b', 'c'], 'arr2': ['d', 'e', 'f']} ), ( r""" var arr1 = ["a", null, "c"]; var arr2 = [null, "e", null]; """, {'arr1': ['a', None, 'c'], 'arr2': [None, 'e', None]} ), ( r""" var arr1 = ["a", undefined, "c"]; var arr2 = [undefined, "e", null]; """, {'arr1': ['a', 'undefined', 'c'], 'arr2': ['undefined', 'e', None]} ), ( r""" var i = -3.14; """, {'i': -3.14} ), ( r""" money = { 'quarters': 20 }; """, {'money': {"quarters": 20}} ), ( r""" money = { quarters: 20 }; """, {'money': {"quarters": 20}} ), ( r""" currency = 'USD'; money = { "value": 20, "currency": currency }; """, {'currency': 'USD', 'money': {'currency': 'currency', 'value': 20}} ), ( r""" t = {a: "3", "b": 3, "3": 3.0}; """, {'t': {'3': 3.0, 'a': '3', 'b': 3}} ), ( r""" money = { 'quarters': 10, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, {'money': {'quarters': 10, 'addQuarters': None}} ), ( r""" var money = { 'quarters': 10, 'something': [1,2,3,4], 'somethingelse': {'nested': [5,6,7,8]}, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, {'money': {'quarters': 10, 'addQuarters': None, 'something': [1,2,3,4], 'somethingelse': {'nested': [5,6,7,8]}}} ), ( r""" var store = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; """, {'store': {'apples': 10, 'carrots': [1, 2, 3, 4], 'chicken': {'eggs': [5, 6, 7, 8]}} } ), ( r""" var store1 = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; var store2 = { 'tomatoes': 20, 'potatoes': [9, false, 7, 6], 'spinach': {'cans': [true, 2]} }; """, {'store1': { 'apples': 10, 'carrots': [1, 2, 3, 4], 'chicken': {'eggs': [5, 6, 7, 8]}}, 'store2':{ 'potatoes': [9, False, 7, 6], 'spinach': {'cans': [True, 2]}, 'tomatoes': 20} } ), ] for snippet, expected in jscode_snippets: tree = parse(snippet) assert_dict_equal(get_vars(tree), expected, (snippet, expected))
for i in range(20000): x+=1 avid+=1 url="https://www.bilibili.com/video/av"+str(avid) try: html=requests.get(url,headers=headers) except: print("html request error "+str(avid)+' '+str(datetime.datetime.now())) bs4Obj=BeautifulSoup(html.text,'lxml') srcScript=bs4Obj.select("script") if len(srcScript)>4: try: srcElement=js2xml.parse(srcScript[3].string, encoding='utf-8', debug=False) src_tree=js2xml.pretty_print(srcElement) h1=BeautifulSoup(src_tree,"lxml") select(h1,x) time.sleep(random.random()*st1) countErr=0 lastTime=str(datetime.datetime.now()) print("success: av"+str(avid)) except: time.sleep(random.random()*st2) countErr+=1 if countErr>100: time.sleep(300) countErr=0 print('select error: '+str(avid)+" CountErr: "+str(countErr)+" now "+str(datetime.datetime.now())+' last time '+lastTime)
def test_syntax(): jscode_snippets = [ # strings r""" "test"; """, r""" "test\ multiline"; """, # numbers "3.14;", "-12;", "3.45e2;", "0377;", "0xFF;" # arrays "[]", "[1,2]", "[1,,2]", "[1,,2,,3,]", "['a', 'b','c']", "[a, 'b', c]", # objects "o = {};", "o = {a: 1};", "o = {a: 1, b: 2};", "o = {'c': 1, 'd': 2};", 'o = {"c": 1, "d": 2};', 'o = {"c": 1, d: "e"};', "e = {foo: 5, bar: 6, baz: ['Baz', 'Content']};", "e = {1: a, 2: b};", # other primitive data types "null;", "undefined;", "true;", "false;", # variables r""" var i; """, r""" var i,j,k; """, r""" var i = 0; """, r""" var i = "test"; """, r"""var z = 'foxes', r = 'birds';""", r""" var i, j, k = 0; """, r""" var i=1, j, k = 2; """, r""" var i = obj.prop; """, r"""var testObj = {};""", r"""var testObj = [];""", # assignements r""" i = b; """, r""" i.a = "b"; """, r""" i["a"] = "b"; """, r""" i[a] = "b"; """, # control structures r""" if (condition) { result = expression; };""", r""" if (condition) { result = expression; } else { result = alternative; };""", r""" if (exprA == exprB) { result = expression; } else if (expr2) { result = alternative1; } else { result = alternative2; };""", "result = condition ? expression : alternative;", # switch r""" switch (expr) { case SOMEVALUE: //statements; break; case ANOTHERVALUE: //statements; break; default: //statements; break; } """ # for loop r""" for (var i = 0; i < 5; i++) { a = i; } """, r""" for (var i = 0; i < 5; i++) { a = i } """, r""" for (var key in array) { continue; } """, r""" for (;;) { break; } """, r""" for (; i < len; i++) { text += cars[i] + "<br>"; } """, r""" for (var i = 0, len = cars.length, text = ""; i < len; i++) { text += cars[i] + "<br>"; } """, """ for (; i < len; ) { text += cars[i] + "<br>"; i++; } """, # while loop """ while (a<b) { a+=1; } """, """ do { a+=1; } while (a<b); """, # with """ with (document) { var a = getElementById('a'); var b = getElementById('b'); var c = getElementById('c'); }; """, # label r""" loop1: for (var a = 0; a < 10; a++) { if (a == 4) { break loop1; // Stops after the 4th attempt } alert('a = ' + a); loop2: for (var b = 0; b < 10; ++b) { if (b == 3) { continue loop2; // Number 3 is skipped } if (b == 6) { continue loop1; // Continues the first loop, 'finished' is not shown } alert('b = ' + b); } alert('finished') } block1: { alert('hello'); // Displays 'hello' break block1; alert('world'); // Will never get here } """, # functions """ function foo(p) { p = "bar"; } """, """ function hello() { alert('world'); } """, """ var x = function(y) { return y * y; }; """, """ var math = { 'factorial': function factorial(n) { if (n <= 1) return 1; return n * factorial(n - 1); } }; """, """ var anon = function() { alert('I am anonymous'); }; """, """ anon(); """, """ setTimeout(function() { alert('hello'); }, 1000) """, """ (function() { alert('foo'); }()); """, # get/set """ var obj = { get latest () { return "latest"; } } """, """ delete obj.latest; """, """ var o = { set current (str) { return this.log[this.log.length] = str; }, log: [] } """, # new """var mycar = new car("Eagle", "Talon TSi", 1993);""", # try / catch """ try { throw "myException"; // generates an exception } catch (e) { // statements to handle any exceptions logMyErrors(e); // pass exception object to error handler } """, """ try { addalert("bad call"); } catch(e) { document.write ("Error Message: " + e.message); document.write ("<br />"); document.write ("Error Code: "); document.write (e.number & 0xFFFF); document.write ("<br />"); document.write ("Error Name: " + e.name); } """, """ try { document.write("Outer try running...<br/>"); try { document.write("Nested try running...<br/>"); throw new Error(301, "an error"); } catch (e) { document.write ("Nested catch caught " + e.message + "<br/>"); throw e; } finally { document.write ("Nested finally is running...<br/>"); } } catch (e) { document.write ("Outer catch caught " + e.message + "<br/>"); } finally { document.write ("Outer finally running"); } """, ] for snippet in jscode_snippets: assert_is_not_none(js2xml.parse(snippet))
def test_json(): jscode_snippets = [ ( r""" var arr1 = ["a","b","c"]; var arr2 = ["d","e","f"]; """, [['a', 'b', 'c'], ['d', 'e', 'f']] ), ( r""" var arr1 = ["a", null, "c"]; var arr2 = [null, "e", null]; """, [['a', None, 'c'], [None, 'e', None]] ), ( r""" var arr1 = ["a", undefined, "c"]; var arr2 = [undefined, "e", null]; """, [['a', 'undefined', 'c'], ['undefined', 'e', None]] ), ( r""" var i = -3.14; """, [] ), ( r""" money = { 'quarters': 20 }; """, [{"quarters": 20}] ), ( r""" money = { quarters: 20 }; """, [{"quarters": 20}] ), ( r""" currency = 'USD', money = { "value": 20, "currency": currency }; """, [{'currency': 'currency', 'value': 20}] ), ( r""" t = {a: "3", "b": 3, "3": 3.0}; """, [{'3': 3.0, 'a': '3', 'b': 3}] ), ( r""" money = { 'quarters': 10, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, [] ), ( r""" var money = { 'quarters': 10, 'something': [1,2,3,4], 'somethingelse': {'nested': [5,6,7,8]}, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, [[1,2,3,4], {'nested': [5,6,7,8]}] ), ( r""" var store = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; """, [{'apples': 10, 'carrots': [1, 2, 3, 4], 'chicken': {'eggs': [5, 6, 7, 8]}}] ), ( r""" var store1 = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; var store2 = { 'tomatoes': 20, 'potatoes': [9, false, 7, 6], 'spinach': {'cans': [true, 2]} }; """, [{'apples': 10, 'carrots': [1, 2, 3, 4], 'chicken': {'eggs': [5, 6, 7, 8]}}, {'potatoes': [9, False, 7, 6], 'spinach': {'cans': [True, 2]}, 'tomatoes': 20}] ), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) assert_list_equal(getall(jsxml, types=[dict, list]), expected)
def parse(self, response): jstree = js2xml.parse( response.xpath('//script/text()').extract_first()) for el in jstree.xpath('//functioncall/arguments/object'): yield js2xml.jsonlike.make_dict(el)
def getst(url): #带上request headers z = requests.get(url, headers=headers) # 获取第一段JavaScript,并去掉 <!--拆包页-->,防止中文报错 jscode = etree.HTML( z.content).xpath("//script[contains(., 'weibo')]/text()")[0].replace( u'<!--拆包页-->', '') #使用js2xml 把JavaScript代码替换成xml parsed_js = js2xml.parse(jscode) #打印下 xml # print js2xml.pretty_print(parsed_js) #打印的值如下 """ <program> <var name="$config"> <object> <property name="weibo"> <number value="0"/> </property> <property name="wechat"> <number value="0"/> </property> <property name="alipay"> <number value="0"/> </property> <property name="isLogin"> <number value="1"/> </property> <property name="isPad"> <number value="0"/> </property> <property name="isPass"> <number value="0"/> </property> <property name="st"> <string>dfd6e4</string> </property> <property name="ext"> <string>pay=1&unionPay=1</string> </property> <property name="loginUrl"> <string></string> </property> <property name="cuid"> <number value="3485500247"/> </property> <property name="detail"> <string></string> </property> </object> </var> <if> <predicate> <dotaccessor> <object> <identifier name="$config"/> </object> <property> <identifier name="wechat"/> </property> </dotaccessor> </predicate> <then> <block> <var name="WB_mishu"> <string>http://mp.weixin.qq.com/s?__biz=MjM5NDA2NDY4MA==&mid=201898100&idx=4&sn=aceda5551311992d46fa039f54ed9477#rd</string> </var> <var name="show_WB_mishu"> <number value="0"/> </var> <var name="show_WX_guide"> <number value="0"/> </var> </block> </then> </if> <if> <predicate> <dotaccessor> <object> <identifier name="$config"/> </object> <property> <identifier name="weibo"/> </property> </dotaccessor> </predicate> <then> <block> <var name="$WB_version"> <string></string> </var> </block> </then> </if> <var name="minVersion"> <object> <property name="minClientVerNum"> <string>600</string> </property> <property name="minClientV"> <string>6.0.0</string> </property> </object> </var> <var name="scheme_protocol"> <string>sinaweibo://</string> </var> <if> <predicate> <binaryoperation operation="=="> <left> <dotaccessor> <object> <identifier name="minVersion"/> </object> <property> <identifier name="minClientVerNum"/> </property> </dotaccessor> </left> <right> <string>510</string> </right> </binaryoperation> </predicate> <then> <block> <assign operator="="> <left> <identifier name="scheme_protocol"/> </left> <right> <string>sinaweibo510://</string> </right> </assign> </block> </then> </if> </program> """ #从上面可以看到st在哪,然后用xpath写出来 st = parsed_js.xpath('//property[@name="st"]/string/text()')[0] return st
def parse(self, response) -> Match: parsed = js2xml.parse(response.text) parsed_dict = js2xml.make_dict( parsed.xpath('//functioncall/arguments/object')[0]) bets = [] bet_type = self.globals["betting_type_names"][str( response.meta["betting_type_id"])]['name'] bet_scope = self.globals["scope_names"][str( response.meta["scope_id"])].replace(' ', ' ') for bet_l1 in parsed_dict.get('d', {}).get('oddsdata', []): # Bet Level 1: Back or Lay for bet_l2 in parsed_dict['d']['oddsdata'][bet_l1]: # Bet Level 2: Odds, volume, movement and bet information bet_info = parsed_dict['d']['oddsdata'][bet_l1][bet_l2] for bookmaker_id in bet_info['odds']: is_active = bet_info['act'][bookmaker_id] if is_active: odds = bet_info['odds'][bookmaker_id] if type(odds) == dict: if bet_type == "1X2": odds = [ odds[odd_type] for odd_type in ('1', '2', 'X') if odd_type in odds ] else: odds = list(odds.values()) bet_dict = { "bookmaker": self.bookmakers_data[bookmaker_id]['WebUrl'], "bookmaker_nice": self.bookmakers_data[bookmaker_id]['WebName'], "feed": self.name, "date_extracted": datetime.utcnow(), "bet_type": bet_type, "bet_scope": bet_scope, "odds": odds, "url": response.url, "is_back": bet_info['isBack'], "handicap": float(bet_info['handicapValue']) if "handicapValue" in bet_info else None } bet = Bet(**bet_dict) bets.append(bet) logging.info( f"Parsed {len(bets)} bets of type {bet_type} and scope {bet_scope}. Url: {response.url}. " f"Remaining bets of match to parse: {len(response.meta['bets_to_parse'])}" ) response.meta["bets"].extend(bets) if response.meta["bets_to_parse"]: odds_url, betting_type_id, scope_id = response.meta[ "bets_to_parse"].pop() response.meta['betting_type_id'] = betting_type_id response.meta["scope_id"] = scope_id yield scrapy.Request(url=urljoin(self.odds_main_url, odds_url), callback=self.parse, headers={ 'user-agent': self.user_agent, 'referer': response.url }, meta=response.meta) else: match = dict(response.meta['match']) match['bets'] = response.meta["bets"] logging.info(f"Finished parsing {len(response.meta['bets'])} bets " f"from match with URL {response.meta['match_url']}.") yield Match(**match)
def parse_match(self, response): # Get page info javascript = response.xpath( "//script[contains(text(),'new PageEvent')]/text()").get() parsed = js2xml.parse(javascript) page_info = js2xml.make_dict( parsed.xpath('//var[@name="page"]/new/arguments/object')[0]) page_info['xhash'] = urllib.parse.unquote(page_info['xhash']) page_info['xhashf'] = urllib.parse.unquote(page_info['xhashf']) response.meta['page_info'] = page_info match_dict = { 'sport': response.meta['sport'], 'tournament': response.meta['tournament'], 'tournament_nice': self.tournament_urls[response.meta['tournament_url']], 'teams': [page_info["home"], page_info["away"]], 'country': response.meta["country"], 'commence_time': int( re.search( r't(\d*)-', response.xpath('//p[contains(@class,"date datet")]'). attrib['class']).group(1)), 'url': response.url } response.meta['match'] = match_dict # Get default betting type and scope ID sport_id = str(page_info['sportId']) betting_type_id = '3' if self.globals['cons']['moneyLineSports'].get( page_info['sportId']) else '1' scope_id = 2 if (self.globals['cons']['sportBetTypeScopeId'].get(sport_id) and self.globals['cons']['sportBetTypeScopeId'][sport_id].get( betting_type_id)): scope_id = self.globals['cons']['sportBetTypeScopeId'][ sport_id].get(betting_type_id) elif self.globals['cons']['betTypeScopeId'].get(betting_type_id): scope_id = self.globals['cons']['betTypeScopeId'].get( betting_type_id) elif self.globals['cons']['sportScopeId'].get(sport_id): scope_id = self.globals['cons']['sportScopeId'].get(sport_id) odds_url = ( f'/feed/match/{page_info["versionId"]}-{sport_id}-{page_info["id"]}' f'-{betting_type_id}-{scope_id}-{page_info["xhash"]}.dat') response.meta['first'] = True response.meta['betting_type_id'] = betting_type_id response.meta["scope_id"] = scope_id response.meta["match_url"] = response.url yield scrapy.Request(url=urljoin(self.odds_main_url, odds_url), callback=self.parse_first, headers={ 'user-agent': self.user_agent, 'referer': response.url }, meta=response.meta)
def test_parse_string(): jscode_snippets = [ ( r""" var h = 'test'; var i = "test"; var j = ""; var k = '""'; var l = '"'; var m = ''; var n = "''"; var o = "'"; """, ['test', 'test', '', '""', '"', '', "''", "'"] ), ( r""" var i = 'test\'s output'; """, [r"test's output"] ), ( r""" var i = 'test\ multiline'; """, [r"test multiline"] ), ( r""" var i = 'test\ long \ multiline'; """, [r"test long multiline"] ), ( r""" var i = ["\"", '\'']; var j = "test\'s output"; var k = "test\\'s output"; var l = "nested \"quotes\"."; """, ['"', "'", r"test's output", r"test\'s output", r'nested "quotes".'] ), ( r""" var i = 'https://www.blogger.com/navbar.g?targetBlogID\0754325487278375417853\46blogName\75spirello\46publishMode\75PUBLISH_MODE_BLOGSPOT\46navbarType\75LIGHT\46layoutType\75LAYOUTS\46searchRoot\75http://spirelloskrimskramserier.blogspot.com/search\46blogLocale\75no\46v\0752\46homepageUrl\75http://spirelloskrimskramserier.blogspot.com/\46vt\0751357383140196484672'; """, [r'https://www.blogger.com/navbar.g?targetBlogID=4325487278375417853&blogName=spirello&publishMode=PUBLISH_MODE_BLOGSPOT&navbarType=LIGHT&layoutType=LAYOUTS&searchRoot=http://spirelloskrimskramserier.blogspot.com/search&blogLocale=no&v=2&homepageUrl=http://spirelloskrimskramserier.blogspot.com/&vt=1357383140196484672'] ), ( r""" var i = "foo \ bar"; var j = "foo \ bar"; """, [r'foo bar', 'foo bar'] ), ( # testing Unicode literals b""" var x = "\\u00A9 Netscape Communications 1"; """, [u'\u00a9 Netscape Communications 1'] ), ( # testing Unicode characters u""" var x = "\u00A9 Netscape Communications 2"; """.encode("utf8"), [u'\u00a9 Netscape Communications 2'] ), # a real example ( r""" var needleParam = needleParam || {}; needleParam.chatGroup = "test"; needleParam.productId = "6341292"; needleParam.productPrice = "EUR 138.53".replace("$","n_").replace(/,/g,""); //Begin Needle (fan-sourcing platform) snippet jQuery(document).ready(function(){ var e = document.createElement("script"); e.type = "text/javascript"; e.async = true; e.src = document.location.protocol + "//overstock.needle.com/needle_service.js?1"; document.body.appendChild(e); }); // End Needle snippet """, ['test', '6341292', 'EUR 138.53', '$', 'n_', '', 'script', 'text/javascript', '//overstock.needle.com/needle_service.js?1'] ), # test replacing some control characters ( r""" var name = "\u13e9\u0352\u0362\u044f\u2778\u00b3\u1d43\u034e\u034e\u0442\u035b\u13b7\u0362\u033b\u1d51A\u0362\u13de\u0001\u0001\u277c00b"; """, [u'\u13e9\u0352\u0362\u044f\u2778\xb3\u1d43\u034e\u034e\u0442\u035b\u13b7\u0362\u033b\u1d51A\u0362\u13de\ufffd\ufffd\u277c00b'] ), # surrogate pairs (r'''var name = "\ud835\udebd"''', [u'\U0001d6bd']), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) result = jsxml.xpath("//string/text()") assert_list_equal(result, expected)
def getst(url): #带上request headers z = requests.get(url,headers=headers) # 获取第一段JavaScript,并去掉 <!--拆包页-->,防止中文报错 jscode = etree.HTML(z.content).xpath("//script[contains(., 'weibo')]/text()")[0].replace(u'<!--拆包页-->','') #使用js2xml 把JavaScript代码替换成xml parsed_js = js2xml.parse(jscode) #打印下 xml # print js2xml.pretty_print(parsed_js) #打印的值如下 """ <program> <var name="$config"> <object> <property name="weibo"> <number value="0"/> </property> <property name="wechat"> <number value="0"/> </property> <property name="alipay"> <number value="0"/> </property> <property name="isLogin"> <number value="1"/> </property> <property name="isPad"> <number value="0"/> </property> <property name="isPass"> <number value="0"/> </property> <property name="st"> <string>dfd6e4</string> </property> <property name="ext"> <string>pay=1&unionPay=1</string> </property> <property name="loginUrl"> <string></string> </property> <property name="cuid"> <number value="3485500247"/> </property> <property name="detail"> <string></string> </property> </object> </var> <if> <predicate> <dotaccessor> <object> <identifier name="$config"/> </object> <property> <identifier name="wechat"/> </property> </dotaccessor> </predicate> <then> <block> <var name="WB_mishu"> <string>http://mp.weixin.qq.com/s?__biz=MjM5NDA2NDY4MA==&mid=201898100&idx=4&sn=aceda5551311992d46fa039f54ed9477#rd</string> </var> <var name="show_WB_mishu"> <number value="0"/> </var> <var name="show_WX_guide"> <number value="0"/> </var> </block> </then> </if> <if> <predicate> <dotaccessor> <object> <identifier name="$config"/> </object> <property> <identifier name="weibo"/> </property> </dotaccessor> </predicate> <then> <block> <var name="$WB_version"> <string></string> </var> </block> </then> </if> <var name="minVersion"> <object> <property name="minClientVerNum"> <string>600</string> </property> <property name="minClientV"> <string>6.0.0</string> </property> </object> </var> <var name="scheme_protocol"> <string>sinaweibo://</string> </var> <if> <predicate> <binaryoperation operation="=="> <left> <dotaccessor> <object> <identifier name="minVersion"/> </object> <property> <identifier name="minClientVerNum"/> </property> </dotaccessor> </left> <right> <string>510</string> </right> </binaryoperation> </predicate> <then> <block> <assign operator="="> <left> <identifier name="scheme_protocol"/> </left> <right> <string>sinaweibo510://</string> </right> </assign> </block> </then> </if> </program> """ #从上面可以看到st在哪,然后用xpath写出来 st = parsed_js.xpath('//property[@name="st"]/string/text()')[0] return st
def test_schema(): jscode_snippets = [ # strings ( r""" "test"; """, """ <program> <string>test</string> </program> """ ), ( r""" "test\ multiline"; """, """ <program> <string>test multiline</string> </program> """ ), # numbers ( "3.14;", """ <program> <number value="3.14"/> </program> """ ), ( "-12;", """ <program> <number value="-12"/> </program> """ ), ( "3.45e2;", """ <program> <number value="3.45e2"/> </program> """ ), ( "0377;", """ <program> <number value="0377"/> </program> """ ), ( "0xFF;", """ <program> <number value="0xFF"/> </program> """ ), # arrays ( "[]", """ <program> <array/> </program> """ ), ( "[1,2]", """ <program> <array> <number value="1"/> <number value="2"/> </array> </program> """ ), ( "[1,,2]", """ <program> <array> <number value="1"/> <undefined/> <number value="2"/> </array> </program> """ ), ( "[1,,2,,,3,]", """ <program> <array> <number value="1"/> <undefined/> <number value="2"/> <undefined/> <undefined/> <number value="3"/> </array> </program> """ ), ( "['a', 'b','c']", """ <program> <array> <string>a</string> <string>b</string> <string>c</string> </array> </program> """ ), ( "[a, 'b', c]", """ <program> <array> <identifier name="a"/> <string>b</string> <identifier name="c"/> </array> </program> """ ), # objects ( "o = {};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object/> </right> </assign> </program> """ ), ( "o = {a: 1};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="a"> <number value="1"/> </property> </object> </right> </assign> </program> """ ), ( "o = {a: 1, b: 2};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="a"> <number value="1"/> </property> <property name="b"> <number value="2"/> </property> </object> </right> </assign> </program> """ ), ( "o = {'c': 1, 'd': 2};", """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <number value="2"/> </property> </object> </right> </assign> </program> """ ), ( 'o = {"c": 1, "d": 2};', """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <number value="2"/> </property> </object> </right> </assign> </program> """ ), ( 'o = {"c": 1, d: "e"};', """ <program> <assign operator="="> <left> <identifier name="o"/> </left> <right> <object> <property name="c"> <number value="1"/> </property> <property name="d"> <string>e</string> </property> </object> </right> </assign> </program> """ ), ( "e = {foo: 5, bar: 6, baz: ['Baz', 'Content']};", """ <program> <assign operator="="> <left> <identifier name="e"/> </left> <right> <object> <property name="foo"> <number value="5"/> </property> <property name="bar"> <number value="6"/> </property> <property name="baz"> <array> <string>Baz</string> <string>Content</string> </array> </property> </object> </right> </assign> </program> """ ), # other primitive data types ( "null;", """ <program> <null/> </program> """ ), ( "undefined;", """ <program> <undefined/> </program> """ ), ( "true;", """ <program> <boolean>true</boolean> </program> """ ), ( "false;", """ <program> <boolean>false</boolean> </program> """ ), # variables ( r""" var i; """, """ <program> <var name="i"/> </program> """ ), ( r""" var i,j,k; """, """ <program> <var name="i"/> <var name="j"/> <var name="k"/> </program> """ ), ( r""" var i = 0; """, """ <program> <var name="i"> <number value="0"/> </var> </program> """ ), ( r""" var i = "test"; """, """ <program> <var name="i"> <string>test</string> </var> </program> """ ), ( r"""var z = 'foxes', r = 'birds';""", """ <program> <var name="z"> <string>foxes</string> </var> <var name="r"> <string>birds</string> </var> </program> """ ), ( r""" var i, j, k = 0; """, """ <program> <var name="i"/> <var name="j"/> <var name="k"> <number value="0"/> </var> </program> """ ), ( r""" var i=1, j, k = 2; """, """ <program> <var name="i"> <number value="1"/> </var> <var name="j"/> <var name="k"> <number value="2"/> </var> </program> """ ), ( r""" var i = obj.prop; """, """ <program> <var name="i"> <dotaccessor> <object> <identifier name="obj"/> </object> <property> <identifier name="prop"/> </property> </dotaccessor> </var> </program> """ ), ( r"""var testObj = {};""", """ <program> <var name="testObj"> <object/> </var> </program> """ ), ( r"""var testObj = [];""", """ <program> <var name="testObj"> <array/> </var> </program> """ ), # operations ( r""" 1 + 2; "foo" + false; 3 - 5 """, """ <program> <binaryoperation operation="+"> <left> <number value="1"/> </left> <right> <number value="2"/> </right> </binaryoperation> <binaryoperation operation="+"> <left> <string>foo</string> </left> <right> <boolean>false</boolean> </right> </binaryoperation> <binaryoperation operation="-"> <left> <number value="3"/> </left> <right> <number value="5"/> </right> </binaryoperation> </program> """ ), ( r""" 1.0 / 2.0; -2 * 2; 12 % 5; """, """ <program> <binaryoperation operation="/"> <left> <number value="1.0"/> </left> <right> <number value="2.0"/> </right> </binaryoperation> <binaryoperation operation="*"> <left> <number value="-2"/> </left> <right> <number value="2"/> </right> </binaryoperation> <binaryoperation operation="%"> <left> <number value="12"/> </left> <right> <number value="5"/> </right> </binaryoperation> </program> """ ), ( r""" // Postfix var x = 3; y = x++; // y = 3, x = 4 // Prefix var a = 2; b = ++a; // a = 3, b = 3 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <postfix operation="++"> <identifier name="x"/> </postfix> </right> </assign> <var name="a"> <number value="2"/> </var> <assign operator="="> <left> <identifier name="b"/> </left> <right> <unaryoperation operation="++"> <identifier name="a"/> </unaryoperation> </right> </assign> </program> """ ), ( r""" // Postfix var x = 3; y = x--; // y = 3, x = 2 // Prefix var a = 2; b = --a; // a = 1, b = 1 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <postfix operation="--"> <identifier name="x"/> </postfix> </right> </assign> <var name="a"> <number value="2"/> </var> <assign operator="="> <left> <identifier name="b"/> </left> <right> <unaryoperation operation="--"> <identifier name="a"/> </unaryoperation> </right> </assign> </program> """ ), ( r""" var x = 3; y = -x; // y = -3, x = 3 """, """ <program> <var name="x"> <number value="3"/> </var> <assign operator="="> <left> <identifier name="y"/> </left> <right> <unaryoperation operation="-"> <identifier name="x"/> </unaryoperation> </right> </assign> </program> """ ), ( r""" +3; // 3 +"3"; // 3 +true; // 1 +false; // 0 +null; // 0 """, """ <program> <number value="+3"/> <unaryoperation operation="+"> <string>3</string> </unaryoperation> <unaryoperation operation="+"> <boolean>true</boolean> </unaryoperation> <unaryoperation operation="+"> <boolean>false</boolean> </unaryoperation> <unaryoperation operation="+"> <null/> </unaryoperation> </program> """ ), # assignements ( r""" i = b; """, """ <program> <assign operator="="> <left> <identifier name="i"/> </left> <right> <identifier name="b"/> </right> </assign> </program> """ ), ( r""" i.a = "b"; """, """ <program> <assign operator="="> <left> <dotaccessor> <object> <identifier name="i"/> </object> <property> <identifier name="a"/> </property> </dotaccessor> </left> <right> <string>b</string> </right> </assign> </program> """ ), ( r""" i["a"] = "b"; """, """ <program> <assign operator="="> <left> <bracketaccessor> <object> <identifier name="i"/> </object> <property> <string>a</string> </property> </bracketaccessor> </left> <right> <string>b</string> </right> </assign> </program> """ ), ( r""" i[a] = "b"; """, """ <program> <assign operator="="> <left> <bracketaccessor> <object> <identifier name="i"/> </object> <property> <identifier name="a"/> </property> </bracketaccessor> </left> <right> <string>b</string> </right> </assign> </program> """ ), # control structures ( r""" if (condition) { result = expression; }""", """ <program> <if> <predicate> <identifier name="condition"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> </if> </program> """ ), ( r""" if (condition) { result = expression; } else { result = alternative; }""", """ <program> <if> <predicate> <identifier name="condition"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> <else> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative"/> </right> </assign> </block> </else> </if> </program> """ ), ( r""" if (exprA == exprB) { result = expression; } else if (expr2) { result = alternative1; } else { result = alternative2; }""", """ <program> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="exprA"/> </left> <right> <identifier name="exprB"/> </right> </binaryoperation> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="expression"/> </right> </assign> </block> </then> <else> <if> <predicate> <identifier name="expr2"/> </predicate> <then> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative1"/> </right> </assign> </block> </then> <else> <block> <assign operator="="> <left> <identifier name="result"/> </left> <right> <identifier name="alternative2"/> </right> </assign> </block> </else> </if> </else> </if> </program> """ ), ( "result = condition ? expression : alternative;", """ <program> <assign operator="="> <left> <identifier name="result"/> </left> <right> <conditional> <condition> <identifier name="condition"/> </condition> <value1> <identifier name="expression"/> </value1> <value2> <identifier name="alternative"/> </value2> </conditional> </right> </assign> </program> """ ), # switch ( r""" switch (expr) { case SOMEVALUE: //statements; break; case ANOTHERVALUE: //statements; break; default: //statements; break; } """, """ <program> <switch> <expression> <identifier name="expr"/> </expression> <case> <expression> <identifier name="SOMEVALUE"/> </expression> <break/> </case> <case> <expression> <identifier name="ANOTHERVALUE"/> </expression> <break/> </case> <default> <break/> </default> </switch> </program> """ ), # for loop ( r""" for (var i = 0; i < 5; i++) { a = i; } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <number value="5"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="a"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """ ), ( r""" for (var i = 0; i < 5; i++) { a = i } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <number value="5"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="a"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """ ), ( r""" for (var key in array) { continue; } """, """ <program> <forin> <variable> <var name="key"/> </variable> <object> <identifier name="array"/> </object> <statement> <block> <continue/> </block> </statement> </forin> </program> """ ), ( r""" for (;;) { break; } """, """ <program> <for> <statement> <block> <break/> </block> </statement> </for> </program> """ ), ( r""" for (; i < len; i++) { j = i; } """, """ <program> <for> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="="> <left> <identifier name="j"/> </left> <right> <identifier name="i"/> </right> </assign> </block> </statement> </for> </program> """ ), ( r""" for (var i = 0, len = cars.length, text = ""; i < len; i++) { text += cars[i] + "<br>"; } """, """ <program> <for> <init> <var name="i"> <number value="0"/> </var> <var name="len"> <dotaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="length"/> </property> </dotaccessor> </var> <var name="text"> <string></string> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="i"/> </postfix> </post> <statement> <block> <assign operator="+="> <left> <identifier name="text"/> </left> <right> <binaryoperation operation="+"> <left> <bracketaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="i"/> </property> </bracketaccessor> </left> <right> <string><br></string> </right> </binaryoperation> </right> </assign> </block> </statement> </for> </program> """ ), ( """ for (; i < len; ) { text += cars[i] + "<br>"; i++; } """, """ <program> <for> <condition> <binaryoperation operation="<"> <left> <identifier name="i"/> </left> <right> <identifier name="len"/> </right> </binaryoperation> </condition> <statement> <block> <assign operator="+="> <left> <identifier name="text"/> </left> <right> <binaryoperation operation="+"> <left> <bracketaccessor> <object> <identifier name="cars"/> </object> <property> <identifier name="i"/> </property> </bracketaccessor> </left> <right> <string><br></string> </right> </binaryoperation> </right> </assign> <postfix operation="++"> <identifier name="i"/> </postfix> </block> </statement> </for> </program> """ ), # while loop ( """ while (a<b) { a+=1; } """, """ <program> <while> <predicate> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <identifier name="b"/> </right> </binaryoperation> </predicate> <statement> <block> <assign operator="+="> <left> <identifier name="a"/> </left> <right> <number value="1"/> </right> </assign> </block> </statement> </while> </program> """ ), ( """ do { a+=1; } while (a<b); """, """ <program> <statement> <block> <assign operator="+="> <left> <identifier name="a"/> </left> <right> <number value="1"/> </right> </assign> </block> </statement> <while> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <identifier name="b"/> </right> </binaryoperation> </while> </program> """ ), # with ( """ with (document) { var a = getElementById('a'); var b = getElementById('b'); var c = getElementById('c'); var c = document.get('c'); }; """, """ <program> <with> <identifier name="document"/> <statement> <block> <var name="a"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>a</string> </arguments> </functioncall> </var> <var name="b"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>b</string> </arguments> </functioncall> </var> <var name="c"> <functioncall> <function> <identifier name="getElementById"/> </function> <arguments> <string>c</string> </arguments> </functioncall> </var> <var name="c"> <functioncall> <function> <dotaccessor> <object> <identifier name="document"/> </object> <property> <identifier name="get"/> </property> </dotaccessor> </function> <arguments> <string>c</string> </arguments> </functioncall> </var> </block> </statement> </with> <empty>;</empty> </program> """ ), # label ( r""" loop1: for (var a = 0; a < 10; a++) { if (a == 4) { break loop1; // Stops after the 4th attempt } alert('a = ' + a); loop2: for (var b = 0; b < 10; ++b) { if (b == 3) { continue loop2; // Number 3 is skipped } if (b == 6) { continue loop1; // Continues the first loop, 'finished' is not shown } alert('b = ' + b); } alert('finished') } block1: { alert('hello'); // Displays 'hello' break block1; alert('world'); // Will never get here } """, """ <program> <label name="loop1"> <statement> <for> <init> <var name="a"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="a"/> </left> <right> <number value="10"/> </right> </binaryoperation> </condition> <post> <postfix operation="++"> <identifier name="a"/> </postfix> </post> <statement> <block> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="a"/> </left> <right> <number value="4"/> </right> </binaryoperation> </predicate> <then> <block> <break> <identifier name="loop1"/> </break> </block> </then> </if> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <binaryoperation operation="+"> <left> <string>a = </string> </left> <right> <identifier name="a"/> </right> </binaryoperation> </arguments> </functioncall> <label name="loop2"> <statement> <for> <init> <var name="b"> <number value="0"/> </var> </init> <condition> <binaryoperation operation="<"> <left> <identifier name="b"/> </left> <right> <number value="10"/> </right> </binaryoperation> </condition> <post> <unaryoperation operation="++"> <identifier name="b"/> </unaryoperation> </post> <statement> <block> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="b"/> </left> <right> <number value="3"/> </right> </binaryoperation> </predicate> <then> <block> <continue> <identifier name="loop2"/> </continue> </block> </then> </if> <if> <predicate> <binaryoperation operation="=="> <left> <identifier name="b"/> </left> <right> <number value="6"/> </right> </binaryoperation> </predicate> <then> <block> <continue> <identifier name="loop1"/> </continue> </block> </then> </if> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <binaryoperation operation="+"> <left> <string>b = </string> </left> <right> <identifier name="b"/> </right> </binaryoperation> </arguments> </functioncall> </block> </statement> </for> </statement> </label> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>finished</string> </arguments> </functioncall> </block> </statement> </for> </statement> </label> <label name="block1"> <statement> <block> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>hello</string> </arguments> </functioncall> <break> <identifier name="block1"/> </break> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>world</string> </arguments> </functioncall> </block> </statement> </label> </program> """ ), # functions ( """ function foo(p) { p = "bar"; } """, """ <program> <funcdecl name="foo"> <parameters> <identifier name="p"/> </parameters> <body> <assign operator="="> <left> <identifier name="p"/> </left> <right> <string>bar</string> </right> </assign> </body> </funcdecl> </program> """ ), ( """ function hello() { alert('world'); } """, """ <program> <funcdecl name="hello"> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>world</string> </arguments> </functioncall> </body> </funcdecl> </program> """ ), ( """ var anon = function() { alert('I am anonymous'); }; """, """ <program> <var name="anon"> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>I am anonymous</string> </arguments> </functioncall> </body> </funcexpr> </var> </program> """ ), ( """ anon(); """, """ <program> <functioncall> <function> <identifier name="anon"/> </function> <arguments/> </functioncall> </program> """ ), ( """ setTimeout(function() { alert('hello'); }, 1000) """, """ <program> <functioncall> <function> <identifier name="setTimeout"/> </function> <arguments> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>hello</string> </arguments> </functioncall> </body> </funcexpr> <number value="1000"/> </arguments> </functioncall> </program> """ ), ( """ (function() { alert('foo'); }()); """, """ <program> <functioncall> <function> <funcexpr> <identifier/> <parameters/> <body> <functioncall> <function> <identifier name="alert"/> </function> <arguments> <string>foo</string> </arguments> </functioncall> </body> </funcexpr> </function> <arguments/> </functioncall> </program> """ ), # get/set ( """ var obj = { get latest () { return "latest"; } } """, """ <program> <var name="obj"> <object> <get> <property> <identifier name="latest"/> </property> <body> <return> <string>latest</string> </return> </body> </get> </object> </var> </program> """ ), ( """ delete obj.latest; """, """ <program> <unaryoperation operation="delete"> <dotaccessor> <object> <identifier name="obj"/> </object> <property> <identifier name="latest"/> </property> </dotaccessor> </unaryoperation> </program> """ ), ( """ var o = { set current (str) { return this.log[this.log.length] = str; }, log: [] } """, """ <program> <var name="o"> <object> <set> <body> <return> <assign operator="="> <left> <bracketaccessor> <object> <dotaccessor> <object> <identifier>this</identifier> </object> <property> <identifier name="log"/> </property> </dotaccessor> </object> <property> <dotaccessor> <object> <dotaccessor> <object> <identifier>this</identifier> </object> <property> <identifier name="log"/> </property> </dotaccessor> </object> <property> <identifier name="length"/> </property> </dotaccessor> </property> </bracketaccessor> </left> <right> <identifier name="str"/> </right> </assign> </return> </body> </set> <property name="log"> <array/> </property> </object> </var> </program> """ ), ] for snippet, expected in jscode_snippets: print "---------------------------------------------------------" print snippet js = js2xml.parse(snippet) output = js2xml.pretty_print(js).strip() assert_equal(output, expected.strip(), "got\n%s\nexpected:\n%s" % (output, expected))
for i in range(1,460): url = 'https://weibo.com/cctvxinwen?is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page={}#feedtop'.format(i) r = requests.get(url, headers=headers) # WB_text W_f14 r.encoding = 'utf-8' response = etree.HTML(r.text) # print(r.text) script_list = response.xpath("//script/text()") filter_script = [script for script in script_list if script.find('FM.view({"ns":"pl.content.homeFeed.index","domid":"Pl_Official_MyProfileFeed') != -1] # print(filter_script) try: script_text = js2xml.parse(filter_script[0], encoding='utf-8', debug=False) # print(script_list[-1]) script_tree = js2xml.pretty_print(script_text) # print(script_tree) selector = etree.HTML(script_tree) div_selector = selector.xpath("//program//property[@name='html']/string/text()")[0] div_tree_se = etree.HTML(div_selector) text_selectors = div_tree_se.xpath("//div[@class='WB_text W_f14']") text_selectors_full = div_tree_se.xpath("//div[@node-type='feed_list_content_full']") print(text_selectors_full) for text_se in text_selectors: text = ''.join(text_se.xpath('./text()')).replace('\n', '').replace(' ', '') if (text.find('【') != -1 and text.find('】') != -1): text = text.split("】")[1] if (text[0] == ',' or text[0] == "?"): text = text[1:]
def test_json(): jscode_snippets = [ ( r""" var arr1 = ["a","b","c"]; var arr2 = ["d","e","f"]; """, [["a", "b", "c"], ["d", "e", "f"]], ), ( r""" var arr1 = ["a", null, "c"]; var arr2 = [null, "e", null]; """, [["a", None, "c"], [None, "e", None]], ), ( r""" var arr1 = ["a", undefined, "c"]; var arr2 = [undefined, "e", null]; """, [["a", "undefined", "c"], ["undefined", "e", None]], ), ( r""" var i = -3.14; """, [], ), ( r""" money = { 'quarters': 20 }; """, [{"quarters": 20}], ), ( r""" money = { quarters: 20 }; """, [{"quarters": 20}], ), ( r""" currency = 'USD', money = { "value": 20, "currency": currency }; """, [{"currency": "currency", "value": 20}], ), ( r""" t = {a: "3", "b": 3, "3": 3.0}; """, [{"3": 3.0, "a": "3", "b": 3}], ), ( r""" money = { 'quarters': 10, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, [], ), ( r""" var money = { 'quarters': 10, 'something': [1,2,3,4], 'somethingelse': {'nested': [5,6,7,8]}, 'addQuarters': function(amount) { this.quarters += amount; } }; money.addQuarters(10); """, [[1, 2, 3, 4], {"nested": [5, 6, 7, 8]}], ), ( r""" var store = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; """, [{"apples": 10, "carrots": [1, 2, 3, 4], "chicken": {"eggs": [5, 6, 7, 8]}}], ), ( r""" var store1 = { 'apples': 10, 'carrots': [1,2,3,4], 'chicken': {'eggs': [5,6,7,8]} }; var store2 = { 'tomatoes': 20, 'potatoes': [9, false, 7, 6], 'spinach': {'cans': [true, 2]} }; """, [ {"apples": 10, "carrots": [1, 2, 3, 4], "chicken": {"eggs": [5, 6, 7, 8]}}, {"potatoes": [9, False, 7, 6], "spinach": {"cans": [True, 2]}, "tomatoes": 20}, ], ), ] for snippet, expected in jscode_snippets: jsxml = js2xml.parse(snippet) assert_list_equal(js2xml.jsonlike.getall(jsxml), expected)
def parsee(self, response): item = WeddingspotItem() item['id'] = self.id self.id = self.id + 1 item['image_urls'] = response.css( '.slick-slide img::attr(src)').extract() item['venue_title'] = response.css( '.Panel--className .SecondaryCTA--venueName::text').extract_first( ) item['venue_title'] = item['venue_title'].replace("'", "") price = response.css( '.Panel--className .VenuePrimaryCTA--className h3::text').extract( ) a = ' ' item['price'] = a.join(price) item['price'] = item['price'].replace("'", "") item['style'] = response.css( '.VenuePage--main-details .VenuePage--detail-text-container p::text' ).extract_first() item['style'] = item['style'].replace("'", "") item['guest_capacity'] = response.css( '.VenuePage--main-details .VenuePage--detail-text-container p::text' ).extract()[1] item['guest_capacity'] = item['guest_capacity'].replace("'", "") services = response.css( '.VenuePage--main-details .VenuePage--detail-text-container p::text' ).extract()[2:4] b = ',' item['services'] = b.join(services) item['services'] = item['services'].replace("'", "") item['location'] = response.css( '.VenuePage--main-details .VenuePage--detail-text-container p::text' ).extract()[4] item['location'] = item['location'].replace("'", "") item['zip_code'] = response.css( '.VenuePage--main-details .VenuePage--detail-text-container span::text' ).extract_first() item['zip_code'] = item['zip_code'].replace("'", "") item['description'] = response.css( '.VenuePage--description p::text').extract_first() item['description'] = item['description'].replace("'", "") item['venue_notes'] = response.css( 'p.VenuePage--additional-detail::text').extract_first() item['venue_notes'] = item['venue_notes'].replace("'", "") item['url'] = response.url amenities = response.css( '.Amenities--row .VenuePage--additional-detail')[0] amenities = amenities.css( '.VenuePage--additional-detail p::text').extract() amenities = [value for value in amenities if value != '- '] c = ',' item['amenities'] = c.join(amenities) item['amenities'] = item['amenities'].replace("'", "") restrictions = response.css( '.Amenities--row .VenuePage--additional-detail')[1] restrictions = restrictions.css( '.VenuePage--additional-detail p::text').extract() restrictions = [value for value in restrictions if value != '- '] d = ',' item['restrictions'] = d.join(restrictions) item['restrictions'] = item['restrictions'].replace("'", "") javascript = response.css( "script:contains('window.__PRELOADED_STATE__')::text").get() xml = lxml.etree.tostring(js2xml.parse(javascript), encoding='unicode') selector = Selector(xml) junk = javascript[javascript.index('website'):] http = junk[junk.index('http'):] http = http[:http.index('"')] item['web_url'] = http f = open("data.txt", "a") f.write( f"({item['id']},'{item['venue_title']}','{item['price']}','{item['style']}','{item['guest_capacity']}','{item['services']}','{item['location']}','{item['zip_code']}','{item['description']}','{item['venue_notes']}','{item['url']}','{item['amenities']}','{item['restrictions']}','{item['web_url']}'),\n" ) f.close() yield item