def find_script_sources(self, tree): """ Extract all the relevant sources in <script> Parse all the inline scripts using their AST and extracting all relevant sources for CSP directives :param tree: a script parsed into an AST (Abstract Syntax Tree) :return: None """ # Visit tree if a relevant node falls under a CSP directive walker = Walker() for node in walker.filter( tree, lambda node: (isinstance(node, (FunctionCall, VarDecl, Assign, NewExpr)))): # Calls the right function for each node if isinstance(node, FunctionCall): print('FUNCTION CALL') self.extract_function_call(node) elif isinstance(node, VarDecl): print('VAR DECL') self.extract_var_declaration(node) elif isinstance(node, Assign): print('ASSIGN') self.extract_assign(node) elif isinstance(node, NewExpr): print('NEW EXPR') self.extract_new_expr(node)
def gen_js_script_nodes(self, node_type, scripts): res = [] for script in scripts: program = es5(script) walker = Walker() for node in walker.filter( program, lambda node: (isinstance(node, node_type))): res.append(node) return res
def test_extract_func_call_eval_instruction(self): """ Aims to test if eval instruction properly generate unsafe-eval directive and raise correct flags :return: """ # ------------------------------- # # ----------- ARRANGE ----------- # # ------------------------------- # self.res_sorter.report_generator = ReportGenerator() self.test_sorter.report_generator = ReportGenerator() script = """ eval('2*3;') var m = 3; var f = new Function('a', 'return a'); document.getElementsByTagName("body").style.cssText = "background-color:pink;font-size:55px;border:2px dashed green;color:white;" myStyle.insertRule('#blanc { color: white }', 0); """ # Getting the node from the script nodes = [] walker = Walker() for node in walker.filter( es5(script), lambda node: (isinstance(node, FunctionCall))): nodes.append(node) # Adding unsafe-eval directives for each relevant directive self.res_sorter.directives_sources['script-src'].add("'unsafe-eval'") self.res_sorter.directives_sources['style-src'].add("'unsafe-eval'") # Adding flag into test report generator flag_eval = Flag('eval_script', nodes[0]) flag_insert_rule = Flag('eval_style', nodes[2]) self.res_sorter.report_generator.flags.append(flag_eval) self.res_sorter.report_generator.flags.append(flag_insert_rule) # ------------------------------- # # ------------- ACT ------------- # # ------------------------------- # for node in nodes: instruction = self.test_sorter.get_node_instruction(node) self.test_sorter.extract_func_call_eval_instruction( node, instruction) # ------------------------------- # # ----------- ASSERT ------------ # # ------------------------------- # assert (self.res_sorter.directives_sources == self.test_sorter.directives_sources) assert (set(self.res_sorter.report_generator.flags) == set( self.test_sorter.report_generator.flags))
def parse_publication(soups: Soups): stash = {} content = None target_script = None for script in soups.body.find_all("script"): try: if "Fusion.globalContent" in script.contents[0]: target_script = script.contents[0] break except: pass if target_script: for x in Walker().filter( es5(target_script), lambda node: isinstance(node, Assign) ): if str(x.left) == "Fusion.globalContent": content = json.loads(str(x.right))["content_elements"] break publication_text = [x["content"] for x in content if x.get("type") == "text"] publication_text_html = [ x["content"] for x in content if x.get("type") == "raw_html" ] for html in publication_text_html: soup = BeautifulSoup(html, "html.parser") publication_text += list(soup.stripped_strings) stash["publication_text"] = "\n".join(publication_text) stash["image_urls"] = [x["url"] for x in content if x.get("type") == "image"] return { "version": soups.snapshot.snapshot_at, "site_id": soups.snapshot.site_id, "canonical_url": soups.snapshot.url, "published_at": P.parse_published_at(soups), "first_seen_at": soups.snapshot.first_seen_at, "last_updated_at": soups.snapshot.last_updated_at, "title": soups.body.find("title").text, "publication_text": stash.get("publication_text", ""), "author": None, "connect_from": None, "data": { "urls": P.parse_external_links(soups), "image_urls": stash.get("image_urls", []), "hashtags": [], "keywords": [], "tags": [], "metadata": { "metatags": soups.metatags, **soups.metadata, "ga-id": parse_ga_id(soups), }, "comments": [], }, }
def _request(self): """ :returns: List or None """ endpoint = "http://%s/cgi/cgi_myNetwork.js" % self.host try: response = requests.get(endpoint) _LOGGER.debug("Response %s", response.text) tree = es5(response.text) known_device_list = [] # find known_device_list variable var_known_device_list = None walker = Walker() for node in walker.filter(tree, lambda node: isinstance(node, VarDecl)): if node.identifier.value == 'known_device_list': var_known_device_list = node if var_known_device_list is None: raise IndexError('known_device_list variable not found.') for object_node in walker.filter(var_known_device_list, lambda node: isinstance(node, Object)): known_device_list.append({ getattr(node.left, 'value', ''): urllib.parse.unquote(getattr(node.right, 'value', '')).replace( '\'', '') for node in walker.filter(object_node, lambda node: isinstance(node, Assign)) }) return known_device_list except requests.RequestException: _LOGGER.error("Status failed %s", endpoint, exc_info=1) except IndexError: _LOGGER.error("Parsing failed %s", endpoint, exc_info=1) return None
def parse_publication(soups): stash = {} stash["title"] = soups.body.find("title").text declarations = es5(soups.body.find_all("script")[-10].contents[0]) info_node = list(Walker().filter( declarations, lambda x: isinstance(x, Assign) and str(x.left) == "articleInfo", ))[0] content_node = list( filter( lambda p: isinstance(p, Assign) and str(p.left) == "content", info_node.right.properties, ))[0] content_text = re.search(r"\'(.*)\'.slice", str(content_node.right)).group(1) content_soup = BeautifulSoup(json.loads(html.unescape(content_text)), "html.parser") stash["publication_text"] = "\n".join(content_soup.stripped_strings) stash["image_urls"] = [x["src"] for x in content_soup.find_all("img")] return { "version": soups.snapshot.snapshot_at, "site_id": soups.snapshot.site_id, "canonical_url": soups.snapshot.url, "published_at": P.parse_published_at(soups), "first_seen_at": soups.snapshot.first_seen_at, "last_updated_at": soups.snapshot.last_updated_at, "title": stash["title"], "publication_text": stash.get("publication_text", ""), "author": None, "connect_from": None, "data": { "urls": P.parse_external_links(soups), "image_urls": stash.get("image_urls", ""), "hashtags": [], "keywords": [], "tags": [], "metadata": { "metatags": soups.metatags, **soups.metadata, "ga-id": parse_ga_id(soups), }, "comments": [], }, }
from calmjs.parse import es5 from calmjs.parse.walkers import Walker from calmjs.parse.asttypes import VarDecl from calmjs.parse.exceptions import ECMASyntaxError from calmjs.parse.unparsers import extractor from typing import Tuple, Any, Callable, Optional, TypeVar, Generic, Union, List from functools import partial from urllib.parse import urlparse import re from pydantic import ValidationError # DEBUG- TODO delete import pdb from pprint import pprint walker = Walker() def extract_yt_initial_data(soup: BeautifulSoup) -> dict: """ Extract the object bound to variable ytInitialData in a script tag """ initial_data = {} for scripts in soup.find_all('script', src=None, type=None): for script in scripts.contents: try: program = es5(script) except ECMASyntaxError as e: # TODO: proper logging print('DEBUG: parsing failed, continuing') continue
def main(arg_dir, arg_file, arg_from_ep, arg_to_ep, arg_url, custom_stdout, arg_debug, arg_proxy=None): try: sys.stdout = custom_stdout # stderr can test with calmjs error: # Don't be confuse, outer es5 use internal es5, both files named es5.py: # this file -> CalmParser() -> calmjs.parse.parsers.es5 -> [calmjs\parse\parsers\es5.py] # -> self.lexer.build(optimize=lex_optimize, lextab=lextab) -> from calmjs.parse.lexers.es5 import Lexer # -> [calmjs\parse\lexers\es5.py] -> class Lexer(object): -> def build(self, **kwargs): -> ply.lex.lex(object=self, **kwargs) # -> [lex.py] -> def lex -> errorlog = PlyLogger(sys.stderr) -> class PlyLogger(object): -> def error(self, msg, *args, **kwargs): # -> self.f.write('ERROR: ' + (msg % args) + '\n') # f should means stderr here # [UPDATE] disable since useless now (other place change stderr is calmjs CP.parse(script.text) below) # Without stderr still able to shows ffmpeg not found traceback on gui log # sys.stderr = custom_stdout if not arg_url: print('main arg_url: ' + repr(arg_url)) #quit('[!] [e1] Please specify cinema url in https://www.fanstui.com/voddetail-300.html. Abort.') return quit( '[!] [e1] 请用该格式 https://www.duboku.co/voddetail/300.html 的链接。' ) # Should accept these formats: # https://www.duboku.net/voddetail/300.html # https://www.fanstui.com/voddetail-300.html # Deprecated # https://www.fanstui.com/vodplay/300-1-1.html # Deprecated # https://www.fanstui.com/vp/529-1-1.html # Deprecated # https://tv.newsinportal.com/vodplay/1382-1-3.html #VODPLAY_PREFIX = 'https://www.fanstui.com/vodplay/' NEWS_VODPLAY_PREFIX = 'vodplay/' VODPLAY_PREFIX = 'vodplay/' VODDETAIL_PREFIX = 'voddetail/' #VP_PREFIX = 'https://www.fanstui.com/vp/' VP_PREFIX = 'vp/' ORIG_PREFIX = 'voddetail-' cinema_url_post = '.html' #cinema_url_pre = 'https://www.duboku.net/vodplay/' if '://' not in arg_url: arg_url = 'https://' + arg_url arg_path = '/'.join(arg_url.split('/')[-2:]) cinema_url_pre = '/'.join( arg_url.split('/')[:-2]) + '/' + VODPLAY_PREFIX arg_url_m = arg_path.strip( ) #.replace('https://www.duboku.net/', 'https://www.fanstui.com/') try: #if arg_url_m.startswith('https://www.fanstui.com/voddetail-'): if arg_url_m.startswith(ORIG_PREFIX): #cinema_id = int(arg_url_m.split('https://www.fanstui.com/voddetail-')[1].split('.html')[0]) cinema_id = int( arg_url_m.split(ORIG_PREFIX)[1].split('.html')[0]) cinema_id = str( cinema_id) #set back str after test int() ValueError cinema_url_middle = '-1-' elif arg_url_m.startswith(NEWS_VODPLAY_PREFIX): cinema_id = int( arg_url_m.split(NEWS_VODPLAY_PREFIX)[1].split('-')[0]) cinema_id = str(cinema_id) cinema_url_middle = '-' + arg_url_m.split( NEWS_VODPLAY_PREFIX)[1].split('-')[1] + '-' elif arg_url_m.startswith(VODPLAY_PREFIX): cinema_id = int( arg_url_m.split(VODPLAY_PREFIX)[1].split('-')[0]) cinema_id = str(cinema_id) cinema_url_middle = '-' + arg_url_m.split( VODPLAY_PREFIX)[1].split('-')[1] + '-' elif arg_url_m.startswith(VODDETAIL_PREFIX): cinema_id = int( arg_url_m.split(VODDETAIL_PREFIX)[1].split('.')[0]) cinema_id = str(cinema_id) cinema_url_middle = '-1-' elif arg_url_m.startswith(VP_PREFIX): cinema_id = int(arg_url_m.split(VP_PREFIX)[1].split('-')[0]) cinema_id = str(cinema_id) cinema_url_middle = '-' + arg_url_m.split(VP_PREFIX)[1].split( '-')[1] + '-' else: #return quit('[!] [e2] Please specify cinema url in https://www.fanstui.com/voddetail-300.html. Abort.') return quit( '[!] [e2] 请用该格式 https://www.duboku.co/voddetail/300.html 的链接。' ) except ValueError as ve: print(ve) #return quit('[!] [e3] Please specify cinema url in https://www.fanstui.com/voddetail-300.html. Abort.') return quit( '[!] [e3] 请用该格式 https://www.duboku.co/voddetail/300.html 的链接。' ) if arg_file: if arg_dir: return quit('[!] 不能同时使用 -d 和 -f 选项。') ep_ts_path = os.path.abspath(arg_file + '.ts') ep_mp4_path = os.path.abspath(arg_file + '.mp4') arg_to_ep = 2 else: if not arg_to_ep: return quit('[!] 请用 `--to-ep N` 选项决定从第 N 集停止下集。') if arg_from_ep > arg_to_ep: return quit('[!] 从第几集必须小于或等于到第几集。') arg_to_ep += 1 if not arg_dir: return quit('[!] 请用 `-d 目录名` 选项。') dir_path_m = os.path.abspath(arg_dir) if not os.path.isdir(dir_path_m): try: os.makedirs(dir_path_m) except OSError: return quit('[i] 无法创建目录。或许已有同名文件? ') # https://stackoverflow.com/questions/10606133/sending-user-agent-using-requests-library-in-python http_headers = { 'User-Agent': UA #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' #, 'From': '*****@*****.**' # This is another valid field } def calm_assign(node): #print('$$$$$$$$$$$$$$$$ START') #print(type(node)) #can see class xxx(e.g. BinOp) at calmjs/parse/asttypes.py #print(node) #print('$$$$$$$$$$$$$$$$ M') #print(dir(node)) #print('$$$$$$$$$$$$$$$$ END') return isinstance(node, CalmAssign) def calm_id(node): #print(node) #print(type(node)) #print(isinstance(node, Identifier)) return isinstance(node, CalmIdentifier) def calm_str(node): return isinstance(node, CalmString) def calm_var(node): return isinstance(node, CalmVar) ''' //https://github.com/brix/crypto-js //import js in console: var imported = document.createElement('script'); //https://cdnjs.com/libraries/crypto-js imported.src = 'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.9-1/crypto-js.js'; document.head.appendChild(imported); //rerun the code from webpage: var content = "U2FsdGVkX18/ZQ8zQuYIsjIgZkCTVTWoklPND/Bx5tdp3vphNNtxnlzBPeCW2h3OiGbgI17pH/14qF2e8ZsWLpcNeGegDzRonl8dnDwnZKYSOgkPXmSwxArjg1lBPufaSJs8IyTcATJINMrWme/TqSPxxe7CdezlsA35neSw+OjEzx5yUH3mhZY2Jnah+ko2wmIBucCkRUdGbwU8ufsmX4FL+fkKDAIPi+AVmITbzcquMnGHnk/CmibPG9CNOr5joKrdJ1GT2bodPn9vnruvY+j3tNC6D4sdRtLnHAlEUnxlLu0Sr6NczJsgVlrKhsn06ML2Jkcc+ZQ4+fuFeXhEl6isEGjlCAdnrlbSl6SvSxqyjnA2JwBGjUWGs3kIBnaNc+TCNi5Vmxiv3OsSgbQM4NX6SqD66+cqBlM6gqeUjOXDa+7O39GcIsKNo/95hbfTBruDZIIQM1UKVoA7ZfuFN+L9AmuoirrMb24AWxTiHQPGdCxWLzncwdn9Ri7GouiUVGDBuDaiRBKJvR1MgVIBUQ8n/D1HZUsFQJLpA4x2+49ZQ2loovIYU5gkoPZSPGnYHK1iZmMPYLFFxPyHob5QBu1w5wgo5ZtSYS14B3PnT6DY0NLHm5etSgeOM2dvkOY+i/U9q98XLYMd1GAORWt6AdpMFZm+1BVwIxF1JyodpLg57z9eTZSv/I0+FlGsQRGArXga5Xoq6Sj22l1tiGgt5ZDtHFaQeLBMhKWqIdVDyxhsqhtRpxx//EA9b9ZALquYo+6XeEm61RLbyoUqPnYE0ygi1W3Br6EpnimKAxYAoYqv7vIedF2WLOJ9t/mPB594EPkV8PGgha6IOyqLgn8QPqS+pFsuJeRAD9xUCAL9905v9igSC73Q22gXxcTb9m2CEqHDYWrVD528rr7uY/c8PypvvWX35dxNdNiJ3n4Kc6SuL27ncmPyHIyTXrNwdPyvvexIrzD7uJIUFirqoR1JCGGyjks5RLcw/iTTXurV2M9y3mGr3pBAM66bxlglfNugp/Pwg05gr8ik31mqvvxyWw=="; var bytes = CryptoJS.AES.decrypt(content, 'ppvod'); var originalText = bytes.toString(CryptoJS.enc.Utf8); //get: "var hosts = 'www.duboku.net|tv.zdubo.com|v.zdubo.com|v.wedubo.com|v.duboku.net|www.fanstui.com|www.duboku.tv|localhost'; var playlist = '[{\"url\":\"/20190923/EEGkg4vm/hls/index.m3u8\"}]'; playlist = JSON.parse(playlist); var danmuenable = 0; var magnet = \"\" var redirecturl = \"https://v.zdubo.com\"; var videoid = \"1Rhgp5nzyWuK3P6k\"; var id = '1Rhgp5nzyWuK3P6k' var l = '' var r = '' var t = '15' var d = '' var u = '' var main = \"/ppvod/H2GPhFCJ\"; var playertype = 'dplayer'; // dplayer || ckplayer var mp4 = \"/20190923/EEGkg4vm/mp4/EEGkg4vm.mp4\"; var xml = \"\"; var pic = \"/20190923/EEGkg4vm/1.jpg\"; $(function () { var t = BrowserType(); if (t && t.indexOf(\"IE\") >= 0 ) playertype = \"ckplayer\" var order = 0; init(order); }) # https://u.tudu.site/vodplay/1554-1-38.html (some UA blocked including Chrome, but can use other UA OR use correct referer): var player_data={"flag":"play","encrypt":0,"trysee":0,"points":0,"link":"\/vodplay\/1554-1-1.html","link_next":"","link_pre":"\/vodplay\/1554-1-37.html","url":"https:\/\/tv.wedubo.com\/20200901\/69OYAim7\/index.m3u8","url_next":"","from":"videojs-tv.js","server":"no","note":"","id":"1554","sid":1,"nid":38} ''' CP = CalmParser() walker = CalmWalker() if arg_proxy: arg_proxy = arg_proxy.strip() if arg_proxy: if '://' not in arg_proxy: arg_proxy = 'https://' + arg_proxy proxies = { 'https': arg_proxy, } print('[...] 尝试代理: ' + proxies['https']) else: proxies = {} print('[...] 无代理。') for ep in range(arg_from_ep, arg_to_ep): url = ''.join([ cinema_url_pre, cinema_id, cinema_url_middle, str(ep), cinema_url_post ]) #don't override template cinema_url if arg_file: print('[...] 尝试 URL: ' + url) else: print('[当前第{}集] 尝试 URL: {}'.format(ep, url)) try: if arg_debug: #logging.basicConfig(level=logging.DEBUG, format="%(message)s") http.client.HTTPConnection.debuglevel = 1 logging.basicConfig(filename='duboku_ep' + str(ep) + '.log') logging.getLogger().setLevel(logging.DEBUG) requests_log = logging.getLogger( "requests.packages.urllib3") requests_log.setLevel(logging.DEBUG) requests_log.propagate = True with open('duboku_ep' + str(ep) + '.log', 'w') as f: f.write('URL: ' + url + '\n\n') try: try: http_headers.pop('referer') except KeyError: pass r = requests.get(url, allow_redirects=True, headers=http_headers, timeout=30, proxies=proxies) except requests.exceptions.ConnectionError: print('\n[!] 你的网络出现问题,也可能是网站的服务器问题。\n', flush=True) continue if arg_debug: with open('duboku_ep' + str(ep) + '.log', 'a') as f: f.write(r.text) except requests.exceptions.ProxyError as pe: print( '[😞] 代理错误。请检查您的代理。确保有端口号(port number), 例如端口1234: http://127.0.0.1:1234\n' ) print(traceback.format_exc()) break soup = BeautifulSoup(r.text, 'html.parser') ct_b64 = '' #reset passwd = '' #reset http_headers.update({'referer': url}) printed_err = False got_ep_url = False for script in soup.find_all('script'): #print(script) try: #program = es5(script.text) #PyInstaller has issue to make `ply_dist = working_set.find(Requirement.parse('ply'))` in calmjs\parse\utils.py return non-None #... And causes self.parser.parse in \calmjs\parse\parsers\es5.py no parse method #... bcoz set with unknown pkg name by Parser() constructor/init 's tabmodule=yacctab arg #, so re-assign stderr here to ignore this common warning msg to send to gui log # [UPDATE] disable since useless now. #sys.stderr = sys.__stderr__ tree = CP.parse(script.text) #sys.stderr = custom_stdout #print(type(tree)) #<class 'calmjs.parse.factory.ES5Program'> #print(tree) #print('######## START') #print(tree) #text #type is <class 'calmjs.parse.factory.ES5Program' #print(walker.filter(tree, assignment)) #<generator object Walker.filter at 0x7f0b75664360> #print(walker.filter(tree, assignment)) #for w in walker.filter(tree, assignment): # print(w) ep_url = '' #reset is_vimeo = False vimeo_qd = {} if arg_dir: ep_mp4_path = None for w in walker.filter(tree, calm_id): if w.value == 'player_data': for wa in walker.filter(tree, calm_assign): if wa.left.value == '"url"': #'' included "" rv = wa.right.value ep_url = rv.replace( '\/', '/').strip('\"').strip('\'') #episode not exists if not ep_url.strip(): if not printed_err: print('[!] 不存在第{}集。'.format(ep)) printed_err = True continue try: if ep_url.split('/')[2].split( '.')[1].lower() == 'vimeo': # e.g. https://www.duboku.co/vodplay/1584-1-1.html # -> https://player.vimeo.com/video/452182074 is_vimeo = True if arg_debug: with open( 'duboku_ep' + str(ep) + '.log', 'a') as f: f.write( '\n\nEP URL of VIMEO: ' + ep_url + '\n\n') #print('呼叫 vimeo... ' + repr(ep_url)) r_iframe = requests.get( ep_url, allow_redirects=True, headers=http_headers, timeout=30, proxies=proxies) if arg_debug: with open( 'duboku_ep' + str(ep) + '.log', 'a') as f: f.write(r_iframe.text) soup_iframe = BeautifulSoup( r_iframe.text, 'html.parser') for vimeo_script in soup_iframe.find_all( 'script'): tree = es5(vimeo_script.text) for w in walker.filter( tree, calm_var): if w.identifier.value == 'config': for config_wp in w.initializer.properties: try: for config_wp2 in config_wp.right.properties: for config_wp3 in config_wp2.right.properties: if 'progressive' != config_wp3.left.value.strip( '"' ).lower( ): continue try: for config_wp4 in config_wp3.right.children( ): next_width_k = '' next_url_v = '' for config_wp5 in config_wp4.properties: if config_wp5.left.value.strip( '"' ).lower( ) == 'width': next_width_k = config_wp5.right.value if next_url_v: vimeo_qd[int( next_width_k )] = next_url_v elif config_wp5.left.value.strip( '"' ).lower( ) == 'url': next_url_v = config_wp5.right.value.strip( '"' ) if next_width_k: vimeo_qd[int( next_width_k )] = next_url_v except ( TypeError, AttributeError ): pass #print(traceback.format_exc()) except (TypeError, AttributeError ): pass except IndexError: print('Split ep url failed: ' + repr(ep_url)) if is_vimeo: #print('vimeo 视频质量: ' + repr(vimeo_qd)) if not vimeo_qd: continue vimeo_qdk = list(vimeo_qd.keys()) vimeo_qdk.sort(key=int) ep_url = vimeo_qd[int(vimeo_qdk[-1])] elif rv.endswith('.m3u8"') or rv.endswith( ".m3u8'" ): #[todo:0] need check ' also ? pass else: #single video normally came here #print('NEW url type? ' + repr(ep_url)) if arg_debug: with open( 'duboku_ep' + str(ep) + '.log', 'a') as f: f.write('\n\nEP URL: ' + ep_url + '\n\n') r_iframe = requests.get( ep_url, allow_redirects=True, headers=http_headers, timeout=30, proxies=proxies) if arg_debug: with open( 'duboku_ep' + str(ep) + '.log', 'a') as f: f.write(r_iframe.text) soup_iframe = BeautifulSoup( r_iframe.text, 'html.parser') decrypted_final_js = None for script_iframe in soup_iframe.find_all( 'script'): tree_iframe = CalmParser().parse( script_iframe.text.strip()) for decrypt_js in walker.filter( tree_iframe, calm_var): if decrypt_js.identifier.value == 'content': ct_b64 = decrypt_js.initializer.value elif decrypt_js.identifier.value == 'bytes': get_passwd = False for decrypt_i, decrypt_js_c in enumerate( decrypt_js. initializer. children()): if get_passwd: #(content, 'ppvod') for dci, dc in enumerate( decrypt_js_c .children( )): if dci == 1 and isinstance( dc. value, str): passwd = dc.value[ 1: -1] #exclude '' if decrypt_js_c.__str__( ) == 'CryptoJS.AES.decrypt': #CryptoJS.AES.decrypt get_passwd = True elif decrypt_js.identifier.value == 'playlist': decrypted_final_js = tree_iframe if ct_b64: print('ct b64 data: ' + repr(ct_b64)) print('passwd: ' + repr(passwd)) decrypted_final_content = crypto_py_aes_main( ct_b64, passwd) decrypted_final_js = CalmParser( ).parse(decrypted_final_content. decode()) #else: # No nid decrypt, direct use plain `decrypted_final_js = tree_iframe` above m3u8_path_incomplete = '' #reset m3u8_host_incomplete = '' for decrypted_final_var in walker.filter( decrypted_final_js, calm_var): if decrypted_final_var.identifier.value == 'playlist': decrypted_m3u8_path = decrypted_final_var.initializer.value[ 1:-1] # exclude '' if "'" in decrypted_m3u8_path: dot_type = "'" elif '"' in decrypted_m3u8_path: dot_type = '"' else: continue for path_part in decrypted_m3u8_path.split( dot_type): if path_part.endswith( '.m3u8'): m3u8_path_incomplete = path_part elif decrypted_final_var.identifier.value == 'redirecturl': m3u8_host_incomplete = decrypted_final_var.initializer.value[ 1:-1] #exclude "" if not m3u8_host_incomplete.endswith( '/' ) and not m3u8_path_incomplete.startswith( '/'): ep_url = m3u8_host_incomplete + '/' + m3u8_path_incomplete else: ep_url = m3u8_host_incomplete + m3u8_path_incomplete if arg_dir: ep_filename = os.path.basename(''.join( ['第', str(ep), '集'])) ep_ts_path = os.path.join( dir_path_m, ''.join([ os.path.basename(ep_filename) + '.ts' ])) ep_mp4_path = os.path.join( dir_path_m, ''.join([ os.path.basename(ep_filename), '.mp4' ])) if ep_url: break if ep_url: break if ep_url and ep_mp4_path: got_ep_url = True print('下载的 url: ' + ep_url) if not is_vimeo: print('下载的 ts 路径: ' + ep_ts_path) print('下载的 mp4 路径: ' + ep_mp4_path) if arg_debug: with open('duboku_ep' + str(ep) + '.log', 'a') as f: f.write('\n\n下载的 url: ' + ep_url) if not is_vimeo: f.write('\n下载的 ts 路径: ' + ep_ts_path) f.write('\n下载的 mp4 路径: ' + ep_mp4_path + '\n\n') if is_vimeo: r = requests.get(ep_url, allow_redirects=True, headers=http_headers, timeout=30, proxies=proxies, stream=True) chunk_size = 1024 # 1 MB file_size = int(r.headers['Content-Length']) num_bars = 0 #int(file_size / chunk_size) with open(ep_mp4_path, 'wb') as fp: for chunk in tqdm.tqdm( r.iter_content(chunk_size=chunk_size), total=num_bars, position=0, mininterval=5, unit='KB', desc=ep_mp4_path, leave=True, file=sys.stdout): fp.write(chunk) else: r = requests.get(ep_url, allow_redirects=True, headers=http_headers, timeout=30, proxies=proxies) if arg_debug: with open('duboku_ep' + str(ep) + '.log', 'a') as f: f.write('r: ' + r.text) # Disable `if` condition line below, if want to test convert .ts without re-download if m3u8_decryptopr_main(r.text, ep_ts_path, ep_url, http_headers, arg_debug, 'duboku_ep' + str(ep) + '.log', proxies=proxies): remux_ts_to_mp4(ep_ts_path, ep_mp4_path) #source_url = "https://tv2.xboku.com/20191126/wNiFeUIj/index.m3u8" #https://stackoverflow.com/questions/52736897/custom-user-agent-in-youtube-dl-python-script #youtube_dl.utils.std_headers['User-Agent'] = UA #try: # This one shouldn't pass .mp4 ep_path # youtube_dl.YoutubeDL(params={'-c': '', '-q': '', '--no-mtime': '', # 'outtmpl': ep_path + '.%(ext)s'}).download([ep_url]) #except youtube_dl.utils.DownloadError: # print(traceback.format_exc()) # print( # 'Possible reason is filename too long. Please retry with -s <maximum filename size>.') # sys.exit() break #print(walker.extract(tree, assignment)) #print('######## END') except calmjs.parse.exceptions.ECMASyntaxError as ee: pass #here is normal #print('ex') #print(traceback.format_exc()) except Exception: #Need to catch & print exception explicitly to pass to duboku_gui to show err log print(traceback.format_exc()) try: print('[😞]') except UnicodeEncodeError: print('[!] 失败。') if not got_ep_url: if not printed_err: if arg_file: print('[!] 不存在该部影片。') else: print('[!] 不存在第{}集。'.format(ep)) except Exception: try: print(traceback.format_exc()) except UnicodeEncodeError: print('[!] 出现错误。') try: print('[😄] 全部下载工作完毕。您已可以关闭窗口, 或下载别的视频。') except UnicodeEncodeError: print('[*] 全部下载工作完毕。您已可以关闭窗口, 或下载别的视频。') '''
class TestContentSorter(unittest.TestCase): def setUp(self): self.driver = WebDriver() self.test_sorter = ScriptSorter() self.res_sorter = ScriptSorter() def tearDown(self): self.driver.proxy.close() self.driver.driver.close() def getting_sorter_resource(self, url): """ Gets html and soup associated for a given url :param url: url of the desired resource :return: """ html = self.driver.parse_page(url)[0] soup = BeautifulSoup(html, 'html.parser') return html, soup def gen_js_style_nodes(self, node_type, styles): res = [] # Parsing the node for style in styles: parsed = tinycss2.parse_rule_list(style.text) for node in parsed: if isinstance(node, tinycss2.ast.AtRule ) and node.lower_at_keyword == node_type: res.append(node) return res def gen_js_script_nodes(self, node_type, scripts): res = [] for script in scripts: program = es5(script) walker = Walker() for node in walker.filter( program, lambda node: (isinstance(node, node_type))): res.append(node) return res def test_resolve_nested_sources(self): url = 'http://localhost:4000/nested_variable' resource = self.getting_sorter_resource(url) html = resource[0] self.test_sorter.report_generator = ReportGenerator() res_nested = {'n2': 'connect-src', 'n7': 'connect-src'} res_variable = { 'n1': 'http://localhost:4000/nest_connect_xmlhttp', 'n2': 'n1', 'n5': 'http://localhost:4000/nest_connect_socket', 'n6': 'n5', 'n7': 'n6', 'n8': 'n7' } self.test_sorter.sort_content(url=url, html=html) assert (res_nested == self.test_sorter.nested_source and res_variable == self.test_sorter.variable) def test_parse_sources(self): """ Only testing the flag for this method, rest is handled in following the following tests :return: """ script = u""" <style> h1 {color:red;} p {color:blue;} </style> <script> alert('test'); </script> """ soup = BeautifulSoup(script, 'html.parser') script_tag = soup.find_all('script') style_tag = soup.find_all('style') # Setting up res_sorter generators and falgs report_generator = ReportGenerator() report_generator.flags.append(Flag('inline_script', script_tag[0])) report_generator.flags.append(Flag('inline_style', style_tag[0])) self.res_sorter.report_generator = report_generator # Setting report generator for test sorter self.test_sorter.report_generator = ReportGenerator() self.test_sorter.parse_sources(script) assert (self.test_sorter.report_generator.flags == self.res_sorter.report_generator.flags) def test_get_node_instruction(self): case = [] script = u""" navigator.sendBeacon(dummyArg); navigator.serviceWorker.register(dummyArg); very.long.DotAcessor.dummy.serviceWorker.register(dummyArg); var test = funct_in_a_var(dummyArg); test = xhr.open(dummyArg); test = send(dummyArg); """ program = es5(script) walker = Walker() nodes = [] for node in walker.filter( program, lambda node: (isinstance(node, FunctionCall))): nodes.append(node) print(type(node)) case.append( ('sendBeacon', self.test_sorter.get_node_instruction(nodes[0]))) case.append( ('register', self.test_sorter.get_node_instruction(nodes[1]))) case.append( ('register', self.test_sorter.get_node_instruction(nodes[2]))) case.append(('funct_in_a_var', self.test_sorter.get_node_instruction(nodes[3]))) case.append(('open', self.test_sorter.get_node_instruction(nodes[4]))) case.append(('send', self.test_sorter.get_node_instruction(nodes[5]))) for test_case in case: with self.subTest(case=test_case):
def measure(user_dir, task_id, length, start, end, status_queue, process_index): global processed_data_dir, conflicting_rank_set, rank2url, func_dir, raw_data_dir current_pid = os.getpid() current_dir = os.getcwd() cnt = 0 try: status = 'Process %-4d task %d/%d PID [%d] starting ...' % ( process_index, task_id + 1, length, current_pid) status_queue.put([process_index, status]) #print(status) current_pid = os.getpid() current_dir = os.getcwd() input_dir = user_dir + '_analysis' files = os.listdir(input_dir) #files = [f for f in files if f.endswith('-category2target2type2script2infos.json')] files = [ f for f in files if f.endswith('-category2target2type2script2infos.json') ] # and not f.endswith('-used-category2type2target2infos.json')] cat2rank2target2infos = dict() #print(files) for f in files: try: rank = f.split('.')[0] input_file = os.path.join(input_dir, f) with open(input_file, 'r') as input_f: category2target2type2script2infos = json.loads( input_f.read()) for category, target2type2script2infos in category2target2type2script2infos.items( ): for target, type2script2infos in target2type2script2infos.items( ): for type_, script2infos in type2script2infos.items( ): if type_ != 'funcs': continue for script, infos in script2infos.items(): for info in infos: if len(info) >= 15: func_str_one = info[2].strip() func_str_two = info[12].strip() clean_source_one = ''.join( func_str_one.strip().split( )).replace(';', '') clean_source_two = ''.join( func_str_two.strip().split( )).replace(';', '') if clean_source_one == clean_source_two: if category not in cat2rank2target2infos: cat2rank2target2infos[ category] = dict() if rank not in cat2rank2target2infos[ category]: cat2rank2target2infos[ category][rank] = dict( ) if target not in cat2rank2target2infos[ category][rank]: cat2rank2target2infos[ category][rank][ target] = list() cnt += 1 cat2rank2target2infos[ category][rank][ target].append(info) else: script_id_one = info[0] if '.func' in info[3]: source_file_one = info[ 3].replace( '.func', '.' + str(script_id_one) + '.script') else: source_file_one = info[ 3].replace( '-functions.json', '.' + str(script_id_one) + '.script') script_id_two = info[4] if '.func' in info[7]: source_file_two = info[ 7].replace( '.func', '.' + str(script_id_two) + '.script') else: source_file_two = info[ 7].replace( '-functions.json', '.' + str(script_id_two) + '.script') rank = rank.split('-')[0] script_dir = 'iso_' + str( int(rank) % num_instances) + '_logs' script_dir = os.path.join( raw_data_dir, script_dir) try: source_file_one = os.path.join( script_dir, source_file_one) with open( source_file_one, 'r') as input_f: source_one = input_f.read() source_file_two = os.path.join( script_dir, source_file_two) with open( source_file_two, 'r') as input_f: source_two = input_f.read() except IOError as e: #print(e) continue try: program_one = es5( unicode(source_one)) except Exception as e: #print(e) continue function_source_one = None walker = Walker() for node in walker.filter( program_one, lambda node: (isinstance(node, FuncDecl))): if str(node.identifier) == str( target): function_source_one = str( node) break try: program_two = es5( unicode(source_two)) except Exception as e: #print(e) continue function_source_two = None walker = Walker() for node in walker.filter( program_two, lambda node: (isinstance(node, FuncDecl))): if str(node.identifier) == str( target): function_source_two = str( node) break if function_source_one is not None and function_source_two is not None: #print(rank, program_one, program_two) #print(rank) clean_source_one = ''.join( function_source_one.strip( ).split()).replace( ';', '') clean_source_two = ''.join( function_source_two.strip( ).split()).replace( ';', '') if clean_source_one == clean_source_two: if category not in cat2rank2target2infos: cat2rank2target2infos[ category] = dict() if rank not in cat2rank2target2infos[ category]: cat2rank2target2infos[ category][ rank] = dict() if target not in cat2rank2target2infos[ category][rank]: cat2rank2target2infos[ category][rank][ target] = list( ) cnt += 1 cat2rank2target2infos[ category][rank][ target].append( info) except Exception as e: print(e) pass except OSError as e: pass except Exception as e: status = 'Process %-4d task %s/%s raised an exception %s.' % ( process_index, task_id + 1, length, type(e)) status_queue.put([process_index, status]) string = '%s\t%s' % (getlocaltime(), status) try: print(string) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) print(''.join('!! ' + line for line in lines)) sys.stdout.flush() except Exception: pass status = 'Process %-4d task %s/%s PID [%d] completed.' % ( process_index, task_id + 1, length, current_pid) status_queue.put([process_index, status]) if cnt > 0: output_file = '%s-duplicate_cat2rank2target2infos.json' % (user_dir) with open(output_file, 'w') as output_f: output_f.write(json.dumps(cat2rank2target2infos)) print(output_file)