Example #1
0
    def find_script_sources(self, tree):
        """
        Extract all the relevant sources in <script>

        Parse all the inline scripts using their AST and extracting all relevant
        sources for CSP directives

        :param tree: a script parsed into an AST (Abstract Syntax Tree)
        :return: None
        """
        # Visit tree if a relevant node falls under a CSP directive
        walker = Walker()
        for node in walker.filter(
                tree, lambda node:
            (isinstance(node, (FunctionCall, VarDecl, Assign, NewExpr)))):
            # Calls the right function for each node
            if isinstance(node, FunctionCall):
                print('FUNCTION CALL')
                self.extract_function_call(node)

            elif isinstance(node, VarDecl):
                print('VAR DECL')
                self.extract_var_declaration(node)

            elif isinstance(node, Assign):
                print('ASSIGN')
                self.extract_assign(node)

            elif isinstance(node, NewExpr):
                print('NEW EXPR')
                self.extract_new_expr(node)
Example #2
0
 def gen_js_script_nodes(self, node_type, scripts):
     res = []
     for script in scripts:
         program = es5(script)
         walker = Walker()
         for node in walker.filter(
                 program, lambda node: (isinstance(node, node_type))):
             res.append(node)
     return res
Example #3
0
 def test_extract_func_call_eval_instruction(self):
     """
     Aims to test if eval instruction properly generate unsafe-eval directive
     and raise correct flags
     :return:
     """
     # ------------------------------- #
     # ----------- ARRANGE ----------- #
     # ------------------------------- #
     self.res_sorter.report_generator = ReportGenerator()
     self.test_sorter.report_generator = ReportGenerator()
     script = """
         eval('2*3;')
         
         var m = 3;
         var f = new Function('a', 'return a');
         
         document.getElementsByTagName("body").style.cssText = "background-color:pink;font-size:55px;border:2px dashed green;color:white;"
         myStyle.insertRule('#blanc { color: white }', 0);
     """
     # Getting the node from the script
     nodes = []
     walker = Walker()
     for node in walker.filter(
             es5(script), lambda node: (isinstance(node, FunctionCall))):
         nodes.append(node)
     # Adding unsafe-eval directives for each relevant directive
     self.res_sorter.directives_sources['script-src'].add("'unsafe-eval'")
     self.res_sorter.directives_sources['style-src'].add("'unsafe-eval'")
     # Adding flag into test report generator
     flag_eval = Flag('eval_script', nodes[0])
     flag_insert_rule = Flag('eval_style', nodes[2])
     self.res_sorter.report_generator.flags.append(flag_eval)
     self.res_sorter.report_generator.flags.append(flag_insert_rule)
     # ------------------------------- #
     # ------------- ACT ------------- #
     # ------------------------------- #
     for node in nodes:
         instruction = self.test_sorter.get_node_instruction(node)
         self.test_sorter.extract_func_call_eval_instruction(
             node, instruction)
     # ------------------------------- #
     # ----------- ASSERT ------------ #
     # ------------------------------- #
     assert (self.res_sorter.directives_sources ==
             self.test_sorter.directives_sources)
     assert (set(self.res_sorter.report_generator.flags) == set(
         self.test_sorter.report_generator.flags))
Example #4
0
def parse_publication(soups: Soups):
    stash = {}
    content = None
    target_script = None
    for script in soups.body.find_all("script"):
        try:
            if "Fusion.globalContent" in script.contents[0]:
                target_script = script.contents[0]
                break
        except:
            pass

    if target_script:
        for x in Walker().filter(
            es5(target_script), lambda node: isinstance(node, Assign)
        ):
            if str(x.left) == "Fusion.globalContent":
                content = json.loads(str(x.right))["content_elements"]
                break

        publication_text = [x["content"] for x in content if x.get("type") == "text"]
        publication_text_html = [
            x["content"] for x in content if x.get("type") == "raw_html"
        ]
        for html in publication_text_html:
            soup = BeautifulSoup(html, "html.parser")
            publication_text += list(soup.stripped_strings)
        stash["publication_text"] = "\n".join(publication_text)
        stash["image_urls"] = [x["url"] for x in content if x.get("type") == "image"]

    return {
        "version": soups.snapshot.snapshot_at,
        "site_id": soups.snapshot.site_id,
        "canonical_url": soups.snapshot.url,
        "published_at": P.parse_published_at(soups),
        "first_seen_at": soups.snapshot.first_seen_at,
        "last_updated_at": soups.snapshot.last_updated_at,
        "title": soups.body.find("title").text,
        "publication_text": stash.get("publication_text", ""),
        "author": None,
        "connect_from": None,
        "data": {
            "urls": P.parse_external_links(soups),
            "image_urls": stash.get("image_urls", []),
            "hashtags": [],
            "keywords": [],
            "tags": [],
            "metadata": {
                "metatags": soups.metatags,
                **soups.metadata,
                "ga-id": parse_ga_id(soups),
            },
            "comments": [],
        },
    }
Example #5
0
    def _request(self):
        """
        :returns: List or None
        """

        endpoint = "http://%s/cgi/cgi_myNetwork.js" % self.host

        try:
            response = requests.get(endpoint)
            _LOGGER.debug("Response %s", response.text)

            tree = es5(response.text)
            known_device_list = []

            # find known_device_list variable
            var_known_device_list = None

            walker = Walker()
            for node in walker.filter(tree, lambda node: isinstance(node, VarDecl)):
                if node.identifier.value == 'known_device_list':
                    var_known_device_list = node

            if var_known_device_list is None:
                raise IndexError('known_device_list variable not found.')

            for object_node in walker.filter(var_known_device_list, lambda node: isinstance(node, Object)):
                known_device_list.append({
                    getattr(node.left, 'value', ''): urllib.parse.unquote(getattr(node.right, 'value', '')).replace(
                        '\'', '')
                    for node in walker.filter(object_node, lambda node: isinstance(node, Assign))
                })

            return known_device_list
        except requests.RequestException:
            _LOGGER.error("Status failed %s", endpoint, exc_info=1)
        except IndexError:
            _LOGGER.error("Parsing failed %s", endpoint, exc_info=1)

        return None
Example #6
0
def parse_publication(soups):
    stash = {}
    stash["title"] = soups.body.find("title").text
    declarations = es5(soups.body.find_all("script")[-10].contents[0])
    info_node = list(Walker().filter(
        declarations,
        lambda x: isinstance(x, Assign) and str(x.left) == "articleInfo",
    ))[0]
    content_node = list(
        filter(
            lambda p: isinstance(p, Assign) and str(p.left) == "content",
            info_node.right.properties,
        ))[0]
    content_text = re.search(r"\'(.*)\'.slice",
                             str(content_node.right)).group(1)

    content_soup = BeautifulSoup(json.loads(html.unescape(content_text)),
                                 "html.parser")
    stash["publication_text"] = "\n".join(content_soup.stripped_strings)
    stash["image_urls"] = [x["src"] for x in content_soup.find_all("img")]

    return {
        "version": soups.snapshot.snapshot_at,
        "site_id": soups.snapshot.site_id,
        "canonical_url": soups.snapshot.url,
        "published_at": P.parse_published_at(soups),
        "first_seen_at": soups.snapshot.first_seen_at,
        "last_updated_at": soups.snapshot.last_updated_at,
        "title": stash["title"],
        "publication_text": stash.get("publication_text", ""),
        "author": None,
        "connect_from": None,
        "data": {
            "urls": P.parse_external_links(soups),
            "image_urls": stash.get("image_urls", ""),
            "hashtags": [],
            "keywords": [],
            "tags": [],
            "metadata": {
                "metatags": soups.metatags,
                **soups.metadata,
                "ga-id": parse_ga_id(soups),
            },
            "comments": [],
        },
    }
Example #7
0
from calmjs.parse import es5
from calmjs.parse.walkers import Walker
from calmjs.parse.asttypes import VarDecl
from calmjs.parse.exceptions import ECMASyntaxError
from calmjs.parse.unparsers import extractor
from typing import Tuple, Any, Callable, Optional, TypeVar, Generic, Union, List
from functools import partial
from urllib.parse import urlparse
import re
from pydantic import ValidationError

# DEBUG- TODO delete
import pdb
from pprint import pprint

walker = Walker()


def extract_yt_initial_data(soup: BeautifulSoup) -> dict:
    """
    Extract the object bound to variable ytInitialData in a script tag
    """
    initial_data = {}
    for scripts in soup.find_all('script', src=None, type=None):
        for script in scripts.contents:
            try:
                program = es5(script)
            except ECMASyntaxError as e:
                # TODO: proper logging
                print('DEBUG: parsing failed, continuing')
                continue
Example #8
0
def main(arg_dir,
         arg_file,
         arg_from_ep,
         arg_to_ep,
         arg_url,
         custom_stdout,
         arg_debug,
         arg_proxy=None):

    try:
        sys.stdout = custom_stdout
        # stderr can test with calmjs error:
        # Don't be confuse, outer es5 use internal es5, both files named es5.py:
        # this file -> CalmParser() -> calmjs.parse.parsers.es5 -> [calmjs\parse\parsers\es5.py]
        # -> self.lexer.build(optimize=lex_optimize, lextab=lextab) -> from calmjs.parse.lexers.es5 import Lexer
        # -> [calmjs\parse\lexers\es5.py] -> class Lexer(object): -> def build(self, **kwargs):  -> ply.lex.lex(object=self, **kwargs)
        # -> [lex.py] -> def lex -> errorlog = PlyLogger(sys.stderr) -> class PlyLogger(object): -> def error(self, msg, *args, **kwargs):
        # -> self.f.write('ERROR: ' + (msg % args) + '\n') # f should means stderr here
        # [UPDATE] disable since useless now (other place change stderr is calmjs CP.parse(script.text) below)
        # Without stderr still able to shows ffmpeg not found traceback on gui log
        # sys.stderr = custom_stdout

        if not arg_url:
            print('main arg_url: ' + repr(arg_url))
            #quit('[!] [e1] Please specify cinema url in https://www.fanstui.com/voddetail-300.html. Abort.')
            return quit(
                '[!] [e1] 请用该格式  https://www.duboku.co/voddetail/300.html 的链接。'
            )

        # Should accept these formats:
        # https://www.duboku.net/voddetail/300.html
        # https://www.fanstui.com/voddetail-300.html # Deprecated
        # https://www.fanstui.com/vodplay/300-1-1.html # Deprecated
        # https://www.fanstui.com/vp/529-1-1.html # Deprecated
        # https://tv.newsinportal.com/vodplay/1382-1-3.html
        #VODPLAY_PREFIX = 'https://www.fanstui.com/vodplay/'
        NEWS_VODPLAY_PREFIX = 'vodplay/'
        VODPLAY_PREFIX = 'vodplay/'
        VODDETAIL_PREFIX = 'voddetail/'
        #VP_PREFIX = 'https://www.fanstui.com/vp/'
        VP_PREFIX = 'vp/'
        ORIG_PREFIX = 'voddetail-'

        cinema_url_post = '.html'
        #cinema_url_pre = 'https://www.duboku.net/vodplay/'

        if '://' not in arg_url:
            arg_url = 'https://' + arg_url
        arg_path = '/'.join(arg_url.split('/')[-2:])
        cinema_url_pre = '/'.join(
            arg_url.split('/')[:-2]) + '/' + VODPLAY_PREFIX

        arg_url_m = arg_path.strip(
        )  #.replace('https://www.duboku.net/', 'https://www.fanstui.com/')
        try:
            #if arg_url_m.startswith('https://www.fanstui.com/voddetail-'):
            if arg_url_m.startswith(ORIG_PREFIX):
                #cinema_id = int(arg_url_m.split('https://www.fanstui.com/voddetail-')[1].split('.html')[0])
                cinema_id = int(
                    arg_url_m.split(ORIG_PREFIX)[1].split('.html')[0])
                cinema_id = str(
                    cinema_id)  #set back str after test int() ValueError
                cinema_url_middle = '-1-'
            elif arg_url_m.startswith(NEWS_VODPLAY_PREFIX):
                cinema_id = int(
                    arg_url_m.split(NEWS_VODPLAY_PREFIX)[1].split('-')[0])
                cinema_id = str(cinema_id)
                cinema_url_middle = '-' + arg_url_m.split(
                    NEWS_VODPLAY_PREFIX)[1].split('-')[1] + '-'
            elif arg_url_m.startswith(VODPLAY_PREFIX):
                cinema_id = int(
                    arg_url_m.split(VODPLAY_PREFIX)[1].split('-')[0])
                cinema_id = str(cinema_id)
                cinema_url_middle = '-' + arg_url_m.split(
                    VODPLAY_PREFIX)[1].split('-')[1] + '-'
            elif arg_url_m.startswith(VODDETAIL_PREFIX):
                cinema_id = int(
                    arg_url_m.split(VODDETAIL_PREFIX)[1].split('.')[0])
                cinema_id = str(cinema_id)
                cinema_url_middle = '-1-'
            elif arg_url_m.startswith(VP_PREFIX):
                cinema_id = int(arg_url_m.split(VP_PREFIX)[1].split('-')[0])
                cinema_id = str(cinema_id)
                cinema_url_middle = '-' + arg_url_m.split(VP_PREFIX)[1].split(
                    '-')[1] + '-'
            else:
                #return quit('[!] [e2] Please specify cinema url in https://www.fanstui.com/voddetail-300.html. Abort.')
                return quit(
                    '[!] [e2] 请用该格式 https://www.duboku.co/voddetail/300.html 的链接。'
                )
        except ValueError as ve:
            print(ve)
            #return quit('[!] [e3] Please specify cinema url in https://www.fanstui.com/voddetail-300.html. Abort.')
            return quit(
                '[!] [e3] 请用该格式  https://www.duboku.co/voddetail/300.html 的链接。'
            )

        if arg_file:
            if arg_dir:
                return quit('[!] 不能同时使用 -d 和 -f 选项。')

            ep_ts_path = os.path.abspath(arg_file + '.ts')
            ep_mp4_path = os.path.abspath(arg_file + '.mp4')
            arg_to_ep = 2
        else:
            if not arg_to_ep:
                return quit('[!] 请用 `--to-ep N` 选项决定从第 N 集停止下集。')
            if arg_from_ep > arg_to_ep:
                return quit('[!] 从第几集必须小于或等于到第几集。')
            arg_to_ep += 1

            if not arg_dir:
                return quit('[!] 请用 `-d 目录名` 选项。')

            dir_path_m = os.path.abspath(arg_dir)
            if not os.path.isdir(dir_path_m):
                try:
                    os.makedirs(dir_path_m)
                except OSError:
                    return quit('[i] 无法创建目录。或许已有同名文件? ')

        # https://stackoverflow.com/questions/10606133/sending-user-agent-using-requests-library-in-python
        http_headers = {
            'User-Agent': UA
            #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
            #, 'From': '*****@*****.**'  # This is another valid field
        }

        def calm_assign(node):
            #print('$$$$$$$$$$$$$$$$ START')
            #print(type(node)) #can see class xxx(e.g. BinOp) at calmjs/parse/asttypes.py
            #print(node)
            #print('$$$$$$$$$$$$$$$$ M')
            #print(dir(node))
            #print('$$$$$$$$$$$$$$$$ END')
            return isinstance(node, CalmAssign)

        def calm_id(node):
            #print(node)
            #print(type(node))
            #print(isinstance(node, Identifier))
            return isinstance(node, CalmIdentifier)

        def calm_str(node):
            return isinstance(node, CalmString)

        def calm_var(node):
            return isinstance(node, CalmVar)

        '''
        //https://github.com/brix/crypto-js
        //import js in console:
        var imported = document.createElement('script');
        //https://cdnjs.com/libraries/crypto-js
        imported.src = 'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.9-1/crypto-js.js';
        document.head.appendChild(imported);

        //rerun the code from webpage:
        var content = "U2FsdGVkX18/ZQ8zQuYIsjIgZkCTVTWoklPND/Bx5tdp3vphNNtxnlzBPeCW2h3OiGbgI17pH/14qF2e8ZsWLpcNeGegDzRonl8dnDwnZKYSOgkPXmSwxArjg1lBPufaSJs8IyTcATJINMrWme/TqSPxxe7CdezlsA35neSw+OjEzx5yUH3mhZY2Jnah+ko2wmIBucCkRUdGbwU8ufsmX4FL+fkKDAIPi+AVmITbzcquMnGHnk/CmibPG9CNOr5joKrdJ1GT2bodPn9vnruvY+j3tNC6D4sdRtLnHAlEUnxlLu0Sr6NczJsgVlrKhsn06ML2Jkcc+ZQ4+fuFeXhEl6isEGjlCAdnrlbSl6SvSxqyjnA2JwBGjUWGs3kIBnaNc+TCNi5Vmxiv3OsSgbQM4NX6SqD66+cqBlM6gqeUjOXDa+7O39GcIsKNo/95hbfTBruDZIIQM1UKVoA7ZfuFN+L9AmuoirrMb24AWxTiHQPGdCxWLzncwdn9Ri7GouiUVGDBuDaiRBKJvR1MgVIBUQ8n/D1HZUsFQJLpA4x2+49ZQ2loovIYU5gkoPZSPGnYHK1iZmMPYLFFxPyHob5QBu1w5wgo5ZtSYS14B3PnT6DY0NLHm5etSgeOM2dvkOY+i/U9q98XLYMd1GAORWt6AdpMFZm+1BVwIxF1JyodpLg57z9eTZSv/I0+FlGsQRGArXga5Xoq6Sj22l1tiGgt5ZDtHFaQeLBMhKWqIdVDyxhsqhtRpxx//EA9b9ZALquYo+6XeEm61RLbyoUqPnYE0ygi1W3Br6EpnimKAxYAoYqv7vIedF2WLOJ9t/mPB594EPkV8PGgha6IOyqLgn8QPqS+pFsuJeRAD9xUCAL9905v9igSC73Q22gXxcTb9m2CEqHDYWrVD528rr7uY/c8PypvvWX35dxNdNiJ3n4Kc6SuL27ncmPyHIyTXrNwdPyvvexIrzD7uJIUFirqoR1JCGGyjks5RLcw/iTTXurV2M9y3mGr3pBAM66bxlglfNugp/Pwg05gr8ik31mqvvxyWw==";
        var bytes =  CryptoJS.AES.decrypt(content, 'ppvod');
        var originalText = bytes.toString(CryptoJS.enc.Utf8);

        //get:
        "var hosts = 'www.duboku.net|tv.zdubo.com|v.zdubo.com|v.wedubo.com|v.duboku.net|www.fanstui.com|www.duboku.tv|localhost';
        var playlist = '[{\"url\":\"/20190923/EEGkg4vm/hls/index.m3u8\"}]';
        playlist = JSON.parse(playlist);
            var danmuenable = 0;
            var magnet = \"\"
            var redirecturl = \"https://v.zdubo.com\";
            var videoid = \"1Rhgp5nzyWuK3P6k\";
            var id = '1Rhgp5nzyWuK3P6k'
            var l = ''
            var r = ''
            var t = '15'
            var d = ''
            var u = ''
            var main = \"/ppvod/H2GPhFCJ\";
            var playertype = 'dplayer'; // dplayer || ckplayer
            
            var mp4 = \"/20190923/EEGkg4vm/mp4/EEGkg4vm.mp4\";
            
            var xml = \"\";		
            
            var pic = \"/20190923/EEGkg4vm/1.jpg\";
                

        $(function () {
            var t = BrowserType();		
            if (t && t.indexOf(\"IE\") >= 0  )
                playertype = \"ckplayer\"
            var order = 0;
                
            init(order);
        })

        # https://u.tudu.site/vodplay/1554-1-38.html (some UA blocked including Chrome, but can use other UA OR use correct referer):
        var player_data={"flag":"play","encrypt":0,"trysee":0,"points":0,"link":"\/vodplay\/1554-1-1.html","link_next":"","link_pre":"\/vodplay\/1554-1-37.html","url":"https:\/\/tv.wedubo.com\/20200901\/69OYAim7\/index.m3u8","url_next":"","from":"videojs-tv.js","server":"no","note":"","id":"1554","sid":1,"nid":38}
        '''

        CP = CalmParser()
        walker = CalmWalker()
        if arg_proxy:
            arg_proxy = arg_proxy.strip()
        if arg_proxy:
            if '://' not in arg_proxy:
                arg_proxy = 'https://' + arg_proxy
            proxies = {
                'https': arg_proxy,
            }
            print('[...] 尝试代理: ' + proxies['https'])
        else:
            proxies = {}
            print('[...] 无代理。')

        for ep in range(arg_from_ep, arg_to_ep):
            url = ''.join([
                cinema_url_pre, cinema_id, cinema_url_middle,
                str(ep), cinema_url_post
            ])  #don't override template cinema_url
            if arg_file:
                print('[...] 尝试 URL: ' + url)
            else:
                print('[当前第{}集] 尝试 URL: {}'.format(ep, url))
            try:

                if arg_debug:
                    #logging.basicConfig(level=logging.DEBUG, format="%(message)s")
                    http.client.HTTPConnection.debuglevel = 1
                    logging.basicConfig(filename='duboku_ep' + str(ep) +
                                        '.log')
                    logging.getLogger().setLevel(logging.DEBUG)
                    requests_log = logging.getLogger(
                        "requests.packages.urllib3")
                    requests_log.setLevel(logging.DEBUG)
                    requests_log.propagate = True

                    with open('duboku_ep' + str(ep) + '.log', 'w') as f:
                        f.write('URL: ' + url + '\n\n')

                try:
                    try:
                        http_headers.pop('referer')
                    except KeyError:
                        pass
                    r = requests.get(url,
                                     allow_redirects=True,
                                     headers=http_headers,
                                     timeout=30,
                                     proxies=proxies)
                except requests.exceptions.ConnectionError:
                    print('\n[!] 你的网络出现问题,也可能是网站的服务器问题。\n', flush=True)
                    continue

                if arg_debug:
                    with open('duboku_ep' + str(ep) + '.log', 'a') as f:
                        f.write(r.text)

            except requests.exceptions.ProxyError as pe:
                print(
                    '[😞] 代理错误。请检查您的代理。确保有端口号(port number), 例如端口1234: http://127.0.0.1:1234\n'
                )
                print(traceback.format_exc())
                break
            soup = BeautifulSoup(r.text, 'html.parser')

            ct_b64 = ''  #reset
            passwd = ''  #reset
            http_headers.update({'referer': url})

            printed_err = False
            got_ep_url = False
            for script in soup.find_all('script'):
                #print(script)
                try:
                    #program = es5(script.text)

                    #PyInstaller has issue to make `ply_dist = working_set.find(Requirement.parse('ply'))` in calmjs\parse\utils.py return non-None
                    #... And causes self.parser.parse in \calmjs\parse\parsers\es5.py no parse method
                    #... bcoz set with unknown pkg name by Parser() constructor/init 's tabmodule=yacctab arg
                    #, so re-assign stderr here to ignore this common warning msg to send to gui log
                    # [UPDATE] disable since useless now.
                    #sys.stderr = sys.__stderr__
                    tree = CP.parse(script.text)
                    #sys.stderr = custom_stdout

                    #print(type(tree)) #<class 'calmjs.parse.factory.ES5Program'>
                    #print(tree)

                    #print('######## START')
                    #print(tree) #text #type is <class 'calmjs.parse.factory.ES5Program'
                    #print(walker.filter(tree, assignment)) #<generator object Walker.filter at 0x7f0b75664360>
                    #print(walker.filter(tree, assignment))

                    #for w in walker.filter(tree, assignment):
                    #    print(w)
                    ep_url = ''  #reset
                    is_vimeo = False
                    vimeo_qd = {}
                    if arg_dir:
                        ep_mp4_path = None
                    for w in walker.filter(tree, calm_id):
                        if w.value == 'player_data':
                            for wa in walker.filter(tree, calm_assign):
                                if wa.left.value == '"url"':  #'' included ""
                                    rv = wa.right.value
                                    ep_url = rv.replace(
                                        '\/', '/').strip('\"').strip('\'')

                                    #episode not exists
                                    if not ep_url.strip():

                                        if not printed_err:
                                            print('[!] 不存在第{}集。'.format(ep))
                                        printed_err = True

                                        continue

                                    try:
                                        if ep_url.split('/')[2].split(
                                                '.')[1].lower() == 'vimeo':
                                            # e.g. https://www.duboku.co/vodplay/1584-1-1.html
                                            # -> https://player.vimeo.com/video/452182074
                                            is_vimeo = True

                                            if arg_debug:
                                                with open(
                                                        'duboku_ep' + str(ep) +
                                                        '.log', 'a') as f:
                                                    f.write(
                                                        '\n\nEP URL of VIMEO: '
                                                        + ep_url + '\n\n')
                                            #print('呼叫 vimeo... ' + repr(ep_url))
                                            r_iframe = requests.get(
                                                ep_url,
                                                allow_redirects=True,
                                                headers=http_headers,
                                                timeout=30,
                                                proxies=proxies)

                                            if arg_debug:
                                                with open(
                                                        'duboku_ep' + str(ep) +
                                                        '.log', 'a') as f:
                                                    f.write(r_iframe.text)
                                            soup_iframe = BeautifulSoup(
                                                r_iframe.text, 'html.parser')
                                            for vimeo_script in soup_iframe.find_all(
                                                    'script'):
                                                tree = es5(vimeo_script.text)
                                                for w in walker.filter(
                                                        tree, calm_var):
                                                    if w.identifier.value == 'config':
                                                        for config_wp in w.initializer.properties:
                                                            try:
                                                                for config_wp2 in config_wp.right.properties:
                                                                    for config_wp3 in config_wp2.right.properties:
                                                                        if 'progressive' != config_wp3.left.value.strip(
                                                                                '"'
                                                                        ).lower(
                                                                        ):
                                                                            continue
                                                                        try:
                                                                            for config_wp4 in config_wp3.right.children(
                                                                            ):
                                                                                next_width_k = ''
                                                                                next_url_v = ''
                                                                                for config_wp5 in config_wp4.properties:
                                                                                    if config_wp5.left.value.strip(
                                                                                            '"'
                                                                                    ).lower(
                                                                                    ) == 'width':
                                                                                        next_width_k = config_wp5.right.value
                                                                                        if next_url_v:
                                                                                            vimeo_qd[int(
                                                                                                next_width_k
                                                                                            )] = next_url_v
                                                                                    elif config_wp5.left.value.strip(
                                                                                            '"'
                                                                                    ).lower(
                                                                                    ) == 'url':
                                                                                        next_url_v = config_wp5.right.value.strip(
                                                                                            '"'
                                                                                        )
                                                                                        if next_width_k:
                                                                                            vimeo_qd[int(
                                                                                                next_width_k
                                                                                            )] = next_url_v
                                                                        except (
                                                                                TypeError,
                                                                                AttributeError
                                                                        ):
                                                                            pass  #print(traceback.format_exc())
                                                            except (TypeError,
                                                                    AttributeError
                                                                    ):
                                                                pass
                                    except IndexError:
                                        print('Split ep url failed: ' +
                                              repr(ep_url))

                                    if is_vimeo:
                                        #print('vimeo 视频质量: ' + repr(vimeo_qd))
                                        if not vimeo_qd:
                                            continue
                                        vimeo_qdk = list(vimeo_qd.keys())
                                        vimeo_qdk.sort(key=int)
                                        ep_url = vimeo_qd[int(vimeo_qdk[-1])]

                                    elif rv.endswith('.m3u8"') or rv.endswith(
                                            ".m3u8'"
                                    ):  #[todo:0] need check ' also ?
                                        pass

                                    else:  #single video normally came here
                                        #print('NEW url type? ' + repr(ep_url))

                                        if arg_debug:
                                            with open(
                                                    'duboku_ep' + str(ep) +
                                                    '.log', 'a') as f:
                                                f.write('\n\nEP URL: ' +
                                                        ep_url + '\n\n')

                                        r_iframe = requests.get(
                                            ep_url,
                                            allow_redirects=True,
                                            headers=http_headers,
                                            timeout=30,
                                            proxies=proxies)

                                        if arg_debug:
                                            with open(
                                                    'duboku_ep' + str(ep) +
                                                    '.log', 'a') as f:
                                                f.write(r_iframe.text)

                                        soup_iframe = BeautifulSoup(
                                            r_iframe.text, 'html.parser')
                                        decrypted_final_js = None
                                        for script_iframe in soup_iframe.find_all(
                                                'script'):
                                            tree_iframe = CalmParser().parse(
                                                script_iframe.text.strip())
                                            for decrypt_js in walker.filter(
                                                    tree_iframe, calm_var):
                                                if decrypt_js.identifier.value == 'content':
                                                    ct_b64 = decrypt_js.initializer.value
                                                elif decrypt_js.identifier.value == 'bytes':
                                                    get_passwd = False
                                                    for decrypt_i, decrypt_js_c in enumerate(
                                                            decrypt_js.
                                                            initializer.
                                                            children()):
                                                        if get_passwd:
                                                            #(content, 'ppvod')
                                                            for dci, dc in enumerate(
                                                                    decrypt_js_c
                                                                    .children(
                                                                    )):
                                                                if dci == 1 and isinstance(
                                                                        dc.
                                                                        value,
                                                                        str):
                                                                    passwd = dc.value[
                                                                        1:
                                                                        -1]  #exclude ''
                                                        if decrypt_js_c.__str__(
                                                        ) == 'CryptoJS.AES.decrypt':
                                                            #CryptoJS.AES.decrypt
                                                            get_passwd = True
                                                elif decrypt_js.identifier.value == 'playlist':
                                                    decrypted_final_js = tree_iframe

                                        if ct_b64:
                                            print('ct b64 data: ' +
                                                  repr(ct_b64))
                                            print('passwd: ' + repr(passwd))
                                            decrypted_final_content = crypto_py_aes_main(
                                                ct_b64, passwd)
                                            decrypted_final_js = CalmParser(
                                            ).parse(decrypted_final_content.
                                                    decode())
                                        #else: # No nid decrypt, direct use plain `decrypted_final_js = tree_iframe` above
                                        m3u8_path_incomplete = ''  #reset
                                        m3u8_host_incomplete = ''

                                        for decrypted_final_var in walker.filter(
                                                decrypted_final_js, calm_var):
                                            if decrypted_final_var.identifier.value == 'playlist':
                                                decrypted_m3u8_path = decrypted_final_var.initializer.value[
                                                    1:-1]  # exclude ''
                                                if "'" in decrypted_m3u8_path:
                                                    dot_type = "'"
                                                elif '"' in decrypted_m3u8_path:
                                                    dot_type = '"'
                                                else:
                                                    continue
                                                for path_part in decrypted_m3u8_path.split(
                                                        dot_type):
                                                    if path_part.endswith(
                                                            '.m3u8'):
                                                        m3u8_path_incomplete = path_part

                                            elif decrypted_final_var.identifier.value == 'redirecturl':
                                                m3u8_host_incomplete = decrypted_final_var.initializer.value[
                                                    1:-1]  #exclude ""

                                        if not m3u8_host_incomplete.endswith(
                                                '/'
                                        ) and not m3u8_path_incomplete.startswith(
                                                '/'):
                                            ep_url = m3u8_host_incomplete + '/' + m3u8_path_incomplete
                                        else:
                                            ep_url = m3u8_host_incomplete + m3u8_path_incomplete

                                    if arg_dir:
                                        ep_filename = os.path.basename(''.join(
                                            ['第', str(ep), '集']))
                                        ep_ts_path = os.path.join(
                                            dir_path_m, ''.join([
                                                os.path.basename(ep_filename) +
                                                '.ts'
                                            ]))
                                        ep_mp4_path = os.path.join(
                                            dir_path_m, ''.join([
                                                os.path.basename(ep_filename),
                                                '.mp4'
                                            ]))

                                if ep_url:
                                    break
                        if ep_url:
                            break

                    if ep_url and ep_mp4_path:
                        got_ep_url = True
                        print('下载的 url: ' + ep_url)
                        if not is_vimeo:
                            print('下载的 ts 路径: ' + ep_ts_path)
                        print('下载的 mp4 路径: ' + ep_mp4_path)

                        if arg_debug:
                            with open('duboku_ep' + str(ep) + '.log',
                                      'a') as f:
                                f.write('\n\n下载的 url: ' + ep_url)
                                if not is_vimeo:
                                    f.write('\n下载的 ts 路径: ' + ep_ts_path)
                                f.write('\n下载的 mp4 路径: ' + ep_mp4_path +
                                        '\n\n')

                        if is_vimeo:
                            r = requests.get(ep_url,
                                             allow_redirects=True,
                                             headers=http_headers,
                                             timeout=30,
                                             proxies=proxies,
                                             stream=True)
                            chunk_size = 1024  # 1 MB
                            file_size = int(r.headers['Content-Length'])
                            num_bars = 0  #int(file_size / chunk_size)
                            with open(ep_mp4_path, 'wb') as fp:
                                for chunk in tqdm.tqdm(
                                        r.iter_content(chunk_size=chunk_size),
                                        total=num_bars,
                                        position=0,
                                        mininterval=5,
                                        unit='KB',
                                        desc=ep_mp4_path,
                                        leave=True,
                                        file=sys.stdout):
                                    fp.write(chunk)
                        else:

                            r = requests.get(ep_url,
                                             allow_redirects=True,
                                             headers=http_headers,
                                             timeout=30,
                                             proxies=proxies)

                            if arg_debug:
                                with open('duboku_ep' + str(ep) + '.log',
                                          'a') as f:
                                    f.write('r: ' + r.text)

                            # Disable `if` condition line below, if want to test convert .ts without re-download
                            if m3u8_decryptopr_main(r.text,
                                                    ep_ts_path,
                                                    ep_url,
                                                    http_headers,
                                                    arg_debug,
                                                    'duboku_ep' + str(ep) +
                                                    '.log',
                                                    proxies=proxies):
                                remux_ts_to_mp4(ep_ts_path, ep_mp4_path)

                        #source_url = "https://tv2.xboku.com/20191126/wNiFeUIj/index.m3u8"
                        #https://stackoverflow.com/questions/52736897/custom-user-agent-in-youtube-dl-python-script
                        #youtube_dl.utils.std_headers['User-Agent'] = UA
                        #try: # This one shouldn't pass .mp4 ep_path
                        #    youtube_dl.YoutubeDL(params={'-c': '', '-q': '', '--no-mtime': '',
                        #                                 'outtmpl': ep_path + '.%(ext)s'}).download([ep_url])
                        #except youtube_dl.utils.DownloadError:
                        #    print(traceback.format_exc())
                        #    print(
                        #        'Possible reason is filename too long. Please retry with -s <maximum filename size>.')
                        #    sys.exit()

                        break
                    #print(walker.extract(tree, assignment))

                    #print('######## END')
                except calmjs.parse.exceptions.ECMASyntaxError as ee:
                    pass  #here is normal
                    #print('ex')
                    #print(traceback.format_exc())
                except Exception:
                    #Need to catch & print exception explicitly to pass to duboku_gui to show err log
                    print(traceback.format_exc())
                    try:
                        print('[😞]')
                    except UnicodeEncodeError:
                        print('[!] 失败。')

            if not got_ep_url:
                if not printed_err:
                    if arg_file:
                        print('[!] 不存在该部影片。')
                    else:
                        print('[!] 不存在第{}集。'.format(ep))

    except Exception:
        try:
            print(traceback.format_exc())
        except UnicodeEncodeError:
            print('[!] 出现错误。')

    try:
        print('[😄] 全部下载工作完毕。您已可以关闭窗口, 或下载别的视频。')
    except UnicodeEncodeError:
        print('[*] 全部下载工作完毕。您已可以关闭窗口, 或下载别的视频。')
    '''
Example #9
0
class TestContentSorter(unittest.TestCase):
    def setUp(self):
        self.driver = WebDriver()
        self.test_sorter = ScriptSorter()
        self.res_sorter = ScriptSorter()

    def tearDown(self):
        self.driver.proxy.close()
        self.driver.driver.close()

    def getting_sorter_resource(self, url):
        """
        Gets html and soup associated for a given url
        :param url: url of the desired resource
        :return:
        """
        html = self.driver.parse_page(url)[0]
        soup = BeautifulSoup(html, 'html.parser')
        return html, soup

    def gen_js_style_nodes(self, node_type, styles):
        res = []
        # Parsing the node
        for style in styles:
            parsed = tinycss2.parse_rule_list(style.text)
            for node in parsed:
                if isinstance(node, tinycss2.ast.AtRule
                              ) and node.lower_at_keyword == node_type:
                    res.append(node)
        return res

    def gen_js_script_nodes(self, node_type, scripts):
        res = []
        for script in scripts:
            program = es5(script)
            walker = Walker()
            for node in walker.filter(
                    program, lambda node: (isinstance(node, node_type))):
                res.append(node)
        return res

    def test_resolve_nested_sources(self):
        url = 'http://localhost:4000/nested_variable'
        resource = self.getting_sorter_resource(url)
        html = resource[0]
        self.test_sorter.report_generator = ReportGenerator()

        res_nested = {'n2': 'connect-src', 'n7': 'connect-src'}
        res_variable = {
            'n1': 'http://localhost:4000/nest_connect_xmlhttp',
            'n2': 'n1',
            'n5': 'http://localhost:4000/nest_connect_socket',
            'n6': 'n5',
            'n7': 'n6',
            'n8': 'n7'
        }

        self.test_sorter.sort_content(url=url, html=html)

        assert (res_nested == self.test_sorter.nested_source
                and res_variable == self.test_sorter.variable)

    def test_parse_sources(self):
        """
        Only testing the flag for this method, rest is handled in following the
        following tests
        :return:
        """
        script = u"""
        <style>
            h1 {color:red;}
            p {color:blue;}
        </style>
        
        <script>
            alert('test'); 
        </script>
        """
        soup = BeautifulSoup(script, 'html.parser')
        script_tag = soup.find_all('script')
        style_tag = soup.find_all('style')

        # Setting up res_sorter generators and falgs
        report_generator = ReportGenerator()
        report_generator.flags.append(Flag('inline_script', script_tag[0]))
        report_generator.flags.append(Flag('inline_style', style_tag[0]))
        self.res_sorter.report_generator = report_generator
        # Setting report generator for test sorter
        self.test_sorter.report_generator = ReportGenerator()

        self.test_sorter.parse_sources(script)

        assert (self.test_sorter.report_generator.flags ==
                self.res_sorter.report_generator.flags)

    def test_get_node_instruction(self):
        case = []
        script = u"""
            navigator.sendBeacon(dummyArg);
            navigator.serviceWorker.register(dummyArg);
            very.long.DotAcessor.dummy.serviceWorker.register(dummyArg);
            var test = funct_in_a_var(dummyArg);
            test = xhr.open(dummyArg);
            test = send(dummyArg);
        """
        program = es5(script)
        walker = Walker()
        nodes = []
        for node in walker.filter(
                program, lambda node: (isinstance(node, FunctionCall))):
            nodes.append(node)
            print(type(node))

        case.append(
            ('sendBeacon', self.test_sorter.get_node_instruction(nodes[0])))
        case.append(
            ('register', self.test_sorter.get_node_instruction(nodes[1])))
        case.append(
            ('register', self.test_sorter.get_node_instruction(nodes[2])))
        case.append(('funct_in_a_var',
                     self.test_sorter.get_node_instruction(nodes[3])))
        case.append(('open', self.test_sorter.get_node_instruction(nodes[4])))
        case.append(('send', self.test_sorter.get_node_instruction(nodes[5])))

        for test_case in case:
            with self.subTest(case=test_case):
Example #10
0
def measure(user_dir, task_id, length, start, end, status_queue,
            process_index):
    global processed_data_dir, conflicting_rank_set, rank2url, func_dir, raw_data_dir

    current_pid = os.getpid()
    current_dir = os.getcwd()
    cnt = 0
    try:
        status = 'Process %-4d task %d/%d PID [%d] starting ...' % (
            process_index, task_id + 1, length, current_pid)
        status_queue.put([process_index, status])
        #print(status)

        current_pid = os.getpid()
        current_dir = os.getcwd()

        input_dir = user_dir + '_analysis'
        files = os.listdir(input_dir)
        #files = [f for f in files if f.endswith('-category2target2type2script2infos.json')]
        files = [
            f for f in files
            if f.endswith('-category2target2type2script2infos.json')
        ]  # and not f.endswith('-used-category2type2target2infos.json')]
        cat2rank2target2infos = dict()
        #print(files)
        for f in files:
            try:
                rank = f.split('.')[0]
                input_file = os.path.join(input_dir, f)
                with open(input_file, 'r') as input_f:
                    category2target2type2script2infos = json.loads(
                        input_f.read())
                    for category, target2type2script2infos in category2target2type2script2infos.items(
                    ):
                        for target, type2script2infos in target2type2script2infos.items(
                        ):
                            for type_, script2infos in type2script2infos.items(
                            ):
                                if type_ != 'funcs':
                                    continue
                                for script, infos in script2infos.items():
                                    for info in infos:
                                        if len(info) >= 15:
                                            func_str_one = info[2].strip()
                                            func_str_two = info[12].strip()
                                            clean_source_one = ''.join(
                                                func_str_one.strip().split(
                                                )).replace(';', '')
                                            clean_source_two = ''.join(
                                                func_str_two.strip().split(
                                                )).replace(';', '')
                                            if clean_source_one == clean_source_two:
                                                if category not in cat2rank2target2infos:
                                                    cat2rank2target2infos[
                                                        category] = dict()
                                                if rank not in cat2rank2target2infos[
                                                        category]:
                                                    cat2rank2target2infos[
                                                        category][rank] = dict(
                                                        )
                                                if target not in cat2rank2target2infos[
                                                        category][rank]:
                                                    cat2rank2target2infos[
                                                        category][rank][
                                                            target] = list()
                                                cnt += 1
                                                cat2rank2target2infos[
                                                    category][rank][
                                                        target].append(info)

                                        else:
                                            script_id_one = info[0]
                                            if '.func' in info[3]:
                                                source_file_one = info[
                                                    3].replace(
                                                        '.func', '.' +
                                                        str(script_id_one) +
                                                        '.script')
                                            else:
                                                source_file_one = info[
                                                    3].replace(
                                                        '-functions.json',
                                                        '.' +
                                                        str(script_id_one) +
                                                        '.script')

                                            script_id_two = info[4]
                                            if '.func' in info[7]:
                                                source_file_two = info[
                                                    7].replace(
                                                        '.func', '.' +
                                                        str(script_id_two) +
                                                        '.script')
                                            else:
                                                source_file_two = info[
                                                    7].replace(
                                                        '-functions.json',
                                                        '.' +
                                                        str(script_id_two) +
                                                        '.script')

                                            rank = rank.split('-')[0]
                                            script_dir = 'iso_' + str(
                                                int(rank) %
                                                num_instances) + '_logs'
                                            script_dir = os.path.join(
                                                raw_data_dir, script_dir)

                                            try:
                                                source_file_one = os.path.join(
                                                    script_dir,
                                                    source_file_one)
                                                with open(
                                                        source_file_one,
                                                        'r') as input_f:
                                                    source_one = input_f.read()

                                                source_file_two = os.path.join(
                                                    script_dir,
                                                    source_file_two)
                                                with open(
                                                        source_file_two,
                                                        'r') as input_f:
                                                    source_two = input_f.read()
                                            except IOError as e:
                                                #print(e)
                                                continue
                                            try:
                                                program_one = es5(
                                                    unicode(source_one))
                                            except Exception as e:
                                                #print(e)
                                                continue
                                            function_source_one = None
                                            walker = Walker()
                                            for node in walker.filter(
                                                    program_one, lambda node:
                                                (isinstance(node, FuncDecl))):
                                                if str(node.identifier) == str(
                                                        target):
                                                    function_source_one = str(
                                                        node)
                                                    break

                                            try:
                                                program_two = es5(
                                                    unicode(source_two))
                                            except Exception as e:
                                                #print(e)
                                                continue
                                            function_source_two = None
                                            walker = Walker()
                                            for node in walker.filter(
                                                    program_two, lambda node:
                                                (isinstance(node, FuncDecl))):
                                                if str(node.identifier) == str(
                                                        target):
                                                    function_source_two = str(
                                                        node)
                                                    break

                                            if function_source_one is not None and function_source_two is not None:
                                                #print(rank, program_one, program_two)
                                                #print(rank)
                                                clean_source_one = ''.join(
                                                    function_source_one.strip(
                                                    ).split()).replace(
                                                        ';', '')
                                                clean_source_two = ''.join(
                                                    function_source_two.strip(
                                                    ).split()).replace(
                                                        ';', '')
                                                if clean_source_one == clean_source_two:
                                                    if category not in cat2rank2target2infos:
                                                        cat2rank2target2infos[
                                                            category] = dict()
                                                    if rank not in cat2rank2target2infos[
                                                            category]:
                                                        cat2rank2target2infos[
                                                            category][
                                                                rank] = dict()
                                                    if target not in cat2rank2target2infos[
                                                            category][rank]:
                                                        cat2rank2target2infos[
                                                            category][rank][
                                                                target] = list(
                                                                )
                                                    cnt += 1
                                                    cat2rank2target2infos[
                                                        category][rank][
                                                            target].append(
                                                                info)

            except Exception as e:
                print(e)
                pass

    except OSError as e:
        pass
    except Exception as e:
        status = 'Process %-4d task %s/%s raised an exception %s.' % (
            process_index, task_id + 1, length, type(e))
        status_queue.put([process_index, status])
        string = '%s\t%s' % (getlocaltime(), status)
        try:
            print(string)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lines = traceback.format_exception(exc_type, exc_value,
                                               exc_traceback)
            print(''.join('!! ' + line for line in lines))
            sys.stdout.flush()
        except Exception:
            pass

    status = 'Process %-4d task %s/%s PID [%d] completed.' % (
        process_index, task_id + 1, length, current_pid)
    status_queue.put([process_index, status])

    if cnt > 0:
        output_file = '%s-duplicate_cat2rank2target2infos.json' % (user_dir)
        with open(output_file, 'w') as output_f:
            output_f.write(json.dumps(cat2rank2target2infos))
        print(output_file)