Ejemplo n.º 1
0
    def test_gen_find_files(self):
        # Should match files in subdirectories
        results = set(fu.gen_find_files('regexp*.txt', 'files'))
        regex_files = set(
            ('files/regexp2.txt', 'files/regexp.txt', 'files/sub/regexp.txt'))
        self.assertSetEqual(results, regex_files)

        results = set((fu.gen_find_files('*.log', 'files')))
        log_files = set(['files/hertrxcrawl.log'])
        self.assertSetEqual(results, log_files)

        # should match .as and .flr files
        files = list(fu.gen_find_files(swfu.AS_SOURCE_FILE_PATTERN, 'files'))
        if not len(files) or not 'TweenPlugin.as' in files[0]:
            self.fail("Couldn't match .as file in directory")
Ejemplo n.º 2
0
def generate_index_file(path):
    table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>'
    fonts_dict = {}
    i = 0
    for json_file in fu.gen_find_files("*.json", path):
        i = i + 1
        wl_log.info("%s - %s" % (i, json_file))
        domaInfo = load_domainfo_from_json_file(json_file)
        if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected:
            fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads

    sorted_font_dict = sorted(fonts_dict.iteritems(),
                              key=operator.itemgetter(1),
                              reverse=True)

    for filename, num_font_loaded in sorted_font_dict:
        #if num_font_loaded > FONT_LOAD_THRESHOLD:
        rank, domain = get_rank_domain_from_filename(filename)
        output_filename = os.path.basename(filename)[:-4] + ".html"
        table_str += '<tr><td>'+  rank + '</td><td><a href="' + output_filename + '">' + domain \
                + '</a></td><td>' + str(num_font_loaded) +  '</td></tr>'

    table_str += '</table>'

    html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\
            <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>"
    index_filename = os.path.join(path, "index.html")
    fu.write_to_file(index_filename, html_str.encode('utf-8'))
Ejemplo n.º 3
0
 def test_gen_find_files(self):
     # Should match files in subdirectories
     results = set(fu.gen_find_files('regexp*.txt', 'files'))
     regex_files = set(('files/regexp2.txt', 
                        'files/regexp.txt', 
                        'files/sub/regexp.txt'))
     self.assertSetEqual(results, regex_files)
     
     results = set((fu.gen_find_files('*.log', 'files')))
     log_files = set(['files/hertrxcrawl.log'])
     self.assertSetEqual(results, log_files)
     
     # should match .as and .flr files
     files = list(fu.gen_find_files(swfu.AS_SOURCE_FILE_PATTERN, 'files'))
     if not len(files) or not 'TweenPlugin.as' in files[0]:
         self.fail("Couldn't match .as file in directory")
Ejemplo n.º 4
0
def generate_index_file(path):
    table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>'
    fonts_dict = {}
    i = 0
    for json_file in fu.gen_find_files("*.json", path):
        i = i + 1
        wl_log.info("%s - %s" % (i, json_file))
        domaInfo = load_domainfo_from_json_file(json_file)
        if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected:
            fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads
            
    sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    for filename, num_font_loaded in sorted_font_dict:
        #if num_font_loaded > FONT_LOAD_THRESHOLD:
        rank,domain = get_rank_domain_from_filename(filename)
        output_filename = os.path.basename(filename)[:-4] + ".html"
        table_str += '<tr><td>'+  rank + '</td><td><a href="' + output_filename + '">' + domain \
                + '</a></td><td>' + str(num_font_loaded) +  '</td></tr>' 
        
    table_str += '</table>'
    
    html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\
            <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>"
    index_filename = os.path.join(path, "index.html")
    fu.write_to_file(index_filename, html_str.encode('utf-8'))
Ejemplo n.º 5
0
    def test_gen_cat(self):
        somelist = [1, 2, 3, 4]
        result = list(fu.gen_cat(([1, 2], [3, 4])))
        self.assertListEqual(result, somelist)

        # concat'ing mixed input
        result = list(fu.gen_cat((['1', '2'], [3])))
        mixedlist = ['1', '2', 3]
        self.assertListEqual(result, mixedlist)

        # concat'ing files
        as_file_names = fu.gen_find_files('*.as', 'files/fp')
        self.failUnless(len(list(as_file_names)) > 1)
Ejemplo n.º 6
0
 def test_gen_cat(self):    
     somelist = [1, 2, 3, 4]
     result = list(fu.gen_cat(([1,2], [3, 4])))
     self.assertListEqual(result, somelist)
     
     # concat'ing mixed input
     result = list(fu.gen_cat((['1','2'], [3])))
     mixedlist = ['1', '2', 3]
     self.assertListEqual(result, mixedlist)
     
     # concat'ing files
     as_file_names = fu.gen_find_files('*.as', 'files/fp')
     self.failUnless(len(list(as_file_names)) > 1)
Ejemplo n.º 7
0
def crawl_site_for_font_probing(url, agent_config, job_config=None):
    """Visit a site for font probing and return results."""
    
    ha = HeadlessAgent()
    cr_job = CrawlJob(ha)
    ha.setOptions(agent_config)
    
    job_cfg = job_config or {
                  'desc': 'Crawl for font probing detection', 
                  'max_parallel_procs': MAX_PARALLEL_PROCESSES,
                  'urls':  (url,),
                  'num_crawl_urls':1
                  }
    cr_job.setOptions(job_cfg)
    run_crawl(cr_job)
    
    for json_file in fu.gen_find_files("*.json", cr_job.job_dir):
        return lp.load_domainfo_from_json_file(json_file)
    
    return None # no json file can be found
Ejemplo n.º 8
0
def crawl_site_for_font_probing(url, agent_config, job_config=None):
    """Visit a site for font probing and return results."""

    ha = HeadlessAgent()
    cr_job = CrawlJob(ha)
    ha.setOptions(agent_config)

    job_cfg = job_config or {
        'desc': 'Crawl for font probing detection',
        'max_parallel_procs': MAX_PARALLEL_PROCESSES,
        'urls': (url, ),
        'num_crawl_urls': 1
    }
    cr_job.setOptions(job_cfg)
    run_crawl(cr_job)

    for json_file in fu.gen_find_files("*.json", cr_job.job_dir):
        return lp.load_domainfo_from_json_file(json_file)

    return None  # no json file can be found
Ejemplo n.º 9
0
def gen_decompile_swf(swf_path, out_dir=''):
    """Decompile a Flash file with the available decompilers."""
    if not os.path.isfile(swf_path):
        return
    
    if not out_dir:
        base_dir = os.path.dirname(swf_path)
        out_dir = ut.rand_str() + '-src'
        out_dir = os.path.join(base_dir, out_dir)
    
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    
    cmd_ffdec = 'java -jar ' + FFDEC_PATH + ' -export as "' + out_dir + '" "' + swf_path + '"'  
    timeout_prefx = 'timeout -k 5 30 ' # linux specific
    
    ffdec_status, ffdec_output = cmds.getstatusoutput(timeout_prefx + cmd_ffdec)
    write_decomp_log(swf_path, ffdec_status, ffdec_output, '', '')
        
    for flash_src_file in fu.gen_find_files(AS_SOURCE_FILE_PATTERN, out_dir):
        yield flash_src_file
Ejemplo n.º 10
0
def gen_decompile_swf(swf_path, out_dir=''):
    """Decompile a Flash file with the available decompilers."""
    if not os.path.isfile(swf_path):
        return

    if not out_dir:
        base_dir = os.path.dirname(swf_path)
        out_dir = ut.rand_str() + '-src'
        out_dir = os.path.join(base_dir, out_dir)

    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    cmd_ffdec = 'java -jar ' + FFDEC_PATH + ' -export as "' + out_dir + '" "' + swf_path + '"'
    timeout_prefx = 'timeout -k 5 30 '  # linux specific

    ffdec_status, ffdec_output = cmds.getstatusoutput(timeout_prefx +
                                                      cmd_ffdec)
    write_decomp_log(swf_path, ffdec_status, ffdec_output, '', '')

    for flash_src_file in fu.gen_find_files(AS_SOURCE_FILE_PATTERN, out_dir):
        yield flash_src_file
Ejemplo n.º 11
0
def gen_find_swf_files(top_dir):
    """Find Flash files under a given directory."""
    for filename in fu.gen_find_files('*', top_dir): # don't rely on file extension
        if is_swf_file(filename): # test with Linux file command   
            yield filename
Ejemplo n.º 12
0
def parse_crawl_logs(path, no_of_procs=16):
    files = fu.gen_find_files("*.txt", path)
    log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html)
    parallelize.run_in_parallel(files, log_worker, no_of_procs)
    wl_log.info("Worker processes are finished, will generate index")
Ejemplo n.º 13
0
def gen_find_swf_files(top_dir):
    """Find Flash files under a given directory."""
    for filename in fu.gen_find_files('*',
                                      top_dir):  # don't rely on file extension
        if is_swf_file(filename):  # test with Linux file command
            yield filename
Ejemplo n.º 14
0
def parse_crawl_logs(path, no_of_procs=16):
    files = fu.gen_find_files("*.txt", path)
    log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html)
    parallelize.run_in_parallel(files, log_worker, no_of_procs)
    wl_log.info("Worker processes are finished, will generate index")