def test_gen_find_files(self): # Should match files in subdirectories results = set(fu.gen_find_files('regexp*.txt', 'files')) regex_files = set( ('files/regexp2.txt', 'files/regexp.txt', 'files/sub/regexp.txt')) self.assertSetEqual(results, regex_files) results = set((fu.gen_find_files('*.log', 'files'))) log_files = set(['files/hertrxcrawl.log']) self.assertSetEqual(results, log_files) # should match .as and .flr files files = list(fu.gen_find_files(swfu.AS_SOURCE_FILE_PATTERN, 'files')) if not len(files) or not 'TweenPlugin.as' in files[0]: self.fail("Couldn't match .as file in directory")
def generate_index_file(path): table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>' fonts_dict = {} i = 0 for json_file in fu.gen_find_files("*.json", path): i = i + 1 wl_log.info("%s - %s" % (i, json_file)) domaInfo = load_domainfo_from_json_file(json_file) if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected: fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for filename, num_font_loaded in sorted_font_dict: #if num_font_loaded > FONT_LOAD_THRESHOLD: rank, domain = get_rank_domain_from_filename(filename) output_filename = os.path.basename(filename)[:-4] + ".html" table_str += '<tr><td>'+ rank + '</td><td><a href="' + output_filename + '">' + domain \ + '</a></td><td>' + str(num_font_loaded) + '</td></tr>' table_str += '</table>' html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>" index_filename = os.path.join(path, "index.html") fu.write_to_file(index_filename, html_str.encode('utf-8'))
def test_gen_find_files(self): # Should match files in subdirectories results = set(fu.gen_find_files('regexp*.txt', 'files')) regex_files = set(('files/regexp2.txt', 'files/regexp.txt', 'files/sub/regexp.txt')) self.assertSetEqual(results, regex_files) results = set((fu.gen_find_files('*.log', 'files'))) log_files = set(['files/hertrxcrawl.log']) self.assertSetEqual(results, log_files) # should match .as and .flr files files = list(fu.gen_find_files(swfu.AS_SOURCE_FILE_PATTERN, 'files')) if not len(files) or not 'TweenPlugin.as' in files[0]: self.fail("Couldn't match .as file in directory")
def generate_index_file(path): table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>' fonts_dict = {} i = 0 for json_file in fu.gen_find_files("*.json", path): i = i + 1 wl_log.info("%s - %s" % (i, json_file)) domaInfo = load_domainfo_from_json_file(json_file) if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected: fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for filename, num_font_loaded in sorted_font_dict: #if num_font_loaded > FONT_LOAD_THRESHOLD: rank,domain = get_rank_domain_from_filename(filename) output_filename = os.path.basename(filename)[:-4] + ".html" table_str += '<tr><td>'+ rank + '</td><td><a href="' + output_filename + '">' + domain \ + '</a></td><td>' + str(num_font_loaded) + '</td></tr>' table_str += '</table>' html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>" index_filename = os.path.join(path, "index.html") fu.write_to_file(index_filename, html_str.encode('utf-8'))
def test_gen_cat(self): somelist = [1, 2, 3, 4] result = list(fu.gen_cat(([1, 2], [3, 4]))) self.assertListEqual(result, somelist) # concat'ing mixed input result = list(fu.gen_cat((['1', '2'], [3]))) mixedlist = ['1', '2', 3] self.assertListEqual(result, mixedlist) # concat'ing files as_file_names = fu.gen_find_files('*.as', 'files/fp') self.failUnless(len(list(as_file_names)) > 1)
def test_gen_cat(self): somelist = [1, 2, 3, 4] result = list(fu.gen_cat(([1,2], [3, 4]))) self.assertListEqual(result, somelist) # concat'ing mixed input result = list(fu.gen_cat((['1','2'], [3]))) mixedlist = ['1', '2', 3] self.assertListEqual(result, mixedlist) # concat'ing files as_file_names = fu.gen_find_files('*.as', 'files/fp') self.failUnless(len(list(as_file_names)) > 1)
def crawl_site_for_font_probing(url, agent_config, job_config=None): """Visit a site for font probing and return results.""" ha = HeadlessAgent() cr_job = CrawlJob(ha) ha.setOptions(agent_config) job_cfg = job_config or { 'desc': 'Crawl for font probing detection', 'max_parallel_procs': MAX_PARALLEL_PROCESSES, 'urls': (url,), 'num_crawl_urls':1 } cr_job.setOptions(job_cfg) run_crawl(cr_job) for json_file in fu.gen_find_files("*.json", cr_job.job_dir): return lp.load_domainfo_from_json_file(json_file) return None # no json file can be found
def crawl_site_for_font_probing(url, agent_config, job_config=None): """Visit a site for font probing and return results.""" ha = HeadlessAgent() cr_job = CrawlJob(ha) ha.setOptions(agent_config) job_cfg = job_config or { 'desc': 'Crawl for font probing detection', 'max_parallel_procs': MAX_PARALLEL_PROCESSES, 'urls': (url, ), 'num_crawl_urls': 1 } cr_job.setOptions(job_cfg) run_crawl(cr_job) for json_file in fu.gen_find_files("*.json", cr_job.job_dir): return lp.load_domainfo_from_json_file(json_file) return None # no json file can be found
def gen_decompile_swf(swf_path, out_dir=''): """Decompile a Flash file with the available decompilers.""" if not os.path.isfile(swf_path): return if not out_dir: base_dir = os.path.dirname(swf_path) out_dir = ut.rand_str() + '-src' out_dir = os.path.join(base_dir, out_dir) if not os.path.isdir(out_dir): os.mkdir(out_dir) cmd_ffdec = 'java -jar ' + FFDEC_PATH + ' -export as "' + out_dir + '" "' + swf_path + '"' timeout_prefx = 'timeout -k 5 30 ' # linux specific ffdec_status, ffdec_output = cmds.getstatusoutput(timeout_prefx + cmd_ffdec) write_decomp_log(swf_path, ffdec_status, ffdec_output, '', '') for flash_src_file in fu.gen_find_files(AS_SOURCE_FILE_PATTERN, out_dir): yield flash_src_file
def gen_find_swf_files(top_dir): """Find Flash files under a given directory.""" for filename in fu.gen_find_files('*', top_dir): # don't rely on file extension if is_swf_file(filename): # test with Linux file command yield filename
def parse_crawl_logs(path, no_of_procs=16): files = fu.gen_find_files("*.txt", path) log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html) parallelize.run_in_parallel(files, log_worker, no_of_procs) wl_log.info("Worker processes are finished, will generate index")