def test_add_symlink(self): test_link = self.new_temp_file('test_link') # symbolic link src_file = self.new_temp_file('linktest.txt') fu.write_to_file(src_file, "link test") fu.add_symlink(test_link, src_file) self.failUnless(os.path.lexists(test_link))
def generate_index_file(path): table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>' fonts_dict = {} i = 0 for json_file in fu.gen_find_files("*.json", path): i = i + 1 wl_log.info("%s - %s" % (i, json_file)) domaInfo = load_domainfo_from_json_file(json_file) if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected: fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for filename, num_font_loaded in sorted_font_dict: #if num_font_loaded > FONT_LOAD_THRESHOLD: rank,domain = get_rank_domain_from_filename(filename) output_filename = os.path.basename(filename)[:-4] + ".html" table_str += '<tr><td>'+ rank + '</td><td><a href="' + output_filename + '">' + domain \ + '</a></td><td>' + str(num_font_loaded) + '</td></tr>' table_str += '</table>' html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>" index_filename = os.path.join(path, "index.html") fu.write_to_file(index_filename, html_str.encode('utf-8'))
def close_index_html(index_file): # wl_log.info('Will close %s' % index_file) # TODO: add check to don't close a file twice if not os.path.isfile(index_file): fu.write_to_file(index_file, '') # create an empty file index_src = fu.read_file(index_file) if index_src.startswith('<html'): wl_log.info('Index file %s already closed' % index_file) return scripts_src = """<script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery-1.9.1.min.js"></script> <style type="text/css" title="currentStyle"> @import "../../js/css/demo_page.css"; @import "../../js/css/demo_table.css"; </style> <script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery.dataTables.min.js"></script> <script type="text/javascript" charset="utf-8"> $(document).ready(function() { $('#results').dataTable( { "aaSorting": [[ 2, "desc" ]] } ); } ); </script>""" html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + scripts_src + "</head>\n<body><table id ='results'>\ \n<thead><tr><th>Rank</th><th>Domain</th><th>Fonts</th><th>OffsetWidth</th><th>OffsetHeight</th><th>FP found</th></tr></thead>" + index_src + '</table></body></html>' fu.write_to_file(index_file, html_str)
def generate_index_file(path): table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>' fonts_dict = {} i = 0 for json_file in fu.gen_find_files("*.json", path): i = i + 1 wl_log.info("%s - %s" % (i, json_file)) domaInfo = load_domainfo_from_json_file(json_file) if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected: fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for filename, num_font_loaded in sorted_font_dict: #if num_font_loaded > FONT_LOAD_THRESHOLD: rank, domain = get_rank_domain_from_filename(filename) output_filename = os.path.basename(filename)[:-4] + ".html" table_str += '<tr><td>'+ rank + '</td><td><a href="' + output_filename + '">' + domain \ + '</a></td><td>' + str(num_font_loaded) + '</td></tr>' table_str += '</table>' html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>" index_filename = os.path.join(path, "index.html") fu.write_to_file(index_filename, html_str.encode('utf-8'))
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def test_job_folder_should_be_writable(self): out_dir = self.create_job_folder() self.assert_(os.path.isdir(out_dir), 'Cannot create job folder') out_file = os.path.join(out_dir, 'some.log') file_content = '123456789' fu.write_to_file(out_file, file_content) self.assert_(os.path.isfile(out_file), 'Cannot create file in job folder') self.assert_(file_content == fu.read_file(out_file), 'Cannot create file in job folder')
def test_hash_file(self): filename = self.new_temp_file('hash_test.txt') random_str = ut.rand_str(1000) fu.write_to_file(filename, random_str) self.assertEqual(fu.hash_file(filename, 'sha1'), ut.hash_text(random_str, 'sha1'), 'SHA1 hashes don\'t match') self.assertEqual(fu.hash_file(filename), ut.hash_text(random_str), 'Hashes with default algo don\'t match')
def generate_results_page(domaInfo): """Generate results page for the given domain information.""" back_link = '<div><a href="index.html">Index</a></div>' rank_str = "<h2>%s - %s - <a href='%s' target='_blank'>%s</a></h2>" % ( str(domaInfo.rank), domaInfo.url, domaInfo.url, EXT_LINK_IMG) fonts_list = ' • '.join('<span style="font-family:%s">%s</span>' % (font_name, font_name) for font_name in domaInfo.fonts_loaded) font_div = "<div class='fonts'><p><b>%s</b> fonts loaded, <b>%s</b> num_offsetWidth_calls, <b>%s</b> num_offsetHeight_calls</p>\ \n<div class='font_list'>%s</div></div>" % ( len(domaInfo.fonts_loaded), domaInfo.num_offsetWidth_calls, domaInfo.num_offsetHeight_calls, fonts_list) unique_urls = set(domaInfo.responses + domaInfo.requests) unique_http_urls = [ url for url in unique_urls if re.match(r"https?:\/\/[^.]+\.[^.]", url) ] # filter out data urls unique_domains = set( pub_suffix.get_public_suffix(url) for url in unique_http_urls if url.startswith('http')) unique_domains = [ mark_if_fp(address) for address in sorted(unique_domains) ] domain_list = "<ul class='domains'>\n<li>%s</li></ul>" % ( "</li>\n<li>".join(unique_domains)) unique_urls = ["<a href='%s' target='_blank'>%s</a> - %s" %\ (address, EXT_LINK_IMG, mark_if_fp(address)) for address in sorted(unique_http_urls)] url_list = "<ul class='urls'>\n<li>%s</li></ul>" % ( "</li>\n<li>".join(unique_urls)) domains_div = "<div class='domains'><p> Number of different domains loaded: <b>"\ + str(len(unique_domains)) + "</b></p><div class='domains_list'> " + domain_list + "</div></div>" urls_div = "<div class='urls'><p> Number of different URLs loaded: <b>"\ + str(len(unique_urls)) + "</b></p>\n<div class='urls_list'> " + url_list + "</div></div>" font_orig_str = "<p>Fonts per origin</p><ul>" for orig, fonts in domaInfo.fonts_by_origins.iteritems(): font_orig_str += "<li>%s: %s %s</li>" % ( json_field_name_to_origin(orig), len(fonts), fonts) font_orig_str += "</ul>" html_str = "<html>\n<head>\n<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ \n<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ \n<style>span.red{color:red; font-weight:bold;}\ \n</style>\n</head>\n<body>" + back_link + rank_str + font_div + font_orig_str + domains_div + urls_div + "\n</body>\n</html>" output_filename = domaInfo.log_filename[:-4] + ".html" fu.write_to_file(output_filename, html_str)
def test_close_index_html(self): index_filename = 'files/html/results/index.html' index_filename = self.abs_test_file_name(index_filename) # self.new_temp_file(index_filename) # to remove it after test finishes table_rows = """<tr><td>1</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/1-google-com.html">http://google.com/</a></td><td>10</td><td>1</td></tr> <tr><td>118</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/118-google-com-ar.html">http://google.com.ar/</a></td><td>3</td><td>51</td></tr> <tr><td>27</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/27-google-co-uk.html">http://google.co.uk/</a></td><td>1</td><td>11</td></tr>""" fu.write_to_file(index_filename, table_rows) lp.close_index_html(index_filename) index_src = fu.read_file(index_filename) self.assertTrue('<table' in index_src, 'No table in index.html') self.assertTrue('<thead' in index_src, 'No thead in index.html') self.assertTrue('</html>' in index_src, 'No closing html tag index.html')
def generate_results_page(domaInfo): """Generate results page for the given domain information.""" back_link = '<div><a href="index.html">Index</a></div>' rank_str = "<h2>%s - %s - <a href='%s' target='_blank'>%s</a></h2>" % (str(domaInfo.rank), domaInfo.url, domaInfo.url, EXT_LINK_IMG) fonts_list = ' • '.join('<span style="font-family:%s">%s</span>' %(font_name, font_name) for font_name in domaInfo.fonts_loaded) font_div = "<div class='fonts'><p><b>%s</b> fonts loaded, <b>%s</b> num_offsetWidth_calls, <b>%s</b> num_offsetHeight_calls</p>\ \n<div class='font_list'>%s</div></div>" % (len(domaInfo.fonts_loaded), domaInfo.num_offsetWidth_calls, domaInfo.num_offsetHeight_calls, fonts_list) unique_urls = set(domaInfo.responses + domaInfo.requests) unique_http_urls = [url for url in unique_urls if re.match(r"https?:\/\/[^.]+\.[^.]", url)] # filter out data urls unique_domains = set(pub_suffix.get_public_suffix(url) for url in unique_http_urls if url.startswith('http')) unique_domains = [mark_if_fp(address) for address in sorted(unique_domains)] domain_list = "<ul class='domains'>\n<li>%s</li></ul>" % ("</li>\n<li>".join(unique_domains)) unique_urls = ["<a href='%s' target='_blank'>%s</a> - %s" %\ (address, EXT_LINK_IMG, mark_if_fp(address)) for address in sorted(unique_http_urls)] url_list = "<ul class='urls'>\n<li>%s</li></ul>" % ("</li>\n<li>".join(unique_urls)) domains_div = "<div class='domains'><p> Number of different domains loaded: <b>"\ + str(len(unique_domains)) + "</b></p><div class='domains_list'> " + domain_list + "</div></div>" urls_div = "<div class='urls'><p> Number of different URLs loaded: <b>"\ + str(len(unique_urls)) + "</b></p>\n<div class='urls_list'> " + url_list + "</div></div>" font_orig_str = "<p>Fonts per origin</p><ul>" for orig, fonts in domaInfo.fonts_by_origins.iteritems(): font_orig_str += "<li>%s: %s %s</li>" % (json_field_name_to_origin(orig), len(fonts), fonts) font_orig_str += "</ul>" html_str = "<html>\n<head>\n<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ \n<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ \n<style>span.red{color:red; font-weight:bold;}\ \n</style>\n</head>\n<body>" + back_link + rank_str + font_div + font_orig_str + domains_div + urls_div + "\n</body>\n</html>" output_filename = domaInfo.log_filename[:-4] + ".html" fu.write_to_file(output_filename, html_str)
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers[ 'Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS ): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join( dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf( swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join( swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def test_write_to_file(self): filename = self.new_temp_file('write_test.txt') random_str = ut.rand_str(100) fu.write_to_file(filename, random_str) self.assertEqual(random_str, fu.read_file(filename))