def test_init_headless_agent(self): ha = ag.HeadlessAgent() cr_job = ag.CrawlJob(ha) crawl_agent_cfg = { 'main_js': cm.CASPER_JS_LAZY_HOMEPAGER, 'cmd_line_options': ag.PHANTOM_COMMON_OPTIONS, 'timeout': 20, 'screenshot': True, 'post_visit_func': lp.parse_log_dump_results } ha = ag.HeadlessAgent() ha.setOptions(crawl_agent_cfg) limit = 3 cr_job_cfg = { 'desc': 'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.' % limit, 'max_parallel_procs': 20, 'crawl_agent': ha, 'urls': wu.gen_url_list(limit) } cr_job.setOptions(cr_job_cfg) ag.run_crawl(cr_job) self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
def setUp(self): self.dirs_to_remove = [] self.db_conn = dbu.mysql_init_db('fp_detective_test') self.domainInfo = lp.DomainInfo() # create a new DomainInfo obj for tests self.domainInfo.rank = 1 self.domainInfo.log_filename = '/var/log/syslog' self.domainInfo.url = 'http://google.com' self.domainInfo.fonts_loaded = ['Arial', 'Tahoma', 'Georgia', '微软雅黑'] self.domainInfo.fonts_by_origins = {'http://google.com':['arial', 'Tahoma'], 'http://yahoo.com':['Georgia'] } self.domainInfo.requests = ['http://google.com', 'http://yahoo.com'] self.domainInfo.responses = ['http://abc.com', 'http://xyz.com'] self.domainInfo.num_font_loads = 50 self.domainInfo.num_offsetWidth_calls = 15 self.domainInfo.num_offsetHeight_calls = 15 self.domainInfo.fp_detected = [fpr.FINGERPRINTER_REGEX.items()[:2]] self.domainInfo.crawl_id = 64654 self.domainInfo.fpd_logs = ['userAgent', 'appCodeName'] self.domainInfo.fc_dbg_font_loads = ['Arial', 'Tahoma', 'Georgia', 'someotherfont', '微软雅黑'] self.domainInfo.log_complete = 1 ha = ag.HeadlessAgent() self.crawl_job = ag.CrawlJob(ha) self.dirs_to_remove.append(self.crawl_job.job_dir) self.crawl_job.urls = ['http://google.com', 'http://yahoo.com'] self.crawl_job.desc
def should_crawl_and_log(self, agent_cfg, urls, expected_strs, unexpected_strs=[]): # TODO: add support for normal browsers if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']: br = ag.ChromeAgent() else: br = ag.HeadlessAgent() if not agent_cfg.has_key("timeout"): agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT br.setOptions(agent_cfg) cr_job = ag.CrawlJob(br) cr_job.urls = [ urls, ] if isinstance(urls, basestring) else urls cr_job.url_tuples = zip(xrange(1, len(urls) + 1), urls) ag.run_crawl(cr_job) self.assertTrue(os.path.isdir(cr_job.job_dir), 'No job folder created!') for idx, url in enumerate(cr_job.urls): outfile = os.path.join( cr_job.job_dir, fu.get_out_filename_from_url(url, str(idx + 1))) self.assertTrue(os.path.isfile(outfile), 'Cannot find log file %s' % outfile) self.assert_all_patterns_in_file(outfile, expected_strs) self.assert_all_patterns_not_in_file(outfile, unexpected_strs)
def test_modphantomjs_should_log_access_to_navigator_props(self): ph = ag.HeadlessAgent().setOptions( ag.AGENT_CFG_PHANTOM_MOD_HOME_PAGE).setOptions( {'timeout': TEST_TIMEOUT}) self.should_log_access_to_navigator_props(ph.__dict__)