def test_init_headless_agent(self): ha = ag.HeadlessAgent() cr_job = ag.CrawlJob(ha) crawl_agent_cfg = { 'main_js': cm.CASPER_JS_LAZY_HOMEPAGER, 'cmd_line_options': ag.PHANTOM_COMMON_OPTIONS, 'timeout': 20, 'screenshot': True, 'post_visit_func': lp.parse_log_dump_results } ha = ag.HeadlessAgent() ha.setOptions(crawl_agent_cfg) limit = 3 cr_job_cfg = { 'desc': 'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.' % limit, 'max_parallel_procs': 20, 'crawl_agent': ha, 'urls': wu.gen_url_list(limit) } cr_job.setOptions(cr_job_cfg) ag.run_crawl(cr_job) self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
def test_init_headless_agent(self): ha = ag.HeadlessAgent() cr_job = ag.CrawlJob(ha) crawl_agent_cfg = { 'main_js' : cm.CASPER_JS_LAZY_HOMEPAGER, 'cmd_line_options' : ag.PHANTOM_COMMON_OPTIONS, 'timeout' : 20, 'screenshot' : True, 'post_visit_func': lp.parse_log_dump_results } ha = ag.HeadlessAgent() ha.setOptions(crawl_agent_cfg) limit = 3 cr_job_cfg = { 'desc': 'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.' % limit, 'max_parallel_procs': 20, 'crawl_agent': ha, 'urls': wu.gen_url_list(limit) } cr_job.setOptions(cr_job_cfg) ag.run_crawl(cr_job) self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
def should_crawl_and_log(self, agent_cfg, urls, expected_strs, unexpected_strs=[]): # TODO: add support for normal browsers if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']: br = ag.ChromeAgent() else: br = ag.HeadlessAgent() if not agent_cfg.has_key("timeout"): agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT br.setOptions(agent_cfg) cr_job = ag.CrawlJob(br) cr_job.urls = [ urls, ] if isinstance(urls, basestring) else urls cr_job.url_tuples = zip(xrange(1, len(urls) + 1), urls) ag.run_crawl(cr_job) self.assertTrue(os.path.isdir(cr_job.job_dir), 'No job folder created!') for idx, url in enumerate(cr_job.urls): outfile = os.path.join( cr_job.job_dir, fu.get_out_filename_from_url(url, str(idx + 1))) self.assertTrue(os.path.isfile(outfile), 'Cannot find log file %s' % outfile) self.assert_all_patterns_in_file(outfile, expected_strs) self.assert_all_patterns_not_in_file(outfile, unexpected_strs)
def should_crawl_and_log(self, agent_cfg, urls, expected_strs, unexpected_strs=[]): # TODO: add support for normal browsers if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']: br = ag.ChromeAgent() else: br = ag.HeadlessAgent() if not agent_cfg.has_key("timeout"): agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT br.setOptions(agent_cfg) cr_job = ag.CrawlJob(br) cr_job.urls = [urls,] if isinstance(urls, basestring) else urls cr_job.url_tuples = zip(xrange(1, len(urls)+1), urls) ag.run_crawl(cr_job) self.assertTrue(os.path.isdir(cr_job.job_dir), 'No job folder created!') for idx, url in enumerate(cr_job.urls): outfile = os.path.join(cr_job.job_dir, fu.get_out_filename_from_url(url, str(idx+1))) self.assertTrue(os.path.isfile(outfile), 'Cannot find log file %s' % outfile) self.assert_all_patterns_in_file(outfile, expected_strs) self.assert_all_patterns_not_in_file(outfile, unexpected_strs)