예제 #1
0
    def test_init_headless_agent(self):
        ha = ag.HeadlessAgent()
        cr_job = ag.CrawlJob(ha)

        crawl_agent_cfg = {
            'main_js': cm.CASPER_JS_LAZY_HOMEPAGER,
            'cmd_line_options': ag.PHANTOM_COMMON_OPTIONS,
            'timeout': 20,
            'screenshot': True,
            'post_visit_func': lp.parse_log_dump_results
        }

        ha = ag.HeadlessAgent()
        ha.setOptions(crawl_agent_cfg)
        limit = 3
        cr_job_cfg = {
            'desc':
            'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.'
            % limit,
            'max_parallel_procs':
            20,
            'crawl_agent':
            ha,
            'urls':
            wu.gen_url_list(limit)
        }

        cr_job.setOptions(cr_job_cfg)

        ag.run_crawl(cr_job)
        self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))
예제 #2
0
 def setUp(self):
     self.dirs_to_remove = []
     self.db_conn = dbu.mysql_init_db('fp_detective_test')
     self.domainInfo = lp.DomainInfo() # create a new DomainInfo obj for tests
     
     self.domainInfo.rank = 1
     self.domainInfo.log_filename = '/var/log/syslog'
     self.domainInfo.url = 'http://google.com'
     self.domainInfo.fonts_loaded = ['Arial', 'Tahoma', 'Georgia', '微软雅黑']
     self.domainInfo.fonts_by_origins = {'http://google.com':['arial', 'Tahoma'], 'http://yahoo.com':['Georgia'] }
     self.domainInfo.requests = ['http://google.com', 'http://yahoo.com']
     self.domainInfo.responses = ['http://abc.com', 'http://xyz.com']
     self.domainInfo.num_font_loads = 50
     self.domainInfo.num_offsetWidth_calls = 15
     self.domainInfo.num_offsetHeight_calls = 15
     self.domainInfo.fp_detected = [fpr.FINGERPRINTER_REGEX.items()[:2]]
     self.domainInfo.crawl_id = 64654
     self.domainInfo.fpd_logs = ['userAgent', 'appCodeName']
     self.domainInfo.fc_dbg_font_loads = ['Arial', 'Tahoma', 'Georgia', 'someotherfont', '微软雅黑']
     self.domainInfo.log_complete = 1
     
     ha = ag.HeadlessAgent()
     self.crawl_job = ag.CrawlJob(ha)
     self.dirs_to_remove.append(self.crawl_job.job_dir)
     self.crawl_job.urls = ['http://google.com', 'http://yahoo.com']
     self.crawl_job.desc
예제 #3
0
    def should_crawl_and_log(self,
                             agent_cfg,
                             urls,
                             expected_strs,
                             unexpected_strs=[]):
        # TODO: add support for normal browsers
        if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']:
            br = ag.ChromeAgent()
        else:
            br = ag.HeadlessAgent()

        if not agent_cfg.has_key("timeout"):
            agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT

        br.setOptions(agent_cfg)
        cr_job = ag.CrawlJob(br)
        cr_job.urls = [
            urls,
        ] if isinstance(urls, basestring) else urls
        cr_job.url_tuples = zip(xrange(1, len(urls) + 1), urls)

        ag.run_crawl(cr_job)

        self.assertTrue(os.path.isdir(cr_job.job_dir),
                        'No job folder created!')
        for idx, url in enumerate(cr_job.urls):
            outfile = os.path.join(
                cr_job.job_dir,
                fu.get_out_filename_from_url(url, str(idx + 1)))
            self.assertTrue(os.path.isfile(outfile),
                            'Cannot find log file %s' % outfile)
            self.assert_all_patterns_in_file(outfile, expected_strs)
            self.assert_all_patterns_not_in_file(outfile, unexpected_strs)
예제 #4
0
 def test_modphantomjs_should_log_access_to_navigator_props(self):
     ph = ag.HeadlessAgent().setOptions(
         ag.AGENT_CFG_PHANTOM_MOD_HOME_PAGE).setOptions(
             {'timeout': TEST_TIMEOUT})
     self.should_log_access_to_navigator_props(ph.__dict__)