Python run_crawl Exemples, agents.run_crawl Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : agents_test.py Projet : eliant/fpdetective

    def test_init_headless_agent(self):
        ha = ag.HeadlessAgent()
        cr_job = ag.CrawlJob(ha)

        crawl_agent_cfg = {
            'main_js': cm.CASPER_JS_LAZY_HOMEPAGER,
            'cmd_line_options': ag.PHANTOM_COMMON_OPTIONS,
            'timeout': 20,
            'screenshot': True,
            'post_visit_func': lp.parse_log_dump_results
        }

        ha = ag.HeadlessAgent()
        ha.setOptions(crawl_agent_cfg)
        limit = 3
        cr_job_cfg = {
            'desc':
            'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.'
            % limit,
            'max_parallel_procs':
            20,
            'crawl_agent':
            ha,
            'urls':
            wu.gen_url_list(limit)
        }

        cr_job.setOptions(cr_job_cfg)

        ag.run_crawl(cr_job)
        self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))

Exemple #2

0

Afficher le fichier

Fichier : agents_test.py Projet : abhiraw/fpdetective

 def test_init_headless_agent(self):
     ha = ag.HeadlessAgent()
     cr_job = ag.CrawlJob(ha)
 
     crawl_agent_cfg = {
                'main_js' : cm.CASPER_JS_LAZY_HOMEPAGER,
                'cmd_line_options' : ag.PHANTOM_COMMON_OPTIONS,
                'timeout' : 20,
                'screenshot' : True,
                'post_visit_func': lp.parse_log_dump_results
                }
     
     ha = ag.HeadlessAgent()
     ha.setOptions(crawl_agent_cfg)
     limit = 3
     cr_job_cfg = {
               'desc': 'Visit top %s sites and use fontconfig\'s debugging facilities to collect data.' % limit,
               'max_parallel_procs': 20,
               'crawl_agent': ha,
               'urls':  wu.gen_url_list(limit)
               }
     
     cr_job.setOptions(cr_job_cfg)
     
     ag.run_crawl(cr_job)
     self.dirs_to_remove.append(os.path.realpath(cr_job.job_dir))

Exemple #3

0

Afficher le fichier

Fichier : fpdtest.py Projet : telefunkenvf14/fpdetective

    def should_crawl_and_log(self,
                             agent_cfg,
                             urls,
                             expected_strs,
                             unexpected_strs=[]):
        # TODO: add support for normal browsers
        if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']:
            br = ag.ChromeAgent()
        else:
            br = ag.HeadlessAgent()

        if not agent_cfg.has_key("timeout"):
            agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT

        br.setOptions(agent_cfg)
        cr_job = ag.CrawlJob(br)
        cr_job.urls = [
            urls,
        ] if isinstance(urls, basestring) else urls
        cr_job.url_tuples = zip(xrange(1, len(urls) + 1), urls)

        ag.run_crawl(cr_job)

        self.assertTrue(os.path.isdir(cr_job.job_dir),
                        'No job folder created!')
        for idx, url in enumerate(cr_job.urls):
            outfile = os.path.join(
                cr_job.job_dir,
                fu.get_out_filename_from_url(url, str(idx + 1)))
            self.assertTrue(os.path.isfile(outfile),
                            'Cannot find log file %s' % outfile)
            self.assert_all_patterns_in_file(outfile, expected_strs)
            self.assert_all_patterns_not_in_file(outfile, unexpected_strs)

Exemple #4

0

Afficher le fichier

Fichier : fpdtest.py Projet : eliant/fpdetective

 def should_crawl_and_log(self, agent_cfg, urls, expected_strs, unexpected_strs=[]):
     # TODO: add support for normal browsers 
     if agent_cfg.has_key("type") and 'chrome' in agent_cfg['type']:
         br = ag.ChromeAgent()
     else:  
         br = ag.HeadlessAgent()
     
     if not agent_cfg.has_key("timeout"):
         agent_cfg["timeout"] = DEFAULT_TEST_CRAWL_TIMEOUT        
     
     br.setOptions(agent_cfg)
     cr_job = ag.CrawlJob(br)
     cr_job.urls = [urls,] if isinstance(urls, basestring) else urls
     cr_job.url_tuples = zip(xrange(1, len(urls)+1), urls)
     
     ag.run_crawl(cr_job)
     
     self.assertTrue(os.path.isdir(cr_job.job_dir), 'No job folder created!')
     for idx, url in  enumerate(cr_job.urls):
         outfile = os.path.join(cr_job.job_dir, fu.get_out_filename_from_url(url, str(idx+1)))  
         self.assertTrue(os.path.isfile(outfile), 'Cannot find log file %s' % outfile)
         self.assert_all_patterns_in_file(outfile, expected_strs)
         self.assert_all_patterns_not_in_file(outfile, unexpected_strs)