Beispiel #1
0
def run_crawl(cr_job):
    cr_agent = cr_job.crawl_agent
    url_tuples = cr_job.url_tuples

    # only copy the variables that'll be used by the agent. Parallelization requires picklable variables. 
    cfg_dict = dict([(i, cr_agent.__dict__[i]) for i in \
                     ['fc_fontdebug', 'post_visit_func', 'timeout', 'binary_path', \
                      'use_mitm_proxy', 'mitm_proxy_logs', 'cmd_line_options', 'main_js', \
                      'casper_client_js', 'screenshot', 'job_dir', 'index_html_log', 'type', 'crawl_id'] if i in cr_agent.__dict__])

    worker = partial(crawl_worker, cfg_dict)
    
    parallelize.run_in_parallel(url_tuples, worker, cr_job.max_parallel_procs)
    
    lp.close_index_html(cr_job.index_html_log)
Beispiel #2
0
def run_crawl(cr_job):
    cr_agent = cr_job.crawl_agent
    url_tuples = cr_job.url_tuples

    # only copy the variables that'll be used by the agent. Parallelization requires picklable variables.
    cfg_dict = dict([(i, cr_agent.__dict__[i]) for i in \
                     ['fc_fontdebug', 'post_visit_func', 'timeout', 'binary_path', \
                      'use_mitm_proxy', 'mitm_proxy_logs', 'cmd_line_options', 'main_js', \
                      'casper_client_js', 'screenshot', 'job_dir', 'index_html_log', 'type', 'crawl_id'] if i in cr_agent.__dict__])

    worker = partial(crawl_worker, cfg_dict)

    parallelize.run_in_parallel(url_tuples, worker, cr_job.max_parallel_procs)

    lp.close_index_html(cr_job.index_html_log)
Beispiel #3
0
 def test_close_index_html(self):
     index_filename = 'files/html/results/index.html'
     index_filename = self.abs_test_file_name(index_filename)
     
     # self.new_temp_file(index_filename) # to remove it after test finishes
     table_rows = """<tr><td>1</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/1-google-com.html">http://google.com/</a></td><td>10</td><td>1</td></tr>
     <tr><td>118</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/118-google-com-ar.html">http://google.com.ar/</a></td><td>3</td><td>51</td></tr>
     <tr><td>27</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/27-google-co-uk.html">http://google.co.uk/</a></td><td>1</td><td>11</td></tr>"""
     
     fu.write_to_file(index_filename, table_rows)
     
     lp.close_index_html(index_filename)
     index_src = fu.read_file(index_filename)
     self.assertTrue('<table' in  index_src, 'No table in index.html')
     self.assertTrue('<thead' in  index_src, 'No thead in index.html')
     self.assertTrue('</html>' in  index_src, 'No closing html tag index.html')
Beispiel #4
0
    def test_close_index_html(self):
        index_filename = 'files/html/results/index.html'
        index_filename = self.abs_test_file_name(index_filename)

        # self.new_temp_file(index_filename) # to remove it after test finishes
        table_rows = """<tr><td>1</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/1-google-com.html">http://google.com/</a></td><td>10</td><td>1</td></tr>
        <tr><td>118</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/118-google-com-ar.html">http://google.com.ar/</a></td><td>3</td><td>51</td></tr>
        <tr><td>27</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/27-google-co-uk.html">http://google.co.uk/</a></td><td>1</td><td>11</td></tr>"""

        fu.write_to_file(index_filename, table_rows)

        lp.close_index_html(index_filename)
        index_src = fu.read_file(index_filename)
        self.assertTrue('<table' in index_src, 'No table in index.html')
        self.assertTrue('<thead' in index_src, 'No thead in index.html')
        self.assertTrue('</html>' in index_src,
                        'No closing html tag index.html')