def run_crawl(cr_job): cr_agent = cr_job.crawl_agent url_tuples = cr_job.url_tuples # only copy the variables that'll be used by the agent. Parallelization requires picklable variables. cfg_dict = dict([(i, cr_agent.__dict__[i]) for i in \ ['fc_fontdebug', 'post_visit_func', 'timeout', 'binary_path', \ 'use_mitm_proxy', 'mitm_proxy_logs', 'cmd_line_options', 'main_js', \ 'casper_client_js', 'screenshot', 'job_dir', 'index_html_log', 'type', 'crawl_id'] if i in cr_agent.__dict__]) worker = partial(crawl_worker, cfg_dict) parallelize.run_in_parallel(url_tuples, worker, cr_job.max_parallel_procs) lp.close_index_html(cr_job.index_html_log)
def test_close_index_html(self): index_filename = 'files/html/results/index.html' index_filename = self.abs_test_file_name(index_filename) # self.new_temp_file(index_filename) # to remove it after test finishes table_rows = """<tr><td>1</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/1-google-com.html">http://google.com/</a></td><td>10</td><td>1</td></tr> <tr><td>118</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/118-google-com-ar.html">http://google.com.ar/</a></td><td>3</td><td>51</td></tr> <tr><td>27</td><td><a href="/home/user/fpbase/run/jobs/20130420-010404/27-google-co-uk.html">http://google.co.uk/</a></td><td>1</td><td>11</td></tr>""" fu.write_to_file(index_filename, table_rows) lp.close_index_html(index_filename) index_src = fu.read_file(index_filename) self.assertTrue('<table' in index_src, 'No table in index.html') self.assertTrue('<thead' in index_src, 'No thead in index.html') self.assertTrue('</html>' in index_src, 'No closing html tag index.html')