def crawl(crawl_info): # modified get function with specific browser if isfile(crawl_info.urls): url_tuples = wu.gen_url_list(crawl_info.max_rank, crawl_info.min_rank, True, crawl_info.urls) else: url_tuples = [(0, crawl_info.urls), ] # a single url has been passed machine_id = read_machine_id() suffix = "_%s_FL%s_CO%s_%s_%s" % (machine_id, crawl_info.flash_support, crawl_info.cookie_support, crawl_info.min_rank, crawl_info.max_rank) out_dir, crawl_name = create_job_folder(suffix) # copy_mitm_certs() db_file = join(out_dir, cm.DB_FILENAME) report_file = join(out_dir, "%s.html" % crawl_name) print "Crawl name:", crawl_name dbu.create_db_from_schema(db_file) custom_get = partial(run_cmd, out_dir=out_dir, flash_support=crawl_info.flash_support, cookie_support=crawl_info.cookie_support) parallelize.run_in_parallel(url_tuples, custom_get, crawl_info.max_parallel_procs) gr.gen_crawl_report(db_file, report_file) # clean_tmp_files(out_dir) zipped = pack_data(out_dir) if crawl_info.upload_data: ssh.scp_put_to_server(zipped) ssh.scp_put_to_server(report_file)
def test_get_top_alexa_list_start_stop(self): top_50_100 = list(wu.gen_url_list(100, 50)) self.assertEqual(len(top_50_100), 51) top_5_10 = list(wu.gen_url_list(10, 5)) self.assertEqual(len(top_5_10), 6)
def test_gen_url_list(self): self.assert_is_file(cm.ALEXA_TOP_1M) self.assertEqual(list(wu.gen_url_list(0)), []) self.assertEqual(len(list(wu.gen_url_list(10))), 10, "Unexpected no of URLs")