Esempio n. 1
0
def crawl(crawl_info):
    # modified get function with specific browser
    if isfile(crawl_info.urls):
        url_tuples = wu.gen_url_list(crawl_info.max_rank,
                                     crawl_info.min_rank, True,
                                     crawl_info.urls)
    else:
        url_tuples = [(0, crawl_info.urls), ]  # a single url has been passed

    machine_id = read_machine_id()
    suffix = "_%s_FL%s_CO%s_%s_%s" % (machine_id, crawl_info.flash_support,
                                      crawl_info.cookie_support,
                                      crawl_info.min_rank,
                                      crawl_info.max_rank)
    out_dir, crawl_name = create_job_folder(suffix)
    # copy_mitm_certs()
    db_file = join(out_dir, cm.DB_FILENAME)

    report_file = join(out_dir, "%s.html" % crawl_name)
    print "Crawl name:", crawl_name
    dbu.create_db_from_schema(db_file)
    custom_get = partial(run_cmd, out_dir=out_dir,
                         flash_support=crawl_info.flash_support,
                         cookie_support=crawl_info.cookie_support)
    parallelize.run_in_parallel(url_tuples, custom_get,
                                crawl_info.max_parallel_procs)
    gr.gen_crawl_report(db_file, report_file)
    # clean_tmp_files(out_dir)
    zipped = pack_data(out_dir)
    if crawl_info.upload_data:
        ssh.scp_put_to_server(zipped)
        ssh.scp_put_to_server(report_file)
    def test_get_top_alexa_list_start_stop(self):
        top_50_100 = list(wu.gen_url_list(100, 50))
        self.assertEqual(len(top_50_100), 51)

        top_5_10 = list(wu.gen_url_list(10, 5))
        self.assertEqual(len(top_5_10), 6)
 def test_gen_url_list(self):
     self.assert_is_file(cm.ALEXA_TOP_1M)
     self.assertEqual(list(wu.gen_url_list(0)), [])
     self.assertEqual(len(list(wu.gen_url_list(10))), 10,
                      "Unexpected no of URLs")