def testHandleURLWorks(self): mock_download_queue = mock.Mock(Queue.Queue) crawler_thread = CrawlerThread(None, mock_download_queue, None) crawler_thread.HandleHtmlResource = mock.Mock() with testfixtures.Replacer() as r: # HTML resource. html_resource = CreateFakeURLResource('text/html') r.replace('urllib2.urlopen', mock.Mock(return_value=html_resource)) crawler_thread.HandleURL('http://www.fake.com/') crawler_thread.HandleHtmlResource.assert_called_with(html_resource) # Zip resource. zip_resource = CreateFakeURLResource('application/zip') r.replace('urllib2.urlopen', mock.Mock(return_value=zip_resource)) crawler_thread.HandleURL('http://www.fake.com/') mock_download_queue.put.assert_called_with(zip_resource) # Plain text resource. text_resource = CreateFakeURLResource('text/plain') r.replace('urllib2.urlopen', mock.Mock(return_value=text_resource)) crawler_thread.HandleURL('http://www.fake.com/') mock_download_queue.put.assert_called_with(text_resource)
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() probe_cmd = self.get_phantomjs_cmd() if not probe_cmd: print "Error: unable to find phantomjs executable" sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None display_progress = True check_starturl = True http_auth = None get_robots_txt = True save_html = False try: opts, args = getopt.getopt(argv, 'hc:t:jn:x:O:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': display_progress = False elif o == '-A': http_auth = v elif o == '-p': if v == "tor": v = "socks5:127.0.0.1:9150" proxy = v.split(":") if proxy[0] not in ("http", "https"): print "only http and socks5 proxies are supported" sys.exit(1) Shared.options['proxy'] = {"proto":proxy[0], "host":proxy[1], "port":proxy[2]} elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.","((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": check_starturl = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-F": Shared.options['crawl_forms'] = False if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = self.parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") probe_options.extend(("-x", str(Shared.options['process_timeout']) )) probe_options.extend(("-A", Shared.options['useragent'])) probe_cmd.append(self.base_dir + 'probe/analyze.js') Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) stdoutw("Initializing . ") start_requests = self.init_crawl(start_req, check_starturl, get_robots_txt) database = None fname = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(fname, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info( htcap_version = get_program_infos()['version'], target = Shared.starturl, start_date = self.crawl_start_time, commandline = cmd_to_str(argv), user_agent = Shared.options['useragent'] ) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads" % (fname, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database, display_progress) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)
def test_crawl(self): """ :return: """ url1 = "http://localhost:8081" q = Queue() q.put(url1) thread1 = CrawlerThread(q) thread1.crawl((url1, 0)) self.assertEqual(CrawlerThreadPool.total_links, 5) #depth exceed test url2 = 'localhost:8081/mirror/page1.html' thread2 = CrawlerThread(q) thread2.crawl((url2, 101)) self.assertEqual(CrawlerThreadPool.total_links, 5) url3 = 'http://www.baidu.com?query=10000' thread3 = CrawlerThread(q) CrawlerThreadPool.interval_links_cnt = \ ConfReader.instance().get_max_links_count() + 1 t1 = time.time() thread3.crawl((url3, 0)) t2 = time.time() self.assertAlmostEqual(t2 - t1, ConfReader.instance().get_crawl_interval(), 0)
parser = argparse.ArgumentParser(description="Crawls the web looking for 3D object models.") parser.add_argument("--config", action="store", type=str) parser.add_argument("--instances", action="store", type=int, default=10) if __name__ == "__main__": # TODO(brunonery): verify arguments and fail gracefully if necessary. args = parser.parse_args() config = CrawlerConfig(open(args.config)) # Prepare database and locks. database_handler = DatabaseHandler(config.database_address()) database_handler.Init() url_lock = threading.Lock() # Prepare download queue. download_queue = Queue.Queue() # Start all threads. crawler_thread_list = [] for i in range(args.instances): current_thread = CrawlerThread(database_handler, download_queue, url_lock) crawler_thread_list.append(current_thread) current_thread.start() downloader_thread_list = [] # TODO(brunonery): have different number of crawler and downloader threads. for i in range(args.instances): current_thread = DownloaderThread(download_queue, config.download_folder(), config.zip_size_limit()) current_thread.daemon = True downloader_thread_list.append(current_thread) current_thread.start() # Wait for all crawler threads to finish. for thread in crawler_thread_list: thread.join()