def testHandleURLWorks(self):
     mock_download_queue = mock.Mock(Queue.Queue)
     crawler_thread = CrawlerThread(None, mock_download_queue, None)
     crawler_thread.HandleHtmlResource = mock.Mock()
     with testfixtures.Replacer() as r:
         # HTML resource.
         html_resource = CreateFakeURLResource('text/html')
         r.replace('urllib2.urlopen', mock.Mock(return_value=html_resource))
         crawler_thread.HandleURL('http://www.fake.com/')
         crawler_thread.HandleHtmlResource.assert_called_with(html_resource)
         # Zip resource.
         zip_resource = CreateFakeURLResource('application/zip')
         r.replace('urllib2.urlopen', mock.Mock(return_value=zip_resource))
         crawler_thread.HandleURL('http://www.fake.com/')
         mock_download_queue.put.assert_called_with(zip_resource)
         # Plain text resource.
         text_resource = CreateFakeURLResource('text/plain')
         r.replace('urllib2.urlopen', mock.Mock(return_value=text_resource))
         crawler_thread.HandleURL('http://www.fake.com/')
         mock_download_queue.put.assert_called_with(text_resource)
Exemple #2
0
	def main(self, argv):
		Shared.options = self.defaults		
		Shared.th_condition = threading.Condition()
		Shared.main_condition = threading.Condition()


		probe_cmd = self.get_phantomjs_cmd()
		if not probe_cmd:
			print "Error: unable to find phantomjs executable"
			sys.exit(1)

		start_cookies = []
		start_referer = None
		
		probe_options = ["-R", self.randstr(20)]
		threads = []
		num_threads = self.defaults['num_threads']			
	
		out_file = ""
		out_file_overwrite = self.defaults['out_file_overwrite']	
		cookie_string = None
		display_progress = True
		check_starturl = True
		http_auth = None
		get_robots_txt = True
		save_html = False

		try:
			opts, args = getopt.getopt(argv, 'hc:t:jn:x:O:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:')
		except getopt.GetoptError as err:
			print str(err)	
			sys.exit(1)
		
		
		if len(args) < 2:
			self.usage()
			sys.exit(1)
		

		for o, v in opts:
			if o == '-h':
				self.usage()
				sys.exit(0)
			elif o == '-c':
				cookie_string = v
			elif o == '-C':
				try:
					with open(v) as cf:
						cookie_string = cf.read()
				except Exception as e:				
					print "error reading cookie file"		
					sys.exit(1)			
			elif o == '-r':
				start_referer = v
			elif o == '-n':
				num_threads = int(v)
			elif o == '-t':
				Shared.options['process_timeout'] = int(v)
			elif o == '-q':
				display_progress = False				
			elif o == '-A':
				http_auth = v
			elif o == '-p':			
				if v == "tor": v = "socks5:127.0.0.1:9150"
				proxy =  v.split(":")
				if proxy[0] not in ("http", "https"): 
					print "only http and socks5 proxies are supported"
					sys.exit(1)
				Shared.options['proxy'] = {"proto":proxy[0], "host":proxy[1], "port":proxy[2]}
			elif o == '-d':			
				for ad in v.split(","):
					# convert *.domain.com to *.\.domain\.com
					pattern = re.escape(ad).replace("\\*\\.","((.*\\.)|)")
					Shared.allowed_domains.add(pattern)
			elif o == '-x':	
				for eu in v.split(","):					
					Shared.excluded_urls.add(eu)								
			elif o == "-G":
				Shared.options['group_qs'] = True			
			elif o == "-w":
				out_file_overwrite = True			
			elif o == "-R":
				Shared.options['max_redirects'] = int(v)
			elif o == "-U":
				Shared.options['useragent'] = v			
			elif o == "-s":
				if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL):					
					self.usage()
					print "* ERROR: wrong scope set '%s'" % v
					sys.exit(1)
				Shared.options['scope'] = v
			elif o == "-m":
				if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE):
					self.usage()
					print "* ERROR: wrong mode set '%s'" % v					
					sys.exit(1)
				Shared.options['mode'] = v
			elif o == "-S":
				check_starturl = False
			elif o == "-I":
				get_robots_txt = False	
			elif o == "-H":
				save_html = True
			elif o == "-D":
				Shared.options['max_depth'] = int(v)
			elif o == "-P":
				Shared.options['max_post_depth'] = int(v)				
			elif o == "-F":
				Shared.options['crawl_forms'] = False


		if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0:
			print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

		if cookie_string:
			try:
				start_cookies = self.parse_cookie_string(cookie_string)
			except Exception as e:				
				print "error decoding cookie string"		
				sys.exit(1)			

		

		if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
			probe_options.append("-f") # dont fill values
		if Shared.options['mode'] == CRAWLMODE_PASSIVE:
			probe_options.append("-t") # dont trigger events

		if Shared.options['proxy']:
			probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto'])
			probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port']))
		
		if len(Shared.excluded_urls) > 0:			
			probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

		if save_html:
			probe_options.append("-H")

		probe_options.extend(("-x", str(Shared.options['process_timeout']) ))
		probe_options.extend(("-A", Shared.options['useragent']))

		probe_cmd.append(self.base_dir + 'probe/analyze.js')
		
		Shared.probe_cmd = probe_cmd + probe_options 


		Shared.starturl = normalize_url(args[0])
		out_file = args[1]
		
		purl = urlsplit(Shared.starturl)
		Shared.allowed_domains.add(purl.hostname)


		for sc in start_cookies:		
			Shared.start_cookies.append(Cookie(sc, Shared.starturl))
		

		start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer)
		
		stdoutw("Initializing . ")

		start_requests = self.init_crawl(start_req, check_starturl, get_robots_txt)
				
		database = None
		fname = self.generate_filename(out_file, out_file_overwrite)
		try:
			database = self.init_db(fname, out_file)
		except Exception as e:
			print str(e)
			sys.exit(1)

		database.save_crawl_info(
			htcap_version = get_program_infos()['version'],
			target = Shared.starturl,
			start_date = self.crawl_start_time,
			commandline = cmd_to_str(argv),
			user_agent = Shared.options['useragent']
		)

		database.connect()
		database.begin()		
		for req in start_requests:
			database.save_request(req)
		database.commit()
		database.close()

		print "done"
		print "Database %s initialized, crawl started with %d threads" % (fname, num_threads)

		for n in range(0, num_threads):	
			thread = CrawlerThread()			
			threads.append(thread)		
			thread.start()
		

		self.main_loop(threads, start_requests, database, display_progress)		
		
		self.kill_threads(threads)

		self.crawl_end_time = int(time.time())

		print "Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60)

		database.save_crawl_info(end_date=self.crawl_end_time)
Exemple #3
0
    def test_crawl(self):
        """
        :return:
        """
        url1 = "http://localhost:8081"
        q = Queue()
        q.put(url1)
        thread1 = CrawlerThread(q)
        thread1.crawl((url1, 0))
        self.assertEqual(CrawlerThreadPool.total_links, 5)

        #depth exceed test
        url2 = 'localhost:8081/mirror/page1.html'
        thread2 = CrawlerThread(q)
        thread2.crawl((url2, 101))
        self.assertEqual(CrawlerThreadPool.total_links, 5)

        url3 = 'http://www.baidu.com?query=10000'
        thread3 = CrawlerThread(q)
        CrawlerThreadPool.interval_links_cnt = \
            ConfReader.instance().get_max_links_count() + 1
        t1 = time.time()
        thread3.crawl((url3, 0))
        t2 = time.time()
        self.assertAlmostEqual(t2 - t1,
                               ConfReader.instance().get_crawl_interval(), 0)
Exemple #4
0
parser = argparse.ArgumentParser(description="Crawls the web looking for 3D object models.")
parser.add_argument("--config", action="store", type=str)
parser.add_argument("--instances", action="store", type=int, default=10)

if __name__ == "__main__":
    # TODO(brunonery): verify arguments and fail gracefully if necessary.
    args = parser.parse_args()
    config = CrawlerConfig(open(args.config))
    # Prepare database and locks.
    database_handler = DatabaseHandler(config.database_address())
    database_handler.Init()
    url_lock = threading.Lock()
    # Prepare download queue.
    download_queue = Queue.Queue()
    # Start all threads.
    crawler_thread_list = []
    for i in range(args.instances):
        current_thread = CrawlerThread(database_handler, download_queue, url_lock)
        crawler_thread_list.append(current_thread)
        current_thread.start()
    downloader_thread_list = []
    # TODO(brunonery): have different number of crawler and downloader threads.
    for i in range(args.instances):
        current_thread = DownloaderThread(download_queue, config.download_folder(), config.zip_size_limit())
        current_thread.daemon = True
        downloader_thread_list.append(current_thread)
        current_thread.start()
    # Wait for all crawler threads to finish.
    for thread in crawler_thread_list:
        thread.join()