def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() deps_errors = check_dependences(self.base_dir) if len(deps_errors) > 0: print "Dependences errors: " for err in deps_errors: print " %s" % err sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None initial_checks = True http_auth = None get_robots_txt = True save_html = False try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OvelE:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': self.display_progress = False elif o == '-A': http_auth = v elif o == '-p': try: Shared.options['proxy'] = parse_proxy_string(v) except Exception as e: print e sys.exit(1) elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": self.verbose = True elif o == "-e": Shared.options['deduplicate_pages'] = False elif o == "-l": Shared.options['headless_chrome'] = False elif o == "-E": if not Shared.options['extra_headers']: Shared.options['extra_headers'] = {} (hn, hv) = v.split("=", 1) Shared.options['extra_headers'][hn] = hv probe_cmd = get_node_cmd() if not probe_cmd: # maybe useless print "Error: unable to find node executable" sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_options.extend([ "-y", "%s:%s:%s" % (Shared.options['proxy']['proto'], Shared.options['proxy']['host'], Shared.options['proxy']['port']) ]) if not Shared.options['headless_chrome']: probe_options.append("-l") probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js')) if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") if Shared.options['extra_headers']: probe_options.extend( ["-E", json.dumps(Shared.options['extra_headers'])]) Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) if not hasattr(ssl, "SSLContext"): print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" stdoutw("Initializing . ") start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None self.db_file = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(self.db_file, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info(htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent'], proxy=json.dumps(Shared.options['proxy']), extra_headers=json.dumps( Shared.options['extra_headers']), cookies=json.dumps(start_cookies)) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads (^C to pause or change verbosity)" % ( self.db_file, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % ( Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() probe_cmd = get_phantomjs_cmd() if not probe_cmd: print "Error: unable to find phantomjs executable" sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None display_progress = True verbose = False initial_checks = True http_auth = None get_robots_txt = True save_html = False user_script = None try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': display_progress = False elif o == '-A': http_auth = v elif o == '-p': if v == "tor": v = "socks5:127.0.0.1:9150" proxy = v.split(":") if proxy[0] not in ("http", "socks5"): print "only http and socks5 proxies are supported" sys.exit(1) Shared.options['proxy'] = { "proto": proxy[0], "host": proxy[1], "port": proxy[2] } elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": verbose = True elif o == "-u": if os.path.isfile(v): user_script = os.path.abspath(v) else: print "error: unable to open USER_SCRIPT" sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = self.parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) probe_cmd.append(self.base_dir + 'probe/analyze.js') if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") if user_script: probe_options.extend(("-u", user_script)) probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) if not hasattr(ssl, "SSLContext"): print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" stdoutw("Initializing . ") if user_script and initial_checks: self.check_user_script_syntax(probe_cmd, user_script) start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None fname = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(fname, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info(htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent']) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads" % ( fname, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database, display_progress, verbose) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % ( Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() probe_cmd = self.get_phantomjs_cmd() if not probe_cmd: print "Error: unable to find phantomjs executable" sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None display_progress = True check_starturl = True http_auth = None get_robots_txt = True save_html = False try: opts, args = getopt.getopt(argv, 'hc:t:jn:x:O:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': display_progress = False elif o == '-A': http_auth = v elif o == '-p': if v == "tor": v = "socks5:127.0.0.1:9150" proxy = v.split(":") if proxy[0] not in ("http", "https"): print "only http and socks5 proxies are supported" sys.exit(1) Shared.options['proxy'] = {"proto":proxy[0], "host":proxy[1], "port":proxy[2]} elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.","((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": check_starturl = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-F": Shared.options['crawl_forms'] = False if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = self.parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") probe_options.extend(("-x", str(Shared.options['process_timeout']) )) probe_options.extend(("-A", Shared.options['useragent'])) probe_cmd.append(self.base_dir + 'probe/analyze.js') Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) stdoutw("Initializing . ") start_requests = self.init_crawl(start_req, check_starturl, get_robots_txt) database = None fname = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(fname, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info( htcap_version = get_program_infos()['version'], target = Shared.starturl, start_date = self.crawl_start_time, commandline = cmd_to_str(argv), user_agent = Shared.options['useragent'] ) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads" % (fname, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database, display_progress) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)
parser = argparse.ArgumentParser(description="Crawls the web looking for 3D object models.") parser.add_argument("--config", action="store", type=str) parser.add_argument("--instances", action="store", type=int, default=10) if __name__ == "__main__": # TODO(brunonery): verify arguments and fail gracefully if necessary. args = parser.parse_args() config = CrawlerConfig(open(args.config)) # Prepare database and locks. database_handler = DatabaseHandler(config.database_address()) database_handler.Init() url_lock = threading.Lock() # Prepare download queue. download_queue = Queue.Queue() # Start all threads. crawler_thread_list = [] for i in range(args.instances): current_thread = CrawlerThread(database_handler, download_queue, url_lock) crawler_thread_list.append(current_thread) current_thread.start() downloader_thread_list = [] # TODO(brunonery): have different number of crawler and downloader threads. for i in range(args.instances): current_thread = DownloaderThread(download_queue, config.download_folder(), config.zip_size_limit()) current_thread.daemon = True downloader_thread_list.append(current_thread) current_thread.start() # Wait for all crawler threads to finish. for thread in crawler_thread_list: thread.join()