def get_requests_from_robots(self, request): purl = urlsplit(request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url, extra_headers=Shared.options['extra_headers']) try: # request, timeout, retries=None, useragent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib.error.HTTPError: return [] except: return [] #raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*", "", line).split(":", 1) except: continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request) requests.append(req) return adjust_requests(requests) if requests else []
def test___eq__with_post(self, remove_tokens_mock): a = Request("type1", "POST", "url1", data="dataXXXX") b = Request("type1", "POST", "url1", data="dataYYYY") self.assertTrue(a == b) self.assertEqual(remove_tokens_mock.call_args_list, [call("dataXXXX"), call("dataYYYY")])
def __init__(self, data, parent): self.status = "ok" self.requests = [] self.cookies = [] self.redirect = None # if True the probe returned no error BUT the json is not closed properly self.partialcontent = False self.html = None self.user_output = [] self.page_hash = 0 status = data.pop() if status['status'] == "error": self.status = "error" self.errcode = status['code'] if "partialcontent" in status: self.partialcontent = status['partialcontent'] # grap cookies before creating rquests for key, val in data: if key == "cookies": for cookie in val: self.cookies.append(Cookie(cookie, parent.url)) if "redirect" in status: self.redirect = status['redirect'] r = Request(REQTYPE_REDIRECT, "GET", self.redirect, parent=parent, set_cookie=self.cookies, parent_db_id=parent.db_id) self.requests.append(r) for key, val in data: if key == "request": trigger = val['trigger'] if 'trigger' in val else None #try: r = Request(val['type'], val['method'], val['url'], parent=parent, set_cookie=self.cookies, data=val['data'], trigger=trigger, parent_db_id=parent.db_id) self.requests.append(r) #except Exception as e: # pass elif key == "html": self.html = val elif key == "page_hash": page_hash = TextHash(val).hash self.page_hash = page_hash if page_hash else 0 elif key == "user": self.user_output.append(val)
def get_requests(self, types="xhr"): """ return a list of request matching the given types connect, retrieve the requests list then close the connection :param types: string of types (comma separated) :return: list of matching request """ types = types.split(",") ret = [] qry = "SELECT * FROM request WHERE out_of_scope=0 AND type IN (%s)" % ",".join( "?" * len(types)) self.connect() cur = self.conn.cursor() cur.execute(qry, types) for r in cur.fetchall(): # !! parent must be null (or unset) req = Request(r['type'], r['method'], r['url'], referer=r['referer'], data=r['data'], json_cookies=r['cookies'], db_id=r['id'], parent_db_id=r['id_parent']) ret.append(req) self.close() return ret
def get_request(self, id): req = None qry = "SELECT * FROM request WHERE out_of_scope=0 AND id=?" try: self.connect() cur = self.conn.cursor() cur.execute(qry, (str(id), )) r = cur.fetchone() # !! parent must be null (or unset) if r: req = Request(r['type'], r['method'], r['url'], referer=r['referer'], data=r['data'], json_cookies=r['cookies'], db_id=r['id'], parent_db_id=r['id_parent'], extra_headers=json.loads(r['extra_headers'])) self.close() except Exception as e: raise print("44 %s" % str(e)) return req
def get_requests(self, types="xhr"): types = types.split(",") ret = [] qry = "SELECT * FROM request WHERE out_of_scope=0 AND type IN (" + ",".join( ["?" for _ in range(0, len(types))]) + ")" try: self.connect() cur = self.conn.cursor() cur.execute(qry, types) for r in cur.fetchall(): # !! parent must be null (or unset) req = Request(r['type'], r['method'], r['url'], referer=r['referer'], data=r['data'], json_cookies=r['cookies'], db_id=r['id'], parent_db_id=r['id_parent']) ret.append(req) self.close() except Exception as e: print str(e) return ret
def get_requests(self, types="xhr", where=None): types = types.split(",") ret = [] qry = "SELECT * FROM request WHERE out_of_scope=0 AND type IN (%s) and %s order by id desc" % ( ",".join("?" * len(types)), "1" if where is None else where) try: self.connect() cur = self.conn.cursor() cur.execute(qry, types) for r in cur.fetchall(): # !! parent must be null (or unset) req = Request(r['type'], r['method'], r['url'], referer=r['referer'], data=r['data'], json_cookies=r['cookies'], db_id=r['id'], parent_db_id=r['id_parent'], extra_headers=json.loads(r['extra_headers'])) ret.append(req) self.close() except Exception as e: print("114 %s" % str(e)) return ret
def __init__(self, data, parent): self.status = "ok" self.requests = [] self.cookies = [] self.redirect = [] self.errmessage = "" # if True the probe returned no error BUT the json is not closed properly self.partialcontent = False self.html = None self.user_output = [] self.page_hash = 0 status = data["status"] if status == "error": self.status = "error" self.errmessage = data["errors"] # grap cookies before creating rquests for cookie in data["cookies"]: self.cookies.append(Cookie(cookie, parent.url)) for redirect in data['redirect']: r = Request(REQTYPE_REDIRECT, "GET", redirect, parent=parent, set_cookie=self.cookies, parent_db_id=parent.db_id) self.redirect.append(r) requests = data["requests"] for request in requests: request = json.loads(request) r = Request(request['type'], request['method'], request['url'], parent=parent, parent_db_id=parent.db_id, set_cookie=self.cookies, data=request['data'], trigger=request.get("trigger", None), extra_headers=request.get("extra_headers", None)) self.requests.append(r)
def test_set_params_for_probe(self): req = Request("type1", "POST", "http://example.com", data="example data", http_auth="auth1") Shared.options['set_referer'] = None thread = CrawlerThread() params = thread._set_probe_params(req) print(req) self.assertIn("http://example.com/", params) pass
def _get_requests_from_robots(start_request): """ read robots.txt file (if any) and create a list of request based on it's content :return: list of request """ purl = urlsplit(start_request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url) try: # request, timeout, retries=None, user_agent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib2.HTTPError: return [] except: raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*", "", line).split(":", 1) except Exception as e: print(str(e)) continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=start_request) if request_is_crawlable(req): requests.append(req) return adjust_requests(requests) if requests else []
def get_requests(self): requests = [] try: headers = { "user-agent": self.useragent, } headers.update(self.extra_headers) res = reqlib.request(method=self.request.method, url=self.request.url, verify=False, timeout=self.timeout, cookies=toReqCok(self.request.cookies), proxies=self.proxy) except Exception as e: raise e log.debug("HttpGet get_requests ===> %s,%d,%d" % (self.request.url, res.status_code, len(res.text))) if res.headers["content-type"] is not None and res.headers[ 'content-type'].lower().split(";")[0] != "text/html": raise NotHtmlException(ERROR_CONTENTTYPE) if res.content is None: raise NotHtmlException try: urls = get_urls(res.text) for url in urls: # @TODO handle FORMS requests.append( Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=res.headers["set_cookie"], parent_db_id=self.request.db_id)) except Exception as e: raise e return requests
def rawsend(self, url, method=None, data=None, cookies=None, user_agent=None, proxy=None, extra_headers=None, req_timeout=5, ignore_errors=False): if not method: method = METHOD_GET req = Request(REQTYPE_LINK, method, url) http = HttpGet(req, req_timeout, proxy=proxy, useragent=user_agent, extra_headers=extra_headers) return http.send_request(method=method, url=url, data=data, cookies=cookies, ignore_errors=ignore_errors)
def get_not_crawled_request(self): """ connect, retrieve existing never crawled requests then close the connection :return: list of request """ requests = [] query = "SELECT * FROM request WHERE crawled=0 AND out_of_scope=0" self.connect() cur = self.conn.cursor() cur.execute(query) for request in cur.fetchall(): req = Request(request['type'], request['method'], request['url'], referer=request['referer'], data=request['data'], json_cookies=request['cookies'], db_id=request['id'], parent_db_id=request['id_parent']) requests.append(req) self.close() return requests
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() deps_errors = check_dependences(self.base_dir) if len(deps_errors) > 0: print("Dependences errors: ") for err in deps_errors: print(" %s" % err) sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None initial_checks = True http_auth = None get_robots_txt = True save_html = False try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OvelE:L:Mg:') except getopt.GetoptError as err: print(str(err)) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print("error reading cookie file") sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': self.display_progress = False elif o == '-A': http_auth = v elif o == '-p': try: Shared.options['proxy'] = parse_proxy_string(v) except Exception as e: print(e) sys.exit(1) elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): try: re.match(eu, "") except: print("* ERROR: regex failed: %s" % eu) sys.exit(1) Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print("* ERROR: wrong scope set '%s'" % v) sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print("* ERROR: wrong mode set '%s'" % v) sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": self.verbose = True elif o == "-e": Shared.options['deduplicate_pages'] = False elif o == "-l": Shared.options['headless_chrome'] = False elif o == "-M": Shared.options['simulate_real_events'] = False elif o == "-E": if not Shared.options['extra_headers']: Shared.options['extra_headers'] = {} (hn, hv) = v.split("=", 1) Shared.options['extra_headers'][hn] = hv elif o == "-L": try: with open(v) as cf: Shared.options['login_sequence'] = json.loads( cf.read()) Shared.options['login_sequence'][ "__file__"] = os.path.abspath(v) except ValueError as e: print("* ERROR: decoding login sequence") sys.exit(1) except Exception as e: print("* ERROR: login sequence file not found") sys.exit(1) elif o == "-g": if not Shared.options['local_storage']: Shared.options['local_storage'] = {} (hn, hv) = v.split("=", 1) ktks = hn.split(":", 1) if len(ktks) != 2 or ktks[0] not in ("L", "S"): print( "Error: the -g option must be in the form '[L|S]:key=value', use 'L' to set locaStorage and 'S' to set sessionStorage" ) sys.exit(1) Shared.options['local_storage'][ktks[1]] = { "type": ktks[0], "value": hv } probe_cmd = get_node_cmd() if not probe_cmd: # maybe useless print("Error: unable to find node executable") sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print("* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN) if cookie_string: try: start_cookies = parse_cookie_string(cookie_string) except Exception as e: print("error decoding cookie string") sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_options.extend([ "-y", "%s:%s:%s" % (Shared.options['proxy']['proto'], Shared.options['proxy']['host'], Shared.options['proxy']['port']) ]) if not Shared.options['headless_chrome']: probe_options.append("-l") probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js')) if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") if Shared.options['extra_headers']: probe_options.extend( ["-E", json.dumps(Shared.options['extra_headers'])]) if Shared.options['local_storage']: probe_options.extend( ["-g", json.dumps(Shared.options['local_storage'])]) if not Shared.options['simulate_real_events']: probe_options.append("-M") Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) if Shared.options['login_sequence'] and Shared.options[ 'login_sequence']['type'] == LOGSEQTYPE_SHARED: login_req = Request(REQTYPE_LINK, "GET", Shared.options['login_sequence']['url'], set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer, extra_headers=Shared.options['extra_headers']) stdoutw("Logging in . . . ") try: pe = ProbeExecutor( login_req, Shared.probe_cmd + ["-z"], login_sequence=Shared.options['login_sequence']) probe = pe.execute() if not probe: print("\n* ERROR: login sequence failed to execute probe") sys.exit(1) if probe.status == "ok": for c in probe.cookies: if not Shared.options['login_sequence'][ 'cookies'] or c.name in Shared.options[ 'login_sequence']['cookies']: Shared.start_cookies.append(c) else: print("\n* ERROR: login sequence failed:\n %s" % probe.errmessage) sys.exit(1) except KeyboardInterrupt: pe.terminate() print("\nAborted") sys.exit(0) print("done") for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer, extra_headers=Shared.options['extra_headers']) if not hasattr(ssl, "SSLContext"): print( "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" ) stdoutw("Initializing . ") start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None self.db_file = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(self.db_file, out_file) except Exception as e: print(str(e)) sys.exit(1) database.save_crawl_info( htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent'], proxy=json.dumps(Shared.options['proxy']), extra_headers=json.dumps(Shared.options['extra_headers']), cookies=json.dumps([x.get_dict() for x in Shared.start_cookies])) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print("done") print( "Database %s initialized, crawl started with %d threads (^C to pause or change verbosity)" % (self.db_file, num_threads)) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print("Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) // 60)) database.save_crawl_info(end_date=self.crawl_end_time)
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() deps_errors = check_dependences(self.base_dir) if len(deps_errors) > 0: print "Dependences errors: " for err in deps_errors: print " %s" % err sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None initial_checks = True http_auth = None get_robots_txt = True save_html = False try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OveLlE:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': self.display_progress = False elif o == '-A': http_auth = v elif o == '-p': try: Shared.options['proxy'] = parse_proxy_string(v) except Exception as e: print e sys.exit(1) elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": self.verbose = True elif o == "-e": Shared.options['deduplicate_pages'] = False elif o == "-L": Shared.options['use_legacy_browser'] = True elif o == "-l": Shared.options['headless_chrome'] = False elif o == "-E": if not Shared.options['extra_headers']: Shared.options['extra_headers'] = {} (hn, hv) = v.split("=", 1) Shared.options['extra_headers'][hn] = hv probe_cmd = get_phantomjs_cmd( ) if Shared.options['use_legacy_browser'] else get_node_cmd() if not probe_cmd: # maybe useless print "Error: unable to find node (or phantomjs) executable" sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['use_legacy_browser']: if Shared.options['proxy']: probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js')) else: if Shared.options['proxy']: probe_options.extend([ "-y", "%s:%s:%s" % (Shared.options['proxy']['proto'], Shared.options['proxy']['host'], Shared.options['proxy']['port']) ]) if not Shared.options['headless_chrome']: probe_options.append("-l") probe_cmd.append( os.path.join(self.base_dir, 'probe', 'chrome-probe', 'analyze.js')) if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") if Shared.options['extra_headers']: probe_options.extend( ["-E", json.dumps(Shared.options['extra_headers'])]) Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) if not hasattr(ssl, "SSLContext"): print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" stdoutw("Initializing . ") start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None self.db_file = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(self.db_file, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info(htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent'], proxy=json.dumps(Shared.options['proxy']), extra_headers=json.dumps( Shared.options['extra_headers']), cookies=json.dumps(start_cookies)) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads" % ( self.db_file, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % ( Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)
def test___eq__(self, remove_tokens_mock): a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") b = Request("type1", "method1", "url1", data="data1", http_auth="auth1") self.assertTrue(a == b) a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") b = Request("type2", "method1", "url1", data="data1", http_auth="auth1") self.assertFalse(a == b) a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") b = Request("type1", "method2", "url1", data="data1", http_auth="auth1") self.assertFalse(a == b) a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") b = Request("type1", "method1", "url2", data="data1", http_auth="auth1") self.assertFalse(a == b) a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") b = Request("type1", "method1", "url1", data="data2", http_auth="auth1") self.assertFalse(a == b) a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") b = Request("type1", "method1", "url1", data="data1", http_auth="auth2") self.assertFalse(a == b) a = Request("type1", "method1", "url1") b = None self.assertFalse(a == b) self.assertEqual(remove_tokens_mock.call_count, 0)
def run(self): # get database try: database = self._get_database(self._outfile_name, self._output_mode) crawl_id = database.save_crawl_info( htcap_version=get_program_infos()['version'], target=Shared.start_url, start_date=self.crawl_start_date, commandline=cmd_to_str(self.arg), user_agent=Shared.options['user_agent'], start_cookies=Shared.start_cookies) # if the current crawl is not the first one if crawl_id > 1: # retrieving options from the last crawl random_seed, cookies = database.retrieve_crawl_info(crawl_id - 1) # if the db had a seed and none were provided before if random_seed and not Shared.options.get("random_seed"): Shared.options["random_seed"] = random_seed # if no cookie was provided and some exist from the last crawl if len(Shared.start_cookies ) <= 0 and cookies != "[]" and cookies is not None: for cookie_string in self._parse_cookie_string(cookies): Shared.start_cookies.append(Cookie(cookie_string)) # if no seed have been set yet if not Shared.options.get("random_seed"): Shared.options["random_seed"] = self._generate_random_string( 20) except Exception as e: print(str(e)) sys.exit(1) # set probe arguments self._set_probe() Shared.probe_cmd = self._probe["cmd"] + self._probe["options"] start_requests = [] # create the start request object from provided arguments start_request_from_args = Request(REQTYPE_LINK, "GET", Shared.start_url, set_cookie=Shared.start_cookies, http_auth=self._http_auth, referer=self._start_referer) def _is_not_in_past_requests(request): """ check if the given request is present in Shared.requests or start_requests """ is_in_request = True for r in Shared.requests + start_requests: if r == request: is_in_request = False return is_in_request # check starting url if self._initial_checks: try: self._check_request(start_request_from_args) stdoutw(". ") except KeyboardInterrupt: print("\nAborted") sys.exit(0) if self._output_mode in (CRAWLOUTPUT_RESUME, CRAWLOUTPUT_COMPLETE): try: # make the start url given in arguments crawlable again database.connect() database.save_request(start_request_from_args) database.make_request_crawlable(start_request_from_args) database.commit() database.close() # feeding the "done" request list from the db Shared.requests.extend(database.get_crawled_request()) Shared.requests_index = len(Shared.requests) # if resume, add requests from db if self._output_mode == CRAWLOUTPUT_RESUME: start_requests.extend(database.get_not_crawled_request()) # if request from args is neither in past or future requests if _is_not_in_past_requests(start_request_from_args): start_requests.append(start_request_from_args) except Exception as e: print(str(e)) sys.exit(1) else: start_requests.append(start_request_from_args) # retrieving robots.txt content if self._get_robots_txt: try: start_requests.extend( filter( _is_not_in_past_requests, self._get_requests_from_robots( start_request_from_args))) except KeyboardInterrupt: print("\nAborted") sys.exit(0) # save starting request to db database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print( "\nDone: {} starting url(s) and {} url(s) already crawled".format( len(start_requests), len(Shared.requests))) # starting crawling threads print("Database %s initialized, crawl starting with %d threads" % (database, self._num_threads)) for n in range(0, self._num_threads): thread = CrawlerThread() self._threads.append(thread) thread.start() # running crawl loop self._main_loop(self._threads, start_requests, database, self._display_progress, self._verbose) self._kill_threads(self._threads) self.crawl_end_date = int(time.time()) print("Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_date - self.crawl_start_date) / 60)) # update end date in db database.update_crawl_info(crawl_id, self.crawl_end_date, Shared.options["random_seed"], Shared.end_cookies)
def get_requests(self): # Shared.options['process_timeout'] if self.request.method == "POST": raise Exception("POST method with urllib is not supported yet") #parent = self.request.parent.url if self.request.parent else "" self.retries_interval = 0.5 jar_response = cookielib.LWPCookieJar() jar_request = cookielib.LWPCookieJar() html = "" set_cookie = [] requests = [] while True: try: #Shared.th_lock.acquire() for cookie in self.request.cookies: jar_request.set_cookie(cookie.get_cookielib_cookie()) #Shared.th_lock.release() opener = self.urllib2_opener(self.request, jar_response) req = urllib2.Request(url=self.request.url) jar_request.add_cookie_header(req) res = opener.open(req, None, self.timeout) for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) ctype = res.info( )['Content-Type'] # @TODO !! WRONG!! (check if wrong...not sure) if ctype is not None: if ctype.lower().split(";")[0] != "text/html": opener.close() raise NotHtmlException(ERROR_CONTENTTYPE) html = res.read() opener.close() if html: html = decode_bytes(html) finder = UrlFinder(html) try: urls = finder.get_urls() except Exception as e: raise for url in urls: # @TODO handle FORMS requests.append( Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id)) break except RedirectException as e: set_cookie = [] for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) r = Request(REQTYPE_REDIRECT, "GET", str(e), parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id) requests.append(r) break except NotHtmlException: raise except Exception as e: self.retries -= 1 if self.retries == 0: raise time.sleep(self.retries_interval) return requests
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() probe_cmd = get_phantomjs_cmd() if not probe_cmd: print "Error: unable to find phantomjs executable" sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None display_progress = True verbose = False initial_checks = True http_auth = None get_robots_txt = True save_html = False user_script = None try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': display_progress = False elif o == '-A': http_auth = v elif o == '-p': if v == "tor": v = "socks5:127.0.0.1:9150" proxy = v.split(":") if proxy[0] not in ("http", "socks5"): print "only http and socks5 proxies are supported" sys.exit(1) Shared.options['proxy'] = { "proto": proxy[0], "host": proxy[1], "port": proxy[2] } elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": verbose = True elif o == "-u": if os.path.isfile(v): user_script = os.path.abspath(v) else: print "error: unable to open USER_SCRIPT" sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = self.parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) probe_cmd.append(self.base_dir + 'probe/analyze.js') if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") if user_script: probe_options.extend(("-u", user_script)) probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) if not hasattr(ssl, "SSLContext"): print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" stdoutw("Initializing . ") if user_script and initial_checks: self.check_user_script_syntax(probe_cmd, user_script) start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None fname = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(fname, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info(htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent']) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads" % ( fname, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database, display_progress, verbose) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % ( Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)