# spider a given webpage for all available URL's elif opt.spiderWebSite: problem_identifiers = ["http://", "https://"] if not URL_REGEX.match(opt.spiderWebSite): err_msg = "URL did not match a true URL{}" if not any(m in opt.spiderWebSite for m in problem_identifiers): err_msg = err_msg.format(" issue seems to be that http:// " "or https:// is not present in the URL") else: err_msg = err_msg.format("") raise InvalidInputProvided( err_msg ) else: if URL_QUERY_REGEX.match(opt.spiderWebSite): question_msg = ( "it is recommended to not use a URL that has a GET(query) parameter in it, " "would you like to continue" ) if not opt.runInBatch: is_sure = prompt( question_msg, opts="yN" ) else: is_sure = prompt( question_msg, opts="yN", default="y" ) if is_sure.lower().startswith("y"): pass else:
def parse_search_results( query, url_to_search, verbose=False, **kwargs): """ Parse a webpage from Google for URL's with a GET(query) parameter """ splitter = "&" retval = set() query_url = None proxy_string, user_agent = kwargs.get("proxy", None), kwargs.get("agent", None) if verbose: logger.debug(set_color( "checking for user-agent and proxy configuration...", level=10 )) user_agent_info = "adjusting user-agent header to {}..." if user_agent is not DEFAULT_USER_AGENT: user_agent_info = user_agent_info.format(user_agent.strip()) else: user_agent_info = user_agent_info.format("default user agent '{}'".format(DEFAULT_USER_AGENT)) proxy_string_info = "setting proxy to {}..." if proxy_string is not None: proxy_string_info = proxy_string_info.format( ''.join(proxy_string.keys()) + "://" + ''.join(proxy_string.values())) else: proxy_string_info = "no proxy configuration detected..." headers = { "Connection": "close", "user-agent": user_agent } logger.info(set_color( "attempting to gather query URL..." )) try: query_url = get_urls(query, url_to_search, verbose=verbose, user_agent=user_agent, proxy=proxy_string) except Exception as e: if "WebDriverException" in str(e): logger.exception(set_color( "it seems that you exited the browser, please allow the browser " "to complete it's run so that Zeus can bypass captchas and API " "calls", level=50 )) elif "'/usr/lib/firefoxdriver/webdriver.xpi'" in str(e): logger.fatal(set_color( "firefox was not found in the default location on your system, " "check your installation and make sure it is in /usr/lib, if you " "find it there, restart your system and try again...", level=50 )) else: logger.exception(set_color( "{} failed to gather the URL from search engine, caught exception '{}' " "exception has been logged to current log file...".format( os.path.basename(__file__), str(e).strip()), level=50) ) request_issue_creation() shutdown() logger.info(set_color( "URL successfully gathered, searching for GET parameters..." )) logger.info(set_color(proxy_string_info)) req = requests.get(query_url, proxies=proxy_string) logger.info(set_color(user_agent_info)) req.headers.update(headers) found_urls = URL_REGEX.findall(req.text) url_skip_schema = ("maps.google", "play.google", "youtube") for urls in list(found_urls): for url in list(urls): url = unquote(url) if not any(u in url for u in url_skip_schema): if URL_QUERY_REGEX.match(url) and not any(l in url for l in URL_EXCLUDES): if isinstance(url, unicode): url = str(url).encode("utf-8") if "webcache" in url: logger.info(set_color( "received webcache URL, extracting URL from webcache..." )) webcache_url = url url = extract_webcache_url(webcache_url) if url is None: logger.warning(set_color( "unable to extract url from given webcache URL '{}'...".format( webcache_url ), level=30 )) if verbose: try: logger.debug(set_color( "found '{}'...".format(url.split(splitter)[0]), level=10 )) except TypeError: logger.debug(set_color( "found '{}'...".format(str(url).split(splitter)[0]), level=10 )) except AttributeError: logger.debug(set_color( "found '{}...".format(str(url)), level=10 )) if url is not None: retval.add(url.split("&")[0]) logger.info(set_color( "found a total of {} URL's with a GET parameter...".format(len(retval)) )) if len(retval) != 0: write_to_log_file(retval, URL_LOG_PATH, "url-log-{}.log") else: logger.critical(set_color( "did not find any usable URL's with the given query '{}' " "using search engine '{}'...".format(query, url_to_search), level=50 )) shutdown() return list(retval) if len(retval) != 0 else None
def search_multiple_pages(query, link_amount, verbose=False, **kwargs): def __config_proxy(proxy_string): proxy_type_schema = { "http": httplib2.socks.PROXY_TYPE_HTTP, "socks4": httplib2.socks.PROXY_TYPE_SOCKS4, "socks5": httplib2.socks.PROXY_TYPE_SOCKS5 } proxy_type = get_proxy_type(proxy_string)[0] proxy_dict = proxy_string_to_dict(proxy_string) proxy_config = httplib2.ProxyInfo( proxy_type=proxy_type_schema[proxy_type], proxy_host="".join(proxy_dict.keys()), proxy_port="".join(proxy_dict.values()) ) return proxy_config proxy, agent = kwargs.get("proxy", None), kwargs.get("agent", None) if proxy is not None: if verbose: logger.debug(set_color( "configuring to use proxy '{}'...".format(proxy), level=10 )) __config_proxy(proxy) if agent is not None: if verbose: logger.debug(set_color( "settings user-agent to '{}'...".format(agent), level=10 )) logger.warning(set_color( "multiple pages will be searched using Google's API client, searches may be blocked after a certain " "amount of time...", level=30 )) results, limit, found, index = set(), link_amount, 0, google_api.search(query, user_agent=agent, safe="on") try: while limit > 0: results.add(next(index)) limit -= 1 found += 1 except Exception as e: if "Error 503" in str(e): logger.fatal(set_color( "Google is blocking the current IP address, dumping already found URL's...", level=50 )) results = results pass retval = set() for url in results: if URL_REGEX.match(url) and URL_QUERY_REGEX.match(url): if verbose: logger.debug(set_color( "found '{}'...".format(url), level=10 )) retval.add(url) if len(retval) != 0: logger.info(set_color( "a total of {} links found out of requested {}...".format( len(retval), link_amount ) )) write_to_log_file(list(retval), URL_LOG_PATH, "url-log-{}.log") else: logger.error(set_color( "unable to extract URL's from results...", level=40 ))
def parse_search_results(query, url_to_search, verbose=False, **kwargs): """ Parse a webpage from Google for URL's with a GET(query) parameter """ possible_leftovers = URLParser(None).possible_leftovers splitter = "&" retval = set() query_url = None parse_webcache, pull_all = kwargs.get("parse_webcache", False), kwargs.get( "pull_all", False) proxy_string, user_agent = kwargs.get("proxy", None), kwargs.get("agent", None) forward_for = kwargs.get("forward_for", False) tor = kwargs.get("tor", False) batch = kwargs.get("batch", False) show_success = kwargs.get("show_success", False) if verbose: logger.debug(set_color("parsing blacklist", level=10)) parse_blacklist(query, BLACKLIST_FILE_PATH, batch=batch) if verbose: logger.debug( set_color("checking for user-agent and proxy configuration", level=10)) if not parse_webcache and "google" in url_to_search: logger.warning( set_color( "will not parse webcache URL's (to parse webcache pass -W)", level=30)) if not pull_all: logger.warning( set_color( "only pulling URLs with GET(query) parameters (to pull all URL's pass -E)", level=30)) user_agent_info = "adjusting user-agent header to {}" if user_agent is not DEFAULT_USER_AGENT: user_agent_info = user_agent_info.format(user_agent.strip()) else: user_agent_info = user_agent_info.format( "default user agent '{}'".format(DEFAULT_USER_AGENT)) proxy_string_info = "setting proxy to {}" if proxy_string is not None: proxy_string = proxy_string_to_dict(proxy_string) proxy_string_info = proxy_string_info.format( ''.join(proxy_string.keys()) + "://" + ''.join(proxy_string.values())) elif tor: proxy_string = proxy_string_to_dict("socks5://127.0.0.1:9050") proxy_string_info = proxy_string_info.format("tor proxy settings") else: proxy_string_info = "no proxy configuration detected" if forward_for: ip_to_use = (create_random_ip(), create_random_ip(), create_random_ip()) if verbose: logger.debug( set_color( "random IP addresses generated for headers '{}'".format( ip_to_use), level=10)) headers = { HTTP_HEADER.CONNECTION: "close", HTTP_HEADER.USER_AGENT: user_agent, HTTP_HEADER.X_FORWARDED_FOR: "{}, {}, {}".format(ip_to_use[0], ip_to_use[1], ip_to_use[2]) } else: headers = { HTTP_HEADER.CONNECTION: "close", HTTP_HEADER.USER_AGENT: user_agent } logger.info(set_color("attempting to gather query URL")) try: query_url = get_urls(query, url_to_search, verbose=verbose, user_agent=user_agent, proxy=proxy_string, tor=tor, batch=batch, xforward=forward_for) except Exception as e: if "'/usr/lib/firefoxdriver/webdriver.xpi'" in str(e): logger.fatal( set_color( "firefox was not found in the default location on your system, " "check your installation and make sure it is in /usr/lib, if you " "find it there, restart your system and try again", level=50)) elif "connection refused" in str(e).lower(): logger.fatal( set_color( "there are to many sessions of firefox opened and selenium cannot " "create a new one", level=50)) run_fix( "would you like to attempt to auto clean the open sessions", "sudo sh {}".format(CLEANUP_TOOL_PATH), "kill off the open sessions of firefox and re-run Zeus", exit_process=True) elif "Program install error!" in str(e): logger.error( set_color( "seems the program is having some trouble installing would you like " "to try and automatically fix this issue", level=40)) run_fix( "would you like to attempt to fix this issue automatically", "sudo sh {}".format(FIX_PROGRAM_INSTALL_PATH), "you can manually try and re-install Xvfb to fix the problem", exit_process=True) elif "Message: Reached error page:" in str(e): logger.fatal( set_color( "geckodriver has hit an error that usually means it needs to be reinstalled", level=50)) question = prompt( "would you like to attempt a reinstallation of the geckodriver", opts="yN") if question.lower().startswith("y"): logger.warning( set_color( "rewriting all executed information, path information, and removing geckodriver", level=30)) rewrite_all_paths() logger.info( set_color( "all paths rewritten, you will be forced to re-install everything next run of Zeus" )) else: logger.fatal( set_color( "you will need to remove the geckodriver from /usr/bin and reinstall it", level=50)) shutdown() elif "Unable to find a matching set of capabilities" in str(e): logger.fatal( set_color( "it appears that firefox, selenium, and geckodriver are not playing nice with one another", level=50)) run_fix( "would you like to attempt to resolve this issue automatically", "sudo sh {}".format(REINSTALL_TOOL), ("you will need to reinstall firefox to a later version, update selenium, and reinstall the " "geckodriver to continue using Zeus"), exit_process=True) else: logger.exception( set_color( "{} failed to gather the URL from search engine, caught exception '{}' " "exception has been logged to current log file".format( os.path.basename(__file__), str(e).strip()), level=50)) request_issue_creation() shutdown() logger.info( set_color("URL successfully gathered, searching for GET parameters")) logger.info(set_color(proxy_string_info)) try: req = requests.get(query_url, proxies=proxy_string, params=headers) except ConnectionError: logger.warning( set_color( "target machine refused connection, delaying and trying again", level=30)) time.sleep(3) req = requests.get(query_url, proxies=proxy_string, params=headers) logger.info(set_color(user_agent_info)) req.headers.update(headers) found_urls = URL_REGEX.findall(req.text) for urls in list(found_urls): for url in list(urls): url = unquote(url) if not any(u in url for u in URL_EXCLUDES): if not url == "http://" and not url == "https://": if URL_REGEX.match(url): if isinstance(url, unicode): url = str(url).encode("utf-8") if pull_all: retval.add(url.split(splitter)[0]) else: if URL_QUERY_REGEX.match(url.split(splitter)[0]): retval.add(url.split(splitter)[0]) if verbose: try: logger.debug( set_color("found '{}'".format( url.split(splitter)[0]), level=10)) except TypeError: logger.debug( set_color("found '{}'".format( str(url).split(splitter)[0]), level=10)) except AttributeError: logger.debug( set_color("found '{}".format(str(url)), level=10)) if url is not None: retval.add(url.split(splitter)[0]) true_retval = set() for url in list(retval): if any(l in url for l in possible_leftovers): url = URLParser(url).strip_url_leftovers() if parse_webcache: if "webcache" in url: logger.info(set_color("found a webcache URL, extracting")) url = URLParser(url).extract_webcache_url() if verbose: logger.debug(set_color("found '{}'".format(url), level=15)) true_retval.add(url) else: true_retval.add(url) else: true_retval.add(url) if len(true_retval) != 0: file_path = write_to_log_file(true_retval, URL_LOG_PATH, URL_FILENAME) if show_success: amount_of_urls = len(open(file_path).readlines()) success_rate = calculate_success(amount_of_urls) logger.info( set_color("provided query has a {} success rate".format( success_rate))) else: logger.fatal( set_color( "did not find any URLs with given query '{}' writing query to blacklist" .format(query), level=50)) write_to_log_file(query, BLACKLIST_FILE_PATH, BLACKLIST_FILENAME, blacklist=True) shutdown() logger.info( set_color("found a total of {} URLs with given query '{}'".format( len(true_retval), query))) return list(true_retval) if len(true_retval) != 0 else None