コード例 #1
0
ファイル: probe.py プロジェクト: zdoop/htcap
    def __init__(self, data, parent):
        self.status = "ok"
        self.requests = []
        self.cookies = []
        self.redirect = None
        # if True the probe returned no error BUT the json is not closed properly
        self.partialcontent = False
        self.html = None
        self.user_output = []
        self.page_hash = 0

        status = data.pop()

        if status['status'] == "error":
            self.status = "error"
            self.errcode = status['code']

        if "partialcontent" in status:
            self.partialcontent = status['partialcontent']

        # grap cookies before creating rquests
        for key, val in data:
            if key == "cookies":
                for cookie in val:
                    self.cookies.append(Cookie(cookie, parent.url))

        if "redirect" in status:
            self.redirect = status['redirect']
            r = Request(REQTYPE_REDIRECT,
                        "GET",
                        self.redirect,
                        parent=parent,
                        set_cookie=self.cookies,
                        parent_db_id=parent.db_id)
            self.requests.append(r)

        for key, val in data:
            if key == "request":
                trigger = val['trigger'] if 'trigger' in val else None
                #try:
                r = Request(val['type'],
                            val['method'],
                            val['url'],
                            parent=parent,
                            set_cookie=self.cookies,
                            data=val['data'],
                            trigger=trigger,
                            parent_db_id=parent.db_id)
                self.requests.append(r)
                #except Exception as e:
                #	pass
            elif key == "html":
                self.html = val
            elif key == "page_hash":
                page_hash = TextHash(val).hash
                self.page_hash = page_hash if page_hash else 0
            elif key == "user":
                self.user_output.append(val)
コード例 #2
0
    def __init__(self, data, parent):
        self.status = "ok"
        self.requests = []
        self.cookies = []
        self.redirect = []
        self.errmessage = ""
        # if True the probe returned no error BUT the json is not closed properly
        self.partialcontent = False
        self.html = None
        self.user_output = []
        self.page_hash = 0

        status = data["status"]

        if status == "error":
            self.status = "error"
            self.errmessage = data["errors"]

        # grap cookies before creating rquests
        for cookie in data["cookies"]:
            self.cookies.append(Cookie(cookie, parent.url))

        for redirect in data['redirect']:
            r = Request(REQTYPE_REDIRECT,
                        "GET",
                        redirect,
                        parent=parent,
                        set_cookie=self.cookies,
                        parent_db_id=parent.db_id)
            self.redirect.append(r)

        requests = data["requests"]
        for request in requests:
            request = json.loads(request)
            r = Request(request['type'],
                        request['method'],
                        request['url'],
                        parent=parent,
                        parent_db_id=parent.db_id,
                        set_cookie=self.cookies,
                        data=request['data'],
                        trigger=request.get("trigger", None),
                        extra_headers=request.get("extra_headers", None))
            self.requests.append(r)
コード例 #3
0
ファイル: http_get.py プロジェクト: slooppe/htcap
    def send_request(
            self,
            method=None,
            url=None,
            data=None,
            cookies=None,
            ignore_errors=False,
            follow_redirect=False):  # Shared.options['process_timeout']

        if not method:
            method = self.request.method

        if not url:
            url = self.request.url

        if method in ("POST", "PUT"):
            if not data:
                data = self.request.data if self.request.data else ""

        if not cookies:
            cookies = []

        jar_request = cookielib.LWPCookieJar()

        ret = {
            "code": None,
            "url": None,
            "headers": None,
            "body": None,
            "time": None
        }

        while True:
            try:
                existing_cookies = []
                for cookie in self.request.cookies:
                    clc = cookie.get_cookielib_cookie()
                    for c in cookies:
                        if c['name'] == cookie.name:
                            clc.value = c['value']
                            existing_cookies.append(c)
                    jar_request.set_cookie(clc)

                for cookie in [
                        x for x in cookies if x not in existing_cookies
                ]:
                    c = Cookie(cookie)  # check what to do with cookie.setter
                    jar_request.set_cookie(c.get_cookielib_cookie())

                opener = self.urllib2_opener(self.request, None,
                                             follow_redirect)
                req = urllib2.Request(url=url,
                                      data=data.encode() if data else None)
                req.get_method = lambda: method
                jar_request.add_cookie_header(req)
                # headers = self.request.extra_headers
                # if self.extra_headers:
                # 	for h in self.extra_headers:
                # 		headers[h] = self.extra_headers[h]
                # for hn in headers:
                # 	req.add_header(hn, headers[hn])

                if data and not 'Content-type' in req.headers:
                    req.add_header("Content-type", detect_content_type(data))
                now = time.time()
                try:
                    res = opener.open(req, None, self.timeout)
                except urllib2.HTTPError as e:
                    if not ignore_errors:
                        raise
                    res = e
                opener.close()

                ret['code'] = res.getcode()
                ret['url'] = res.geturl()
                #ret['headers'] = [x.strip() for x in res.info().headers]
                ret['headers'] = ["%s: %s" % x for x in res.info().items()]
                ret['body'] = res.read()
                ret['time'] = time.time() - now

                break

            except Exception as e:
                self.retries -= 1
                if self.retries == 0: raise
                time.sleep(self.retries_interval)

        return ret
コード例 #4
0
ファイル: http_get.py プロジェクト: slooppe/htcap
    def get_requests(self):  # Shared.options['process_timeout']

        if self.request.method == "POST":
            raise Exception("POST method with urllib is not supported yet")

        #parent = self.request.parent.url if self.request.parent else ""

        self.retries_interval = 0.5

        jar_response = cookielib.LWPCookieJar()
        jar_request = cookielib.LWPCookieJar()

        html = ""
        set_cookie = []

        requests = []

        while True:
            try:
                #Shared.th_lock.acquire()

                for cookie in self.request.cookies:
                    jar_request.set_cookie(cookie.get_cookielib_cookie())

                #Shared.th_lock.release()

                opener = self.urllib2_opener(self.request, jar_response)
                req = urllib2.Request(url=self.request.url)
                jar_request.add_cookie_header(req)

                res = opener.open(req, None, self.timeout)

                for cookie in jar_response:
                    set_cookie.append(Cookie(cookie.__dict__,
                                             self.request.url))

                ctype = res.info(
                )['Content-Type']  # @TODO !! WRONG!! (check if wrong...not sure)
                if ctype is not None:
                    if ctype.lower().split(";")[0] != "text/html":
                        opener.close()
                        raise NotHtmlException(ERROR_CONTENTTYPE)

                html = res.read()
                opener.close()

                if html:
                    html = decode_bytes(html)
                    finder = UrlFinder(html)
                    try:
                        urls = finder.get_urls()
                    except Exception as e:
                        raise

                for url in urls:
                    # @TODO handle FORMS
                    requests.append(
                        Request(REQTYPE_LINK,
                                "GET",
                                url,
                                parent=self.request,
                                set_cookie=set_cookie,
                                parent_db_id=self.request.db_id))

                break

            except RedirectException as e:
                set_cookie = []
                for cookie in jar_response:
                    set_cookie.append(Cookie(cookie.__dict__,
                                             self.request.url))

                r = Request(REQTYPE_REDIRECT,
                            "GET",
                            str(e),
                            parent=self.request,
                            set_cookie=set_cookie,
                            parent_db_id=self.request.db_id)
                requests.append(r)
                break
            except NotHtmlException:
                raise
            except Exception as e:
                self.retries -= 1
                if self.retries == 0: raise
                time.sleep(self.retries_interval)

        return requests
コード例 #5
0
	def cookies_from_json(self, cookies):
		#return [Cookie(c, self.parent.url) for c in json.loads(cookies)]

		# create Cookie without "setter" because cookies loaded from db are always valid (no domain restrictions)
		# see Cookie.py 
		return [Cookie(c) for c in json.loads(cookies)]
コード例 #6
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        deps_errors = check_dependences(self.base_dir)
        if len(deps_errors) > 0:
            print("Dependences errors: ")
            for err in deps_errors:
                print("  %s" % err)
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OvelE:L:Mg:')
        except getopt.GetoptError as err:
            print(str(err))
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print("error reading cookie file")
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                self.display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                try:
                    Shared.options['proxy'] = parse_proxy_string(v)
                except Exception as e:
                    print(e)
                    sys.exit(1)
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    try:
                        re.match(eu, "")
                    except:
                        print("* ERROR: regex failed: %s" % eu)
                        sys.exit(1)
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print("* ERROR: wrong scope set '%s'" % v)
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print("* ERROR: wrong mode set '%s'" % v)
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                self.verbose = True
            elif o == "-e":
                Shared.options['deduplicate_pages'] = False
            elif o == "-l":
                Shared.options['headless_chrome'] = False
            elif o == "-M":
                Shared.options['simulate_real_events'] = False
            elif o == "-E":
                if not Shared.options['extra_headers']:
                    Shared.options['extra_headers'] = {}
                (hn, hv) = v.split("=", 1)
                Shared.options['extra_headers'][hn] = hv
            elif o == "-L":
                try:
                    with open(v) as cf:
                        Shared.options['login_sequence'] = json.loads(
                            cf.read())
                        Shared.options['login_sequence'][
                            "__file__"] = os.path.abspath(v)
                except ValueError as e:
                    print("* ERROR: decoding login sequence")
                    sys.exit(1)
                except Exception as e:
                    print("* ERROR: login sequence file not found")
                    sys.exit(1)
            elif o == "-g":
                if not Shared.options['local_storage']:
                    Shared.options['local_storage'] = {}
                (hn, hv) = v.split("=", 1)
                ktks = hn.split(":", 1)
                if len(ktks) != 2 or ktks[0] not in ("L", "S"):
                    print(
                        "Error: the -g option must be in the form '[L|S]:key=value', use 'L' to set locaStorage and 'S' to set sessionStorage"
                    )
                    sys.exit(1)
                Shared.options['local_storage'][ktks[1]] = {
                    "type": ktks[0],
                    "value": hv
                }

        probe_cmd = get_node_cmd()
        if not probe_cmd:  # maybe useless
            print("Error: unable to find node executable")
            sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print("* Warinig: option -d is valid only if scope is %s" %
                  CRAWLSCOPE_DOMAIN)

        if cookie_string:
            try:
                start_cookies = parse_cookie_string(cookie_string)
            except Exception as e:
                print("error decoding cookie string")
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_options.extend([
                "-y",
                "%s:%s:%s" % (Shared.options['proxy']['proto'],
                              Shared.options['proxy']['host'],
                              Shared.options['proxy']['port'])
            ])
        if not Shared.options['headless_chrome']:
            probe_options.append("-l")
        probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js'))

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        if Shared.options['extra_headers']:
            probe_options.extend(
                ["-E", json.dumps(Shared.options['extra_headers'])])

        if Shared.options['local_storage']:
            probe_options.extend(
                ["-g", json.dumps(Shared.options['local_storage'])])

        if not Shared.options['simulate_real_events']:
            probe_options.append("-M")

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        if Shared.options['login_sequence'] and Shared.options[
                'login_sequence']['type'] == LOGSEQTYPE_SHARED:
            login_req = Request(REQTYPE_LINK,
                                "GET",
                                Shared.options['login_sequence']['url'],
                                set_cookie=Shared.start_cookies,
                                http_auth=http_auth,
                                referer=start_referer,
                                extra_headers=Shared.options['extra_headers'])
            stdoutw("Logging in . . . ")
            try:
                pe = ProbeExecutor(
                    login_req,
                    Shared.probe_cmd + ["-z"],
                    login_sequence=Shared.options['login_sequence'])
                probe = pe.execute()
                if not probe:
                    print("\n* ERROR: login sequence failed to execute probe")
                    sys.exit(1)
                if probe.status == "ok":
                    for c in probe.cookies:
                        if not Shared.options['login_sequence'][
                                'cookies'] or c.name in Shared.options[
                                    'login_sequence']['cookies']:
                            Shared.start_cookies.append(c)
                else:
                    print("\n* ERROR: login sequence failed:\n   %s" %
                          probe.errmessage)
                    sys.exit(1)
            except KeyboardInterrupt:
                pe.terminate()
                print("\nAborted")
                sys.exit(0)
            print("done")

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer,
                            extra_headers=Shared.options['extra_headers'])

        if not hasattr(ssl, "SSLContext"):
            print(
                "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"
            )

        stdoutw("Initializing . ")

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        self.db_file = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(self.db_file, out_file)
        except Exception as e:
            print(str(e))
            sys.exit(1)

        database.save_crawl_info(
            htcap_version=get_program_infos()['version'],
            target=Shared.starturl,
            start_date=self.crawl_start_time,
            commandline=cmd_to_str(argv),
            user_agent=Shared.options['useragent'],
            proxy=json.dumps(Shared.options['proxy']),
            extra_headers=json.dumps(Shared.options['extra_headers']),
            cookies=json.dumps([x.get_dict() for x in Shared.start_cookies]))

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print("done")
        print(
            "Database %s initialized, crawl started with %d threads (^C to pause or change verbosity)"
            % (self.db_file, num_threads))

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print("Crawl finished, %d pages analyzed in %d minutes" %
              (Shared.requests_index,
               (self.crawl_end_time - self.crawl_start_time) // 60))

        database.save_crawl_info(end_date=self.crawl_end_time)
コード例 #7
0
    def main(self, args, opts):
        passw = None
        format = None
        out_cookies = True
        out_logouts = True
        for o, v in opts:
            if o == "-h":
                print self.usage()
                sys.exit(0)
            elif o == "-p":
                passw = v
            elif o == "-c":
                out_cookies = False
            elif o == "-l":
                out_logouts = False
            elif o in ("-H", "-J", "-A"):
                format = o

        if not passw:
            print "The password is hidden here BUT it will be passed to phantomjs via commandline ..."
            try:
                passw = getpass.getpass()
            except KeyboardInterrupt:
                print "\nAbort..."
                sys.exit(0)

        jspath = "%s%s%s%s" % (getrealdir(__file__), "login", os.sep,
                               "login.js")
        cmd = get_phantomjs_cmd() + [jspath, args[0], args[1], passw]
        if len(args) > 2: cmd.append(args[2])
        #print cmd_to_str(cmd)
        exe = CommandExecutor(cmd, True)
        out, err = exe.execute(20)
        if err:
            print "Unable to login"
            sys.exit(1)

        try:
            ret = json.loads(out)
        except ValueError as e:
            print e
            sys.exit(1)
        allcookies, logouts = ret
        cookies = []
        if out_cookies:
            for c in reversed(allcookies):
                cookie = Cookie(c)
                if not cookie in cookies: cookies.append(cookie)
        if not out_logouts:
            logouts = []

        if not format:
            print "Cookies:"
            for c in cookies:
                print " %s=%s" % (c.name, c.value)
            print "Logout urls:"
            for u in logouts:
                print " %s" % u
        elif format == "-A":
            for c in cookies:
                print cmd_to_str([c.name, c.value])
            for u in logouts:
                print cmd_to_str([u])
        elif format == "-H":
            args = []
            if len(cookies) > 0:
                args = [
                    "-c",
                    ";".join(["%s=%s" % (c.name, c.value) for c in cookies])
                ]
            if len(logouts) > 0:
                args.extend(["-x", ",".join(logouts)])
            if len(args) > 0:
                print cmd_to_str(args)
        elif format == "-J":
            cd = []
            for c in cookies:
                cd.append(c.get_dict())
            if out_cookies:
                print json.dumps(cd)
コード例 #8
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        deps_errors = check_dependences(self.base_dir)
        if len(deps_errors) > 0:
            print "Dependences errors: "
            for err in deps_errors:
                print "  %s" % err
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OveLlE:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                self.display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                try:
                    Shared.options['proxy'] = parse_proxy_string(v)
                except Exception as e:
                    print e
                    sys.exit(1)
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                self.verbose = True
            elif o == "-e":
                Shared.options['deduplicate_pages'] = False
            elif o == "-L":
                Shared.options['use_legacy_browser'] = True
            elif o == "-l":
                Shared.options['headless_chrome'] = False
            elif o == "-E":
                if not Shared.options['extra_headers']:
                    Shared.options['extra_headers'] = {}
                (hn, hv) = v.split("=", 1)
                Shared.options['extra_headers'][hn] = hv

        probe_cmd = get_phantomjs_cmd(
        ) if Shared.options['use_legacy_browser'] else get_node_cmd()
        if not probe_cmd:  # maybe useless
            print "Error: unable to find node (or phantomjs) executable"
            sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['use_legacy_browser']:
            if Shared.options['proxy']:
                probe_cmd.append("--proxy-type=%s" %
                                 Shared.options['proxy']['proto'])
                probe_cmd.append("--proxy=%s:%s" %
                                 (Shared.options['proxy']['host'],
                                  Shared.options['proxy']['port']))
            probe_cmd.append(os.path.join(self.base_dir, 'probe',
                                          'analyze.js'))
        else:
            if Shared.options['proxy']:
                probe_options.extend([
                    "-y",
                    "%s:%s:%s" % (Shared.options['proxy']['proto'],
                                  Shared.options['proxy']['host'],
                                  Shared.options['proxy']['port'])
                ])
            if not Shared.options['headless_chrome']:
                probe_options.append("-l")
            probe_cmd.append(
                os.path.join(self.base_dir, 'probe', 'chrome-probe',
                             'analyze.js'))

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        if Shared.options['extra_headers']:
            probe_options.extend(
                ["-E", json.dumps(Shared.options['extra_headers'])])

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        self.db_file = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(self.db_file, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'],
                                 proxy=json.dumps(Shared.options['proxy']),
                                 extra_headers=json.dumps(
                                     Shared.options['extra_headers']),
                                 cookies=json.dumps(start_cookies))

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads" % (
            self.db_file, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)
コード例 #9
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        probe_cmd = get_phantomjs_cmd()
        if not probe_cmd:
            print "Error: unable to find phantomjs executable"
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        display_progress = True
        verbose = False
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False
        user_script = None

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                if v == "tor": v = "socks5:127.0.0.1:9150"
                proxy = v.split(":")
                if proxy[0] not in ("http", "socks5"):
                    print "only http and socks5 proxies are supported"
                    sys.exit(1)
                Shared.options['proxy'] = {
                    "proto": proxy[0],
                    "host": proxy[1],
                    "port": proxy[2]
                }
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                verbose = True
            elif o == "-u":
                if os.path.isfile(v):
                    user_script = os.path.abspath(v)
                else:
                    print "error: unable to open USER_SCRIPT"
                    sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = self.parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_cmd.append("--proxy-type=%s" %
                             Shared.options['proxy']['proto'])
            probe_cmd.append("--proxy=%s:%s" %
                             (Shared.options['proxy']['host'],
                              Shared.options['proxy']['port']))

        probe_cmd.append(self.base_dir + 'probe/analyze.js')

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        if user_script:
            probe_options.extend(("-u", user_script))

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        if user_script and initial_checks:
            self.check_user_script_syntax(probe_cmd, user_script)

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        fname = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(fname, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'])

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads" % (
            fname, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database, display_progress,
                       verbose)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)
コード例 #10
0
    def send_request(self,
                     method=None,
                     url=None,
                     data=None,
                     cookies=None,
                     ignore_errors=False):  # Shared.options['process_timeout']

        if not method:
            method = self.request.method

        if not url:
            url = self.request.url

        if method == "POST":
            if not data:
                data = self.request.data if self.request.data else ""

        if not cookies:
            cookies = []

        jar_request = cookielib.LWPCookieJar()

        ret = {
            "code": None,
            "url": None,
            "headers": None,
            "body": None,
            "time": None
        }

        while True:
            try:
                existing_cookies = []
                for cookie in self.request.cookies:
                    clc = cookie.get_cookielib_cookie()
                    for c in cookies:
                        if c['name'] == cookie.name:
                            clc.value = c['value']
                            existing_cookies.append(c)
                    jar_request.set_cookie(clc)

                for cookie in [
                        x for x in cookies if x not in existing_cookies
                ]:
                    c = Cookie(cookie)  # check what to do with cookie.setter
                    jar_request.set_cookie(c.get_cookielib_cookie())

                opener = self.urllib2_opener(self.request, None, True)
                req = urllib2.Request(url=url, data=data)
                jar_request.add_cookie_header(req)
                if self.extra_headers:
                    for hn in self.extra_headers:
                        req.add_header(hn, self.extra_headers[hn])
                now = time.time()
                try:
                    res = opener.open(req, None, self.timeout)
                except urllib2.HTTPError as e:
                    if not ignore_errors:
                        raise
                    res = e
                opener.close()

                ret['code'] = res.getcode()
                ret['url'] = res.geturl()
                ret['headers'] = [x.strip() for x in res.info().headers]
                ret['body'] = res.read()
                ret['time'] = time.time() - now

                break

            except Exception as e:
                self.retries -= 1
                if self.retries == 0: raise
                time.sleep(self.retries_interval)

        return ret
コード例 #11
0
    def run(self):

        # get database
        try:
            database = self._get_database(self._outfile_name,
                                          self._output_mode)

            crawl_id = database.save_crawl_info(
                htcap_version=get_program_infos()['version'],
                target=Shared.start_url,
                start_date=self.crawl_start_date,
                commandline=cmd_to_str(self.arg),
                user_agent=Shared.options['user_agent'],
                start_cookies=Shared.start_cookies)

            # if the current crawl is not the first one
            if crawl_id > 1:

                # retrieving options from the last crawl
                random_seed, cookies = database.retrieve_crawl_info(crawl_id -
                                                                    1)

                # if the db had a seed and none were provided before
                if random_seed and not Shared.options.get("random_seed"):
                    Shared.options["random_seed"] = random_seed

                # if no cookie was provided and some exist from the last crawl
                if len(Shared.start_cookies
                       ) <= 0 and cookies != "[]" and cookies is not None:
                    for cookie_string in self._parse_cookie_string(cookies):
                        Shared.start_cookies.append(Cookie(cookie_string))

            # if no seed have been set yet
            if not Shared.options.get("random_seed"):
                Shared.options["random_seed"] = self._generate_random_string(
                    20)

        except Exception as e:
            print(str(e))
            sys.exit(1)

        # set probe arguments
        self._set_probe()

        Shared.probe_cmd = self._probe["cmd"] + self._probe["options"]

        start_requests = []

        # create the start request object from provided arguments
        start_request_from_args = Request(REQTYPE_LINK,
                                          "GET",
                                          Shared.start_url,
                                          set_cookie=Shared.start_cookies,
                                          http_auth=self._http_auth,
                                          referer=self._start_referer)

        def _is_not_in_past_requests(request):
            """
            check if the given request is present in Shared.requests or start_requests
            """
            is_in_request = True
            for r in Shared.requests + start_requests:
                if r == request:
                    is_in_request = False
            return is_in_request

        # check starting url
        if self._initial_checks:
            try:
                self._check_request(start_request_from_args)
                stdoutw(". ")
            except KeyboardInterrupt:
                print("\nAborted")
                sys.exit(0)

        if self._output_mode in (CRAWLOUTPUT_RESUME, CRAWLOUTPUT_COMPLETE):
            try:
                # make the start url given in arguments crawlable again
                database.connect()
                database.save_request(start_request_from_args)
                database.make_request_crawlable(start_request_from_args)
                database.commit()
                database.close()

                # feeding the "done" request list from the db
                Shared.requests.extend(database.get_crawled_request())
                Shared.requests_index = len(Shared.requests)

                # if resume, add requests from db
                if self._output_mode == CRAWLOUTPUT_RESUME:
                    start_requests.extend(database.get_not_crawled_request())

                # if request from args is neither in past or future requests
                if _is_not_in_past_requests(start_request_from_args):
                    start_requests.append(start_request_from_args)
            except Exception as e:
                print(str(e))
                sys.exit(1)
        else:
            start_requests.append(start_request_from_args)

        # retrieving robots.txt content
        if self._get_robots_txt:
            try:
                start_requests.extend(
                    filter(
                        _is_not_in_past_requests,
                        self._get_requests_from_robots(
                            start_request_from_args)))
            except KeyboardInterrupt:
                print("\nAborted")
                sys.exit(0)

        # save starting request to db
        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print(
            "\nDone: {} starting url(s) and {} url(s) already crawled".format(
                len(start_requests), len(Shared.requests)))

        # starting crawling threads
        print("Database %s initialized, crawl starting with %d threads" %
              (database, self._num_threads))

        for n in range(0, self._num_threads):
            thread = CrawlerThread()
            self._threads.append(thread)
            thread.start()

        # running crawl loop
        self._main_loop(self._threads, start_requests, database,
                        self._display_progress, self._verbose)

        self._kill_threads(self._threads)

        self.crawl_end_date = int(time.time())

        print("Crawl finished, %d pages analyzed in %d minutes" %
              (Shared.requests_index,
               (self.crawl_end_date - self.crawl_start_date) / 60))

        # update end date in db
        database.update_crawl_info(crawl_id, self.crawl_end_date,
                                   Shared.options["random_seed"],
                                   Shared.end_cookies)
コード例 #12
0
    def _setup_shared(self):
        """
        instantiate crawler, probe and start the crawling loop

        :param argv:
        """
        Shared.options = self._defaults  # initialize shared options

        # initialize threads conditions
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        # validate probe presence
        if not self._probe["cmd"]:
            print("Error: unable to find probe")
            sys.exit(1)

        # retrieving user arguments
        try:
            opts, args = getopt.getopt(
                self.arg, 'ho:qvm:s:D:P:Fd:c:C:r:x:p:n:A:U:t:SGNR:IOKe:')
        except getopt.GetoptError as err:
            print(str(err))
            self._usage()
            sys.exit(1)

        if len(args) < 2:  # if no start url and file name
            self._usage()
            print('* Error: missing url and/or outfile')
            sys.exit(1)

        for o, v in opts:
            if o == '-h':  # help
                self._usage()
                sys.exit(0)
            elif o == '-c':  # cookie string
                self._cookie_string = v
            elif o == '-C':  # cookie file
                try:
                    with open(v) as cf:
                        self._cookie_string = cf.read()
                except Exception as e:
                    print("* Error reading cookie file: {}".format(str(e)))
                    sys.exit(1)
            elif o == '-r':  # start referrer
                self._start_referer = v
            elif o == '-n':  # number of threads
                self._num_threads = int(v)
            elif o == '-t':  # time out
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':  # quiet
                self._display_progress = False
            elif o == '-A':  # authentication
                self._http_auth = v
            elif o == '-p':  # proxy
                proxy = v.split(":")
                if proxy[0] not in ("http", "socks5"):
                    print(
                        "* Error: only http and socks5 proxies are supported")
                    sys.exit(1)
                Shared.options['proxy'] = {
                    "proto": proxy[0],
                    "host": proxy[1],
                    "port": proxy[2]
                }
            elif o == '-d':  # allowed domains
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':  # excluded urls
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-o":  # output file mode
                if v not in (CRAWLOUTPUT_OVERWRITE, CRAWLOUTPUT_RENAME,
                             CRAWLOUTPUT_RESUME, CRAWLOUTPUT_COMPLETE):
                    self._usage()
                    print("* Error: wrong output mode set '%s'\n" % v)
                    sys.exit(1)
                self._output_mode = v
            elif o == "-R":  # redirects limit
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":  # user agent
                Shared.options['user_agent'] = v
            elif o == "-s":  # crawl scope
                if v not in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self._usage()
                    print("* ERROR: wrong scope set '%s'" % v)
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":  # crawl mode
                if v not in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self._usage()
                    print("* ERROR: wrong mode set '%s'" % v)
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":  # skip initial checks
                self._initial_checks = False
            elif o == "-I":  # ignore robots.txt
                self._get_robots_txt = False
            elif o == "-D":  # crawling depth
                Shared.options['max_depth'] = int(v)
            elif o == "-P":  # crawling depth for forms
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":  # do not override javascript timeout
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":  # do not crawl forms
                Shared.options['crawl_forms'] = False
            elif o == "-v":  # verbose
                self._verbose = True
            elif o == "-e":  # seed for random value
                Shared.options["random_seed"] = v

        # warn about -d option in domain scope mode
        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print("* Warning: option -d is valid only if scope is %s" %
                  CRAWLSCOPE_DOMAIN)

        # initialize cookies
        if self._cookie_string:
            try:

                start_cookies = self._parse_cookie_string(self._cookie_string)
                for cookie in start_cookies:
                    Shared.start_cookies.append(
                        Cookie(cookie, Shared.start_url))

            except Exception as e:
                print("error decoding cookie string: {}".format(str(e)))
                sys.exit(1)

        # retrieve start url and output file arguments
        Shared.start_url = normalize_url(args[0])
        self._outfile_name = args[1]

        # add start url domain to allowed domains
        purl = urlsplit(Shared.start_url)
        Shared.allowed_domains.add(purl.hostname)

        # warn about ssl context in python 2
        if not hasattr(ssl, "SSLContext"):
            print(
                "* WARNING: SSLContext is not supported with this version of python,"
                " consider to upgrade to >= 2.7.9 in case of SSL errors")