Exemple #1
0
    def test_crawl(self):
        """
        :return:
        """
        url1 = "http://localhost:8081"
        q = Queue()
        q.put(url1)
        thread1 = CrawlerThread(q)
        thread1.crawl((url1, 0))
        self.assertEqual(CrawlerThreadPool.total_links, 5)

        #depth exceed test
        url2 = 'localhost:8081/mirror/page1.html'
        thread2 = CrawlerThread(q)
        thread2.crawl((url2, 101))
        self.assertEqual(CrawlerThreadPool.total_links, 5)

        url3 = 'http://www.baidu.com?query=10000'
        thread3 = CrawlerThread(q)
        CrawlerThreadPool.interval_links_cnt = \
            ConfReader.instance().get_max_links_count() + 1
        t1 = time.time()
        thread3.crawl((url3, 0))
        t2 = time.time()
        self.assertAlmostEqual(t2 - t1,
                               ConfReader.instance().get_crawl_interval(), 0)
Exemple #2
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        deps_errors = check_dependences(self.base_dir)
        if len(deps_errors) > 0:
            print "Dependences errors: "
            for err in deps_errors:
                print "  %s" % err
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OvelE:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                self.display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                try:
                    Shared.options['proxy'] = parse_proxy_string(v)
                except Exception as e:
                    print e
                    sys.exit(1)
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                self.verbose = True
            elif o == "-e":
                Shared.options['deduplicate_pages'] = False
            elif o == "-l":
                Shared.options['headless_chrome'] = False
            elif o == "-E":
                if not Shared.options['extra_headers']:
                    Shared.options['extra_headers'] = {}
                (hn, hv) = v.split("=", 1)
                Shared.options['extra_headers'][hn] = hv

        probe_cmd = get_node_cmd()
        if not probe_cmd:  # maybe useless
            print "Error: unable to find node executable"
            sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_options.extend([
                "-y",
                "%s:%s:%s" % (Shared.options['proxy']['proto'],
                              Shared.options['proxy']['host'],
                              Shared.options['proxy']['port'])
            ])
        if not Shared.options['headless_chrome']:
            probe_options.append("-l")
        probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js'))

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        if Shared.options['extra_headers']:
            probe_options.extend(
                ["-E", json.dumps(Shared.options['extra_headers'])])

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        self.db_file = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(self.db_file, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'],
                                 proxy=json.dumps(Shared.options['proxy']),
                                 extra_headers=json.dumps(
                                     Shared.options['extra_headers']),
                                 cookies=json.dumps(start_cookies))

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads (^C to pause or change verbosity)" % (
            self.db_file, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)
Exemple #3
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        probe_cmd = get_phantomjs_cmd()
        if not probe_cmd:
            print "Error: unable to find phantomjs executable"
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        display_progress = True
        verbose = False
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False
        user_script = None

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                if v == "tor": v = "socks5:127.0.0.1:9150"
                proxy = v.split(":")
                if proxy[0] not in ("http", "socks5"):
                    print "only http and socks5 proxies are supported"
                    sys.exit(1)
                Shared.options['proxy'] = {
                    "proto": proxy[0],
                    "host": proxy[1],
                    "port": proxy[2]
                }
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                verbose = True
            elif o == "-u":
                if os.path.isfile(v):
                    user_script = os.path.abspath(v)
                else:
                    print "error: unable to open USER_SCRIPT"
                    sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = self.parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_cmd.append("--proxy-type=%s" %
                             Shared.options['proxy']['proto'])
            probe_cmd.append("--proxy=%s:%s" %
                             (Shared.options['proxy']['host'],
                              Shared.options['proxy']['port']))

        probe_cmd.append(self.base_dir + 'probe/analyze.js')

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        if user_script:
            probe_options.extend(("-u", user_script))

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        if user_script and initial_checks:
            self.check_user_script_syntax(probe_cmd, user_script)

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        fname = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(fname, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'])

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads" % (
            fname, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database, display_progress,
                       verbose)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)