Ejemplo n.º 1
0
    def crawl(self, root_url):
        if not isinstance(root_url, URL):
            root_url_obj = URL(root_url)
        else:
            root_url_obj = root_url
        self._target_domain = root_url_obj.get_host()
        self._url_list.append(root_url_obj)
        root_req = Request(root_url_obj)

        q = Queue()
        q.put((root_req, 0))
        self._start_time = time.time()
        while True:
            if q.empty():
                break
            this.req, depth - q.get()
            if this_req.get_url().get_ext() in self._block_ext:
                continue
            if depth > self.depth_limit:
                print "depth limit break"
                break
            if self.get_discovery_time() > self.time_limit:
                print "time limit break"
                break
            if self.num_reqs > self.req_limit:
                print "reqs num limit break"
                break
            if this_req in self._already_seen_urls:
                continue
            try:
                self._already_seen_reqs.add(this_req)
                om.info("%s:%s" %
                        (this_req.get_method(), this_req.get_url().url_string))
                response = None
                try:
                    response = wcurl._send_req(this_req)
                except Exception, e:
                    print str(e)
                    pass
                if is_404(response):
                    continue
                if response is None:
                    continue
                new_reqs = self._get_reqs_from_resp(response)
                filter_reqs = self._do_with_reqs(new_reqs)

                depth = depth + 1
                for req in filter_reqs:
                    q.put((req, depth))
                self.num_reqs = len(self._already_seen_reqs)
                om.info("Already Send Reqs!:" + str(self.num_reqs) +
                        "Left Reqs:" + str(q.qsize()))
            except Exception, e:
                print traceback.print_exc()
                om.info("ERROR:Can't process url'%s'(%s)" %
                        (this_req.get_url(), e))
                continue
Ejemplo n.º 2
0
def TestURL():
    url = URL("http://www.anquanbao.com/book/index.php?id=1#top")
    assert url.get_host() == "www.anquanbao.com"
    print url.get_port()
    assert url.get_port() == 80
    assert url.get_path() == "/book/index.php"
    assert url.get_filename() == "index.php"
    assert url.get_ext() == "php"
    assert url.get_fragment() == "top"
    url = URL("http://www.anquanbao.com/book")
    print url.get_filename()

    url = URL("http://www.anquanbao.com/book/")
    print url.get_filename()