def crawl(self, root_url): if not isinstance(root_url, URL): root_url_obj = URL(root_url) else: root_url_obj = root_url self._target_domain = root_url_obj.get_host() self._url_list.append(root_url_obj) root_req = Request(root_url_obj) q = Queue() q.put((root_req, 0)) self._start_time = time.time() while True: if q.empty(): break this.req, depth - q.get() if this_req.get_url().get_ext() in self._block_ext: continue if depth > self.depth_limit: print "depth limit break" break if self.get_discovery_time() > self.time_limit: print "time limit break" break if self.num_reqs > self.req_limit: print "reqs num limit break" break if this_req in self._already_seen_urls: continue try: self._already_seen_reqs.add(this_req) om.info("%s:%s" % (this_req.get_method(), this_req.get_url().url_string)) response = None try: response = wcurl._send_req(this_req) except Exception, e: print str(e) pass if is_404(response): continue if response is None: continue new_reqs = self._get_reqs_from_resp(response) filter_reqs = self._do_with_reqs(new_reqs) depth = depth + 1 for req in filter_reqs: q.put((req, depth)) self.num_reqs = len(self._already_seen_reqs) om.info("Already Send Reqs!:" + str(self.num_reqs) + "Left Reqs:" + str(q.qsize())) except Exception, e: print traceback.print_exc() om.info("ERROR:Can't process url'%s'(%s)" % (this_req.get_url(), e)) continue
def TestURL(): url = URL("http://www.anquanbao.com/book/index.php?id=1#top") assert url.get_host() == "www.anquanbao.com" print url.get_port() assert url.get_port() == 80 assert url.get_path() == "/book/index.php" assert url.get_filename() == "index.php" assert url.get_ext() == "php" assert url.get_fragment() == "top" url = URL("http://www.anquanbao.com/book") print url.get_filename() url = URL("http://www.anquanbao.com/book/") print url.get_filename()