def test_any_to_uri(self): if os.name == 'nt': self.assertEqual(any_to_uri("C:\\windows\clock.avi"), "file:///C:/windows/clock.avi") else: self.assertEqual(any_to_uri("/some/path.txt"), "file:///some/path.txt") self.assertEqual(any_to_uri("file:///some/path.txt"), "file:///some/path.txt") self.assertEqual(any_to_uri("http://www.example.com/some/path.txt"), "http://www.example.com/some/path.txt")
def fetch(self, request_or_url, spider=None): if isinstance(request_or_url, Request): request = request_or_url url = request.url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True) request.meta['handle_httpstatus_all'] = True # ToDo: Bad solution - not work. def callback(x): parent = self.current_ipython_shell.get_parent() self.current_ipython_shell.kernel._publish_status('busy', parent) response, spider = x self.populate_vars(response, request, spider) self.current_ipython_shell.kernel._publish_status('idle', parent) def errback(err): parent = self.current_ipython_shell.get_parent() self.current_ipython_shell.kernel._publish_status('busy', parent) err.printTraceback() self.current_ipython_shell.kernel._publish_status('idle', parent) d = self._schedule(request, spider) d.addCallback(callback) d.addErrback(errback)
def fetch(self, request_or_url, meta, spider=None): site_id = meta['crawl_site_id'] spider = self.spiders.get(site_id) url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, callback=self.spiders[site_id].parse_item) request.meta['source'] = meta['request'] self.crawler_instances[site_id].engine.crawl(request, spider)
def fetch(self, request_or_url, spider=None): if isinstance(request_or_url, Request): request = request_or_url url = request.url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True) request.meta["handle_httpstatus_all"] = True # ToDo: Bad solution - not work. def callback(x): parent = self.current_ipython_shell.get_parent() self.current_ipython_shell.kernel._publish_status("busy", parent) response, spider = x self.populate_vars(response, request, spider) self.current_ipython_shell.kernel._publish_status("idle", parent) def errback(err): parent = self.current_ipython_shell.get_parent() self.current_ipython_shell.kernel._publish_status("busy", parent) err.printTraceback() self.current_ipython_shell.kernel._publish_status("idle", parent) d = self._schedule(request, spider) d.addCallback(callback) d.addErrback(errback)
def fetch(self, request_or_url, spider=None): if isinstance(request_or_url, Request): request = request_or_url url = request.url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True) response = None response, spider = threads.blockingCallFromThread(reactor, \ self._schedule, request, spider) self.populate_vars(response, request, spider)
def fetch(self, request_or_url, spider=None): if isinstance(request_or_url, Request): request = request_or_url url = request.url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True) request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider)
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) if redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider)
def guess_scheme(url): """Add an URL scheme if missing: file:// for filepath-like input or http:// otherwise.""" parts = urlparse(url) if parts.scheme: return url # Note: this does not match Windows filepath if re.match(r'''^ # start with... ( \. # ...a single dot, ( \. | [^/\.]+ # optionally followed by )? # either a second dot or some characters )? # optional match of ".", ".." or ".blabla" / # at least one "/" for a file path, . # and something after the "/" ''', parts.path, flags=re.VERBOSE): return any_to_uri(url) else: return add_http_if_no_scheme(url)
def validate_url(url): _url = urlparse(url) print(_url) if not _url.scheme : if not _url.netloc: return False elif re.match(r'''^ # start with... ( \. # ...a single dot, ( \. | [^/\.]+ # optionally followed by )? # either a second dot or some characters )? # optional match of ".", ".." or ".blabla" / # at least one "/" for a file path, . # and something after the "/" ''', _url.path, flags=re.VERBOSE): return any_to_uri(url) else: return add_http_if_no_scheme(url) return url