Ejemplo n.º 1
0
 def test_any_to_uri(self):
     if os.name == 'nt':
         self.assertEqual(any_to_uri("C:\\windows\clock.avi"),
                          "file:///C:/windows/clock.avi")
     else:
         self.assertEqual(any_to_uri("/some/path.txt"),
                          "file:///some/path.txt")
     self.assertEqual(any_to_uri("file:///some/path.txt"),
                      "file:///some/path.txt")
     self.assertEqual(any_to_uri("http://www.example.com/some/path.txt"),
                      "http://www.example.com/some/path.txt")
Ejemplo n.º 2
0
 def test_any_to_uri(self):
     if os.name == 'nt':
         self.assertEqual(any_to_uri("C:\\windows\clock.avi"),
                          "file:///C:/windows/clock.avi")
     else:
         self.assertEqual(any_to_uri("/some/path.txt"),
                          "file:///some/path.txt")
     self.assertEqual(any_to_uri("file:///some/path.txt"),
                      "file:///some/path.txt")
     self.assertEqual(any_to_uri("http://www.example.com/some/path.txt"),
                      "http://www.example.com/some/path.txt")
Ejemplo n.º 3
0
    def fetch(self, request_or_url, spider=None):
        if isinstance(request_or_url, Request):
            request = request_or_url
            url = request.url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True)
            request.meta['handle_httpstatus_all'] = True

        # ToDo: Bad solution - not work.
        def callback(x):
            parent = self.current_ipython_shell.get_parent()
            self.current_ipython_shell.kernel._publish_status('busy', parent)
            response, spider = x
            self.populate_vars(response, request, spider)
            self.current_ipython_shell.kernel._publish_status('idle', parent)

        def errback(err):
            parent = self.current_ipython_shell.get_parent()
            self.current_ipython_shell.kernel._publish_status('busy', parent)
            err.printTraceback()
            self.current_ipython_shell.kernel._publish_status('idle', parent)

        d = self._schedule(request, spider)
        d.addCallback(callback)
        d.addErrback(errback)
Ejemplo n.º 4
0
 def fetch(self, request_or_url, meta, spider=None):
     site_id = meta['crawl_site_id']
     spider  = self.spiders.get(site_id)
     url = any_to_uri(request_or_url)
     request = Request(url, dont_filter=True, callback=self.spiders[site_id].parse_item)
     request.meta['source'] = meta['request']
     self.crawler_instances[site_id].engine.crawl(request, spider)
Ejemplo n.º 5
0
    def fetch(self, request_or_url, spider=None):
        if isinstance(request_or_url, Request):
            request = request_or_url
            url = request.url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True)
            request.meta["handle_httpstatus_all"] = True

        # ToDo: Bad solution - not work.
        def callback(x):
            parent = self.current_ipython_shell.get_parent()
            self.current_ipython_shell.kernel._publish_status("busy", parent)
            response, spider = x
            self.populate_vars(response, request, spider)
            self.current_ipython_shell.kernel._publish_status("idle", parent)

        def errback(err):
            parent = self.current_ipython_shell.get_parent()
            self.current_ipython_shell.kernel._publish_status("busy", parent)
            err.printTraceback()
            self.current_ipython_shell.kernel._publish_status("idle", parent)

        d = self._schedule(request, spider)
        d.addCallback(callback)
        d.addErrback(errback)
Ejemplo n.º 6
0
 def fetch(self, request_or_url, spider=None):
     if isinstance(request_or_url, Request):
         request = request_or_url
         url = request.url
     else:
         url = any_to_uri(request_or_url)
         request = Request(url, dont_filter=True)
     response = None
     response, spider = threads.blockingCallFromThread(reactor, \
         self._schedule, request, spider)
     self.populate_vars(response, request, spider)
Ejemplo n.º 7
0
 def fetch(self, request_or_url, spider=None):
     if isinstance(request_or_url, Request):
         request = request_or_url
         url = request.url
     else:
         url = any_to_uri(request_or_url)
         request = Request(url, dont_filter=True)
     response = None
     response, spider = threads.blockingCallFromThread(reactor, \
         self._schedule, request, spider)
     self.populate_vars(response, request, spider)
Ejemplo n.º 8
0
 def fetch(self, request_or_url, spider=None):
     if isinstance(request_or_url, Request):
         request = request_or_url
         url = request.url
     else:
         url = any_to_uri(request_or_url)
         request = Request(url, dont_filter=True)
         request.meta['handle_httpstatus_all'] = True
     response = None
     try:
         response, spider = threads.blockingCallFromThread(
             reactor, self._schedule, request, spider)
     except IgnoreRequest:
         pass
     self.populate_vars(response, request, spider)
Ejemplo n.º 9
0
 def fetch(self, request_or_url, spider=None):
     if isinstance(request_or_url, Request):
         request = request_or_url
         url = request.url
     else:
         url = any_to_uri(request_or_url)
         request = Request(url, dont_filter=True)
         request.meta['handle_httpstatus_all'] = True
     response = None
     try:
         response, spider = threads.blockingCallFromThread(
             reactor, self._schedule, request, spider)
     except IgnoreRequest:
         pass
     self.populate_vars(response, request, spider)
Ejemplo n.º 10
0
 def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
     if isinstance(request_or_url, Request):
         request = request_or_url
     else:
         url = any_to_uri(request_or_url)
         request = Request(url, dont_filter=True, **kwargs)
         if redirect:
             request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
         else:
             request.meta['handle_httpstatus_all'] = True
     response = None
     try:
         response, spider = threads.blockingCallFromThread(
             reactor, self._schedule, request, spider)
     except IgnoreRequest:
         pass
     self.populate_vars(response, request, spider)
Ejemplo n.º 11
0
 def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
     if isinstance(request_or_url, Request):
         request = request_or_url
     else:
         url = any_to_uri(request_or_url)
         request = Request(url, dont_filter=True, **kwargs)
         if redirect:
             request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
         else:
             request.meta['handle_httpstatus_all'] = True
     response = None
     try:
         response, spider = threads.blockingCallFromThread(
             reactor, self._schedule, request, spider)
     except IgnoreRequest:
         pass
     self.populate_vars(response, request, spider)
Ejemplo n.º 12
0
def guess_scheme(url):
    """Add an URL scheme if missing: file:// for filepath-like input or http:// otherwise."""
    parts = urlparse(url)
    if parts.scheme:
        return url
    # Note: this does not match Windows filepath
    if re.match(r'''^                   # start with...
                    (
                        \.              # ...a single dot,
                        (
                            \. | [^/\.]+  # optionally followed by
                        )?                # either a second dot or some characters
                    )?      # optional match of ".", ".." or ".blabla"
                    /       # at least one "/" for a file path,
                    .       # and something after the "/"
                    ''', parts.path, flags=re.VERBOSE):
        return any_to_uri(url)
    else:
        return add_http_if_no_scheme(url)
Ejemplo n.º 13
0
def validate_url(url):
    _url = urlparse(url)
    print(_url)
    if not _url.scheme :
        if not _url.netloc:
            return False
        elif re.match(r'''^                   # start with...
                            (
                                \.              # ...a single dot,
                                (
                                    \. | [^/\.]+  # optionally followed by
                                )?                # either a second dot or some characters
                            )?      # optional match of ".", ".." or ".blabla"
                            /       # at least one "/" for a file path,
                            .       # and something after the "/"
                            ''', _url.path, flags=re.VERBOSE):
            return any_to_uri(url)
        else:
            return add_http_if_no_scheme(url)
    return url