Example #1
0
def httpRequest(url, payload, headers, method='POST', timeout=DEFAULT_TIMEOUT, ctx_factory=None):
    # copied from twisted.web.client in order to get access to the
    # factory (which contains response codes, headers, etc)

    if type(url) is not str:
        e = HTTPRequestError('URL must be string, not %s' % type(url))
        return defer.fail(e)

    if not url.startswith('http'):
        e = HTTPRequestError('URL does not start with http (URL %s)' % (url))
        return defer.fail(e)

    log.msg(" -- Sending Payload to %s --\n%s\n -- END. Sending Payload --" % (url, payload), system=LOG_SYSTEM, payload=True)

    scheme, netloc, _ , _, _, _ = twhttp.urlparse(url)
    if not ':' in netloc:
        host = netloc
        port = 80 if scheme == 'http' else 443
    else:
        host, s_port = netloc.split(':',1)
        port = int(s_port)

    factory = twclient.HTTPClientFactory(url, method, postdata=payload, timeout=timeout)
    factory.noisy = False # stop spewing about factory start/stop
    factory.protocol.handleStatus_204 = lambda _ : None # 204 is an ok reply, needed by NCS VPN backend

    # fix missing port in header (bug in twisted.web.client)
    factory.headers['host'] = host + ':' + str(port)
    factory.headers['User-Agent'] = 'OpenNSA/Twisted'

    for header, value in headers.items():
        factory.headers[header] = value

    if scheme == 'https':
        if ctx_factory is None:
            return defer.fail(HTTPRequestError('Cannot perform https request without context factory'))
        reactor.connectSSL(host, port, factory, ctx_factory)
    else:
        reactor.connectTCP(host, port, factory)

    def invocationError(err):
        if isinstance(err.value, ConnectionClosed): # note: this also includes ConnectionDone and ConnectionLost
            pass # these are pretty common when the remote shuts down
        elif isinstance(err.value, WebError):
            data = err.value.response
            log.msg(' -- Received Reply (fault) --\n%s\n -- END. Received Reply (fault) --' % data, system=LOG_SYSTEM, payload=True)
            return err
        elif isinstance(err.value, ConnectionRefusedError):
            log.msg('Connection refused for %s:%i. Request URL: %s' % (host, port, url), system=LOG_SYSTEM)
            return err
        else:
            return err

    def logReply(data):
        log.msg(" -- Received Reply --\n%s\n -- END. Received Reply --" % data, system=LOG_SYSTEM, payload=True)
        return data

    factory.deferred.addCallbacks(logReply, invocationError)

    return factory.deferred
Example #2
0
        def cbFail(fail):

            if hasattr(fail.value, 'response'):
                if hasattr(fail.value.response, 'code'):
                    if fail.value.response.code == 307:
                        loc = fail.value.response.headers.getRawHeaders(
                            'location')
                        new = urlparse(loc[0])
                        newhost = '://'.join((new.scheme, new.netloc))
                        if newhost == self.host:
                            self.loc = None
                        else:
                            self.loc = newhost
                        self.log.debug('redirect: %s' % self.loc)
                        data = FileBodyProducer(StringIO(json.dumps(body)))
                        d = agent.request(
                            method, loc[0], Headers(headers), data)
                        d.addCallbacks(cbRequest, cbFail)
                        return d
                    elif fail.value.response.code == 404 and self.loc:
                        self.loc = None
                        host = '/'.join((self.host, path))
                        if self.token:
                            host += '?auth=' + self.token
                        d = self.request(method, host, Headers(headers), body)
                        d.addCallbacks(cbRequest, cbFail)
                        return d
                else:
                    print(dir(fail.value))
                    print(fail.value.message)
                    print(fail.value.args)

            self.log.error('unhandled failure: %s -- %s' % (
                fail.value.message, fail.value))
def _parse(url, defaultPort=None):
	from urlparse import urlunparse
	url = url.strip()
	parsed = http.urlparse(url)
	scheme = parsed[0]
	path = urlunparse(('', '') + parsed[2:])

	if defaultPort is None:
		if scheme == 'https':
			defaultPort = 443
		else:
			defaultPort = 80

	host, port = parsed[1], defaultPort
	if ':' in host:
		host, port = host.split(':')
		try:
			port = int(port)
		except ValueError:
			port = defaultPort

	if path == '':
		path = '/'

	return scheme, '', host, port, path
Example #4
0
def _parse(url, defaultPort=None):
    from urlparse import urlunparse
    url = url.strip()
    parsed = http.urlparse(url)
    scheme = parsed[0]
    path = urlunparse(('', '') + parsed[2:])

    if defaultPort is None:
        if scheme == 'https':
            defaultPort = 443
        else:
            defaultPort = 80

    host, port = parsed[1], defaultPort
    if ':' in host:
        host, port = host.split(':')
        try:
            port = int(port)
        except ValueError:
            port = defaultPort

    if path == '':
        path = '/'

    return scheme, '', host, port, path
Example #5
0
def dummyRequest(method, path, headers, body=b""):
    """
    Construct a new dummy L{IRequest} provider.

    @param method: The HTTP method of the request.  For example, C{b"GET"}.
    @type method: L{bytes}

    @param path: The encoded path part of the URI of the request.  For example,
        C{b"/foo"}.
    @type path: L{bytes}

    @param headers: The headers of the request.
    @type headers: L{Headers}

    @param body: The bytes that make up the request body.
    @type body: L{bytes}

    @return: A L{IRequest} which can be used to render an L{IResource} using
        only in-memory data structures.
    """
    scheme, location, path, params, query, fragment = urlparse(path)
    if query:
        # Oops, dropped params.  Good thing no one cares.
        path = path + "?" + query
    return _DummyRequest(
        next(_dummyRequestCounter),
        method, path, headers, body)
Example #6
0
def dummyRequest(method, path, headers, body=b""):
    """
    Construct a new dummy L{IRequest} provider.

    @param method: The HTTP method of the request.  For example, C{b"GET"}.
    @type method: L{bytes}

    @param path: The encoded path part of the URI of the request.  For example,
        C{b"/foo"}.
    @type path: L{bytes}

    @param headers: The headers of the request.
    @type headers: L{Headers}

    @param body: The bytes that make up the request body.
    @type body: L{bytes}

    @return: A L{IRequest} which can be used to render an L{IResource} using
        only in-memory data structures.
    """
    parsed = urlparse(path)
    if parsed.query:
        # Oops, dropped params.  Good thing no one cares.
        new_path = parsed.path + "?" + parsed.query
    else:
        new_path = parsed.path
    return _DummyRequest(next(_dummyRequestCounter), method, new_path, headers,
                         body)
Example #7
0
def _parse(url, defaultPort=None):
    """
    Split the given URL into the scheme, host, port, and path.

    @type url: C{str}
    @param url: An URL to parse.

    @type defaultPort: C{int} or C{None}
    @param defaultPort: An alternate value to use as the port if the URL does
    not include one.

    @return: A four-tuple of the scheme, host, port, and path of the URL.  All
    of these are C{str} instances except for port, which is an C{int}.
    """
    url = url.strip()
    parsed = http.urlparse(url)
    scheme = parsed[0]
    path = urlunparse(('','')+parsed[2:])
    if defaultPort is None:
        if scheme == 'https':
            defaultPort = 443
        else:
            defaultPort = 80
    host, port = parsed[1], defaultPort
    if ':' in host:
        host, port = host.split(':')
        port = int(port)
    if path == "":
        path = "/"
    return scheme, host, port, path
Example #8
0
 def __dispatch_url(self):
     self.__log.info('Stream connection to {url}',
                     url=self.transport.location)
     _scheme, _netloc, path_bytes, _params, query_bytes, _fragment = urlparse(
         bytes_or_ascii(self.transport.location))
     # py2/3: unquote returns str in either version but we want Unicode
     path = [
         six.text_type(urllib.parse.unquote(x))
         for x in path_bytes.split(b'/')
     ]
     assert path[0] == ''
     path[0:1] = []
     cap_string = path[0]
     if cap_string in self._caps:
         root_object = self._caps[cap_string]
         path[0:1] = []
     else:
         raise Exception('Unknown cap')  # TODO better error reporting
     if path == [AUDIO_STREAM_PATH_ELEMENT]:
         options = parse_audio_stream_options(parse_qs(query_bytes, 1))
         self.inner = AudioStreamInner(the_reactor, self.__send,
                                       root_object, options.sample_rate)
     elif len(path) >= 1 and path[0] == CAP_OBJECT_PATH_ELEMENT:
         # note _lookup_block may throw. TODO: Better error reporting
         root_object = _lookup_block(root_object, path[1:])
         self.inner = StateStreamInner(
             self.__send, root_object, path_bytes.decode('utf-8'),
             self.__subscription_context
         )  # note reuse of WS path as HTTP path; probably will regret this
     else:
         raise Exception('Unknown path: %r' % (path, ))
Example #9
0
        def cbFail(fail):

            if hasattr(fail.value, 'response'):
                if hasattr(fail.value.response, 'code'):
                    if fail.value.response.code == 307:
                        loc = fail.value.response.headers.getRawHeaders(
                            'location')
                        new = urlparse(loc[0])
                        newhost = '://'.join((new.scheme, new.netloc))
                        if newhost == self.host:
                            self.loc = None
                        else:
                            self.loc = newhost
                        self.log.debug('redirect: %s' % self.loc)
                        data = FileBodyProducer(StringIO(json.dumps(body)))
                        d = agent.request(method, loc[0], Headers(headers),
                                          data)
                        d.addCallbacks(cbRequest, cbFail)
                        return d
                    elif fail.value.response.code == 404 and self.loc:
                        self.loc = None
                        host = '/'.join((self.host, path))
                        if self.token:
                            host += '?auth=' + self.token
                        d = self.request(method, host, Headers(headers), body)
                        d.addCallbacks(cbRequest, cbFail)
                        return d
                else:
                    print(dir(fail.value))
                    print(fail.value.message)
                    print(fail.value.args)

            self.log.error('unhandled failure: %s -- %s' %
                           (fail.value.message, fail.value))
Example #10
0
File: client.py Project: ii0/treq
def _combine_query_params(url, params):
    parsed_url = urlparse(url.encode('ascii'))

    qs = []

    if parsed_url.query:
        qs.extend([parsed_url.query, b'&'])

    qs.append(urlencode(params, doseq=True))

    return urlunparse((parsed_url[0], parsed_url[1], parsed_url[2],
                       parsed_url[3], b''.join(qs), parsed_url[5]))
Example #11
0
def _combine_query_params(url, params):
    parsed_url = urlparse(url.encode('ascii'))

    qs = []

    if parsed_url.query:
        qs.extend([parsed_url.query, b'&'])

    qs.append(urlencode(params, doseq=True))

    return urlunparse((parsed_url[0], parsed_url[1],
                       parsed_url[2], parsed_url[3],
                       b''.join(qs), parsed_url[5]))
Example #12
0
 def assertSameParsing(url, decode):
     """
     Verify that C{url} is parsed into the same objects by both
     L{http.urlparse} and L{urlparse}.
     """
     urlToStandardImplementation = url
     if decode:
         urlToStandardImplementation = url.decode('ascii')
     standardResult = urlparse(urlToStandardImplementation)
     scheme, netloc, path, params, query, fragment = http.urlparse(url)
     self.assertEqual((scheme, netloc, path, params, query, fragment),
                      standardResult)
     self.assertTrue(isinstance(scheme, str))
     self.assertTrue(isinstance(netloc, str))
     self.assertTrue(isinstance(path, str))
     self.assertTrue(isinstance(params, str))
     self.assertTrue(isinstance(query, str))
     self.assertTrue(isinstance(fragment, str))
Example #13
0
    def add_download(self, d):
        max_queued = int(self.get_setting('max_queued', 0))
        if max_queued and (len(self.get_downloads()) >= max_queued):
            raise Exception('Too many downloads queued (see "max_queued" config var)')

        if d.url:
            if not d.description:
                d.description = d.url
            if not d.filename:
                d.filename = unicode(urllib.unquote(
                    os.path.basename(http.urlparse(str(d.url))[2])
                    ))

        # Rather than guess mimetypes, just let default downloaders grab
        # the file and pass it off to secondary handlers if needed
        #if d.filename and not d.mime_type:
            #d.mime_type = unicode(mimetypes.guess_type(d.filename)[0])

        if not d.feed_id and not d.media_type:
            mt = d.mime_type
            if not mt and d.filename:
                mt = mimetypes.guess_type(d.filename)[0]
            if mt:
                if mt.startswith('video'):
                    d.media_type = u'video/other'
                elif mt.startswith('audio'):
                    d.media_type = u'audio/other'

        d.added = time()
        d.deleted = False
        d.active = False
        d.progress = 0
        d.status = Status.QUEUED
        d.downloaded = 0

        self.store.add(d)
        self.get_downloads().append(d)
        self.store.commit()
        logging.info(u'Added new download ' + d.description)
        self.application.fire_event('download_added', d)

        self.application.auto_queue()

        return d.id
Example #14
0
    def add_download(self, d):
        max_queued = int(self.get_setting('max_queued', 0))
        if max_queued and (len(self.get_downloads()) >= max_queued):
            raise Exception('Too many downloads queued (see "max_queued" config var)')

        if d.url:
            if not d.description:
                d.description = d.url
            if not d.filename:
                d.filename = unicode(urllib.unquote(
                    os.path.basename(http.urlparse(str(d.url))[2])
                    ))

        # Rather than guess mimetypes, just let default downloaders grab
        # the file and pass it off to secondary handlers if needed
        #if d.filename and not d.mime_type:
            #d.mime_type = unicode(mimetypes.guess_type(d.filename)[0])

        if not d.feed_id and not d.media_type:
            mt = d.mime_type
            if not mt and d.filename:
                mt = mimetypes.guess_type(d.filename)[0]
            if mt:
                if mt.startswith('video'):
                    d.media_type = u'video/other'
                elif mt.startswith('audio'):
                    d.media_type = u'audio/other'

        d.added = time()
        d.deleted = False
        d.active = False
        d.progress = 0
        d.status = Status.QUEUED
        d.downloaded = 0

        self.store.add(d)
        self.get_downloads().append(d)
        self.store.commit()
        logging.debug(u'Added new download ' + d.description)
        self.application.fire_event('download_added', d)

        self.application.auto_queue()

        return d.id
Example #15
0
 def assertSameParsing(url, decode):
     """
     Verify that C{url} is parsed into the same objects by both
     L{http.urlparse} and L{urlparse}.
     """
     urlToStandardImplementation = url
     if decode:
         urlToStandardImplementation = url.decode('ascii')
     standardResult = urlparse(urlToStandardImplementation)
     scheme, netloc, path, params, query, fragment = http.urlparse(url)
     self.assertEqual(
         (scheme, netloc, path, params, query, fragment),
         standardResult)
     self.assertTrue(isinstance(scheme, str))
     self.assertTrue(isinstance(netloc, str))
     self.assertTrue(isinstance(path, str))
     self.assertTrue(isinstance(params, str))
     self.assertTrue(isinstance(query, str))
     self.assertTrue(isinstance(fragment, str))
Example #16
0
    def __init__(self, method, uri, headers, code=http.OK, content=None, client=None):
        self.method = method
        self.uri = uri
        self.requestHeaders = headers.copy()
        self.responseHeaders = Headers()
        self.code = code
        self._client = client
        self.prepath = []
        location = http.urlparse(self.uri)
        self.postpath = location.path[1:].split(b"/")
        self._finishDeferreds = []
        self.written = []
        self.finished = 0
        self.args = parse_qs(location.query, True)
        self.content = content

        contentType = self.requestHeaders.getRawHeaders(b"Content-Type", [None])[0]
        if method == b"POST" and contentType is not None:
            contentType = parse_header(contentType)[0]
            if contentType == b"application/x-www-form-urlencoded":
                self.args.update(parse_qs(self.content.read(), True))
Example #17
0
        def fromBytes(cls, uri, defaultPort=None):
            """Patched replacement for `twisted.web.client._URI.fromBytes`.

            The Twisted version of this function breaks when you give it a URL
            whose netloc is based on an IPv6 address.
            """
            uri = uri.strip()
            scheme, netloc, path, params, query, fragment = http.urlparse(uri)

            if defaultPort is None:
                scheme_ports = {b"https": 443, b"http": 80}
                defaultPort = scheme_ports.get(scheme, 80)

            if b"[" in netloc:
                # IPv6 address.  This is complicated.
                parsed_netloc = re.match(
                    b"\\[(?P<host>[0-9A-Fa-f:.]+)\\]([:](?P<port>[0-9]+))?$",
                    netloc,
                )
                host, port = parsed_netloc.group("host", "port")
            elif b":" in netloc:
                # IPv4 address or hostname, with port spec.  This is easy.
                host, port = netloc.split(b":")
            else:
                # IPv4 address or hostname, without port spec.
                # This is trivial.
                host = netloc
                port = None

            if port is None:
                port = defaultPort
            try:
                port = int(port)
            except ValueError:
                port = defaultPort

            return cls(
                scheme, netloc, host, port, path, params, query, fragment
            )
Example #18
0
def _parse(url, defaultPort=None):
    """
    Split the given URL into the scheme, host, port, and path.

    @type url: C{bytes}
    @param url: An URL to parse.

    @type defaultPort: C{int} or C{None}
    @param defaultPort: An alternate value to use as the port if the URL does
    not include one.

    @return: A four-tuple of the scheme, host, port, and path of the URL.  All
    of these are C{bytes} instances except for port, which is an C{int}.
    """
    print "X"*100
    url = url.strip()
    parsed = http.urlparse(url)
    scheme = parsed[0]
    path = urlunparse((b'', b'') + parsed[2:])

    if defaultPort is None:
        if scheme == b'https':
            defaultPort = 443
        else:
            defaultPort = 80

    host, port = parsed[1], defaultPort
    if b':' in host:
        host, port = host.split(b':')
        try:
            port = int(port)
        except ValueError:
            port = defaultPort

    if path == b'':
        path = b'/'

    return (scheme, host, port, path)
Example #19
0
def _coerce_and_validate_base_url(url_value, label, allowed_schemes, allow_path=False):
    """Convert url_value to string or None and validate it is a suitable base URL."""
    if url_value is not None:
        url_value = str(url_value)
        
        scheme, _netloc, path_bytes, _params, _query_bytes, _fragment = urlparse(bytes_or_ascii(url_value))
        
        # Ensure that the protocol is compatible.
        if scheme.lower() not in allowed_schemes:
            raise ConfigException('config.serve_web: {} must be a {} URL but was {}'.format(label, ' or '.join(repr_no_string_tag(s + ':') for s in allowed_schemes), repr_no_string_tag(url_value)))
        
        # Ensure that there are no path components. There are two reasons for this:
        # 1. The client makes use of host-relative URLs.
        # 2. Because ShinySDR makes heavy use of localStorage, and may in the future use other origin-scoped features, it is not safe to run ShinySDR on the same origin as another web application as they might collide with each other. Trying to reverse-proxy with an added path component does not _necessarily_ indicate an attempt to do this, but it'd be more work to support it so let's not bother.
        # However, neither reason applies to WebSocket addresses, so those are allowed to have directory paths.
        if allow_path:
            if not path_bytes.endswith(b'/'):
                raise ConfigException('config.serve_web: {}\'s path must end in a slash, but had {}'.format(label, repr_no_string_tag(path_bytes)))
        else:
            if path_bytes != b'/':
                raise ConfigException('config.serve_web: {} must not have any path components, but had {}'.format(label, repr_no_string_tag(path_bytes)))
    
    return url_value
Example #20
0
    def __init__(self, method, uri, headers, code=http.OK, content=None,
                 client=None):
        self.method = method
        self.uri = uri
        self.requestHeaders = headers.copy()
        self.responseHeaders = Headers()
        self.code = code
        self._client = client
        self.prepath = []
        location = http.urlparse(self.uri)
        self.postpath = location.path[1:].split(b'/')
        self._finishDeferreds = []
        self.written = []
        self.finished = 0
        self.args = parse_qs(location.query, True)
        self.content = content

        contentType = self.requestHeaders.getRawHeaders(
            b'Content-Type', [None])[0]
        if method == b'POST' and contentType is not None:
            contentType = parse_header(contentType)[0]
            if contentType == b'application/x-www-form-urlencoded':
                self.args.update(parse_qs(self.content.read(), True))
Example #21
0
        def _parse(url, defaultPort=None):
            url = url.strip()
            parsed = http.urlparse(url)
            scheme = parsed[0]
            path = urlunparse(("", "") + parsed[2:])

            if defaultPort is None:
                if scheme == "https":
                    defaultPort = 443
                else:
                    defaultPort = 80

            host, port = parsed[1], defaultPort
            if ":" in host:
                host, port = host.split(":")
                try:
                    port = int(port)
                except ValueError:
                    port = defaultPort

            if path == "":
                path = "/"

            return scheme, host, port, path
Example #22
0
 def parseConfigUrl(configUrl):
     parsed = urlparse(configUrl)
     # k8s API uses an authorization token, so no password on the URL, only username
     return parsed.hostname, parsed.port or 443, parsed.username
Example #23
0
 def parseConfigUrl(self, configUrl):
     parsed = urlparse(configUrl)
     return parsed.hostname, parsed.port or 2379, parsed.path
    def parse(self, response):
        """
        """
        # @FIXME check crawl_depth == 0 early in parsing, elif >0 then parse as normal

        crawl_depth = response.meta.get('crawl_depth', self._default_crawl_depth)
        content_type = response.meta.get('content_type', 'UNKNOWN')
        if crawl_depth == self._default_crawl_depth:
            content_type = 'HTML'   # 1st URL most likly is HTML

        log = structlog.get_logger().bind(
            event = 'CRAWL',
            source_url = response.url,
            content_type = content_type)

        if content_type == 'HTML':
            body = response.xpath('//body')[0]
            yield {
                'source_url': response.url,
                'content_type': content_type,
                'title': response.xpath('//head//title/text()').extract_first(),
                'content': body.extract()}

            #
            links = body.xpath('.//a/@href')
            for element in links:
                raw_link = element.extract()
                parsed_url = http.urlparse(raw_link.encode('utf8')).decode()
                base_link = ''.join([
                    parsed_url.hostname or '',
                    parsed_url.path or '/']
                    ).rstrip('/')
                content_type = self.determine_type(base_link)
                proceed_to_crawl = all([
                    crawl_depth >= 0,
                    base_link not in self._url_black_list,
                    content_type in self._processable_ext])
                if crawl_depth >= 0:
                    if base_link not in self._url_black_list:
                        if content_type in self._processable_ext:
                            log.info(
                                content_type = content_type,
                                action = 'FOLLOW_HREF',
                                href = raw_link,
                                crawl_depth = crawl_depth)

                            # @TODO handle other formats besides HTML
                            if content_type == 'HTML':
                                yield response.follow(
                                    raw_link,
                                    callback = self.parse,
                                    errback = self.errback,
                                    meta = dict(
                                        crawl_depth = crawl_depth - 1,
                                        content_type = content_type,
                                        splash = {
                                            'args': {
                                                'wait': 1,
                                                'html': 1,
                                            }
                                        }
                                    ),
                                )
                        else:
                            log.info(error = 'CANNOT_PROCESS_CONTENT_TYPE')
                    else:
                        log.info(error = 'BLACKLISTED_URL')
                elif crawl_depth == 0:
                    # do not crawl past this domain layer
                    # @TODO pass visited domains in metadata
                    '@FIXME'

        elif content_type in self._processable_ext:
            # @TODO
            yield {
                'source_url': response.url,
                'content_type': content_type,
                'title': response.url,
                'content': ''}
        elif content_type in ['IMAGE']:
            log.info(event = 'SKIP_CONTENT_TYPE')
        else:
            log.warn(
                response_code = response.status,
                error = 'UNABLE_TO_PARSE')
Example #25
0
def httpRequest(url,
                payload,
                headers,
                method='POST',
                timeout=DEFAULT_TIMEOUT,
                ctx_factory=None):
    # copied from twisted.web.client in order to get access to the
    # factory (which contains response codes, headers, etc)

    if type(url) is not str:
        e = HTTPRequestError('URL must be string, not %s' % type(url))
        return defer.fail(e)

    if not url.startswith('http'):
        e = HTTPRequestError('URL does not start with http (URL %s)' % (url))
        return defer.fail(e)

    log.msg(" -- Sending Payload to %s --\n%s\n -- END. Sending Payload --" %
            (url, payload),
            system=LOG_SYSTEM,
            payload=True)

    scheme, netloc, _, _, _, _ = twhttp.urlparse(url)
    if not ':' in netloc:
        host = netloc
        port = 80 if scheme == 'http' else 443
    else:
        host, s_port = netloc.split(':', 1)
        port = int(s_port)

    factory = twclient.HTTPClientFactory(url,
                                         method,
                                         postdata=payload,
                                         timeout=timeout)
    factory.noisy = False  # stop spewing about factory start/stop
    factory.protocol.handleStatus_204 = lambda _: None  # 204 is an ok reply, needed by NCS VPN backend

    # fix missing port in header (bug in twisted.web.client)
    factory.headers['host'] = host + ':' + str(port)
    factory.headers['User-Agent'] = 'OpenNSA/Twisted'

    for header, value in headers.items():
        factory.headers[header] = value

    if scheme == 'https':
        if ctx_factory is None:
            return defer.fail(
                HTTPRequestError(
                    'Cannot perform https request without context factory'))
        reactor.connectSSL(host, port, factory, ctx_factory)
    else:
        reactor.connectTCP(host, port, factory)

    def invocationError(err):
        if isinstance(
                err.value, ConnectionClosed
        ):  # note: this also includes ConnectionDone and ConnectionLost
            pass  # these are pretty common when the remote shuts down
        elif isinstance(err.value, WebError):
            data = err.value.response
            log.msg(
                ' -- Received Reply (fault) --\n%s\n -- END. Received Reply (fault) --'
                % data,
                system=LOG_SYSTEM,
                payload=True)
            return err
        elif isinstance(err.value, ConnectionRefusedError):
            log.msg('Connection refused for %s:%i. Request URL: %s' %
                    (host, port, url),
                    system=LOG_SYSTEM)
            return err
        else:
            return err

    def logReply(data):
        log.msg(" -- Received Reply --\n%s\n -- END. Received Reply --" % data,
                system=LOG_SYSTEM,
                payload=True)
        return data

    factory.deferred.addCallbacks(logReply, invocationError)

    return factory.deferred
Example #26
0
 def parseConfigUrl(self, configUrl):
     parsed = urlparse(configUrl)
     return parsed.hostname, parsed.port or 2379, parsed.path
Example #27
0
    def render(self, request: server.Request) -> bytes:
        # Deny by default.
        request.setResponseCode(401)

        # Get session cookie value if any.
        sessionid = request.getCookie(self.cookie)
        if sessionid is not None:
            if sessionid in self.sessions:
                request.setResponseCode(200)
                self.log.info("Session: Validation succeeded")
                return b""
            else:
                self.log.info("Session: Invalid session id")

        # Token is passed as a query parameter in the original URL.
        origurl = http.urlparse(request.getHeader(self.header))
        query = http.parse_qs(origurl.query)
        args = query.get(self.param, [])
        if len(args) != 1:
            self.log.error("Request: Token {param} missing", param=self.param)
            return b""

        try:
            token = jwt.JWT(key=self.key, jwt=args[0].decode())
        except (jwt.JWTExpired, jwt.JWTNotYetValid, jwt.JWTMissingClaim,
                jwt.JWTInvalidClaimValue, jwt.JWTInvalidClaimFormat,
                jwt.JWTMissingKeyID, jwt.JWTMissingKey) as error:
            self.log.error("JWT token: {error}", error=error)
            return b""
        except Exception:
            self.log.failure("JWT token: Unknown exception")
            return b""

        try:
            claims = json.loads(token.claims)
        except json.JSONDecodeError as error:
            self.log.failure("JWT token: Claims {error}", error=error)
            return b""

        # Collect session parameters from claims.
        sessparams = claims.get("session", {})
        kwargs = {
            "expires": sessparams.get("expires", None),
            "domain": sessparams.get("domain", None),
            "path": sessparams.get("path", None),
            "secure": sessparams.get("secure", None),
            "httpOnly": sessparams.get("httpOnly", None),
            "sameSite": sessparams.get("sameSite", None),
        }

        # Use maxAge for session ttl if it is present, convert it into a str
        # type as required by the addCookie call.
        if "maxAge" in sessparams:
            kwargs["max_age"] = str(sessparams["maxAge"])
            sessttl = int(sessparams["maxAge"])
        else:
            sessttl = self.sessttl

        # Generate a new session id and remember it. Also clean it up after
        # ttl seconds.
        sessionid = secrets.token_urlsafe(nbytes=16).encode()
        self.sessions.add(sessionid)
        reactor.callLater(sessttl, self._session_remove, sessionid)
        self.log.info("Session: Created, num sessions: {sessions}",
                      sessions=len(self.sessions))

        # Set cookie in the browser.
        request.addCookie(self.cookie, sessionid, **kwargs)

        request.setResponseCode(200)
        self.log.info("JWT token: Validation succeeded")
        return b""
Example #28
0
    def handle_html(self, response, html_selector):
        """
        Parse HTML and extract links

        :type response: scrapy.http.Response
        :type html_selector: scrapy.selector.Selector
        :yields: dict, scrapy.Request
        """
        # @TODO handles for different parts of the HTML. eg. body, head, frameset
        log = structlog.get_logger().bind(
            event = 'PARSE_HTML',
            module = __file__,
            source_url = response.url,
            content_type = 'HTML')

        crawl_depth = response.meta.get('crawl_depth', self._crawl_depth)
        title = response.data.get('title', response.url)

        try:
            body = html_selector.xpath('//body')[0]
        except IndexError:
            body = selector.Selector(text='')

        yield dict(
            source_url = response.url,
            crawl_timestamp = self._crawl_start_datetime.strftime('%Y-%m-%dT%H:%M:%SZ'),
            title = title,
            content_type = 'HTML',
            content = body.extract())

        # add domain to set of traversed domains
        parsed_resp_url = http.urlparse(response.url.encode('utf')).decode()
        self._traversed_domains.add(parsed_resp_url.netloc)

        # extract links
        linkextractor = LxmlLinkExtractor(
            allow = self._patterns_url_whitelist,
            deny = self._patterns_url_blacklist,
            allow_domains = self._patterns_domain_whitelist,
            deny_domains = self._patterns_domain_blacklist)
        href_list = linkextractor.extract_links(response)
        for link in href_list:
            # get the URL in string format
            href = link.url

           # separate meaningful pieces of URL
            try:
                parsed_href = http.urlparse(href.encode('utf8')).decode()
            except:
                # typically href URL is invalid
                log.error(error = "INVALID_URL", href=href)
                continue

            # only parse HTTP links
            if parsed_href.scheme.upper() in ['HTTP', 'HTTPS']:
                # split the query string from the href, do not follow _href!
                _href = ''.join([
                    parsed_href.netloc,
                    parsed_href.path])

                # determine file type from the URL
                content_type = self.identify_type_from_url(_href)

                # make routing decision based on content type
                route = None
                if content_type in ['HTML']:
                    route = response.follow(
                        href,
                        callback = self.parse,
                        errback = self.errback,
                        meta = dict(
                            crawl_depth = crawl_depth - 1,
                            splash = {
                                'endpoint': 'render.json',
                                'args': {
                                    'html': 1,
                                    'iframes': 1,
                                    'timeout': 10,
                                }
                            }
                        )
                    )
                elif content_type in self._processable_ext:
                    log.info('@TODO')     # @TODO

                # is crawl at 0 depth?
                conditions = any([
                    crawl_depth > 0,
                    all([
                        crawl_depth <= 0,
                        parsed_href.netloc in self._traversed_domains
                        ]),
                    ])
                if conditions and route is not None:
                    yield route
Example #29
0
def httpRequest(url, payload, headers, method=b'POST', timeout=DEFAULT_TIMEOUT, ctx_factory=None):
    # copied from twisted.web.client in order to get access to the
    # factory (which contains response codes, headers, etc)

    # Make request work with both str and bytes url
    if type(url) is str:
        url = url.encode()

    if type(url) is not bytes:
        e = HTTPRequestError('URL must be bytes, not %s' % type(url))
        return defer.fail(e)

    if not url.startswith(b'http'):
        e = HTTPRequestError('URL does not start with http (URL %s)' % (url))
        return defer.fail(e)

    log.msg(" -- Sending Payload to {} --".format(url), system=LOG_SYSTEM, payload=True)
    log.msg(payload, system=LOG_SYSTEM, payload=True)
    log.msg(' -- END --', system=LOG_SYSTEM, payload=True)

    scheme, netloc, _ , _, _, _ = twhttp.urlparse(url)
    if not b':' in netloc:
        host = netloc
        port = 80 if scheme == 'http' else 443
    else:
        host, s_port = netloc.split(b':',1)
        port = int(s_port)

    factory = twclient.HTTPClientFactory(url, method, postdata=payload, timeout=timeout)
    factory.noisy = False # stop spewing about factory start/stop
    factory.protocol.handleStatus_204 = lambda _ : None # 204 is an ok reply, needed by NCS VPN backend

    # fix missing port in header (possible bug in twisted.web.client, or just low-level library)
    factory.headers[b'host'] = netloc
    factory.headers[b'User-Agent'] = b'OpenNSA/Twisted'

    for header, value in headers.items():
        factory.headers[header.encode('utf-8')] = value.encode('utf-8')

    if scheme == b'https':
        if ctx_factory is None:
            return defer.fail(HTTPRequestError('Cannot perform https request without context factory'))
        reactor.connectSSL(host, port, factory, ctx_factory.getClientTLSOptions(host.decode()))
    else:
        reactor.connectTCP(host, port, factory)

    def invocationError(err):
        if isinstance(err.value, ConnectionClosed): # note: this also includes ConnectionDone and ConnectionLost
            # It might be better to just raise an error here, but at least we log it
            log.msg('ConnectionClosed failure: {} (this is usually okay during shutdown)'.format(err.value), system=LOG_SYSTEM)
            pass # these are pretty common when the remote shuts down
        elif isinstance(err.value, WebError):
            data = err.value.response
            log.msg(' -- Received Reply (fault) --', system=LOG_SYSTEM, payload=True)
            log.msg(data, system=LOG_SYSTEM, payload=True)
            log.msg(' -- END --', system=LOG_SYSTEM, payload=True)
            return err
        elif isinstance(err.value, ConnectionRefusedError):
            log.msg('Connection refused for %s:%i. Request URL: %s' % (host, port, url), system=LOG_SYSTEM)
            return err
        else:
            return err

    def logReply(data):
        log.msg(' -- Received Reply --', system=LOG_SYSTEM, payload=True)
        log.msg(data, system=LOG_SYSTEM, payload=True)
        log.msg('-- END --', system=LOG_SYSTEM, payload=True)
        return data

    factory.deferred.addCallbacks(logReply, invocationError)

    return factory.deferred