def httpRequest(url, payload, headers, method='POST', timeout=DEFAULT_TIMEOUT, ctx_factory=None): # copied from twisted.web.client in order to get access to the # factory (which contains response codes, headers, etc) if type(url) is not str: e = HTTPRequestError('URL must be string, not %s' % type(url)) return defer.fail(e) if not url.startswith('http'): e = HTTPRequestError('URL does not start with http (URL %s)' % (url)) return defer.fail(e) log.msg(" -- Sending Payload to %s --\n%s\n -- END. Sending Payload --" % (url, payload), system=LOG_SYSTEM, payload=True) scheme, netloc, _ , _, _, _ = twhttp.urlparse(url) if not ':' in netloc: host = netloc port = 80 if scheme == 'http' else 443 else: host, s_port = netloc.split(':',1) port = int(s_port) factory = twclient.HTTPClientFactory(url, method, postdata=payload, timeout=timeout) factory.noisy = False # stop spewing about factory start/stop factory.protocol.handleStatus_204 = lambda _ : None # 204 is an ok reply, needed by NCS VPN backend # fix missing port in header (bug in twisted.web.client) factory.headers['host'] = host + ':' + str(port) factory.headers['User-Agent'] = 'OpenNSA/Twisted' for header, value in headers.items(): factory.headers[header] = value if scheme == 'https': if ctx_factory is None: return defer.fail(HTTPRequestError('Cannot perform https request without context factory')) reactor.connectSSL(host, port, factory, ctx_factory) else: reactor.connectTCP(host, port, factory) def invocationError(err): if isinstance(err.value, ConnectionClosed): # note: this also includes ConnectionDone and ConnectionLost pass # these are pretty common when the remote shuts down elif isinstance(err.value, WebError): data = err.value.response log.msg(' -- Received Reply (fault) --\n%s\n -- END. Received Reply (fault) --' % data, system=LOG_SYSTEM, payload=True) return err elif isinstance(err.value, ConnectionRefusedError): log.msg('Connection refused for %s:%i. Request URL: %s' % (host, port, url), system=LOG_SYSTEM) return err else: return err def logReply(data): log.msg(" -- Received Reply --\n%s\n -- END. Received Reply --" % data, system=LOG_SYSTEM, payload=True) return data factory.deferred.addCallbacks(logReply, invocationError) return factory.deferred
def cbFail(fail): if hasattr(fail.value, 'response'): if hasattr(fail.value.response, 'code'): if fail.value.response.code == 307: loc = fail.value.response.headers.getRawHeaders( 'location') new = urlparse(loc[0]) newhost = '://'.join((new.scheme, new.netloc)) if newhost == self.host: self.loc = None else: self.loc = newhost self.log.debug('redirect: %s' % self.loc) data = FileBodyProducer(StringIO(json.dumps(body))) d = agent.request( method, loc[0], Headers(headers), data) d.addCallbacks(cbRequest, cbFail) return d elif fail.value.response.code == 404 and self.loc: self.loc = None host = '/'.join((self.host, path)) if self.token: host += '?auth=' + self.token d = self.request(method, host, Headers(headers), body) d.addCallbacks(cbRequest, cbFail) return d else: print(dir(fail.value)) print(fail.value.message) print(fail.value.args) self.log.error('unhandled failure: %s -- %s' % ( fail.value.message, fail.value))
def _parse(url, defaultPort=None): from urlparse import urlunparse url = url.strip() parsed = http.urlparse(url) scheme = parsed[0] path = urlunparse(('', '') + parsed[2:]) if defaultPort is None: if scheme == 'https': defaultPort = 443 else: defaultPort = 80 host, port = parsed[1], defaultPort if ':' in host: host, port = host.split(':') try: port = int(port) except ValueError: port = defaultPort if path == '': path = '/' return scheme, '', host, port, path
def dummyRequest(method, path, headers, body=b""): """ Construct a new dummy L{IRequest} provider. @param method: The HTTP method of the request. For example, C{b"GET"}. @type method: L{bytes} @param path: The encoded path part of the URI of the request. For example, C{b"/foo"}. @type path: L{bytes} @param headers: The headers of the request. @type headers: L{Headers} @param body: The bytes that make up the request body. @type body: L{bytes} @return: A L{IRequest} which can be used to render an L{IResource} using only in-memory data structures. """ scheme, location, path, params, query, fragment = urlparse(path) if query: # Oops, dropped params. Good thing no one cares. path = path + "?" + query return _DummyRequest( next(_dummyRequestCounter), method, path, headers, body)
def dummyRequest(method, path, headers, body=b""): """ Construct a new dummy L{IRequest} provider. @param method: The HTTP method of the request. For example, C{b"GET"}. @type method: L{bytes} @param path: The encoded path part of the URI of the request. For example, C{b"/foo"}. @type path: L{bytes} @param headers: The headers of the request. @type headers: L{Headers} @param body: The bytes that make up the request body. @type body: L{bytes} @return: A L{IRequest} which can be used to render an L{IResource} using only in-memory data structures. """ parsed = urlparse(path) if parsed.query: # Oops, dropped params. Good thing no one cares. new_path = parsed.path + "?" + parsed.query else: new_path = parsed.path return _DummyRequest(next(_dummyRequestCounter), method, new_path, headers, body)
def _parse(url, defaultPort=None): """ Split the given URL into the scheme, host, port, and path. @type url: C{str} @param url: An URL to parse. @type defaultPort: C{int} or C{None} @param defaultPort: An alternate value to use as the port if the URL does not include one. @return: A four-tuple of the scheme, host, port, and path of the URL. All of these are C{str} instances except for port, which is an C{int}. """ url = url.strip() parsed = http.urlparse(url) scheme = parsed[0] path = urlunparse(('','')+parsed[2:]) if defaultPort is None: if scheme == 'https': defaultPort = 443 else: defaultPort = 80 host, port = parsed[1], defaultPort if ':' in host: host, port = host.split(':') port = int(port) if path == "": path = "/" return scheme, host, port, path
def __dispatch_url(self): self.__log.info('Stream connection to {url}', url=self.transport.location) _scheme, _netloc, path_bytes, _params, query_bytes, _fragment = urlparse( bytes_or_ascii(self.transport.location)) # py2/3: unquote returns str in either version but we want Unicode path = [ six.text_type(urllib.parse.unquote(x)) for x in path_bytes.split(b'/') ] assert path[0] == '' path[0:1] = [] cap_string = path[0] if cap_string in self._caps: root_object = self._caps[cap_string] path[0:1] = [] else: raise Exception('Unknown cap') # TODO better error reporting if path == [AUDIO_STREAM_PATH_ELEMENT]: options = parse_audio_stream_options(parse_qs(query_bytes, 1)) self.inner = AudioStreamInner(the_reactor, self.__send, root_object, options.sample_rate) elif len(path) >= 1 and path[0] == CAP_OBJECT_PATH_ELEMENT: # note _lookup_block may throw. TODO: Better error reporting root_object = _lookup_block(root_object, path[1:]) self.inner = StateStreamInner( self.__send, root_object, path_bytes.decode('utf-8'), self.__subscription_context ) # note reuse of WS path as HTTP path; probably will regret this else: raise Exception('Unknown path: %r' % (path, ))
def cbFail(fail): if hasattr(fail.value, 'response'): if hasattr(fail.value.response, 'code'): if fail.value.response.code == 307: loc = fail.value.response.headers.getRawHeaders( 'location') new = urlparse(loc[0]) newhost = '://'.join((new.scheme, new.netloc)) if newhost == self.host: self.loc = None else: self.loc = newhost self.log.debug('redirect: %s' % self.loc) data = FileBodyProducer(StringIO(json.dumps(body))) d = agent.request(method, loc[0], Headers(headers), data) d.addCallbacks(cbRequest, cbFail) return d elif fail.value.response.code == 404 and self.loc: self.loc = None host = '/'.join((self.host, path)) if self.token: host += '?auth=' + self.token d = self.request(method, host, Headers(headers), body) d.addCallbacks(cbRequest, cbFail) return d else: print(dir(fail.value)) print(fail.value.message) print(fail.value.args) self.log.error('unhandled failure: %s -- %s' % (fail.value.message, fail.value))
def _combine_query_params(url, params): parsed_url = urlparse(url.encode('ascii')) qs = [] if parsed_url.query: qs.extend([parsed_url.query, b'&']) qs.append(urlencode(params, doseq=True)) return urlunparse((parsed_url[0], parsed_url[1], parsed_url[2], parsed_url[3], b''.join(qs), parsed_url[5]))
def assertSameParsing(url, decode): """ Verify that C{url} is parsed into the same objects by both L{http.urlparse} and L{urlparse}. """ urlToStandardImplementation = url if decode: urlToStandardImplementation = url.decode('ascii') standardResult = urlparse(urlToStandardImplementation) scheme, netloc, path, params, query, fragment = http.urlparse(url) self.assertEqual((scheme, netloc, path, params, query, fragment), standardResult) self.assertTrue(isinstance(scheme, str)) self.assertTrue(isinstance(netloc, str)) self.assertTrue(isinstance(path, str)) self.assertTrue(isinstance(params, str)) self.assertTrue(isinstance(query, str)) self.assertTrue(isinstance(fragment, str))
def add_download(self, d): max_queued = int(self.get_setting('max_queued', 0)) if max_queued and (len(self.get_downloads()) >= max_queued): raise Exception('Too many downloads queued (see "max_queued" config var)') if d.url: if not d.description: d.description = d.url if not d.filename: d.filename = unicode(urllib.unquote( os.path.basename(http.urlparse(str(d.url))[2]) )) # Rather than guess mimetypes, just let default downloaders grab # the file and pass it off to secondary handlers if needed #if d.filename and not d.mime_type: #d.mime_type = unicode(mimetypes.guess_type(d.filename)[0]) if not d.feed_id and not d.media_type: mt = d.mime_type if not mt and d.filename: mt = mimetypes.guess_type(d.filename)[0] if mt: if mt.startswith('video'): d.media_type = u'video/other' elif mt.startswith('audio'): d.media_type = u'audio/other' d.added = time() d.deleted = False d.active = False d.progress = 0 d.status = Status.QUEUED d.downloaded = 0 self.store.add(d) self.get_downloads().append(d) self.store.commit() logging.info(u'Added new download ' + d.description) self.application.fire_event('download_added', d) self.application.auto_queue() return d.id
def add_download(self, d): max_queued = int(self.get_setting('max_queued', 0)) if max_queued and (len(self.get_downloads()) >= max_queued): raise Exception('Too many downloads queued (see "max_queued" config var)') if d.url: if not d.description: d.description = d.url if not d.filename: d.filename = unicode(urllib.unquote( os.path.basename(http.urlparse(str(d.url))[2]) )) # Rather than guess mimetypes, just let default downloaders grab # the file and pass it off to secondary handlers if needed #if d.filename and not d.mime_type: #d.mime_type = unicode(mimetypes.guess_type(d.filename)[0]) if not d.feed_id and not d.media_type: mt = d.mime_type if not mt and d.filename: mt = mimetypes.guess_type(d.filename)[0] if mt: if mt.startswith('video'): d.media_type = u'video/other' elif mt.startswith('audio'): d.media_type = u'audio/other' d.added = time() d.deleted = False d.active = False d.progress = 0 d.status = Status.QUEUED d.downloaded = 0 self.store.add(d) self.get_downloads().append(d) self.store.commit() logging.debug(u'Added new download ' + d.description) self.application.fire_event('download_added', d) self.application.auto_queue() return d.id
def assertSameParsing(url, decode): """ Verify that C{url} is parsed into the same objects by both L{http.urlparse} and L{urlparse}. """ urlToStandardImplementation = url if decode: urlToStandardImplementation = url.decode('ascii') standardResult = urlparse(urlToStandardImplementation) scheme, netloc, path, params, query, fragment = http.urlparse(url) self.assertEqual( (scheme, netloc, path, params, query, fragment), standardResult) self.assertTrue(isinstance(scheme, str)) self.assertTrue(isinstance(netloc, str)) self.assertTrue(isinstance(path, str)) self.assertTrue(isinstance(params, str)) self.assertTrue(isinstance(query, str)) self.assertTrue(isinstance(fragment, str))
def __init__(self, method, uri, headers, code=http.OK, content=None, client=None): self.method = method self.uri = uri self.requestHeaders = headers.copy() self.responseHeaders = Headers() self.code = code self._client = client self.prepath = [] location = http.urlparse(self.uri) self.postpath = location.path[1:].split(b"/") self._finishDeferreds = [] self.written = [] self.finished = 0 self.args = parse_qs(location.query, True) self.content = content contentType = self.requestHeaders.getRawHeaders(b"Content-Type", [None])[0] if method == b"POST" and contentType is not None: contentType = parse_header(contentType)[0] if contentType == b"application/x-www-form-urlencoded": self.args.update(parse_qs(self.content.read(), True))
def fromBytes(cls, uri, defaultPort=None): """Patched replacement for `twisted.web.client._URI.fromBytes`. The Twisted version of this function breaks when you give it a URL whose netloc is based on an IPv6 address. """ uri = uri.strip() scheme, netloc, path, params, query, fragment = http.urlparse(uri) if defaultPort is None: scheme_ports = {b"https": 443, b"http": 80} defaultPort = scheme_ports.get(scheme, 80) if b"[" in netloc: # IPv6 address. This is complicated. parsed_netloc = re.match( b"\\[(?P<host>[0-9A-Fa-f:.]+)\\]([:](?P<port>[0-9]+))?$", netloc, ) host, port = parsed_netloc.group("host", "port") elif b":" in netloc: # IPv4 address or hostname, with port spec. This is easy. host, port = netloc.split(b":") else: # IPv4 address or hostname, without port spec. # This is trivial. host = netloc port = None if port is None: port = defaultPort try: port = int(port) except ValueError: port = defaultPort return cls( scheme, netloc, host, port, path, params, query, fragment )
def _parse(url, defaultPort=None): """ Split the given URL into the scheme, host, port, and path. @type url: C{bytes} @param url: An URL to parse. @type defaultPort: C{int} or C{None} @param defaultPort: An alternate value to use as the port if the URL does not include one. @return: A four-tuple of the scheme, host, port, and path of the URL. All of these are C{bytes} instances except for port, which is an C{int}. """ print "X"*100 url = url.strip() parsed = http.urlparse(url) scheme = parsed[0] path = urlunparse((b'', b'') + parsed[2:]) if defaultPort is None: if scheme == b'https': defaultPort = 443 else: defaultPort = 80 host, port = parsed[1], defaultPort if b':' in host: host, port = host.split(b':') try: port = int(port) except ValueError: port = defaultPort if path == b'': path = b'/' return (scheme, host, port, path)
def _coerce_and_validate_base_url(url_value, label, allowed_schemes, allow_path=False): """Convert url_value to string or None and validate it is a suitable base URL.""" if url_value is not None: url_value = str(url_value) scheme, _netloc, path_bytes, _params, _query_bytes, _fragment = urlparse(bytes_or_ascii(url_value)) # Ensure that the protocol is compatible. if scheme.lower() not in allowed_schemes: raise ConfigException('config.serve_web: {} must be a {} URL but was {}'.format(label, ' or '.join(repr_no_string_tag(s + ':') for s in allowed_schemes), repr_no_string_tag(url_value))) # Ensure that there are no path components. There are two reasons for this: # 1. The client makes use of host-relative URLs. # 2. Because ShinySDR makes heavy use of localStorage, and may in the future use other origin-scoped features, it is not safe to run ShinySDR on the same origin as another web application as they might collide with each other. Trying to reverse-proxy with an added path component does not _necessarily_ indicate an attempt to do this, but it'd be more work to support it so let's not bother. # However, neither reason applies to WebSocket addresses, so those are allowed to have directory paths. if allow_path: if not path_bytes.endswith(b'/'): raise ConfigException('config.serve_web: {}\'s path must end in a slash, but had {}'.format(label, repr_no_string_tag(path_bytes))) else: if path_bytes != b'/': raise ConfigException('config.serve_web: {} must not have any path components, but had {}'.format(label, repr_no_string_tag(path_bytes))) return url_value
def __init__(self, method, uri, headers, code=http.OK, content=None, client=None): self.method = method self.uri = uri self.requestHeaders = headers.copy() self.responseHeaders = Headers() self.code = code self._client = client self.prepath = [] location = http.urlparse(self.uri) self.postpath = location.path[1:].split(b'/') self._finishDeferreds = [] self.written = [] self.finished = 0 self.args = parse_qs(location.query, True) self.content = content contentType = self.requestHeaders.getRawHeaders( b'Content-Type', [None])[0] if method == b'POST' and contentType is not None: contentType = parse_header(contentType)[0] if contentType == b'application/x-www-form-urlencoded': self.args.update(parse_qs(self.content.read(), True))
def _parse(url, defaultPort=None): url = url.strip() parsed = http.urlparse(url) scheme = parsed[0] path = urlunparse(("", "") + parsed[2:]) if defaultPort is None: if scheme == "https": defaultPort = 443 else: defaultPort = 80 host, port = parsed[1], defaultPort if ":" in host: host, port = host.split(":") try: port = int(port) except ValueError: port = defaultPort if path == "": path = "/" return scheme, host, port, path
def parseConfigUrl(configUrl): parsed = urlparse(configUrl) # k8s API uses an authorization token, so no password on the URL, only username return parsed.hostname, parsed.port or 443, parsed.username
def parseConfigUrl(self, configUrl): parsed = urlparse(configUrl) return parsed.hostname, parsed.port or 2379, parsed.path
def parse(self, response): """ """ # @FIXME check crawl_depth == 0 early in parsing, elif >0 then parse as normal crawl_depth = response.meta.get('crawl_depth', self._default_crawl_depth) content_type = response.meta.get('content_type', 'UNKNOWN') if crawl_depth == self._default_crawl_depth: content_type = 'HTML' # 1st URL most likly is HTML log = structlog.get_logger().bind( event = 'CRAWL', source_url = response.url, content_type = content_type) if content_type == 'HTML': body = response.xpath('//body')[0] yield { 'source_url': response.url, 'content_type': content_type, 'title': response.xpath('//head//title/text()').extract_first(), 'content': body.extract()} # links = body.xpath('.//a/@href') for element in links: raw_link = element.extract() parsed_url = http.urlparse(raw_link.encode('utf8')).decode() base_link = ''.join([ parsed_url.hostname or '', parsed_url.path or '/'] ).rstrip('/') content_type = self.determine_type(base_link) proceed_to_crawl = all([ crawl_depth >= 0, base_link not in self._url_black_list, content_type in self._processable_ext]) if crawl_depth >= 0: if base_link not in self._url_black_list: if content_type in self._processable_ext: log.info( content_type = content_type, action = 'FOLLOW_HREF', href = raw_link, crawl_depth = crawl_depth) # @TODO handle other formats besides HTML if content_type == 'HTML': yield response.follow( raw_link, callback = self.parse, errback = self.errback, meta = dict( crawl_depth = crawl_depth - 1, content_type = content_type, splash = { 'args': { 'wait': 1, 'html': 1, } } ), ) else: log.info(error = 'CANNOT_PROCESS_CONTENT_TYPE') else: log.info(error = 'BLACKLISTED_URL') elif crawl_depth == 0: # do not crawl past this domain layer # @TODO pass visited domains in metadata '@FIXME' elif content_type in self._processable_ext: # @TODO yield { 'source_url': response.url, 'content_type': content_type, 'title': response.url, 'content': ''} elif content_type in ['IMAGE']: log.info(event = 'SKIP_CONTENT_TYPE') else: log.warn( response_code = response.status, error = 'UNABLE_TO_PARSE')
def httpRequest(url, payload, headers, method='POST', timeout=DEFAULT_TIMEOUT, ctx_factory=None): # copied from twisted.web.client in order to get access to the # factory (which contains response codes, headers, etc) if type(url) is not str: e = HTTPRequestError('URL must be string, not %s' % type(url)) return defer.fail(e) if not url.startswith('http'): e = HTTPRequestError('URL does not start with http (URL %s)' % (url)) return defer.fail(e) log.msg(" -- Sending Payload to %s --\n%s\n -- END. Sending Payload --" % (url, payload), system=LOG_SYSTEM, payload=True) scheme, netloc, _, _, _, _ = twhttp.urlparse(url) if not ':' in netloc: host = netloc port = 80 if scheme == 'http' else 443 else: host, s_port = netloc.split(':', 1) port = int(s_port) factory = twclient.HTTPClientFactory(url, method, postdata=payload, timeout=timeout) factory.noisy = False # stop spewing about factory start/stop factory.protocol.handleStatus_204 = lambda _: None # 204 is an ok reply, needed by NCS VPN backend # fix missing port in header (bug in twisted.web.client) factory.headers['host'] = host + ':' + str(port) factory.headers['User-Agent'] = 'OpenNSA/Twisted' for header, value in headers.items(): factory.headers[header] = value if scheme == 'https': if ctx_factory is None: return defer.fail( HTTPRequestError( 'Cannot perform https request without context factory')) reactor.connectSSL(host, port, factory, ctx_factory) else: reactor.connectTCP(host, port, factory) def invocationError(err): if isinstance( err.value, ConnectionClosed ): # note: this also includes ConnectionDone and ConnectionLost pass # these are pretty common when the remote shuts down elif isinstance(err.value, WebError): data = err.value.response log.msg( ' -- Received Reply (fault) --\n%s\n -- END. Received Reply (fault) --' % data, system=LOG_SYSTEM, payload=True) return err elif isinstance(err.value, ConnectionRefusedError): log.msg('Connection refused for %s:%i. Request URL: %s' % (host, port, url), system=LOG_SYSTEM) return err else: return err def logReply(data): log.msg(" -- Received Reply --\n%s\n -- END. Received Reply --" % data, system=LOG_SYSTEM, payload=True) return data factory.deferred.addCallbacks(logReply, invocationError) return factory.deferred
def render(self, request: server.Request) -> bytes: # Deny by default. request.setResponseCode(401) # Get session cookie value if any. sessionid = request.getCookie(self.cookie) if sessionid is not None: if sessionid in self.sessions: request.setResponseCode(200) self.log.info("Session: Validation succeeded") return b"" else: self.log.info("Session: Invalid session id") # Token is passed as a query parameter in the original URL. origurl = http.urlparse(request.getHeader(self.header)) query = http.parse_qs(origurl.query) args = query.get(self.param, []) if len(args) != 1: self.log.error("Request: Token {param} missing", param=self.param) return b"" try: token = jwt.JWT(key=self.key, jwt=args[0].decode()) except (jwt.JWTExpired, jwt.JWTNotYetValid, jwt.JWTMissingClaim, jwt.JWTInvalidClaimValue, jwt.JWTInvalidClaimFormat, jwt.JWTMissingKeyID, jwt.JWTMissingKey) as error: self.log.error("JWT token: {error}", error=error) return b"" except Exception: self.log.failure("JWT token: Unknown exception") return b"" try: claims = json.loads(token.claims) except json.JSONDecodeError as error: self.log.failure("JWT token: Claims {error}", error=error) return b"" # Collect session parameters from claims. sessparams = claims.get("session", {}) kwargs = { "expires": sessparams.get("expires", None), "domain": sessparams.get("domain", None), "path": sessparams.get("path", None), "secure": sessparams.get("secure", None), "httpOnly": sessparams.get("httpOnly", None), "sameSite": sessparams.get("sameSite", None), } # Use maxAge for session ttl if it is present, convert it into a str # type as required by the addCookie call. if "maxAge" in sessparams: kwargs["max_age"] = str(sessparams["maxAge"]) sessttl = int(sessparams["maxAge"]) else: sessttl = self.sessttl # Generate a new session id and remember it. Also clean it up after # ttl seconds. sessionid = secrets.token_urlsafe(nbytes=16).encode() self.sessions.add(sessionid) reactor.callLater(sessttl, self._session_remove, sessionid) self.log.info("Session: Created, num sessions: {sessions}", sessions=len(self.sessions)) # Set cookie in the browser. request.addCookie(self.cookie, sessionid, **kwargs) request.setResponseCode(200) self.log.info("JWT token: Validation succeeded") return b""
def handle_html(self, response, html_selector): """ Parse HTML and extract links :type response: scrapy.http.Response :type html_selector: scrapy.selector.Selector :yields: dict, scrapy.Request """ # @TODO handles for different parts of the HTML. eg. body, head, frameset log = structlog.get_logger().bind( event = 'PARSE_HTML', module = __file__, source_url = response.url, content_type = 'HTML') crawl_depth = response.meta.get('crawl_depth', self._crawl_depth) title = response.data.get('title', response.url) try: body = html_selector.xpath('//body')[0] except IndexError: body = selector.Selector(text='') yield dict( source_url = response.url, crawl_timestamp = self._crawl_start_datetime.strftime('%Y-%m-%dT%H:%M:%SZ'), title = title, content_type = 'HTML', content = body.extract()) # add domain to set of traversed domains parsed_resp_url = http.urlparse(response.url.encode('utf')).decode() self._traversed_domains.add(parsed_resp_url.netloc) # extract links linkextractor = LxmlLinkExtractor( allow = self._patterns_url_whitelist, deny = self._patterns_url_blacklist, allow_domains = self._patterns_domain_whitelist, deny_domains = self._patterns_domain_blacklist) href_list = linkextractor.extract_links(response) for link in href_list: # get the URL in string format href = link.url # separate meaningful pieces of URL try: parsed_href = http.urlparse(href.encode('utf8')).decode() except: # typically href URL is invalid log.error(error = "INVALID_URL", href=href) continue # only parse HTTP links if parsed_href.scheme.upper() in ['HTTP', 'HTTPS']: # split the query string from the href, do not follow _href! _href = ''.join([ parsed_href.netloc, parsed_href.path]) # determine file type from the URL content_type = self.identify_type_from_url(_href) # make routing decision based on content type route = None if content_type in ['HTML']: route = response.follow( href, callback = self.parse, errback = self.errback, meta = dict( crawl_depth = crawl_depth - 1, splash = { 'endpoint': 'render.json', 'args': { 'html': 1, 'iframes': 1, 'timeout': 10, } } ) ) elif content_type in self._processable_ext: log.info('@TODO') # @TODO # is crawl at 0 depth? conditions = any([ crawl_depth > 0, all([ crawl_depth <= 0, parsed_href.netloc in self._traversed_domains ]), ]) if conditions and route is not None: yield route
def httpRequest(url, payload, headers, method=b'POST', timeout=DEFAULT_TIMEOUT, ctx_factory=None): # copied from twisted.web.client in order to get access to the # factory (which contains response codes, headers, etc) # Make request work with both str and bytes url if type(url) is str: url = url.encode() if type(url) is not bytes: e = HTTPRequestError('URL must be bytes, not %s' % type(url)) return defer.fail(e) if not url.startswith(b'http'): e = HTTPRequestError('URL does not start with http (URL %s)' % (url)) return defer.fail(e) log.msg(" -- Sending Payload to {} --".format(url), system=LOG_SYSTEM, payload=True) log.msg(payload, system=LOG_SYSTEM, payload=True) log.msg(' -- END --', system=LOG_SYSTEM, payload=True) scheme, netloc, _ , _, _, _ = twhttp.urlparse(url) if not b':' in netloc: host = netloc port = 80 if scheme == 'http' else 443 else: host, s_port = netloc.split(b':',1) port = int(s_port) factory = twclient.HTTPClientFactory(url, method, postdata=payload, timeout=timeout) factory.noisy = False # stop spewing about factory start/stop factory.protocol.handleStatus_204 = lambda _ : None # 204 is an ok reply, needed by NCS VPN backend # fix missing port in header (possible bug in twisted.web.client, or just low-level library) factory.headers[b'host'] = netloc factory.headers[b'User-Agent'] = b'OpenNSA/Twisted' for header, value in headers.items(): factory.headers[header.encode('utf-8')] = value.encode('utf-8') if scheme == b'https': if ctx_factory is None: return defer.fail(HTTPRequestError('Cannot perform https request without context factory')) reactor.connectSSL(host, port, factory, ctx_factory.getClientTLSOptions(host.decode())) else: reactor.connectTCP(host, port, factory) def invocationError(err): if isinstance(err.value, ConnectionClosed): # note: this also includes ConnectionDone and ConnectionLost # It might be better to just raise an error here, but at least we log it log.msg('ConnectionClosed failure: {} (this is usually okay during shutdown)'.format(err.value), system=LOG_SYSTEM) pass # these are pretty common when the remote shuts down elif isinstance(err.value, WebError): data = err.value.response log.msg(' -- Received Reply (fault) --', system=LOG_SYSTEM, payload=True) log.msg(data, system=LOG_SYSTEM, payload=True) log.msg(' -- END --', system=LOG_SYSTEM, payload=True) return err elif isinstance(err.value, ConnectionRefusedError): log.msg('Connection refused for %s:%i. Request URL: %s' % (host, port, url), system=LOG_SYSTEM) return err else: return err def logReply(data): log.msg(' -- Received Reply --', system=LOG_SYSTEM, payload=True) log.msg(data, system=LOG_SYSTEM, payload=True) log.msg('-- END --', system=LOG_SYSTEM, payload=True) return data factory.deferred.addCallbacks(logReply, invocationError) return factory.deferred