def __call__(self, data: str): try: url = URL(data) except ValueError: raise ValidationError(_("URL cannot be parsed"), code="parse_error") if url.has_query_param('db'): if not url.query_param('db').isdigit(): raise ValidationError(_("Invalid port specified"), code="invalid_port") if url.scheme() == "unix": if url.host(): raise ValidationError( _("Hostname not supported for unix domain sockets"), code="unix_domain_socket_hostname") if url.port(): raise ValidationError( _("Port not supported for unix domain sockets"), code="unix_domain_socket_port") if not url.path(): raise ValidationError( _("No path specified for unix domain socket"), code="unix_domain_socket_path") if url.scheme() in ("redis", "redis+tls"): if not url.host(): raise ValidationError(_("No host specified"), code="host_missing")
def url(self, path='/', **query): url = URL(path) if not url.host(): url = url.host(self.host) if not url.scheme(): url = url.scheme('http') for k, v in query.items(): url = url.query_param(k, v) return url
def deal_domain(response): opt = URL(response.url) page_domain = opt.domain() scheme = opt.scheme() response.page_domain = page_domain response.scheme = scheme response.page_prefix = response.scheme + '://' + response.page_domain + '/'
def __init__(self, url, save_dir='tmp'): """ @url: full url of a site @save_dir: dir to save site """ # log self.logger = logger('file', 'sitelog.log', save_dir) self.logger.info('-' * 20) self.logger.info('start') self.logger.info('start func: __init__') self.logger.info('url: %s' % url) save_time = datetime.strftime(datetime.now(), '%Y%m%d%H%M') self.save_time = save_time self.save_dir = os.path.abspath(os.path.join(save_dir, save_time)) # create dir if not exist if not os.path.isdir(self.save_dir): os.makedirs(self.save_dir) self.url = url u = URL(url) # get host like: http://m.sohu.xom self.host = u.scheme() + '://' + u.host() print '%s: saving %s' % (save_time, self.url) self.logger.info('end func: __init__')
def extract_url(self, response): if len(response.all_url) > 0: get_domain_list = [] get_url_list = [] for url in response.all_url: if not url: continue end_fix = url[-4:len(url)] if '.jpg.png.gif.rar.zip.doc.pdf.css'.find(end_fix) != -1: continue opt = URL(url) url_domain = opt.domain() if not url_domain: url = response.page_prefix + '/' + url url_domain = response.page_domain elif not opt.scheme(): url = 'http://' + url if url_domain.find('eastmoney') == -1: continue response.pipe.get(response.spider_name + 'been_url:' + url) get_domain_list.append(url_domain) get_url_list.append(url) for url_domain in get_domain_list: response.pipe.get(response.spider_name + 'ban_host:' + url_domain) get_urlex_dmexp_list = response.pipe.execute() adv_len = len(get_url_list) if len(get_urlex_dmexp_list) == 0 or len( get_urlex_dmexp_list) != adv_len + len(get_domain_list): return for index in range(len(get_url_list)): url = get_url_list[index] exist_flag = get_urlex_dmexp_list[index] if exist_flag: continue is_ban_host = get_urlex_dmexp_list[index + adv_len] if is_ban_host: continue response.pipe.lpush(self.redis_key, url) response.pipe.execute() return True
class Segments(object): """ URL segment handler, not intended for direct use. The URL is constructed by joining base, path and segments. """ def __init__(self, base, path, segments, defaults): # Preserve the base URL self.base = PURL(base, path=path) # Map the segments and defaults lists to an ordered dict self.segments = OrderedDict(zip(segments, defaults)) def build(self): # Join base segments and segments segments = self.base.path_segments() + tuple(self.segments.values()) # Create a new URL with the segments replaced url = self.base.path_segments(segments) return url def full_path(self): full_path = self.build().as_string() full_path = full_path.replace(self.base.host(), '') full_path = full_path.replace(self.base.scheme(), '') return full_path[4:] def __str__(self): return self.build().as_string() def _get_segment(self, segment): return self.segments[segment] def _set_segment(self, segment, value): self.segments[segment] = value @classmethod def _segment(cls, segment): """ Returns a property capable of setting and getting a segment. """ return property( fget=lambda x: cls._get_segment(x, segment), fset=lambda x, v: cls._set_segment(x, segment, v), )
def maybe_external_link(text, **kw): url = URL(text) if url.host() and url.scheme() in ['http', 'https']: return external_link(text, **kw) return text
# Do some sanity checks on the config requiredAttribs = [ 'serviceName', 'package', 'components', 'configurations' ] for attrib in requiredAttribs: if not attrib in service_config: log.error("Invalid configuration. Missing required attribute '%s'", attrib) sys.exit(3) log.info('Installing service: %s on ambari host: %s', service_config['serviceName'], args.ambari_host) ambari_host_uri = URL(args.ambari_host) ambari_client = Ambari(ambari_host_uri.host(), port=ambari_host_uri.port(), protocol=ambari_host_uri.scheme(), username=args.username, password=args.password, identifier='hdiapps') # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses ambari_client.client.request_params['hooks'] = dict( response=shared_lib.Fixup(ambari_host_uri).fixup) # Assume we only have 1 cluster managed by this Ambari installation cluster = ambari_client.clusters.next() log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href) # Pull in any extra dynamic configuration if args.extra_config: try: extra_config = json.loads(args.extra_config) log.debug(
service_config = config_request.json() log.debug('Service config: %s', service_config) except: log.error("Invalid configuration URI", exc_info=True) sys.exit(2) # Do some sanity checks on the config requiredAttribs = ['serviceName', 'package', 'components', 'configurations'] for attrib in requiredAttribs: if not attrib in service_config: log.error("Invalid configuration. Missing required attribute '%s'", attrib) sys.exit(3) log.info('Installing service: %s on ambari host: %s', service_config['serviceName'], args.ambari_host) ambari_host_uri = URL(args.ambari_host) ambari_client = Ambari(ambari_host_uri.host(), port=ambari_host_uri.port(), protocol=ambari_host_uri.scheme(), username=args.username, password=args.password, identifier='hdiapps') # If this is being invoked from outside the cluster, we must fixup the href references contained within the responses ambari_client.client.request_params['hooks'] = dict(response=shared_lib.Fixup(ambari_host_uri).fixup) # Assume we only have 1 cluster managed by this Ambari installation cluster = ambari_client.clusters.next() log.debug('Cluster: %s, href: %s', cluster.cluster_name, cluster._href) # Pull in any extra dynamic configuration if args.extra_config: try: extra_config = json.loads(args.extra_config) log.debug('Applying dynamic service configuration specified on command-line: %s', extra_config) except: log.warning('Extra configuration specified by the -x argument could not be parsed as JSON. The value was \'%s\'. Details: ', args.extra_config, exc_info=True) extra_config = {} else:
def wikipedia_url(s): # pragma: no cover url = URL(s) if url.scheme() in ['http', 'https'] and 'wikipedia.' in url.host(): return s
str_url = URL('https://www.google.com/search?q=google') print(str_url) print(str_url.as_string()) argument_url = URL(scheme='https', host='www.google.com', path='/search', query='q=google') print(argument_url) print(argument_url.as_string()) inline_url = URL().scheme('https').domain('www.google.com').path( 'search').query_param('q', 'google') print(inline_url) print(inline_url.as_string()) u = URL('postgres://*****:*****@localhost:1234/test?ssl=true') print(u.scheme()) print(u.host()) print(u.domain()) print(u.username()) print(u.password()) print(u.netloc()) print(u.port()) print(u.path()) print(u.query()) print(u.path_segments()) print(u.query_param('ssl')) print(u.query_param('ssl', as_list=True)) print(u.query_params()) print(u.has_query_param('ssl')) print(u.subdomains())