def __init__(self, delay=5): super(SearchUrlCrawler, self).__init__(delay=delay) with codecs.open( os.path.realpath(os.path.join(os.getcwd(), u'keywords.csv')), u'rb', u'utf-8') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') self.queries = OrderedDict() for row in csvreader: query = OrderedDict([(u'city', row[1]), (u'state', row[2]), (u'kw', row[0])]) parsed = ParseResult(u'https', u'www.manta.com', u'/api/v1/location', u'', urlencode(query), u'') self.queries[parsed.geturl()] = row self.crawl_urls.append(parsed.geturl())
def _url_and_destination(self, base_url, unit): """ Get the download URL and download destination. :param base_url: The base URL. :type base_url: str :param unit: A content unit. :type unit: dict :return: (url, destination) :rtype: tuple(2) """ storage_path = unit[constants.STORAGE_PATH] tar_path = unit.get(constants.TARBALL_PATH) if not tar_path: # The pulp/nodes/content endpoint provides all content. # This replaced the publishing of individual links for each unit. parsed = urlparse(base_url) relative_path = unit[constants.RELATIVE_PATH] path = pathlib.join(constants.CONTENT_PATH, pathlib.quote(relative_path)) base_url = ParseResult( scheme=parsed.scheme, netloc=parsed.netloc, path=path, params=parsed.params, query=parsed.query, fragment=parsed.fragment) return base_url.geturl(), storage_path else: return pathlib.url_join(base_url, pathlib.quote(tar_path)),\ pathlib.join(os.path.dirname(storage_path), os.path.basename(tar_path))
def build_url(host, scheme=None, port=None): """ Build a valid URL. IPv6 addresses specified in host will be enclosed in brackets automatically. >>> build_url('example.com', 'https', 443) 'https://example.com:443' >>> build_url(host='example.com', port=443) '//example.com:443' >>> build_url('fce:9af7:a667:7286:4917:b8d3:34df:8373', port=80, scheme='http') 'http://[fce:9af7:a667:7286:4917:b8d3:34df:8373]:80' :param scheme: The scheme, e.g. http, https or ftp. :type scheme: str :param host: Consisting of either a registered name (including but not limited to a hostname) or an IP address. :type host: str :type port: int :rtype: str """ netloc = host if not is_valid_ipv6_address(host) else '[{}]'.format(host) if port: netloc += ':{}'.format(port) pr = ParseResult(scheme=scheme, netloc=netloc, path='', params='', query='', fragment='') return pr.geturl()
def url_for_first_service_with_name(self, name, scheme=None): locator = self.credentials_locator find = locator.find_credentials_for_first_service_with_name credentials = find(name) if not credentials: return None if 'uri' in credentials: return credentials['uri'] scheme = scheme or '' username = credentials.get('username', None) hostname = credentials.get('hostname', None) password = credentials.get('password', None) username_password = '' if username or password: username_password = "******" % (username or '', password or '') user_pass_and_hostname = '%s%s' % (username_password, hostname) port = credentials.get('port', None) port = '' if not port else ':%s' % port netloc = '%s%s' % (user_pass_and_hostname, port) parse_result = ParseResult(scheme=scheme, netloc=netloc, path='', params='', query='', fragment='') return parse_result.geturl()
def url_build(scheme='http', netloc='', path='', params='', query='', fragment=''): with app.app_context(): netloc=app.config['SERVER_NAME'] u = ParseResult(scheme=scheme, netloc=netloc, path=path, params=params, query=query, fragment=fragment) return u.geturl()
def pre_resolve_request(self, request): uri = urlparse(request.url) ip, port = CachingResolver.get(uri.hostname, uri.port or 80) if uri.scheme == "https": port = 443 request.headers["Host"] = uri.hostname pr = ParseResult(uri.scheme, "%s:%s" % (ip, port), uri.path, uri.params, uri.query, uri.fragment) request.url = pr.geturl() return request
def __init__(self, delay=5): super(DetailListCrawler, self).__init__(delay=delay) with codecs.open( os.path.realpath( os.path.join(os.getcwd(), u'url_list_pagenum.csv')), u'rb', u'utf-8') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') self.queries = OrderedDict() for row in csvreader: parsed = urlparse(row[0]) query = OrderedDict(parse_qsl(parsed.query)) for page in xrange(int(row[1])): query[u'pg'] = page + 1 newParsed = ParseResult(parsed.scheme, parsed.netloc, parsed.path, u'', urlencode(query), u'') self.queries[newParsed.geturl()] = row self.crawl_urls.append(newParsed.geturl())
def download_key_http(self, address, port): url = ParseResult( scheme='http', # This seems to work well enough with both IPv6 and IPv4 netloc="[[%s]]:%d" % (address, port), path='/', params='', query='', fragment='') return requests.get(url.geturl()).text
def download_key_http(self, address, port): url = ParseResult( scheme='http', # This seems to work well enough with both IPv6 and IPv4 netloc="[[%s]]:%d" % (address, port), path='/', params='', query='', fragment='') return requests.get(url.geturl()).text
def uriWithoutSuffix(self): from urlparse import ParseResult uriCopy = ParseResult(scheme=self._parsedUri.scheme, netloc=self._parsedUri.netloc, path=self._parsedUri.path, params='', query=self._parsedUri.query, fragment='') uriCopy = quoteParseResults(uriCopy) if self._doUnquote else uriCopy return uriCopy.geturl()
def authenticate(self, request): """ Check the request for authenticated user. If user is not authenticated then redirect user to login view. """ next_url = request.GET.get("next", None) # Get the service name from request service = self._get_service(request) if not service: return HttpResponseForbidden("Invalid service") validator = DefaultValidation(service.key) try: next_url = urlparse(urllib.unquote(next_url).decode("utf8")) except AttributeError: if "HTTP_REFERER" in request.META: next_url = urlparse(request.META["REFERER"]) else: next_url = urlparse(service.default_url) # Retreive the referer GET parameters and make a new one params = dict(parse_qsl(next_url[4])) # Does user authenticated before? if request.user.is_authenticated(): logger.debug("User is authenticated.") # If user is authenticated in Daarmaan then a ticket # (user session ID) will send back to service # IMPORTANT: is using session id of daarmaan as ticket ok? ticket = request.session.session_key logger.debug("[TICKET]: %s" % ticket) params.update({'ticket': ticket, "hash": validator.sign(ticket)}) else: # If user is not authenticated simple ack answer will return logger.debug("User is NOT authenticated.") params.update({"ack": " "}) next_url = ParseResult(next_url[0], next_url[1], next_url[2], next_url[3], urllib.urlencode(params), next_url[5]) next_url = next_url.geturl() return HttpResponseRedirect(next_url)
def __call__(self, request): """Insert the token after the path element of the URL""" url = urlparse(request.url) new_url = ParseResult(scheme=url.scheme, netloc=url.netloc, path=join(url.path, self.token), params=url.params, query=url.query, fragment=url.fragment) request.url = new_url.geturl() return request
def should_follow(self, response, spider): parsed = urlparse(response.url) url = ParseResult( parsed.scheme, parsed.netloc, parsed.path, parsed.params, None, None ) url = url.geturl() return url not in spider.disallow_urls
def download_key_http(self, address, port): url = ParseResult( scheme='http', # This seems to work well enough with both IPv6 and IPv4 netloc="[[%s]]:%d" % (address, port), path='/', params='', query='', fragment='') self.log.debug("Starting HTTP request") data = requests.get(url.geturl(), timeout=5).content self.log.debug("finished downloading %d bytes", len(data)) return data
def download_key_http(self, address, port): url = ParseResult( scheme='http', # This seems to work well enough with both IPv6 and IPv4 netloc="[[%s]]:%d" % (address, port), path='/', params='', query='', fragment='') self.log.debug("Starting HTTP request") data = requests.get(url.geturl(), timeout=5).content self.log.debug("finished downloading %d bytes", len(data)) return data
def handle_authcode(request, client, redirection_uri, state=None): parts = urlparse(redirection_uri.uri) qparams = dict(parse_qsl(parts.query)) user_id = authenticated_userid(request) auth_code = Oauth2Code(client, user_id, redirection_uri) db.add(auth_code) db.flush() qparams["code"] = auth_code.authcode if state: qparams["state"] = state parts = ParseResult(parts.scheme, parts.netloc, parts.path, parts.params, urlencode(qparams), "") return HTTPFound(location=parts.geturl())
def url_add_query(url, **kw): """ In python2.6 urlparse parses a url into a ParseResult object while in prior version urlparse's result is a tuple of six elements. """ u = urlparse(url) added_query = urllib.urlencode(kw) query = u.query if u.query: query = added_query + '&' + query else: query = added_query p = ParseResult(u.scheme, u.netloc, u.path, u.params, query, u.fragment) return p.geturl()
def get_token_url(code): args = deepcopy(request.args.to_dict()) args.update(request.view_args) qs = urlencode({ 'appid': config.WECHAT_APP_ID, 'secret': config.WECHAT_APP_SECRET, 'code': code, 'grant_type': 'authorization_code', }) o = ParseResult('https', 'api.weixin.qq.com', '/sns/oauth2/access_token', '', query=qs, fragment='wechat_redirect') return o.geturl()
def handle_authcode(request, client, redirection_uri, state=None): parts = urlparse(redirection_uri.uri) qparams = dict(parse_qsl(parts.query)) user_id = authenticated_userid(request) auth_code = Oauth2Code(client, user_id) db.add(auth_code) db.flush() qparams['code'] = auth_code.authcode if state: qparams['state'] = state parts = ParseResult( parts.scheme, parts.netloc, parts.path, parts.params, urlencode(qparams), '') return HTTPFound(location=parts.geturl())
def handle_implicit(request, client, redirection_uri, state=None): parts = urlparse(redirection_uri.uri) fparams = dict(state=None) user_id = authenticated_userid(request) token = Oauth2Token(client, user_id) db.add(token) db.flush() fparams["access_token"] = token.access_token fparams["token_type"] = "bearer" fparams["expires_in"] = token.expires_in if state: fparams["state"] = state parts = ParseResult(parts.scheme, parts.netloc, parts.path, parts.params, "", urlencode(fparams)) return HTTPFound(location=parts.geturl())
def get_oauth_url(endpoint, state): args = deepcopy(request.args.to_dict()) args.update(request.view_args) url = url_for(endpoint, _external=True, **args) qs = urlencode({ 'appid': config.WECHAT_APP_ID, 'redirect_uri': url, 'scope': 'snsapi_userinfo', 'state': state, }) o = ParseResult('https', 'open.weixin.qq.com', '/connect/oauth2/authorize', '', query=qs, fragment='wechat_redirect') return o.geturl()
def geturl(self, include_params=True): params = self.params query = self.query fragment = self.fragment if not include_params: params = "" query = "" fragment = "" r = ParseResult(scheme=self.scheme, netloc=self.netloc, path=self.path, params=params, query=query, fragment=fragment) return r.geturl()
def rel_to_abs(start_path, relative_url): """converts a relative url at a specified (absolute) location params: start_path - the absolute path from which the relative url is being accessed relative_url - the relative url on the page""" remove_null = lambda x: bool(x) parsed_start_url = urlparse(start_path) path_items = filter(remove_null, parsed_start_url.path.split('/')) path_items += [relative_url] new_path = '/'.join(path_items) parsed_abs_url = ParseResult( scheme=parsed_start_url.scheme, netloc=parsed_start_url.netloc, path=new_path, params=parsed_start_url.params, query=parsed_start_url.query, fragment=parsed_start_url.fragment) return parsed_abs_url.geturl()
def replace_netloc(self, netloc): u""" Replace network location of the media asset URI. **Example usage** >>> import copy >>> from .utils_test import CALLBACK_TEST >>> callback = copy.copy(CALLBACK_TEST) >>> callback.is_valid(True) True >>> print(callback.url) http://127.0.0.1/media >>> callback.replace_netloc(u'129.194.185.47:5003') >>> print(callback.url) http://129.194.185.47:5003/media """ url = urlparse(self.url) url = ParseResult(url.scheme, netloc, url.path, url.params, url.query, url.fragment) self.url = url.geturl()
def _populate_files(self, file_dict, f_index, shared_hashed_id): """ From the dictionary file_dict, generate a list of transfers for a job """ # Extract matching pairs pairs = [] for source in file_dict['sources']: source_url = urlparse(source.strip()) _validate_url(source_url) for destination in file_dict['destinations']: dest_url = urlparse(destination.strip()) _validate_url(dest_url) pairs.append((source_url, dest_url)) # Create one File entry per matching pair if self.is_bringonline: initial_file_state = 'STAGING' else: initial_file_state = 'SUBMITTED' # Multiple replica job or multihop? Then, the initial state is NOT_USED if len(file_dict['sources']) > 1 or self.params['multihop']: #if self.is_bringonline: #set the first as STAGING and the rest as 'NOT_USED' #staging_and_multihop = True #raise HTTPBadRequest('Staging with multiple replicas is not allowed') # On multiple replica job, we mark all files initially with NOT_USED initial_file_state = 'NOT_USED' # Multiple replicas, all must share the hashed-id if shared_hashed_id is None: shared_hashed_id = _generate_hashed_id() vo_name = self.user.vos[0] for source, destination in pairs: if len(file_dict['sources']) > 1 or not _is_dest_surl_uuid_enabled( vo_name): dest_uuid = None else: dest_uuid = str( uuid.uuid5(BASE_ID, destination.geturl().encode('utf-8'))) if self.is_bringonline: # add the new query parameter only for root -> EOS-CTA for now if source.scheme == "root": query_p = parse_qsl(source.query) query_p.append( ('activity', file_dict.get('activity', 'default'))) query_str = urlencode(query_p) source = ParseResult(scheme=source.scheme, netloc=source.netloc, path=source.path, params=source.params, query=query_str, fragment=source.fragment) f = dict(job_id=self.job_id, file_index=f_index, dest_surl_uuid=dest_uuid, file_state=initial_file_state, source_surl=source.geturl(), dest_surl=destination.geturl(), source_se=get_storage_element(source), dest_se=get_storage_element(destination), vo_name=None, priority=self.job['priority'], user_filesize=_safe_filesize(file_dict.get('filesize', 0)), selection_strategy=file_dict.get('selection_strategy', 'auto'), checksum=file_dict.get('checksum', None), file_metadata=file_dict.get('metadata', None), activity=file_dict.get('activity', 'default'), hashed_id=shared_hashed_id if shared_hashed_id else _generate_hashed_id()) self.files.append(f)
class RedirectURI(Validatable): """ wrapper object for the redirect_uri parameter as part of the authorization request """ def __init__(self, uri, settings): self.raw_uri = uri self.parsed_uri = urlparse(uri) self.error_message = None self.settings = settings self.error_responses = settings['error_responses']['redirect_uri'] # used in super class is_valid def validate(self): self.error_message = self.error_responses.get(self.determine_errors(), None) def determine_errors(self): if not self.is_permitted_site(): return 'invalid' if not self.is_absolute(): return 'not_absolute' def add_params(self, params_dict): new_query = urlencode(dict(params_dict, **self.parsed_query())) # create a new parse result with the altered query # TODO there has to be a better way self.parsed_uri = ParseResult(query=new_query, scheme=self.parsed_uri.scheme, netloc=self.parsed_uri.netloc, path=self.parsed_uri.path, params=self.parsed_uri.params, fragment=self.parsed_uri.fragment) # NOTE this will truncate query params that are used more than once # generally not a good idea def parsed_query(self): new_query = parse_qs(self.parsed_uri.query) for key, value in new_query.iteritems(): new_query[key] = value if len(new_query[key]) < 1 else join(value, ",") return new_query def is_absolute(self): return self.parsed_uri.scheme != "" and self.parsed_uri.netloc != "" def is_permitted_site(self): # the oauth 2 spec recommends validating the redirect uri against # a pre defined uri to prevent an open redirect for the sake of simplicity # we've chosed to forgo that validation # # # return self.raw_uri.startswith(self.settings['redirect_site']) return True def get_url(self): return self.parsed_uri.geturl()
def main(argv=None): c = pycurl.Curl() b = StringIO() # Filters out the server address dls = urlparse(argv.new_dls) if not argv.user: setattr(argv, "user", raw_input("Enter the digest authentication user: "******"password", getpass.getpass("Enter the digest authentication password: "******"Excluding URL {}\n".format(url.text) continue # Check whether the URL protocol or server address are correct (i.e. match the download server)... if parsed.scheme != dls.scheme or parsed.netloc != dls.netloc: # Change the fields corresponding the server name and the protocol aux[0] = dls.scheme aux[1] = dls.netloc # Check whether the download path is correct if path.commonprefix([parsed.path, dls.path]) != dls.path: # Extract the download server "mountpoint" # Matterhorn resource URLs in distributed mediapackage take the form: # distribution-channel/mediapackage-id/element-id/filename.extension # , therefore, anything that is beyond these four levels in the hierarchy # is a part of the download server "mountpoint" for i in range(4): aux[2] = path.dirname(aux[2]) # Remove the "mountpoint" from the resource's path and add the final "mountpoint" from the server aux[2] = path.join(dls.path, path.relpath(parsed.path, aux[2])) new_url = ParseResult(*aux) if new_url != parsed: url.text = new_url.geturl() print "In: {}\nOut: {}\n".format( parsed.geturl(), url.text) if not modified: modified = True else: print "URL {} NOT modified".format(url.text) # Overwrite the mediapackage in the index if modified: # Upload the mediapackage back to the search index (overwriting the old version) curl(etree.tostring(mp, encoding="UTF-8"), argv.search_url, argv.add_endpoint, argv.user, argv.password) else: pass else: print b.getvalue() except pycurl.error as err: raise RuntimeError(c.errstr()) except Exception as exc: print type(exc), exc raise finally: c.close() b.close()
def parts_to_url(scheme, netloc, path, params, query, fragment): p = ParseResult(scheme, netloc, path, params, query, fragment) return p.geturl()