def extract_links_from_a_href(self, url, href): """ scheme should be in [http, https] Links can be: 1. full urls (e.g. http://google.com), or 2. reference to local items (e.g. /docs) which should translate to http://python.org/docs, or 3. permalinks to items on same page e.g. #downloads. I will leave this part though. """ valid_schemes = ['http', 'https'] url = url.strip('/') href = href.strip('/') parsed_href = urlparse.urlsplit(href) skip_base = False parsed_base_url = urlparse.urlsplit(url) temp_url = '' valid_urls = [] if parsed_href.scheme: temp_url = href if (parsed_href.scheme in valid_schemes) else '' if parsed_href.netloc == parsed_base_url.netloc: skip_base = True # no need to include base url in the results. its already in the repo else: if parsed_href.path or parsed_href.query: temp_url = urlparse.urljoin(url, href) skip_base = True # at this point, we have something like 'http://python.org/assets/docs/about.html#somediv' # temp_url is one valid url. # now lets get some more if temp_url: valid_urls = self._find_isolated_resources(temp_url, skip_base) if temp_url else [] return valid_urls
def find_pingback_urls(self, urls): """Find the pingback urls of each urls""" pingback_urls = {} for url in urls: try: page = urlopen(url) headers = page.info() if 'text/' not in headers.get('Content-Type', '').lower(): continue server_url = headers.get('X-Pingback') if not server_url: server_url = self.find_pingback_href(page.read()) if server_url: server_url_splitted = urlsplit(server_url) if not server_url_splitted.netloc: url_splitted = urlsplit(url) server_url = '%s://%s%s' % (url_splitted.scheme, url_splitted.netloc, server_url) pingback_urls[url] = server_url except IOError: pass return pingback_urls
def _update_link_prefix(self, orig_url, prefix): if not prefix: return orig_url url_parts = list(urlparse.urlsplit(orig_url)) prefix_parts = list(urlparse.urlsplit(prefix)) url_parts[0:2] = prefix_parts[0:2] return urlparse.urlunsplit(url_parts)
def get_suggestions(self, keywords, keyword_confidence): stackoverflow_query = keywords + " error stackoverflow" askubuntu_query = keywords + " error askubuntu" suggestions = [] question_ids = [] for url in search(stackoverflow_query, tld='es', lang='en', stop=5): hostname = urlparse.urlparse(url).hostname if(hostname == "stackoverflow.com"): path = urlparse.urlsplit(url).path pathx = str(path).split('/') question_ids.append(pathx[2]) if len(question_ids)!=0: print "#DRAK : Fetched Stackoverflow Questions\n#DRAK : Fetching answers" suggestions.extend(self.so.get_suggestions(question_ids)) print "#DRAK : Answers fetched successfully" question_ids = [] for url in search(askubuntu_query, tld='es', lang='en', stop=5): hostname = urlparse.urlparse(url).hostname if(hostname == "askubuntu.com"): path = urlparse.urlsplit(url).path pathx = str(path).split('/') question_ids.append(pathx[2]) if len(question_ids)!=0: print "#DRAK : Fetched AskUbuntu Questions\n#DRAK : Fetching answers" suggestions.extend(self.au.get_suggestions(question_ids)) print "#DRAK : Answers fetched successfully" for suggestion in suggestions: suggestion.keyword_confidence = keyword_confidence return suggestions
def test_password_reset(self): """ Tests the forgotten/reset password workflow. """ c = Client() resp = c.get(reverse('password_reset')) self.assertTrue(resp.status_code, 200) resp = c.post(reverse('password_reset'), data={'email': '*****@*****.**'}) self.assertEqual(resp.status_code, 302) self.assertEqual(len(mail.outbox), 1) token = resp.context[0]['token'] uid = resp.context[0]['uid'] # Grab the token and uidb64 so that we can hit the reset url resp = c.get(reverse('password_reset_confirm', kwargs={'token': token, 'uidb64': uid})) self.assertEqual(resp.status_code, 200) self.assertTrue(resp.template_name.endswith('password_reset_confirm.html')) resp = c.post(reverse('password_reset_confirm', kwargs={'token': token, 'uidb64': uid}), {'new_password1': 'mynewpassword', 'new_password2': 'mynewpassword'}) self.assertEqual(resp.status_code, 302) self.assertEqual(resolve(urlsplit(resp.url).path).url_name, 'password_reset_complete') resp = c.post(reverse('login'), {'username': '******', 'password': '******'}) # User is returned to the login page on error vs redirected by default self.assertEqual(resp.status_code, 302) self.assertNotEqual(resolve(urlsplit(resp.url).path).url_name, 'login')
def prepare_files(self): if urlparse.urlsplit(self.inurl)[0] == 'file': self.infname = urllib.url2pathname(urlparse.urlsplit(self.inurl)[2]) self.infd = open(self.infname) else: # not a file url. download it. source = urllib.urlopen(self.inurl) self.infd, self.infname = tempfile.mkstemp(prefix="transcode-in-", suffix="." + self.inext) self._files_to_clean_up_on_success.append((self.infd, self.infname)) self._files_to_clean_up_on_error.append((self.infd, self.infname)) while True: chunk = source.read(1024 * 64) if not chunk: break os.write(self.infd, chunk) os.lseek(self.infd, 0, 0) self.outfd, self.outfname = tempfile.mkstemp(prefix="transcode-out-", suffix="." + self.tofmt) self._files_to_clean_up_on_error.append((self.outfd, self.outfname)) self.errfh, self.errfname = tempfile.mkstemp(prefix="transcode-", suffix=".log") self.outurl = urlparse.urlunsplit( ["file", None, self.outfname, None, None]) self._files_to_clean_up_on_success.append((self.errfh, self.errfname)) log.debug("Reading from " + self.infname + " (" + self.inurl + ")") log.debug("Outputting to " + self.outfname + " (" + self.outurl + ")") log.debug("Errors to " + self.errfname)
def __init__(self, enc_password=""): if enc_password == "": print "MtGoxHMAC: Enter your API key file encryption password." enc_password = getpass.getpass() # raw_input() try: f = open("./config/salt.txt", "r") salt = f.read() f.close() hash_pass = hashlib.sha256(enc_password + salt).digest() f = open("./config/api_key.txt") ciphertext = f.read() f.close() decryptor = AES.new(hash_pass, AES.MODE_CBC, ciphertext[: AES.block_size]) plaintext = decryptor.decrypt(ciphertext[AES.block_size :]) d = json.loads(plaintext) self.key = d["key"] self.secret = d["secret"] except: print "\n\n\nError: you may have entered an invalid password or the encrypted api key file doesn't exist" print "If you haven't yet generated the encrypted key file, run the encrypt_api_key.py script." while 1: pass self.buff = "" self.timeout = 15 self.__url_parts = urlparse.urlsplit("https://mtgox.com/api/0/") self.__url_parts_1 = urlparse.urlsplit("https://mtgox.com/api/1/") self.clock_window = time.time() self.clock = time.time() self.query_count = 0 self.query_limit_per_time_slice = 5 self.query_time_slice = 10
def validate_next(self, field): if field.data: url_next = urlsplit(field.data) url_base = urlsplit(request.host_url) if url_next.netloc and url_next.netloc != url_base.netloc: field.data = '' raise ValidationError(get_message('INVALID_REDIRECT')[0])
def logout_client(): """ Client-initiated logout """ client = Client.query.filter_by(key=request.args['client_id']).first() if client is None: # No such client. Possible CSRF. Don't logout and don't send them back flash(logout_errormsg, 'error') return redirect(url_for('index')) if client.trusted: # This is a trusted client. Does the referring domain match? clienthost = urlparse.urlsplit(client.redirect_uri).hostname if request.referrer: if clienthost != urlparse.urlsplit(request.referrer).hostname: # Doesn't. Don't logout and don't send back flash(logout_errormsg, 'error') return redirect(url_for('index')) # else: no referrer? Either stripped out by browser or a proxy, or this is a direct link. # We can't do anything about that, so assume it's a legit case. # # If there is a next destination, is it in the same domain? if 'next' in request.args: if clienthost != urlparse.urlsplit(request.args['next']).hostname: # Doesn't. Assume CSRF and redirect to index without logout flash(logout_errormsg, 'error') return redirect(url_for('index')) # All good. Log them out and send them back logout_internal() return redirect(get_next_url(external=True)) else: # We know this client, but it's not trusted. Send back without logout. return redirect(get_next_url(external=True))
def extract_credentials(self, url): """ Extracts user/password from a url. Returns a tuple: (url-without-auth, username, password) """ if isinstance(url, urllib2.Request): result = urlparse.urlsplit(url.get_full_url()) else: result = urlparse.urlsplit(url) scheme, netloc, path, query, frag = result username, password = self.parse_credentials(netloc) if username is None: return url, None, None elif password is None and self.prompting: # remove the auth credentials from the url part netloc = netloc.replace('%s@' % username, '', 1) # prompt for the password prompt = 'Password for %s@%s: ' % (username, netloc) password = urllib.quote(getpass.getpass(prompt)) else: # remove the auth credentials from the url part netloc = netloc.replace('%s:%s@' % (username, password), '', 1) target_url = urlparse.urlunsplit((scheme, netloc, path, query, frag)) return target_url, username, password
def create_yaml_profile(json_data): data = [] filename = None if 'image_data' in json_data: for k, v in json_data['image_data'].items(): filename = os.path.basename(urlparse.urlsplit(v['uri'])[2]) abs_path = os.path.join(json_data['output'], filename) stdout = execute('gunzip', '-ql', abs_path)[0] try: size = int(stdout.split()[1]) except (ValueError, KeyError) as e: size = None stdout = execute('gunzip', '-qc', abs_path, '|', 'md5sum')[0] try: md5 = stdout.split()[0] except (ValueError, KeyError) as e: md5 = None if not md5 or not size: raise Exception("Either md5 or size of %s couldn't be " "calculated" % abs_path) data.append({k: { 'md5': md5, 'size': size, 'filename': filename, 'container': v['container'], 'format': v['format']}}) data.append({'repos': json_data['repos']}) else: raise Exception("Couldn't find any information about images") filename = os.path.basename( urlparse.urlsplit(json_data['image_data'] ['/']['uri'])[2]).split('.')[0] with open(os.path.join(json_data['output'], filename + '.yaml'), 'w') as f: f.write(yaml.dump(data))
def downloadImage(url,imgTag,downloads): imgSrc = imgTag["src"] netloc = urlsplit(url).netloc scheme = urlsplit(url).scheme path = urlsplit(url).path print("[+] Downloading image {0}...".format(imgSrc)) try: imgContent = urllib2.urlopen(imgSrc).read() except: if "html" in urllib2.os.path.split(path)[-1]: root_path = urllib2.os.path.split(path)[0].lstrip("/") else: root_path = path.lstrip("/") imgUrl = urllib2.os.path.join("{0}://{1}".format(scheme,netloc),root_path,imgSrc.lstrip("/")) imgContent = urllib2.urlopen(imgUrl).read() finally: imgFileName = basename(urlsplit(imgSrc)[2]) imgFile = open(os.path.join(downloads,netloc,imgFileName),"wb") imgFile.write(imgContent) imgFile.close() return imgFileName
def __init__(self, uri=None, path=None): if uri is None: uri = get_request().site parts = list(urlparse.urlsplit(uri)) if path is not None: parts[2] = urlparse.urlsplit(path)[2] self.uri = urlparse.urlunsplit(parts)
def __call__(self, uri): (scheme, netloc, path, query, frag) = urlparse.urlsplit(uri) # urlparse doesnt understand file URLs and stuffs everything into path (scheme, netloc, path, query, frag) = urlparse.urlsplit("http:" + path) path = os.path.normpath(path) schema_xml = self.schema_xml_template schema = ZConfig.loadSchemaFile(StringIO(schema_xml)) config, handler = ZConfig.loadConfig(schema, path) for config_item in config.databases + config.storages: if not frag: # use the first defined in the file break elif frag == config_item.name: # match found break else: raise KeyError("No storage or database named %s found" % frag) if isinstance(config_item, ZODBDatabase): config = config_item.config factory = config.storage dbkw = {"connection_cache_size": config.cache_size, "connection_pool_size": config.pool_size} if config.database_name: dbkw["database_name"] = config.database_name else: factory = config_item dbkw = dict(cgi.parse_qsl(query)) return factory.open, dbkw
def get_download_links(self): self.get_links() # for state, link in self.links.iteritems(): # if state in self.Meta.states: # self.download_links[state] = link for state in self.states: self.download_links[state] = {} try: self.download_links[state]['link'] = self.links[state] except: print "failed to set link for state %s" % (state) self.download_links[state]['link'] = ['NA'] try: self.download_links[state]['file_name'] = os.path.basename( urlsplit(self.links[state]).path) except: print "failed to set file_name for %s " % (state) self.download_links[state]['file_name'] = [] try: self.download_links[state]['file_type'] = os.path.splitext( os.path.basename(urlsplit(self.links[state]).path) )[1] except: print "couldnt find a type for file" return self.download_links
def getSource(self, url, form_data="", referer="", xml=False, mobile=False): url = self.fixurl(url) if len(referer) < 1 or referer is None: referer = 'http://' + urlparse.urlsplit(url).hostname if 'arenavision.in' in urlparse.urlsplit(url).netloc: self.s.headers.update({'Cookie' : 'beget=begetok'}) if 'pushpublish' in urlparse.urlsplit(url).netloc: del self.s.headers['Accept-Encoding'] if not referer: referer = url else: referer = self.fixurl(referer) headers = {'Referer': referer} if mobile: self.s.headers.update({'User-Agent' : 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'}) if xml: headers['X-Requested-With'] = 'XMLHttpRequest' if form_data: r = self.s.post(url, headers=headers, data=form_data, timeout=20) response = r.text else: try: r = self.s.get(url, headers=headers, timeout=20) response = r.text except (requests.exceptions.MissingSchema): response = 'pass' if len(response) > 10: if self.cookie_file: self.save_cookies_lwp(self.s.cookies, self.cookie_file) return HTMLParser().unescape(response)
def url(self, name, force=False): """ Returns the real URL in DEBUG mode. """ if settings.DEBUG and not force: hashed_name, fragment = name, '' else: clean_name, fragment = urldefrag(name) if urlsplit(clean_name).path.endswith('/'): # don't hash paths hashed_name = name else: cache_key = self.cache_key(name) hashed_name = self.cache.get(cache_key) if hashed_name is None: hashed_name = self.hashed_name(clean_name).replace('\\', '/') # set the cache if there was a miss # (e.g. if cache server goes down) self.cache.set(cache_key, hashed_name) final_url = super(CachedFilesMixin, self).url(hashed_name) # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax query_fragment = '?#' in name # [sic!] if fragment or query_fragment: urlparts = list(urlsplit(final_url)) if fragment and not urlparts[4]: urlparts[4] = fragment if query_fragment and not urlparts[3]: urlparts[2] += '?' final_url = urlunsplit(urlparts) return unquote(final_url)
def normalize_url(url, domain_canonical=None): """ Ensure we have a value url - raise exception if not. If given, we convert the domain to a domain_canonical """ url = url.strip() rgURL = list(urlparse.urlsplit(url)) if rgURL[split.scheme] == '': url = r"http://%s" % url rgURL = list(urlparse.urlsplit(url)) # Invalid protocol if rgURL[split.scheme] != "http" and rgURL[split.scheme] != "https": raise reqfilter.Error("Invalid protocol: %s" % rgURL[split.scheme]) if domain_canonical is not None: rgURL[split.domain] = domain_canonical if rgURL[split.domain]: rgURL[split.domain] = rgURL[split.domain].lower() if not rgURL[split.domain] or not regDomain.search(rgURL[split.domain]) or len(rgURL[split.domain]) > 255: raise reqfilter.Error("Invalid URL: %s" % urlparse.urlunsplit(rgURL)) # Always end naked domains with a trailing slash as canonical if rgURL[split.path] == '': rgURL[split.path] = '/' return urlparse.urlunsplit(rgURL)
def urljoin(self, base_url, url): if not url: return base_url urlsp = urlparse.urlsplit(url) if urlsp.scheme or not base_url: return url basesp = urlparse.urlsplit(base_url) if basesp.scheme in ("keep", "arvwf"): if not basesp.path: raise IOError(errno.EINVAL, "Invalid Keep locator", base_url) baseparts = basesp.path.split("/") urlparts = urlsp.path.split("/") if urlsp.path else [] pdh = baseparts.pop(0) if basesp.scheme == "keep" and not arvados.util.keep_locator_pattern.match(pdh): raise IOError(errno.EINVAL, "Invalid Keep locator", base_url) if urlsp.path.startswith("/"): baseparts = [] urlparts.pop(0) if baseparts and urlsp.path: baseparts.pop() path = "/".join([pdh] + baseparts + urlparts) return urlparse.urlunsplit((basesp.scheme, "", path, "", urlsp.fragment)) return super(CollectionFetcher, self).urljoin(base_url, url)
def generate_temp_url(self, url=None, path=None, duration=300, method='GET'): settings = getUtility(IRegistry).forInterface(ISwiftSettings) # storage_url contains version and account storage_url = getattr(settings, 'storage_url', os.environ.get('OS_STORAGE_URL')) storage_url = urlsplit(storage_url) if url: url = urlsplit(url) if storage_url.netloc.rsplit(':', 1)[0] != url.netloc.rsplit(':', 1)[0]: # not our swift store return url.geturl() elif path: # build storage url; path contains container name and starts with / url = '{0}://{1}{2}{3}'.format( storage_url.scheme, storage_url.netloc, storage_url.path, path ) url = urlsplit(url) else: raise Exception('Need either path or url') # build temp_url key = getattr(settings, 'temp_url_key', os.environ.get('OS_TEMP_URL_KEY')) if not key: return url.geturl() expires = int(time() + duration) hmac_body = u"\n".join((method.upper().encode(), str(expires), url.path)) sig = hmac.new(key.encode('utf-8'), hmac_body.encode('utf-8'), hashlib.sha1).hexdigest() temp_url = u"{url}?temp_url_sig={sig}&temp_url_expires={expires}".format( url=url.geturl(), sig=sig, expires=expires) return temp_url
def enumerate_links(self, input_url): self.__browser.open(input_url) chapter_name = self.__browser.title().split(' - ', 1)[0] chapter_dir = os.path.join(DEST_PATH, chapter_name) try: os.mkdir(chapter_dir) except OSError: pass soup = BeautifulSoup.BeautifulSoup(self.__browser.response().read()) o = urlparse.urlsplit(input_url) href_re = re.compile('/'.join(o.path.split('/')[2:-1])) links = soup('a', href=href_re) o = urlparse.urlsplit(links[1]['href']) start_dir_path, start_page = o.path.rsplit('/', 1) start_page = int(start_page) o = urlparse.urlsplit(links[-3]['href']) end_dir_path, end_page = o.path.rsplit('/', 1) end_page = int(end_page) assert start_dir_path == end_dir_path, 'Start and end dir paths differ' dir_path = start_dir_path tasks = [] for page_idx in xrange(start_page, end_page + 1): page_url = urlparse.urlunsplit((o[0], o[1], '/'.join((dir_path, str(page_idx))), o[3], o[4])) tasks.append((page_url, chapter_dir)) self.zipit(chapter_name, chapter_dir, tasks) print '[*] Finished'
def assertRedirects(self, response, expected_url, status_code=302, target_status_code=200, host=None): """Asserts that a response redirected to a specific URL, and that the redirect URL can be loaded. Note that assertRedirects won't work for external links since it uses TestClient to do a request. """ self.assertEqual( response.status_code, status_code, ( "Response didn't redirect as expected: Response code was %d" " (expected %d)" % (response.status_code, status_code) ), ) url = response["Location"] scheme, netloc, path, query, fragment = urlsplit(url) e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url) if not (e_scheme or e_netloc): expected_url = urlunsplit(("http", host or "testserver", e_path, e_query, e_fragment)) self.assertEqual(url, expected_url, "Response redirected to '%s', expected '%s'" % (url, expected_url)) # Get the redirection page, using the same client that was used # to obtain the original response. redirect_response = response.client.get(path, QueryDict(query)) self.assertEqual( redirect_response.status_code, target_status_code, ("Couldn't retrieve redirection page '%s': response code was %d" " (expected %d)") % (path, redirect_response.status_code, target_status_code), )
def getsession(url, user, password, domain="default"): scheme, location, path, query, fragment = urlparse.urlsplit(url) if scheme is None and location is None and query is None: url="http://"+url if url[-1]!="/": url+="/" url+="xmlrpc.php" sp=xmlrpclib.ServerProxy(url) res=sp.system.login({"username": user, "password": password, "domain": domain}) if "sessionid" not in res or "kp3" not in res: raise Exception("Invalid username or password") scheme, location, path, query, fragment = urlparse.urlsplit(url) if location.find("@")>=0: location=location[location.find("@")+1:] newurl=urlparse.urlunsplit( (scheme, "%s:%s@%s" % (res["sessionid"], res["kp3"], location), path, query, fragment) ) return Session(xmlrpclib.ServerProxy(newurl), res)
def get_from_form(str_page, response, opener=None): page = lxml.html.document_fromstring(str_page) if len(page.forms) == 1: form = page.forms[0] else: form = page.forms[1] # для Яндекса на этапе полтверждения прав:( # Собираем параметры key_value = {} for inpt in form.inputs: value = inpt.value name = inpt.name if None not in [name, value]: key_value[name] = value.encode('utf-8') #if key_value.has_key(None): del key_value[None] # У кнопки обычно нет имени. # Извлекаем адрес отправки формы action_url = form.action if action_url == None: action_url = response.geturl() parts = urlparse.urlsplit(action_url) # если относительный адрес... if parts.scheme == '' and parts.netloc == '': # относительно сервера if action_url[0] == '/': netloc = urlparse.urlsplit(response.geturl()).netloc action_url = 'https://' + netloc + action_url # относительно адреса текущей страницы else: action_url = response.geturl() +'/'+ action_url #print 'action url after parse: ', action_url # проверяем наличие капчи (for vk.com only) if key_value.has_key('captcha_key'): img = form.cssselect('img.captcha_img')[0] captcha_url = img.attrib['src'] captcha_img = opener.open(captcha_url).read() dataMngt.write('oauth/logs/captcha.jpg', captcha_img, 'wb') captcha_key = raw_input('Input the captcha number:') key_value['captcha_key'] = captcha_key return key_value, action_url
def clean(self): from django.core.validators import URLValidator, validate_ipv46_address port_re = "(:[0-9]{1,5}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])" cleaned_data = super(AddEndpointForm, self).clean() if 'endpoint' in cleaned_data and 'product' in cleaned_data: endpoint = cleaned_data['endpoint'] product = cleaned_data['product'] if isinstance(product, Product): self.product = product else: self.product = Product.objects.get(id=int(product)) else: raise forms.ValidationError('Please enter a valid URL or IP address.', code='invalid') endpoints = endpoint.split() count = 0 error = False for endpoint in endpoints: try: url_validator = URLValidator() url_validator(endpoint) protocol, host, path, query, fragment = urlsplit(endpoint) self.endpoints_to_process.append([protocol, host, path, query, fragment]) except forms.ValidationError: try: # do we have a port number? host = endpoint regex = re.compile(port_re) if regex.findall(endpoint): for g in regex.findall(endpoint): host = re.sub(port_re, '', host) validate_ipv46_address(host) protocol, host, path, query, fragment = ("", endpoint, "", "", "") self.endpoints_to_process.append([protocol, host, path, query, fragment]) except forms.ValidationError: try: regex = re.compile( r'^(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4 r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) validate_hostname = RegexValidator(regex=regex) validate_hostname(host) protocol, host, path, query, fragment = (None, host, None, None, None) if "/" in host or "?" in host or "#" in host: # add a fake protocol just to join, wont use in update to database host_with_protocol = "http://" + host p, host, path, query, fragment = urlsplit(host_with_protocol) self.endpoints_to_process.append([protocol, host, path, query, fragment]) except forms.ValidationError: raise forms.ValidationError( 'Please check items entered, one or more do not appear to be a valid URL or IP address.', code='invalid') return cleaned_data
def spider(client, url, domain_whitelist=None, pool=None, threadpool=None, tested=None): client.send_status('Spidering {url}...'.format(url=url)) domain_whitelist = domain_whitelist or (urlsplit(url).netloc,) threadpool = threadpool or ThreadPool(4) # for lxml - 4 workers pool = pool or Pool() # maximum number of concurrent HTTP requests tested = tested or set([url]) with timer() as timed: response = requests.get(url) result = dict( status_code = response.status_code, length = len(response.text), headers = response.headers, url = url, duration = timed.result(), ) client.send_result(result) html = threadpool.apply(fromstring, [response.text]) for link in html.cssselect('a'): href = link.attrib.get('href').split('#')[0].strip() if not href: continue url = urljoin(response.url, href) parts = urlsplit(url) if parts.netloc not in domain_whitelist: continue if url in tested: continue tested.add(url) pool.spawn(spider, client, url, domain_whitelist, pool, threadpool, tested) return pool
def _resolveLocation(self, requestURI, location): from twisted.web.client import _urljoin from urlparse import urlparse, urlsplit old_url = urlsplit(requestURI)[1].split(":") go_to = urlsplit(location)[1].split(":") if self._onRedirect == "sticky": location = location.replace(go_to[0], old_url[0]) elif self._onRedirect == "stickyport": def _preparePort(url): urlsplited = urlsplit(url)[1].split(":") scheme = urlsplit(url).scheme \ if urlsplit(url).scheme else "http" if scheme == "http": url = url.replace(urlsplited[0], urlsplited[0]+":80") elif scheme == "https": url = url.replace(urlsplited[0], urlsplited[0]+":443") return url if len(old_url) != 2: requestURI = _preparePort(requestURI) old_url = urlsplit(requestURI)[1].split(":") if len(go_to) != 2: location = _preparePort(location) go_to = urlsplit(location)[1].split(":") if not self._proxy: location = location.replace(go_to[1], str(self._port)) else: location = location.replace(go_to[1], old_url[1]) location = _urljoin(requestURI, location) log.debug("Locating to URL: %s" % location) return location
def get_correctedFiles(path, save, url, img): if not os.path.exists(save): os.makedirs(save) for f in os.listdir(path): print "correcting file %s" % f infile = open(os.path.join(path, f)).read() soup = BeautifulSoup(infile, "html5lib") for tag in soup.find_all(lambda t: 'href' in t.attrs or 'src' in t.attrs): if 'href' in tag.attrs: url_parts = urlparse.urlsplit(tag.attrs["href"]) full_path = tag.attrs["href"] hrefpath = url_parts.path if full_path[0:4] != "http" or full_path[0:5] != " http": # for wiki conversion (moin moin wikis) # hrefpath = hrefpath.replace("/", "|") if hrefpath[0:6] == "|wiki|": hrefpath = hrefpath[6:] tag.attrs["href"] = urlparse.urljoin(url, hrefpath) else: url_parts = urlparse.urlsplit(tag.attrs["src"]) srcpath = url_parts.path srcparts = srcpath.split("/") srcpath = srcparts[len(srcparts) -1] tag.attrs["src"] = urlparse.urljoin(img, srcpath) outfile = open(os.path.join(save, f), "w") outfile.write(soup.encode("ascii", "xmlcharrefreplace")) outfile.close()
def filename(self, pdf_url0): pdf_url = str(pdf_url0) CurrentDir=os.path.dirname(os.path.realpath(__file__)).replace('\\','/') if re.findall('/', pdf_url): self.suffix = os.path.splitext(pdf_url)[1] self.file_name_decode = urllib2.unquote(pdf_url).decode('utf8').split('/')[-1] self.filename = urlparse.urlsplit(pdf_url).path.split('/')[-1] if self.filename.endswith('.jsp'): self.filename=(self.suffix).split('arnumber=')[1]+'.pdf' # self.filename=(pdf_url).split('id=')[1].split('&')[0]+'.pdf' # self.pdf_Folder_filename = CurrentDir + "/"+self.PDF_Files_Dir+"/" + self.filename # self.W_pdf_Folder_filename = CurrentDir + "/"+self.Watermarked_PDF_Dir+"/" + self.filename self.pdf_Folder_filename = self.PDF_Files_Dir+"/" + self.filename self.W_pdf_Folder_filename =self.Watermarked_PDF_Dir+"/" + self.filename self.chdir=CurrentDir else: self.filename = urlparse.urlsplit(pdf_url).path.split('\\')[-1] self.chdir=CurrentDir # self.pdf_Folder_filename = CurrentDir+ "/"+self.PDF_Files_Dir+"/" + self.filename # self.W_pdf_Folder_filename = CurrentDir + "/"+self.Watermarked_PDF_Dir+"/" + self.filename self.pdf_Folder_filename =self.PDF_Files_Dir+"/" + self.filename self.W_pdf_Folder_filename =self.Watermarked_PDF_Dir+"/" + self.filename return self
def build_url (self): """ Construct self.url and self.urlparts out of the given base url information self.base_url, self.parent_url and self.base_ref. """ # norm base url - can raise UnicodeError from url.idna_encode() base_url, is_idn = url_norm(self.base_url, self.encoding) # make url absolute if self.base_ref: # use base reference as parent url if ":" not in self.base_ref: # some websites have a relative base reference self.base_ref = urljoin(self.parent_url, self.base_ref) self.url = urljoin(self.base_ref, base_url) elif self.parent_url: # strip the parent url query and anchor urlparts = list(urlparse.urlsplit(self.parent_url)) urlparts[4] = "" parent_url = urlutil.urlunsplit(urlparts) self.url = urljoin(parent_url, base_url) else: self.url = base_url # urljoin can unnorm the url path, so norm it again urlparts = list(urlparse.urlsplit(self.url)) if urlparts[2]: urlparts[2] = urlutil.collapse_segments(urlparts[2]) self.url = urlutil.urlunsplit(urlparts) # split into (modifiable) list self.urlparts = strformat.url_unicode_split(self.url) # and unsplit again self.url = urlutil.urlunsplit(self.urlparts) self.build_url_parts()
def process_url(raw_url): if ' ' not in raw_url[-1]: raw_url=raw_url.replace(' ','%20') return raw_url elif ' ' in raw_url[-1]: raw_url=raw_url[:-1] raw_url=raw_url.replace(' ','%20') return raw_url url='' ## give the url here parse_object=urlparse(url) dirname=basename(parse_object.path) if not os.path.exists('images'): os.mkdir("images") os.mkdir("images/"+dirname) os.chdir("images/"+dirname) urlcontent=urllib2.urlopen(url).read() imgurls=re.findall('img .*?src="(.*?)"',urlcontent) for imgurl in imgurls: try: imgurl=process_url(imgurl) imgdata=urllib2.urlopen(imgurl).read() filname=basename(urlsplit(imgurl)[2]) output=open(filname,'wb') output.write(imgdata) output.close() os.remove(filename) except: pass
# when proxy needed, make urllib2 follow it proxy = repo.proxy proxy_username = repo.proxy_username proxy_password = repo.proxy_password if not proxy: proxy = get_proxy_for(repo.baseurl[0]) handlers = [] auth_handler = u2.HTTPBasicAuthHandler( u2.HTTPPasswordMgrWithDefaultRealm()) u2opener = None if proxy: if proxy_username: proxy_netloc = urlparse.urlsplit(proxy).netloc if proxy_password: proxy_url = 'http://%s:%s@%s' % (proxy_username, proxy_password, proxy_netloc) else: proxy_url = 'http://%s@%s' % (proxy_username, proxy_netloc) else: proxy_url = proxy proxy_support = u2.ProxyHandler({ 'http': proxy_url, 'https': proxy_url, 'ftp': proxy_url }) handlers.append(proxy_support)
status = 302 redirectCount = 0 url = service if gnu: url = url + '?out=gnu' else: url = url + '?out=text' if errorsOnly: url = url + '&level=error' while (status == 302 or status == 301 or status == 307) and redirectCount < 10: if redirectCount > 0: url = response.getheader('Location') parsed = urlparse.urlsplit(url) if parsed[0] != 'http': sys.stderr.write('URI scheme %s not supported.\n' % parsed[0]) sys.exit(7) if redirectCount > 0: connection.close() # previous connection print 'Redirecting to %s' % url print 'Please press enter to continue or type "stop" followed by enter to stop.' if raw_input() != "": sys.exit(0) connection = httplib.HTTPConnection(parsed[1]) connection.connect() connection.putrequest("POST", "%s?%s" % (parsed[2], parsed[3]), skip_accept_encoding=1) connection.putheader("Accept-Encoding", 'gzip')
def get_prefix_source(cls): """Return the prefix source, by default derived from site.""" if hasattr(cls, '_prefix_source'): return cls._prefix_source else: return urlparse.urlsplit(cls.site)[2]
try: attempt_index = int(attempt_index) attempt = job.job_attempts['jobAttempt'][attempt_index] log_link = attempt['logsLink'] except (KeyError, RestException), e: raise KeyError( _("Cannot find job attempt '%(id)s'.") % {'id': job.jobId}, e) link = '/%s/' % name params = {} if offset and int(offset) >= 0: params['start'] = offset root = Resource(get_log_client(log_link), urlparse.urlsplit(log_link)[2], urlencode=False) try: response = root.get(link, params=params) log = html.fromstring(response).xpath( '/html/body/table/tbody/tr/td[2]')[0].text_content() except Exception, e: log = _('Failed to retrieve log: %s') % e response = {'log': log} return HttpResponse(json.dumps(response), mimetype="application/json") @check_job_permission
def update_scm_url(scm_type, url, username=True, password=True, check_special_cases=True, scp_format=False): ''' Update the given SCM URL to add/replace/remove the username/password. When username/password is True, preserve existing username/password, when False (None, '', etc.), remove any existing username/password, otherwise replace username/password. Also validates the given URL. ''' # Handle all of the URL formats supported by the SCM systems: # git: https://www.kernel.org/pub/software/scm/git/docs/git-clone.html#URLS # hg: http://www.selenic.com/mercurial/hg.1.html#url-paths # svn: http://svnbook.red-bean.com/en/1.7/svn-book.html#svn.advanced.reposurls if scm_type not in ('git', 'hg', 'svn', 'insights'): raise ValueError(_('Unsupported SCM type "%s"') % str(scm_type)) if not url.strip(): return '' parts = urlparse.urlsplit(url) try: parts.port except ValueError: raise ValueError(_('Invalid %s URL') % scm_type) if parts.scheme == 'git+ssh' and not scp_format: raise ValueError(_('Unsupported %s URL') % scm_type) if '://' not in url: # Handle SCP-style URLs for git (e.g. [user@]host.xz:path/to/repo.git/). if scm_type == 'git' and ':' in url: if '@' in url: userpass, hostpath = url.split('@', 1) else: userpass, hostpath = '', url if hostpath.count(':') > 1: raise ValueError(_('Invalid %s URL') % scm_type) host, path = hostpath.split(':', 1) #if not path.startswith('/') and not path.startswith('~/'): # path = '~/%s' % path #if path.startswith('/'): # path = path.lstrip('/') hostpath = '/'.join([host, path]) modified_url = '@'.join(filter(None, [userpass, hostpath])) # git+ssh scheme identifies URLs that should be converted back to # SCP style before passed to git module. parts = urlparse.urlsplit('git+ssh://%s' % modified_url) # Handle local paths specified without file scheme (e.g. /path/to/foo). # Only supported by git and hg. elif scm_type in ('git', 'hg'): if not url.startswith('/'): parts = urlparse.urlsplit('file:///%s' % url) else: parts = urlparse.urlsplit('file://%s' % url) else: raise ValueError(_('Invalid %s URL') % scm_type) # Validate that scheme is valid for given scm_type. scm_type_schemes = { 'git': ('ssh', 'git', 'git+ssh', 'http', 'https', 'ftp', 'ftps', 'file'), 'hg': ('http', 'https', 'ssh', 'file'), 'svn': ('http', 'https', 'svn', 'svn+ssh', 'file'), 'insights': ('http', 'https') } if parts.scheme not in scm_type_schemes.get(scm_type, ()): raise ValueError(_('Unsupported %s URL') % scm_type) if parts.scheme == 'file' and parts.netloc not in ('', 'localhost'): raise ValueError( _('Unsupported host "%s" for file:// URL') % (parts.netloc)) elif parts.scheme != 'file' and not parts.netloc: raise ValueError(_('Host is required for %s URL') % parts.scheme) if username is True: netloc_username = parts.username or '' elif username: netloc_username = username else: netloc_username = '' if password is True: netloc_password = parts.password or '' elif password: netloc_password = password else: netloc_password = '' # Special handling for github/bitbucket SSH URLs. if check_special_cases: special_git_hosts = ('github.com', 'bitbucket.org', 'altssh.bitbucket.org') if scm_type == 'git' and parts.scheme.endswith( 'ssh' ) and parts.hostname in special_git_hosts and netloc_username != 'git': raise ValueError( _('Username must be "git" for SSH access to %s.') % parts.hostname) if scm_type == 'git' and parts.scheme.endswith( 'ssh' ) and parts.hostname in special_git_hosts and netloc_password: #raise ValueError('Password not allowed for SSH access to %s.' % parts.hostname) netloc_password = '' special_hg_hosts = ('bitbucket.org', 'altssh.bitbucket.org') if scm_type == 'hg' and parts.scheme == 'ssh' and parts.hostname in special_hg_hosts and netloc_username != 'hg': raise ValueError( _('Username must be "hg" for SSH access to %s.') % parts.hostname) if scm_type == 'hg' and parts.scheme == 'ssh' and netloc_password: #raise ValueError('Password not supported for SSH with Mercurial.') netloc_password = '' if netloc_username and parts.scheme != 'file' and scm_type != "insights": netloc = u':'.join([ urllib.quote(x, safe='') for x in (netloc_username, netloc_password) if x ]) else: netloc = u'' netloc = u'@'.join(filter(None, [netloc, parts.hostname])) if parts.port: netloc = u':'.join([netloc, unicode(parts.port)]) new_url = urlparse.urlunsplit( [parts.scheme, netloc, parts.path, parts.query, parts.fragment]) if scp_format and parts.scheme == 'git+ssh': new_url = new_url.replace('git+ssh://', '', 1).replace('/', ':', 1) return new_url
def __fetch_url(self, params): # Skip existing file if exists and matches checksum if not self.parent.force: if self.__is_file_done(local_path=params['target_file'], checksum_type=params['checksum_type'], checksum=params['checksum']): return True opts = URLGrabberOptions(ssl_ca_cert=params['ssl_ca_cert'], ssl_cert=params['ssl_client_cert'], ssl_key=params['ssl_client_key'], range=params['bytes_range'], proxy=params['proxy'], username=params['proxy_username'], password=params['proxy_password'], proxies=params['proxies'], http_headers=tuple( params['http_headers'].items())) mirrors = len(params['urls']) for retry in range(max(self.parent.retries, mirrors)): fo = None url = urlparse.urljoin(params['urls'][self.mirror], params['relative_path']) ## BEWARE: This hack is introduced in order to support SUSE SCC channels ## This also needs a patched urlgrabber AFAIK if 'authtoken' in params and params['authtoken']: (scheme, netloc, path, query, _) = urlparse.urlsplit(params['urls'][self.mirror]) url = "%s://%s%s/%s?%s" % (scheme, netloc, path, params['relative_path'], query.rstrip('/')) try: try: fo = PyCurlFileObjectThread(url, params['target_file'], opts, self.curl, self.parent) # Check target file if not self.__is_file_done( file_obj=fo, checksum_type=params['checksum_type'], checksum=params['checksum']): raise FailedDownloadError( "Target file isn't valid. Checksum should be %s (%s)." % (params['checksum'], params['checksum_type'])) break except (FailedDownloadError, URLGrabError): e = sys.exc_info()[1] # urlgrabber-3.10.1-9 trows URLGrabError for both # 'HTTP Error 404 - Not Found' and 'No space left on device', so # workaround for this is check error message: if 'No space left on device' in str(e): self.parent.fail_download(e) return False if not self.__can_retry(retry, mirrors, opts, url, e): return False self.__next_mirror(mirrors) # RHEL 6 urlgrabber raises KeyboardInterrupt for example when there is no space left # but handle also other fatal exceptions except (KeyboardInterrupt, Exception): # pylint: disable=W0703 e = sys.exc_info()[1] self.parent.fail_download(e) return False finally: if fo: fo.close() # Delete failed download file elif os.path.isfile(params['target_file']): os.unlink(params['target_file']) return True
def load_tool(argsworkflow, updateonly, strict, makeTool, debug, print_pre=False, print_rdf=False, print_dot=False, print_deps=False, relative_deps=False, rdf_serializer=None, stdout=sys.stdout, urifrag=None): # type: (Union[str,unicode,dict[unicode,Any]], bool, bool, Callable[...,Process], bool, bool, bool, bool, bool, bool, Any, Any, Any) -> Any (document_loader, avsc_names, schema_metadata) = process.get_schema() if isinstance(avsc_names, Exception): raise avsc_names jobobj = None uri = None # type: str workflowobj = None # type: Dict[unicode, Any] if isinstance(argsworkflow, (basestring)): split = urlparse.urlsplit(cast(str, argsworkflow)) if split.scheme: uri = cast(str, argsworkflow) else: uri = "file://" + os.path.abspath(cast(str, argsworkflow)) fileuri, urifrag = urlparse.urldefrag(uri) workflowobj = document_loader.fetch(fileuri) elif isinstance(argsworkflow, dict): workflowobj = argsworkflow uri = urifrag fileuri = "#" else: raise schema_salad.validate.ValidationException("Must be URI or dict") if "cwl:tool" in workflowobj: jobobj = workflowobj uri = urlparse.urljoin(uri, jobobj["cwl:tool"]) fileuri, urifrag = urlparse.urldefrag(uri) workflowobj = document_loader.fetch(fileuri) del jobobj["cwl:tool"] if isinstance(workflowobj, list): # bare list without a version must be treated as draft-2 workflowobj = { "cwlVersion": "https://w3id.org/cwl/cwl#draft-2", "id": fileuri, "@graph": workflowobj } workflowobj = update.update(workflowobj, document_loader, fileuri) document_loader.idx.clear() if updateonly: stdout.write(json.dumps(workflowobj, indent=4)) return 0 if print_deps: printdeps(workflowobj, document_loader, stdout, relative_deps) return 0 try: processobj, metadata = schema_salad.schema.load_and_validate( document_loader, avsc_names, workflowobj, strict) except (schema_salad.validate.ValidationException, RuntimeError) as e: _logger.error(u"Tool definition failed validation:\n%s", e, exc_info=(e if debug else False)) return 1 if print_pre: stdout.write(json.dumps(processobj, indent=4)) return 0 if print_rdf: printrdf(str(argsworkflow), processobj, document_loader.ctx, rdf_serializer, stdout) return 0 if print_dot: printdot(str(argsworkflow), processobj, document_loader.ctx, stdout) return 0 if urifrag: processobj, _ = document_loader.resolve_ref(uri) elif isinstance(processobj, list): if 1 == len(processobj): processobj = processobj[0] else: _logger.error( u"Tool file contains graph of multiple objects, must specify one of #%s", ", #".join( urlparse.urldefrag(i["id"])[1] for i in processobj if "id" in i)) return 1 try: t = makeTool(processobj, strict=strict, makeTool=makeTool, loader=document_loader, avsc_names=avsc_names) except (schema_salad.validate.ValidationException) as e: _logger.error(u"Tool definition failed validation:\n%s", e, exc_info=(e if debug else False)) return 1 except (RuntimeError, workflow.WorkflowException) as e: _logger.error(u"Tool definition failed initialization:\n%s", e, exc_info=(e if debug else False)) return 1 if jobobj: for inp in t.tool["inputs"]: if shortname(inp["id"]) in jobobj: inp["default"] = jobobj[shortname(inp["id"])] if metadata: t.metadata = metadata else: t.metadata = { "$namespaces": t.tool.get("$namespaces", {}), "$schemas": t.tool.get("$schemas", []) } return t
def check_file(self, fn): # type: (unicode) -> bool if fn.startswith("file://"): u = urlparse.urlsplit(fn) return os.path.exists(u.path) else: return False
def _curl_setup_request(curl, request, buffer, headers): curl.setopt(pycurl.URL, str(request.url)) # xxx: cannot send in unicode strings? # libcurl's magic "Expect: 100-continue" behavior causes delays # with servers that don't support it (which include, among others, # Google's OpenID endpoint). Additionally, this behavior has # a bug in conjunction with the curl_multi_socket_action API # (https://sourceforge.net/tracker/?func=detail&atid=100976&aid=3039744&group_id=976), # which increases the delays. It's more trouble than it's worth, # so just turn off the feature (yes, setting Expect: to an empty # value is the official way to disable this) if "Expect" not in request.headers: request.headers["Expect"] = "" # libcurl adds Pragma: no-cache by default; disable that too if "Pragma" not in request.headers: request.headers["Pragma"] = "" # Request headers may be either a regular dict or HTTPHeaders object if isinstance(request.headers, httputil.HTTPHeaders): curl.setopt(pycurl.HTTPHEADER, [utf8("%s: %s" % i) for i in request.headers.get_all()]) else: curl.setopt(pycurl.HTTPHEADER, [utf8("%s: %s" % i) for i in request.headers.iteritems()]) if request.header_callback: curl.setopt(pycurl.HEADERFUNCTION, request.header_callback) else: curl.setopt(pycurl.HEADERFUNCTION, lambda line: _curl_header_callback(headers, line)) if request.streaming_callback: curl.setopt(pycurl.WRITEFUNCTION, request.streaming_callback) else: curl.setopt(pycurl.WRITEFUNCTION, buffer.write) if urlparse.urlsplit(_unicode(request.url)).scheme == 'https' and not request.validate_cert: curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) curl.setopt(pycurl.FOLLOWLOCATION, request.follow_redirects) curl.setopt(pycurl.MAXREDIRS, request.max_redirects) curl.setopt(pycurl.CONNECTTIMEOUT, int(request.connect_timeout)) curl.setopt(pycurl.TIMEOUT, int(request.request_timeout)) if request.user_agent: curl.setopt(pycurl.USERAGENT, utf8(request.user_agent)) else: curl.setopt(pycurl.USERAGENT, "Mozilla/5.0 (compatible; pycurl)") if request.network_interface: curl.setopt(pycurl.INTERFACE, request.network_interface) if request.use_gzip: curl.setopt(pycurl.ENCODING, "gzip,deflate") else: curl.setopt(pycurl.ENCODING, "none") if request.proxy_host and request.proxy_port: curl.setopt(pycurl.PROXY, request.proxy_host) curl.setopt(pycurl.PROXYPORT, request.proxy_port) if request.proxy_username: credentials = '%s:%s' % (request.proxy_username, request.proxy_password) curl.setopt(pycurl.PROXYUSERPWD, credentials) else: curl.setopt(pycurl.PROXY, '') if request.validate_cert: curl.setopt(pycurl.SSL_VERIFYPEER, 1) curl.setopt(pycurl.SSL_VERIFYHOST, 2) else: curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) if request.ca_certs is not None: curl.setopt(pycurl.CAINFO, request.ca_certs) else: # There is no way to restore pycurl.CAINFO to its default value # (Using unsetopt makes it reject all certificates). # I don't see any way to read the default value from python so it # can be restored later. We'll have to just leave CAINFO untouched # if no ca_certs file was specified, and require that if any # request uses a custom ca_certs file, they all must. pass if request.allow_ipv6 is False: # Curl behaves reasonably when DNS resolution gives an ipv6 address # that we can't reach, so allow ipv6 unless the user asks to disable. # (but see version check in _process_queue above) curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # Set the request method through curl's retarded interface which makes # up names for almost every single method curl_options = { "GET": pycurl.HTTPGET, "POST": pycurl.POST, "PUT": pycurl.UPLOAD, "HEAD": pycurl.NOBODY, } custom_methods = set(["DELETE"]) for o in curl_options.values(): curl.setopt(o, False) if request.method in curl_options: curl.unsetopt(pycurl.CUSTOMREQUEST) curl.setopt(curl_options[request.method], True) elif request.allow_nonstandard_methods or request.method in custom_methods: curl.setopt(pycurl.CUSTOMREQUEST, request.method) else: raise KeyError('unknown method ' + request.method) # Handle curl's cryptic options for every individual HTTP method if request.method in ("POST", "PUT"): request_buffer = cStringIO.StringIO(utf8(request.body)) curl.setopt(pycurl.READFUNCTION, request_buffer.read) if request.method == "POST": def ioctl(cmd): if cmd == curl.IOCMD_RESTARTREAD: request_buffer.seek(0) curl.setopt(pycurl.IOCTLFUNCTION, ioctl) curl.setopt(pycurl.POSTFIELDSIZE, len(request.body)) else: curl.setopt(pycurl.INFILESIZE, len(request.body)) logmethod = 'info' if request.log_request else 'debug' if request.auth_username and request.auth_password: userpwd = "%s:%s" % (request.auth_username, request.auth_password) curl.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_BASIC) curl.setopt(pycurl.USERPWD, userpwd) getattr(logging,logmethod)("%s %s (username: %r)", request.method, request.url, request.auth_username) else: curl.unsetopt(pycurl.USERPWD) getattr(logging,logmethod)("%s %s", request.method, request.url) if request.client_key is not None or request.client_cert is not None: raise ValueError("Client certificate not supported with curl_httpclient") if threading.activeCount() > 1: # libcurl/pycurl is not thread-safe by default. When multiple threads # are used, signals should be disabled. This has the side effect # of disabling DNS timeouts in some environments (when libcurl is # not linked against ares), so we don't do it when there is only one # thread. Applications that use many short-lived threads may need # to set NOSIGNAL manually in a prepare_curl_callback since # there may not be any other threads running at the time we call # threading.activeCount. curl.setopt(pycurl.NOSIGNAL, 1) if request.prepare_curl_callback is not None: request.prepare_curl_callback(curl)
def url(self, url): if urlparse.urlsplit(url).netloc is None: return self.url(url) body = {"url": url} return self.send_command("POST", "url", body)
def getCookies(cookieName, url): domain = urlparse.urlsplit(url).netloc return common.getCookies(cookieName, domain)
def decode(self, content, url): result = content if not ALWAYS_CHAR_DETECT and self.encoding: # 先使用上次的编码打开文件尝试 try: result = content.decode(self.encoding) except UnicodeDecodeError: # 解码错误,使用自动检测编码 encoding = chardet.detect(content)['encoding'] try: result = content.decode(encoding) except UnicodeDecodeError: # 还是出错,则不转换,直接返回 self.encoding = None result = content else: # 保存下次使用,以节省时间 self.encoding = encoding #同时保存到数据库 netloc = urlparse.urlsplit(url)[1] urlenc = UrlEncoding.all().filter('netloc = ', netloc).get() if urlenc: enc = urlenc.feedenc if self.isfeed else urlenc.pageenc if enc != encoding: if self.isfeed: urlenc.feedenc = encoding else: urlenc.pageenc = encoding urlenc.put() elif self.isfeed: UrlEncoding(netloc=netloc, feedenc=encoding).put() else: UrlEncoding(netloc=netloc, pageenc=encoding).put() else: # 暂时没有之前的编码信息 netloc = urlparse.urlsplit(url)[1] urlenc = UrlEncoding.all().filter('netloc = ', netloc).get() if urlenc: #先看数据库有没有 enc = urlenc.feedenc if self.isfeed else urlenc.pageenc if enc: try: result = content.decode(enc) except UnicodeDecodeError: # 出错,重新探测编码 self.encoding = chardet.detect(content)['encoding'] else: self.encoding = enc return result else: #数据库暂时没有数据 self.encoding = chardet.detect(content)['encoding'] else: self.encoding = chardet.detect(content)['encoding'] #使用探测到的编码解压 try: result = content.decode(self.encoding) except UnicodeDecodeError: # 出错,则不转换,直接返回 result = content else: #保存到数据库 newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc) if self.isfeed: newurlenc.feedenc = self.encoding else: newurlenc.pageenc = self.encoding newurlenc.put() return result
def finish_conversation(self): if not (check_duplicate_url(self.host, self.uri)): if check_duplicate_uri(self.uri): self.uri = create_next_uri(self.uri) obj_num = len(conversations) conversations.append(namedtuple('Conv', ['id','server_ip', 'uri','req_head','res_body','res_head','res_num','res_type','host','referer', \ 'filename','method','redirect_to','req_microsec', 'res_len','magic_name', 'magic_ext'])) host_tuple = (self.host, str(self.remote_host[0]) + ":" + str(self.remote_host[1])) # hosts list if (hosts.has_key(host_tuple)): hosts[host_tuple].append(self.uri + " [" + str(obj_num) + "]") else: hosts[host_tuple] = [self.uri + " [" + str(obj_num) + "]"] # convs list conversations[obj_num].id = obj_num conversations[obj_num].server_ip = str( self.remote_host[0]) + ":" + str(self.remote_host[1]) conversations[obj_num].uri = self.uri conversations[obj_num].redirect_to = self.redirect_to conversations[obj_num].short_uri = getShortURI(self.uri) conversations[obj_num].req_head = self.req_head conversations[obj_num].res_body = self.res_body add_object("body", self.res_body) try: # FindMagic mgc_name = "" mgc_ext = "" mgc_name, mgc_ext = WhatypeMagic.identify_buffer(self.res_body) except: pass conversations[obj_num].magic_name = mgc_name.rstrip() conversations[obj_num].magic_ext = mgc_ext.rstrip() conversations[obj_num].orig_chunked_resp = self.orig_chunked_resp conversations[obj_num].orig_resp = self.orig_resp conversations[obj_num].res_head = self.res_head conversations[obj_num].res_num = self.res_num if ";" in self.res_type: conversations[obj_num].res_type = self.res_type[:self.res_type. find(";")] else: conversations[obj_num].res_type = self.res_type conversations[obj_num].host = self.host conversations[obj_num].referer = self.referer conversations[obj_num].filename = self.filename conversations[obj_num].method = self.method conversations[obj_num].req_microsec = str(self.time)[:10] # In case no filename was given from the server, split by URI if (conversations[obj_num].filename == ""): uri_name = urlparse.urlsplit(str(conversations[obj_num].uri)).path conversations[obj_num].filename = uri_name.split('/')[-1] if (str(conversations[obj_num].filename).find('?') > 0): conversations[obj_num].filename = \ conversations[obj_num].filename[:str(conversations[obj_num].filename).find('?')] if (str(conversations[obj_num].filename).find('&') > 0): conversations[obj_num].filename = \ conversations[obj_num].filename[:str(conversations[obj_num].filename).find('&')] # In case the URI was '/' then this is still empty if (conversations[obj_num].filename == ""): conversations[obj_num].filename = str(obj_num) + ".html" objects[obj_num].name = conversations[obj_num].filename conversations[obj_num].res_len = self.res_len
def get_domain(task_id): task = Task.objects.get(id=task_id) _ = urlparse.urlsplit(task.start_url) domain = "%s://%s%s" % (_.scheme, _.netloc, task.base) return domain
def geoserver_proxy(request, proxy_path, downstream_path, workspace=None, layername=None): """ WARNING: Decorators are applied in the order they appear in the source. """ # AF: No need to authenticate first. We will check if "access_token" is present # or not on session # @dismissed # if not request.user.is_authenticated(): # return HttpResponse( # "You must be logged in to access GeoServer", # content_type="text/plain", # status=401) def strip_prefix(path, prefix): assert prefix in path prefix_idx = path.index(prefix) _prefix = path[:prefix_idx] + prefix full_prefix = "%s/%s/%s" % ( _prefix, layername, downstream_path) if layername else _prefix return path[len(full_prefix):] path = strip_prefix(request.get_full_path(), proxy_path) raw_url = str( "".join([ogc_server_settings.LOCATION, downstream_path, path])) if settings.DEFAULT_WORKSPACE or workspace: ws = (workspace or settings.DEFAULT_WORKSPACE) if ws and ws in path: # Strip out WS from PATH try: path = "/%s" % strip_prefix(path, "/%s:" % (ws)) except BaseException: pass if proxy_path == '/gs/%s' % settings.DEFAULT_WORKSPACE and layername: import posixpath raw_url = urljoin(ogc_server_settings.LOCATION, posixpath.join(workspace, layername, downstream_path, path)) if downstream_path in ('rest/styles') and len(request.body) > 0: if ws: # Lets try # http://localhost:8080/geoserver/rest/workspaces/<ws>/styles/<style>.xml _url = str("".join([ogc_server_settings.LOCATION, 'rest/workspaces/', ws, '/styles', path])) else: _url = str("".join([ogc_server_settings.LOCATION, 'rest/styles', path])) raw_url = _url if downstream_path in 'ows' and ( 'rest' in path or re.match(r'/(w.*s).*$', path, re.IGNORECASE) or re.match(r'/(ows).*$', path, re.IGNORECASE)): _url = str("".join([ogc_server_settings.LOCATION, '', path[1:]])) raw_url = _url url = urlsplit(raw_url) affected_layers = None if '%s/layers' % ws in path: downstream_path = 'rest/layers' elif '%s/styles' % ws in path: downstream_path = 'rest/styles' if request.method in ("POST", "PUT", "DELETE"): if downstream_path in ('rest/styles', 'rest/layers', 'rest/workspaces'): if not style_change_check(request, downstream_path): return HttpResponse( _( "You don't have permissions to change style for this layer"), content_type="text/plain", status=401) elif downstream_path == 'rest/styles': logger.info( "[geoserver_proxy] Updating Style ---> url %s" % url.geturl()) affected_layers = style_update(request, raw_url) elif downstream_path == 'rest/layers': logger.debug( "[geoserver_proxy] Updating Layer ---> url %s" % url.geturl()) try: _layer_name = os.path.splitext(os.path.basename(request.path))[0] _layer = Layer.objects.get(name__icontains=_layer_name) affected_layers = [_layer] except BaseException: logger.warn("Could not find any Layer %s on DB" % os.path.basename(request.path)) kwargs = {'affected_layers': affected_layers} import urllib raw_url = urllib.unquote(raw_url).decode('utf8') timeout = getattr(ogc_server_settings, 'TIMEOUT') or 10 allowed_hosts = [urlsplit(ogc_server_settings.public_url).hostname, ] return proxy(request, url=raw_url, response_callback=_response_callback, timeout=timeout, allowed_hosts=allowed_hosts, **kwargs)
def getFilenameFromURL(url): '''Gets the filename from a URL''' (unused_scheme, unused_netloc, path, unused_query, unused_fragment) = urlparse.urlsplit(url) return os.path.basename(path)
def separate_url(url): s = urlparse.urlsplit(url) if '@' not in s.netloc: parser.error('merged url netloc must contain an "@"') userpass, new_netloc = s.netloc.rsplit('@', 1) return urlparse.urlunsplit(s._replace(netloc=new_netloc)), userpass
def subscribe(service, action='subscribe'): """ send a subscribe/renewal/unsubscribe request to a service return the device response """ log_category = "event_protocol" log.info(log_category, "event.subscribe, action: %r", action) _,host_port,path,_,_ = urlsplit(service.get_base_url()) if host_port.find(':') != -1: host,port = tuple(host_port.split(':')) port = int(port) else: host = host_port port = 80 def send_request(p, action): log.info(log_category, "event.subscribe.send_request %r, action: %r %r", p, action, service.get_event_sub_url()) _,_,event_path,_,_ = urlsplit(service.get_event_sub_url()) if action == 'subscribe': timeout = service.timeout if timeout == 0: timeout = 1800 request = ["SUBSCRIBE %s HTTP/1.1" % event_path, "HOST: %s:%d" % (host, port), "TIMEOUT: Second-%d" % timeout, ] service.event_connection = p else: request = ["UNSUBSCRIBE %s HTTP/1.1" % event_path, "HOST: %s:%d" % (host, port), ] if service.get_sid(): request.append("SID: %s" % service.get_sid()) else: # XXX use address and port set in the coherence instance #ip_address = p.transport.getHost().host global hostname, web_server_port #print hostname, web_server_port url = 'http://%s:%d/events' % (hostname, web_server_port) request.append("CALLBACK: <%s>" % url) request.append("NT: upnp:event") request.append('Date: %s' % datetimeToString()) request.append( "Content-Length: 0") request.append( "") request.append( "") request = '\r\n'.join(request) log.debug(log_category, "event.subscribe.send_request %r %r", request, p) try: p.transport.writeSomeData(request) except AttributeError: log.info(log_category, "transport for event %r already gone", action) # print "event.subscribe.send_request", d #return d def got_error(failure, action): log.info(log_category, "error on %s request with %s" % (action,service.get_base_url())) log.debug(log_category, failure) def teardown_connection(c, d): log.info(log_category, "event.subscribe.teardown_connection") del d del c def prepare_connection( service, action): log.info(log_category, "event.subscribe.prepare_connection action: %r %r", action, service.event_connection) if service.event_connection == None: c = ClientCreator(reactor, EventProtocol, service=service, action=action) log.info(log_category, "event.subscribe.prepare_connection: %r %r", host, port) d = c.connectTCP(host, port) d.addCallback(send_request, action=action) d.addErrback(got_error, action) #reactor.callLater(3, teardown_connection, c, d) else: d = defer.Deferred() d.addCallback(send_request, action=action) d.callback(service.event_connection) #send_request(service.event_connection, action) return d """ FIXME: we need to find a way to be sure that our unsubscribe calls get through on shutdown reactor.addSystemEventTrigger( 'before', 'shutdown', prepare_connection, service, action) """ return prepare_connection(service, action)
def post_url(url, fields, files=[]): urlparts = urlparse.urlsplit(url) return post_multipart(urlparts[1], urlparts[2], fields, files)
def __init__(self, params, status, calib_data, service_name): """ :param params: pywws configuration. :type params: :class:`pywws.DataStore.params` :param status: pywws status store. :type status: :class:`pywws.DataStore.status` :param calib_data: 'calibrated' data. :type calib_data: :class:`pywws.DataStore.calib_store` :param service_name: name of service to upload to. :type service_name: string """ self.logger = logging.getLogger('pywws.ToService(%s)' % service_name) self.params = params self.status = status self.data = calib_data self.service_name = service_name # 'derived' services such as 'underground_rf' share their # parent's config and templates config_section = self.service_name.split('_')[0] if config_section == self.service_name: self.parent = None else: self.parent = config_section self.old_response = None self.old_ex = None # set default socket timeout, so urlopen calls don't hang forever socket.setdefaulttimeout(30) # open params file service_params = SafeConfigParser() service_params.optionxform = str service_params.readfp( pkg_resources.resource_stream( 'pywws', 'services/%s.ini' % (self.service_name))) # get URL self.server = service_params.get('config', 'url') parsed_url = urlparse.urlsplit(self.server) if parsed_url.scheme == 'aprs': self.send_data = self.aprs_send_data server, port = parsed_url.netloc.split(':') self.server = (server, int(port)) elif parsed_url.scheme == 'mqtt': self.send_data = self.mqtt_send_data else: self.send_data = self.http_send_data self.use_get = eval(service_params.get('config', 'use get')) # get fixed part of upload data self.fixed_data = dict() for name, value in service_params.items('fixed'): if value[0] == '*': value = self.params.get(config_section, value[1:], 'unknown') self.fixed_data[name] = value # create templater self.templater = Template.Template(self.params, self.status, self.data, self.data, None, None, use_locale=False) template_name = 'services/%s_template_%s.txt' % ( config_section, self.params.get('config', 'ws type')) if not pkg_resources.resource_exists('pywws', template_name): template_name = 'services/%s_template_1080.txt' % (config_section) self.template_file = pkg_resources.resource_stream( 'pywws', template_name) # get other parameters self.auth_type = service_params.get('config', 'auth_type') if self.auth_type == 'basic': user = self.params.get(config_section, 'user', 'unknown') password = self.params.get(config_section, 'password', 'unknown') self.auth = 'Basic %s' % base64.b64encode('%s:%s' % (user, password)) self.catchup = eval(service_params.get('config', 'catchup')) self.expected_result = eval(service_params.get('config', 'result')) self.interval = eval(service_params.get('config', 'interval')) self.interval = max(self.interval, 40) self.interval = timedelta(seconds=self.interval) # move 'last update' from params to status last_update = self.params.get_datetime(self.service_name, 'last update') if last_update: self.params.unset(self.service_name, 'last update') self.status.set('last update', self.service_name, last_update.isoformat(' ')) # set timestamp of first data to upload self.next_update = datetime.utcnow() - max( timedelta(days=self.catchup), self.interval)
def delete(uri, filepath, depth="infinity"): """ Perform a X{DELETE} operation on the given URI, which is backed by the given filepath. @param filepath: the L{FilePath} to delete. @param depth: the recursion X{Depth} for the X{DELETE} operation, which must be "infinity". @raise HTTPError: (containing a response with a status code of L{responsecode.BAD_REQUEST}) if C{depth} is not "infinity". @raise HTTPError: (containing an appropriate response) if the delete operation fails. If C{filepath} is a directory, the response will be a L{MultiStatusResponse}. @return: a deferred response with a status code of L{responsecode.NO_CONTENT} if the X{DELETE} operation succeeds. """ # # Remove the file(s) # # FIXME: defer if filepath.isdir(): # # RFC 2518, section 8.6 says that we must act as if the Depth header is # set to infinity, and that the client must omit the Depth header or set # it to infinity, meaning that for collections, we will delete all # members. # # This seems somewhat at odds with the notion that a bad request should # be rejected outright; if the client sends a bad depth header, the # client is broken, and RFC 2518, section 8 suggests that a bad request # should be rejected... # # Let's play it safe for now and ignore broken clients. # if depth != "infinity": msg = ("Client sent illegal depth header value for DELETE: %s" % (depth, )) log.error(msg) raise HTTPError(StatusResponse(responsecode.BAD_REQUEST, msg)) # # Recursive delete # # RFC 2518, section 8.6 says that if we get an error deleting a resource # other than the collection in the request-URI, that we must respond # with a multi-status response containing error statuses for each # resource that we fail to delete. It also says we should not return # no-content (success) status, which means that we should continue after # errors, rather than aborting right away. This is interesting in that # it's different from how most operating system tools act (eg. rm) when # recursive filsystem deletes fail. # uri_path = urllib.unquote(urlsplit(uri)[2]) if uri_path[-1] == "/": uri_path = uri_path[:-1] log.info("Deleting directory %s" % (filepath.path, )) # NOTE: len(uri_path) is wrong if os.sep is not one byte long... meh. request_basename = filepath.path[:-len(uri_path)] errors = ResponseQueue(request_basename, "DELETE", responsecode.NO_CONTENT) # FIXME: defer this for dir, subdirs, files in os.walk(filepath.path, topdown=False): for filename in files: path = os.path.join(dir, filename) try: os.remove(path) except: errors.add(path, Failure()) for subdir in subdirs: path = os.path.join(dir, subdir) if os.path.islink(path): try: os.remove(path) except: errors.add(path, Failure()) else: try: os.rmdir(path) except: errors.add(path, Failure()) try: os.rmdir(filepath.path) except: raise HTTPError( statusForFailure(Failure(), "deleting directory: %s" % (filepath.path, ))) response = errors.response() else: # # Delete a file; much simpler, eh? # log.info("Deleting file %s" % (filepath.path, )) try: os.remove(filepath.path) except: raise HTTPError( statusForFailure(Failure(), "deleting file: %s" % (filepath.path, ))) response = responsecode.NO_CONTENT # Remove stat info for filepath since we deleted the backing file filepath.changed() return succeed(response)
def parse_detail_wiki(url, title, desc): """ 解析API明细页面 :param url: wiki地址 :param desc: api描述 :return: void """ print "parsing detail api wiki page...." print url content = requests.get(url).text soup = BeautifulSoup(content, "html.parser") api_dict = dict() # find basic information api_node = soup.find('a', class_='external free') if not api_node: print url + "does not contains an API path" return api_url = api_node["href"] api_path = urlparse.urlparse(api_url).path api_name = api_url.rsplit('/', 1)[1] api_category = api_path.rsplit('/')[2] api_category = api_category.replace('.', '').title() api_signature = api_name.rsplit('.', 1)[0].title().replace('_', "") api_dict["url"] = api_url api_dict["description"] = desc api_dict["method"] = "GET" api_dict["signature"] = api_signature api_dict["category"] = api_category api_dict["path"] = api_path api_dict["title"] = title console = soup.find('a', text='API测试工具') if console: console_url = console["href"] api_dict["console"] = console_url params = dict(urlparse.parse_qsl(urlparse.urlsplit(console_url).query)) http_method = params.get('httpmethod') if http_method == 'None': api_dict["method"] = 'GET' else: api_dict["method"] = http_method # parse parameters and responses tables = soup.find_all('table', class_='parameters') if len(tables) == 2: parameters, needs_auth = parse_request_parameters(tables[0]) api_dict["parameters"] = parameters api_dict["authorize"] = needs_auth response = parse_response_model(tables[1]) api_dict["response"] = response elif len(tables) == 1: parameters, needs_auth = parse_request_parameters(tables[0]) api_dict["parameters"] = parameters api_dict["authorize"] = needs_auth api_dict["response"] = [] else: print "!!!!!!!!!! detail page error" return filename = api_url.rsplit('/', 1)[1] if not filename.endswith('json'): return file_path = builds_dir + "/" + api_category + "_" + filename with open(file_path, mode='w') as f: f.write( json.dumps(api_dict, encoding='UTF-8', ensure_ascii=False, indent=4)) print "parsing detail page done ====== "
def __cleanup_requests(self): """Cleanup handles that have finished their request. Return the handles to the freelist. Generate any relevant error information.""" count, good, bad = self.__mhandle.info_read() failures = self.__failures done_handles = [] ex_to_raise = None for h, en, em in bad: # Get statistics for each handle. repostats = self.__xport.stats[h.repourl] repostats.record_tx() bytes = h.getinfo(pycurl.SIZE_DOWNLOAD) seconds = h.getinfo(pycurl.TOTAL_TIME) repostats.record_progress(bytes, seconds) httpcode = h.getinfo(pycurl.RESPONSE_CODE) url = h.url urlstem = h.repourl proto = urlparse.urlsplit(url)[0] # All of these are errors repostats.record_error() # If we were cancelled, raise an API error. # Otherwise fall through to transport's exception # generation. if en == pycurl.E_ABORTED_BY_CALLBACK: ex = None ex_to_raise = api_errors.CanceledException elif en == pycurl.E_HTTP_RETURNED_ERROR: ex = tx.TransportProtoError(proto, httpcode, url, repourl=urlstem) else: ex = tx.TransportFrameworkError(en, url, em, repourl=urlstem) if ex and ex.retryable: failures.append(ex) elif ex and not ex_to_raise: ex_to_raise = ex done_handles.append(h) for h in good: # Get statistics for each handle. repostats = self.__xport.stats[h.repourl] repostats.record_tx() bytes = h.getinfo(pycurl.SIZE_DOWNLOAD) seconds = h.getinfo(pycurl.TOTAL_TIME) h.filetime = h.getinfo(pycurl.INFO_FILETIME) repostats.record_progress(bytes, seconds) httpcode = h.getinfo(pycurl.RESPONSE_CODE) url = h.url urlstem = h.repourl proto = urlparse.urlsplit(url)[0] if httpcode == httplib.OK: h.success = True else: ex = tx.TransportProtoError(proto, httpcode, url, repourl=urlstem) # If code >= 400, record this as an error. # Handlers above the engine get to decide # for 200/300 codes that aren't OK if httpcode >= 400: repostats.record_error() # If code == 0, libcurl failed to read # any HTTP status. Response is almost # certainly corrupted. elif httpcode == 0: reason = "Invalid HTTP status code " \ "from server" ex = tx.TransportProtoError(proto, url=url, reason=reason, repourl=urlstem) ex.retryable = True # Stash retryable failures, arrange # to raise first fatal error after # cleanup. if ex.retryable: failures.append(ex) elif not ex_to_raise: ex_to_raise = ex done_handles.append(h) # Call to remove_handle must be separate from info_read() for h in done_handles: self.__mhandle.remove_handle(h) self.__teardown_handle(h) self.__freehandles.append(h) self.__failures = failures if ex_to_raise: raise ex_to_raise
def render_template(self, template_name, output_name, context): data = self.template_system.render_template(template_name, None, context, self.GLOBAL_CONTEXT) assert output_name.startswith(self.config["OUTPUT_FOLDER"]) url_part = output_name[len(self.config["OUTPUT_FOLDER"]) + 1:] # This is to support windows paths url_part = "/".join(url_part.split(os.sep)) src = urlparse.urljoin(self.config["BLOG_URL"], url_part) parsed_src = urlparse.urlsplit(src) src_elems = parsed_src.path.split('/')[1:] def replacer(dst): # Refuse to replace links that are full URLs. dst_url = urlparse.urlparse(dst) if dst_url.netloc: if dst_url.scheme == 'link': # Magic link dst = self.link(dst_url.netloc, dst_url.path.lstrip('/'), context['lang']) else: return dst # Normalize dst = urlparse.urljoin(src, dst) # Avoid empty links. if src == dst: return "#" # Check that link can be made relative, otherwise return dest parsed_dst = urlparse.urlsplit(dst) if parsed_src[:2] != parsed_dst[:2]: return dst # Now both paths are on the same site and absolute dst_elems = parsed_dst.path.split('/')[1:] i = 0 for (i, s), d in zip(enumerate(src_elems), dst_elems): if s != d: break # Now i is the longest common prefix result = '/'.join(['..'] * (len(src_elems) - i - 1) + dst_elems[i:]) if not result: result = "." # Don't forget the fragment (anchor) part of the link if parsed_dst.fragment: result += "#" + parsed_dst.fragment assert result, (src, dst, i, src_elems, dst_elems) return result try: os.makedirs(os.path.dirname(output_name)) except: pass doc = lxml.html.document_fromstring(data) doc.rewrite_links(replacer) data = '<!DOCTYPE html>' + lxml.html.tostring(doc, encoding='utf8') with open(output_name, "w+") as post_file: post_file.write(data)
def thumbvid(self): vid = urlparse.parse_qs(urlparse.urlsplit(self.url).query)['v'][0] embed = '<iframe width="120" height="90"' embed += 'src="http://www.youtube.com/embed/%s"' % (vid, ) embed += 'frameborder="0" allowfullscreen></iframe>' return embed
def make_url_protocol_relative(url): if not url or url.startswith("//"): return url scheme, netloc, path, query, fragment = urlparse.urlsplit(url) return urlparse.urlunsplit((None, netloc, path, query, fragment))
def __call__(self, env): is_https = (env['REQUEST_METHOD'] == 'CONNECT') # for non-https requests, check non-proxy urls if not is_https: url = env['REL_REQUEST_URI'] if not url.startswith(('http://', 'https://')): return None env['pywb.proxy_scheme'] = 'http' route = None coll = None matcher = None response = None ts = None # check resolver, for pre connect resolve if self.resolver.pre_connect: route, coll, matcher, ts, response = self.resolver.resolve(env) if response: return response # do connect, then get updated url if is_https: response = self.handle_connect(env) if response: return response url = env['REL_REQUEST_URI'] else: parts = urlparse.urlsplit(env['REL_REQUEST_URI']) hostport = parts.netloc.split(':', 1) env['pywb.proxy_host'] = hostport[0] env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else '' env['pywb.proxy_req_uri'] = parts.path if parts.query: env['pywb.proxy_req_uri'] += '?' + parts.query env['pywb_proxy_magic'] = self.magic_name # route (static) and other resources to archival replay if env['pywb.proxy_host'] == self.magic_name: env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] # special case for proxy install response = self.handle_cert_install(env) if response: return response return None # check resolver, post connect if not self.resolver.pre_connect: route, coll, matcher, ts, response = self.resolver.resolve(env) if response: return response host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name rel_prefix = '' # special case for proxy calendar if (env['pywb.proxy_host'] == 'query.' + self.magic_name): url = env['pywb.proxy_req_uri'][1:] rel_prefix = '/' if ts is not None: url = ts + '/' + url wbrequest = route.request_class( env, request_uri=url, wb_url_str=url, coll=coll, host_prefix=host_prefix, rel_prefix=rel_prefix, wburl_class=route.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, is_proxy=True) if matcher: route.apply_filters(wbrequest, matcher) if self.unaltered: wbrequest.wb_url.mod = 'id_' elif is_https: wbrequest.wb_url.mod = 'bn_' response = route.handler(wbrequest) if wbrequest.wb_url and wbrequest.wb_url.is_replay(): response.status_headers.replace_headers(self.extra_headers) return response
def getOrigin(url): origin = urlparse.urlsplit(url) origin = urlparse.urlunparse( (origin.scheme, origin.netloc, '', '', '', '')) return origin
def make_url_https(url): if not url or url.startswith("https://"): return url scheme, netloc, path, query, fragment = urlparse.urlsplit(url) return urlparse.urlunsplit(("https", netloc, path, query, fragment))