def getLockUri(ouri): ''' Gets the lockfile uri from the given ouri @param ouri: osaka-uri to wrap with the lock ''' parsed = urlparse.urlparse(ouri) parsed = parsed._replace(path=INTERLOCK_NAME_TEMPLATE.format(parsed.path.rstrip("/"))) return parsed.geturl()
def make_data_to_sign(self, r, auth_header): if self.testurl: testparts = urlparse(self.testurl) requestparts = urlparse(r.url) url = urlunparse(testparts[0:2] + requestparts[2:]) else: url = r.url parsed_url = urlparse(url) data_to_sign = '\t'.join([ r.method, parsed_url.scheme, parsed_url.netloc, # Note: relative URL constraints are handled by requests when it sets up 'r' parsed_url.path + ('?' + parsed_url.query if parsed_url.query else ""), self.canonicalize_headers(r), self.make_content_hash(r), auth_header ]) logger.debug('data to sign: %s', '\\t'.join(data_to_sign.split('\t'))) return data_to_sign
def online_testcase(self): """this test is intended to not run on every project test. If you want to run the test, you should specifically run this method. """ google_search = GoogleSearch('essanpupil') google_search.start_search() for item in google_search.search_result: url = urlparse(item) if url.scheme is not None: self.assertIn('http', url.scheme) else: self.fail('Parsing failed!')
def _guess_atta(fb_atta, base_text): text = '' if 'type' in fb_atta and fb_atta['type'].startswith('video'): parsed_url = urlparse(fb_atta['url']) parsed_qs = parse_qs(parsed_url.query) if not parsed_qs['u'][0] in base_text: text += '\n\n' + parsed_qs['u'][0] elif 'subattachments' in fb_atta: for fb_atta_ in fb_atta['subattachments']['data']: image_url = fb_atta_['media']['image']['src'] text += "\n\n![](%s)" % image_url elif 'media' in fb_atta: if 'image' in fb_atta['media']: image_url = fb_atta['media']['image']['src'] text += "\n![](%s)" % image_url return text
def urlallowed(url): if CONFIG['skip-robots-txt']: return True protocol, domain = urlparse.urlparse(url)[:2] for bd in ROBOTS_TXT_BLACKLIST_DOMAINS: if re.match(bd, domain): return True for d in ['sourceforge', 'berlios', 'github.com']: if d in domain: return True if protocol == 'ftp': return True baseurl = '%s://%s' % (protocol, domain) robotsurl = urlparse.urljoin(baseurl, 'robots.txt') if rpcache.has_key(baseurl): rp = rpcache[baseurl] else: from socket import setdefaulttimeout, getdefaulttimeout timeout = getdefaulttimeout() setdefaulttimeout(5) rp = robotparser.RobotFileParser() rp.set_url(robotsurl) try: rp.read() rpcache[baseurl] = rp except: rp = None setdefaulttimeout(timeout) return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
def urlallowed(url): if CONFIG['skip-robots-txt']: return True protocol, domain = urlparse.urlparse(url)[:2] for bd in ROBOTS_TXT_BLACKLIST_DOMAINS: if re.match(bd, domain): return True for d in ['sourceforge', 'berlios', 'github.com']: if d in domain: return True if protocol == 'ftp': return True baseurl = '%s://%s' % (protocol, domain) robotsurl = urlparse.urljoin(baseurl, 'robots.txt') if baseurl in rpcache: rp = rpcache[baseurl] else: from socket import setdefaulttimeout, getdefaulttimeout timeout = getdefaulttimeout() setdefaulttimeout(5) rp = robotparser.RobotFileParser() rp.set_url(robotsurl) try: rp.read() rpcache[baseurl] = rp except: rp = None setdefaulttimeout(timeout) return rp.can_fetch(CONFIG['user-agent'], url) if rp else True
def is_fetch_allowed_by_robots_txt(self, url): scheme, netloc, path, _, _, _ = urlparse(url) if path == '/robots.txt': return True robots_txt_url = '%s://%s/robots.txt' % (scheme, netloc) checker = self.robotcheckers.get(robots_txt_url) if checker is None: doc = self.fetch(robots_txt_url) robots_txt = doc.content if doc.status == 200 else '' checker = robotparser.RobotFileParser() checker.set_url(robots_txt_url) checker.parse(robots_txt) self.robotcheckers[robots_txt_url] = checker # Work around a RobotFileParser bug which makes it crash when # an URL contains non-ASCII characters, even when they are perfectly # escaped. (The library seems to make a hard-coded assumption that # URLs are encoded in ISO 8859-1 instead of UTF-8 before being escaped; # this had been true in the very early days of the web, but not # anymore.) To work around this bug, we double-encode the URL # for the purpose of robots checking; this prevents the crash. return checker.can_fetch(useragent=self.useragent_for_robots_txt, url=urlencode(url))
def urlencode(url): p = list(urlparse(url)) p[1] = p[1].encode('idna') for i in range(2, len(p)): p[i] = urllib.quote(p[i].encode('utf-8')) return urlunparse(p).encode('ascii')
def urlpath(url): "'http://example.org/foo/bar.html?baz#qux' --> '/foo/bar.hml'" return urlparse(url)[2]
def set_url_arg(url, arg, val): url_parts = list(urlparse.urlparse(url)) query = dict(urlparse.parse_qsl(url_parts[4])) query.update({arg : str(val)}) url_parts[4] = urlencode(query) return urlparse.urlunparse(url_parts)