def preprocess_link(self, referrer, url): # Modify and filter URLs before crawling if not url: return None fields = urlsplit(urljoin( referrer, url))._asdict() # convert to absolute URLs and split fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing "/" fields['fragment'] = '' # remove targets within a page fields = SplitResult(**fields) if fields.scheme == 'http': httpurl = newurl = fields.geturl() httpsurl = httpurl.replace('http:', 'https:', 1) elif fields.scheme == 'https': httpsurl = newurl = fields.geturl() httpurl = httpsurl.replace('https:', 'http:', 1) else: # Filter the URL without 'http' or 'https' return None if httpurl not in self.url_set and httpsurl not in self.url_set: # Filter URL that already exists in set return newurl else: return None
def build_url(url): url_result = {UrlParser.QUERY: "", UrlParser.FRAGMENT: ""} if not url or UrlParser.SCHEME not in url or not url[UrlParser.SCHEME]: raise Exception("UrlParser:build_url", "Url dictionary is empty or missing key values") url_result[UrlParser.SCHEME] = url[UrlParser.SCHEME] if UrlParser.NETLOC in url and url[UrlParser.NETLOC]: if ( UrlParser.USERNAME in url and url[UrlParser.USERNAME] and url[UrlParser.USERNAME] in url[UrlParser.NETLOC] ): url_result[UrlParser.NETLOC] = url[UrlParser.NETLOC] if UrlParser.NETLOC not in url_result: url_result[UrlParser.NETLOC] = url[UrlParser.HOSTNAME] if UrlParser.PORT in url and url[UrlParser.PORT]: url_result[UrlParser.NETLOC] += str(url[UrlParser.PORT]) if UrlParser.USERNAME in url and url[UrlParser.USERNAME]: credentials = "{}@".format(url[UrlParser.USERNAME]) if UrlParser.PASSWORD in url and url[UrlParser.PASSWORD]: credentials = "{}:{}@".format(url[UrlParser.USERNAME], url[UrlParser.PASSWORD]) url_result[UrlParser.NETLOC] = credentials + url_result[UrlParser.NETLOC] url_result[UrlParser.PATH] = url[UrlParser.FILENAME] if UrlParser.PATH in url and url[UrlParser.PATH]: url_result[UrlParser.PATH] = url[UrlParser.PATH] + "/" + url_result[UrlParser.PATH] url_result[UrlParser.PATH] = re.sub("//+", "/", url_result[UrlParser.PATH]) if UrlParser.QUERY in url and url[UrlParser.QUERY]: url_result[UrlParser.QUERY] = url[UrlParser.QUERY] result = SplitResult(**url_result) return result.geturl()
def build_url(url): url_result = {UrlParser.QUERY: '', UrlParser.FRAGMENT: ''} if not url or UrlParser.SCHEME not in url or not url[UrlParser.SCHEME]: raise Exception('UrlParser:build_url', 'Url dictionary is empty or missing key values') url_result[UrlParser.SCHEME] = url[UrlParser.SCHEME] if UrlParser.NETLOC in url and url[UrlParser.NETLOC]: if UrlParser.USERNAME in url \ and url[UrlParser.USERNAME] \ and url[UrlParser.USERNAME] in url[UrlParser.NETLOC]: url_result[UrlParser.NETLOC] = url[UrlParser.NETLOC] if UrlParser.NETLOC not in url_result: url_result[UrlParser.NETLOC] = url[UrlParser.HOSTNAME] if UrlParser.PORT in url and url[UrlParser.PORT]: url_result[UrlParser.NETLOC] += str(url[UrlParser.PORT]) if UrlParser.USERNAME in url and url[UrlParser.USERNAME]: credentials = '{}@'.format(url[UrlParser.USERNAME]) if UrlParser.PASSWORD in url and url[UrlParser.PASSWORD]: credentials = '{}:{}@'.format(url[UrlParser.USERNAME], url[UrlParser.PASSWORD]) url_result[UrlParser.NETLOC] = credentials + url_result[UrlParser.NETLOC] url_result[UrlParser.PATH] = url[UrlParser.FILENAME] if UrlParser.PATH in url and url[UrlParser.PATH]: url_result[UrlParser.PATH] = url[UrlParser.PATH] + '/' + url_result[UrlParser.PATH] url_result[UrlParser.PATH] = re.sub('//+', '/', url_result[UrlParser.PATH]) if UrlParser.QUERY in url and url[UrlParser.QUERY]: url_result[UrlParser.QUERY] = url[UrlParser.QUERY] result = SplitResult(**url_result) return result.geturl()
def clean_link(url): o = urlsplit(url) if not o.scheme.lower() in ALLOWED_URL_SCHEMES: return None o = SplitResult(o.scheme, o.netloc, o.path, o.query, '') while o.query and __utm_matcher.search(o.query): query = __utm_matcher.sub('', o.query) o = SplitResult(o.scheme, o.netloc, o.path, query, '') return o.geturl()
def _parseurl(self, url): ret = urlsplit(url) self.username = ret.username self.password = ret.password if ret.port <> None: n = SplitResult(ret.scheme, ret.hostname + ":" + ret.port.__str__(), ret.path, ret.query, ret.fragment) else: n = SplitResult(ret.scheme, ret.hostname, ret.path, ret.query, ret.fragment) self.url = n.geturl()
def with_port(url_str): try: port = settings.PORT except AttributeError: port = None if port == 80: port = None url_split = urlsplit(url_str) if port: if not url_split.port and url_split.netloc: scheme, netloc, url, query, fragment = url_split netloc += ":%s" % port url_split = SplitResult(scheme, netloc, url, query, fragment) return url_split.geturl()
def find_next_indexes(soup): ''' next page for an album index or an album page ''' indexes = soup.findAll('a', 'pix-navi-page') urls = [] if indexes: max_p = max([int(tag.string) for tag in indexes if tag.string.isdigit()]) result = urlsplit(httplib.html_unescape(indexes[0]['href'])) #i don't want patch urllib.unquote. bug description: http://bugs.python.org/issue1712522 #quick fix is convert to ascii. query_dict = parse_qs(result.query.encode('ascii')) for p in range(1, max_p + 1): query_dict['p'] = p result = SplitResult(result.scheme, result.netloc, result.path, urlencode(query_dict, doseq=True), result.fragment) urls.append(result.geturl()) return urls
def __init__(self, url): """ """ # urlsplit will parse what it can from the provided string. raw = urlsplit(url) if not raw.path: raise ValueError("Invalid argument for MIB source: %s" % url) scheme = raw.scheme if not scheme: scheme = "file" if not raw.netloc else "http" path = raw.path if scheme == "file" and not path.startswith("/"): path = os.path.abspath("./" + path) cooked = SplitResult(scheme, raw.netloc, path, raw.query, raw.fragment) self._url = cooked.geturl() self._scheme = scheme self._path = cooked.path self._filename = os.path.split(cooked.path)[-1]