def get_url_type(self, url): subdomain, _, _ = tldextract.extract(url) if self.is_domain_url(url): if len(subdomain) == 0 or subdomain == "www": return "domain" else: return "subdomain" else: return "others"
def normalize_url(self, url, base_url = None): if url is None or len(url) == 0: return None original_url = url #Note: here asume all non-unicode urls are encoded by utf-8 if isinstance(url, str): url = url.decode("utf-8") if not isinstance(url, unicode): logging.error("invalid normalized url, url is not unicode", url = original_url, base_url = base_url) return None url = url.replace('%20', ' ').strip() #fix http scheme: url = self._fix_http_scheme(url) #handle relative url if base_url is not None: url = urlparse.urljoin(base_url, url) #common normlization try: url = urlnorm.norm(url) except Exception as e: logging.warn("invalid normalized url, urlnorm raised exception", url = original_url, base_url = base_url, exception = e) return None try: parse_result = urlparse.urlparse(url) except Exception as e: logging.warn("invalid normalized url, when parsing url", url = original_url, base_url = base_url) return None if not parse_result.scheme.lower() in self._settings["general_crawl_policies"]["supported_schemes"]: logging.warn("invalid normalized url, not supported schemes", url = original_url, base_url = base_url) return None netloc = parse_result.netloc host = parse_result.netloc.split(':')[0] if ip_regex.match(host) is None: #if it's an ip host #check if domain and tld exists subdomain, domain, tld = tldextract.extract(host) if len(domain) == 0 or len(tld) == 0: logging.warn("invalid normalized url, no domain or tld", url = original_url, base_url = base_url) return None #fix chinese punctuation for i in range(len(chinese_punctuation_map[0])): src = chinese_punctuation_map[0][i] dst = chinese_punctuation_map[1][i] netloc = netloc.replace(src, dst) #add www if not exists if len(subdomain) == 0: netloc = "www." + netloc fragment = parse_result.fragment if not fragment.startswith("!"): #Google's recommendation for ajax request fragment = "" if len(parse_result.scheme) == 0 or len(netloc) == 0: logging.warn("invalid normalized url, scheme or netloc is none", url = original_url, base_url = base_url) return None url = urlparse.urlunparse((parse_result.scheme, netloc, parse_result.path, parse_result.params, parse_result.query, fragment)) #canonicalize url #Note: it's too strong, and sometimes change the url semantics. #url = ccrawler.utils.url.canonicalize_url(url) url = url.strip() if len(url) > self._settings["general_crawl_policies"]["max_url_length"]: logging.warn("invalid normalized url, length exceeded", url = original_url, base_url = base_url) return None elif len(url) == 0: logging.warn("invalid normalized url, length too short", url = original_url, base_url = base_url) return None else: return url
def get_url_domain_info(url): subdomain, domain, tld = tldextract.extract(url) full_domain = section_join(domain, tld) host = section_join(subdomain, full_domain) return domain, full_domain, host