def normalize_link(link): if urlparse.urlsplit(link).scheme in ('http', 'https', ''): if '#' in link: link = link[:link.index('#')] if url: link = urlparse.urljoin(url, link) if not local and common.same_domain(url, link): # local links not included link = None if not external and not common.same_domain(url, link): # external links not included link = None else: link = None # ignore mailto, etc return link
def normalize_link(link): if urlsplit(link).scheme in ('http', 'https', ''): if '#' in link: link = link[:link.index('#')] if url: link = urljoin(url, link) if not local and common.same_domain(url, link): # local links not included link = None if not external and not common.same_domain(url, link): # external links not included link = None else: link = None # ignore mailto, etc return link
def valid(link): """Check if should crawl this link """ # check if a media file if common.get_extension(link) not in common.MEDIA_EXTENSIONS: # check if a proper HTTP link if link.lower().startswith('http'): # only crawl within website if common.same_domain(domain, link): # passes regex if self.allowed_urls.match(link) and not self.banned_urls.match(link): # not blocked by robots.txt if not self.robots or self.robots.can_fetch(settings.user_agent, link): # allowed to recrawl if self.crawl_existing or (D.cache and link not in D.cache): return True return False