def fetch_domains(self): self.domains = [] for n in self.links: url = Link(n, self.url) if url.is_valid(): self.domains.append(url.domain) return self.domains
def fetch_domains_id(self): self.domain_ids = [] for n in self.links: url = Link(n, self.url) if url.is_valid(): self.domain_ids.append(url.netloc) return self.domain_ids
def is_valid(self): #logging.info("Valid url?") url = Link(self.url, self.source_url, self.debug) if url.is_valid(): #logging.info("Yes") return True else: self.msg = url.msg self.code = url.code self.step = "Validating page" self.status = False return False
def fetch_links(self): ''' extract raw_links and domains ''' self.domains = [] self.links = [] self.domain_ids = [] links = [n.get('href') for n in self.doc.find_all("a")] links = [n for n in links if n is not None and n != "" and n != "/" and n[0] !="#"] for url in links: if url.startswith('mailto'): pass if url.startswith('javascript'): pass else: l = Link(url) if l.is_valid(): url, domain, domain_id = l.clean_url(url, self.url) self.domains.append(domain) self.links.append(url) self.domain_ids.append(domain_id) return (self.links, self.domains, self.domain_ids)
def fetch_links(self): ''' extract raw_links and domains ''' self.domains = [] self.links = [] self.domain_ids = [] links = [n.get('href') for n in self.doc.find_all("a")] links = [ n for n in links if n is not None and n != "" and n != "/" and n[0] != "#" ] for url in links: if url.startswith('mailto'): pass if url.startswith('javascript'): pass else: l = Link(url) if l.is_valid(): url, domain, domain_id = l.clean_url(url, self.url) self.domains.append(domain) self.links.append(url) self.domain_ids.append(domain_id) return (self.links, self.domains, self.domain_ids)
def check_link(self, url, source_url): url = Link(url, source_url) if url.is_valid(): return url.url else: return None