def get_emails(self, website, max_depth=1, max_urls=10, max_emails=1): """Crawl this website and return all emails found website: the URL of website to crawl max_depth: how many links deep to follow before stop crawl max_urls: how many URL's to download before stop crawl max_emails: The maximum number of emails to extract before stop crawl. If None then extract all emails found in crawl. """ def score(link): """Return how valuable this link is for ordering crawling The lower the better""" link = link.lower() total = 0 if 'contact' in link: pass # this page is top priority elif 'about' in link: total += 10 elif 'help' in link: total += 20 else: # generic page total += 100 # bias towards shorter links total += len(link) return total domain = urlparse.urlparse(website).netloc scraped = adt.HashDict() c = CrawlerCallback(max_depth=max_depth) outstanding = [(0, website)] # list of URLs and their score emails = [] while outstanding and (max_urls is None or len(scraped) < max_urls) \ and (max_emails is None or len(emails) < max_emails): _, url = outstanding.pop(0) scraped[url] = True html = self.get(url) if html: for email in alg.extract_emails(html): if email not in emails: emails.append(email) if len(emails) == max_emails: break # crawl the linked URLs for link in c.crawl(self, url, html): if urlparse.urlparse(link).netloc == domain: if link not in scraped: outstanding.append((score(link), link)) # sort based on score to crawl most promising first outstanding.sort() return list(emails)
def get_emails(self, website, max_depth=1, max_urls=None, max_emails=None): """Crawl this website and return all emails found """ scraped = adt.HashDict() c = CrawlerCallback(max_depth=max_depth) outstanding = collections.deque([website]) emails = [] while outstanding and (max_urls is None or len(scraped) < max_urls) \ and (max_emails is None or len(emails) < max_emails): url = outstanding.popleft() scraped[url] = True html = self.get(url, delay=1) if html: for email in alg.extract_emails(html): if email not in emails: emails.append(email) if len(emails) == max_emails: break outstanding.extend(c.crawl(self, url, html)) return list(emails)