Example #1
0
    def get_emails(self, website, max_depth=1, max_urls=10, max_emails=1):
        """Crawl this website and return all emails found

        website:
            the URL of website to crawl
        max_depth:
            how many links deep to follow before stop crawl
        max_urls:
            how many URL's to download before stop crawl
        max_emails:
            The maximum number of emails to extract before stop crawl.
            If None then extract all emails found in crawl.
        """
        def score(link):
            """Return how valuable this link is for ordering crawling
            The lower the better"""
            link = link.lower()
            total = 0
            if 'contact' in link:
                pass # this page is top priority
            elif 'about' in link:
                total += 10
            elif 'help' in link:
                total += 20
            else:
                # generic page
                total += 100
            # bias towards shorter links
            total += len(link)
            return total

        domain = urlparse.urlparse(website).netloc
        scraped = adt.HashDict()
        c = CrawlerCallback(max_depth=max_depth)
        outstanding = [(0, website)] # list of URLs and their score
        emails = []
        while outstanding and (max_urls is None or len(scraped) < max_urls) \
                          and (max_emails is None or len(emails) < max_emails):
            _, url = outstanding.pop(0)
            scraped[url] = True
            html = self.get(url)
            if html:
                for email in alg.extract_emails(html):
                    if email not in emails:
                        emails.append(email)
                        if len(emails) == max_emails:
                            break
                # crawl the linked URLs
                for link in c.crawl(self, url, html):
                    if urlparse.urlparse(link).netloc == domain:
                        if link not in scraped:
                            outstanding.append((score(link), link))
                # sort based on score to crawl most promising first
                outstanding.sort()
        return list(emails)
Example #2
0
    def get_emails(self, website, max_depth=1, max_urls=10, max_emails=1):
        """Crawl this website and return all emails found

        website:
            the URL of website to crawl
        max_depth:
            how many links deep to follow before stop crawl
        max_urls:
            how many URL's to download before stop crawl
        max_emails:
            The maximum number of emails to extract before stop crawl.
            If None then extract all emails found in crawl.
        """
        def score(link):
            """Return how valuable this link is for ordering crawling
            The lower the better"""
            link = link.lower()
            total = 0
            if 'contact' in link:
                pass  # this page is top priority
            elif 'about' in link:
                total += 10
            elif 'help' in link:
                total += 20
            else:
                # generic page
                total += 100
            # bias towards shorter links
            total += len(link)
            return total

        domain = urlparse.urlparse(website).netloc
        scraped = adt.HashDict()
        c = CrawlerCallback(max_depth=max_depth)
        outstanding = [(0, website)]  # list of URLs and their score
        emails = []
        while outstanding and (max_urls is None or len(scraped) < max_urls) \
                          and (max_emails is None or len(emails) < max_emails):
            _, url = outstanding.pop(0)
            scraped[url] = True
            html = self.get(url)
            if html:
                for email in alg.extract_emails(html):
                    if email not in emails:
                        emails.append(email)
                        if len(emails) == max_emails:
                            break
                # crawl the linked URLs
                for link in c.crawl(self, url, html):
                    if urlparse.urlparse(link).netloc == domain:
                        if link not in scraped:
                            outstanding.append((score(link), link))
                # sort based on score to crawl most promising first
                outstanding.sort()
        return list(emails)
Example #3
0
 def get_emails(self, website, max_depth=1, max_urls=None, max_emails=None):
     """Crawl this website and return all emails found
     """
     scraped = adt.HashDict()
     c = CrawlerCallback(max_depth=max_depth)
     outstanding = collections.deque([website])
     emails = []
     while outstanding and (max_urls is None or len(scraped) < max_urls) \
                       and (max_emails is None or len(emails) < max_emails):
         url = outstanding.popleft()
         scraped[url] = True
         html = self.get(url, delay=1)
         if html:
             for email in alg.extract_emails(html):
                 if email not in emails:
                     emails.append(email)
                     if len(emails) == max_emails:
                         break
             outstanding.extend(c.crawl(self, url, html))
     return list(emails)
Example #4
0
 def get_emails(self, website, max_depth=1, max_urls=None, max_emails=None):
     """Crawl this website and return all emails found
     """
     scraped = adt.HashDict()
     c = CrawlerCallback(max_depth=max_depth)
     outstanding = collections.deque([website])
     emails = []
     while outstanding and (max_urls is None or len(scraped) < max_urls) \
                       and (max_emails is None or len(emails) < max_emails):
         url = outstanding.popleft()
         scraped[url] = True
         html = self.get(url, delay=1)
         if html:
             for email in alg.extract_emails(html):
                 if email not in emails:
                     emails.append(email)
                     if len(emails) == max_emails:
                         break
             outstanding.extend(c.crawl(self, url, html))
     return list(emails)