Beispiel #1
0
def crawl(links):
    blacklist = Blacklist.factory("url", list(links))
    links_to_process = deque(blacklist.remove_blacklisted())
    email_blacklist = Blacklist(scrub_words=[
        'example', 'email', 'support', 'domain', 'orders', 'info', 'github',
        'registration', 'mozilla', 'donate', 'feedback', 'newsletter', 'name'
    ])
    email_writer = EmailWriter(email_blacklist)
    processed_urls = set()
    emails = set()

    logger = logging.getLogger()

    while len(links_to_process):
        url1 = links_to_process.pop()
        # add to processed immediately, to support failure
        processed_urls.add(url1)

        url_extras = get_url_extras(url1)

        response = get_url_response(url1)
        if not response.ok:
            continue

        try:
            new_emails = get_email_set_from_response(response)
        except TimeoutError:
            continue

        email_writer.add_emails(new_emails)

        # create a beautiful soup for the html document

        soup = BeautifulSoup(response.text, "html.parser")

        # find and process all the anchors in the document
        for anchor in soup.find_all("a"):
            # extract link url from the anchor
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            # resolve relative links
            if link.startswith('/'):
                link = url_extras[1] + link
            elif not link.startswith('http'):
                link = url_extras[2] + link

            # add the new url to the queue if it was not enqueued nor processed yet
            if link not in links_to_process and link not in processed_urls:
                if not blacklist.is_blacklisted(link):
                    links_to_process.appendleft(link)

        # scrub linkset to ensure crawler doesn't waste time on one site
        # urls = scrub_linkset(urls)
        urls_list = list(links_to_process)
        scrubbed = scrub(urls_list, 4)
        logger.debug(scrubbed)
        links_to_process = deque(scrubbed)

    return emails
Beispiel #2
0
 def test_email_blacklist(self):
     blacklist = Blacklist.factory("emails")
     self.assertTrue(blacklist.is_blacklisted("*****@*****.**"))
     self.assertFalse(blacklist.is_blacklisted("*****@*****.**"))