Example #1
0
def base():
    # 1. Initialize queue
    cpa_queue = url_queue.URLSearchQueue()

    # 1a. Initialize firm_list (consider making this a MySQL d/b)
    set_of_emails = set([])
    driver = webdriver.PhantomJS()

    # 3. Add START_URL(s) to queue
    for start_url in start_url_list:
        cpa_queue.enqueue(start_url)
    # 5b. scrape tree
    external_sites, list_of_firms = scrape_cpa_tree(cpa_queue, driver)

    with open('firm_list.csv', 'w') as csvfile:
        fieldnames = ['firm_details', 'firm_url']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for firm in list_of_firms:
            writer.writerow(firm)

    # 8. Go through each external URL and scrape emails
    while len(external_sites) > 0:
        active_queue = external_sites.pop()
        set_of_emails.update(process_external_url_queue(active_queue, driver))

    with open('email_list.csv', 'w') as csvfile:
        fieldnames = ['email']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for email in set_of_emails:
            try:
                writer.writerow(email)
            except Exception as exc:
                print('failed to write email %s' % email)
Example #2
0
def main():
    # 1. instantiate stuff
    canada_queue = url_queue.URLSearchQueue()
    list_of_external_queues = []
    set_of_external_base_urls = set([])
    email_set = set([])
    list_of_firms = []
    driver = webdriver.PhantomJS()
    url = 'https://www.cpacanada.ca/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm'
    url_start = '/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm'

    # 2. get internal firm links from base page
    driver.get(url)
    tree = BeautifulSoup(driver.page_source, 'lxml')
    for url in extract_firm_page_urls(tree, url_start):
        canada_queue.enqueue(url)

    # 3. grab relevant info from each individual firm listing
    n = 0
    while canada_queue.queue_len() > 0:
        n += 1
        if n % 100 == 0:
            print('processed %s cpacanada pages' % n)

        curr_url = canada_queue.dequeue()
        firm_name, firm_details, email_list, web_list = scrape_for_firm_info(
            curr_url, driver)
        if len(web_list) > 0:
            for site in web_list:
                if site is not None and 'linkedin' not in site \
                        and 'facebook' not in site:
                    if site[:4] != 'http':
                        site = 'http://' + site
                    update_external_queue(list_of_external_queues,
                                          set_of_external_base_urls, site)
        if len(email_list) > 0:
            email_set.update(email_list)
        list_of_firms.append({
            'firm_name': firm_name,
            'firm_details': firm_details
        })

    connection = pymysql.connect(host=HOST,
                                 password=PASSWORD,
                                 port=PORT,
                                 user=USER,
                                 db=DB)

    sql = 'INSERT INTO emails VALUES (%s)'
    with connection.cursor() as cursor:
        for email in email_set:
            try:
                cursor.execute(sql, email)
            except Exception as exc:
                print('Error: %s \nfailed to write %s' % (exc, email))

    connection.commit()
Example #3
0
def update_external_queue(list_of_queues, set_of_urls, new_url):
    parsed_url = urlparse(new_url)
    base_url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_url)
    new_queue = url_queue.URLSearchQueue()
    if base_url not in set_of_urls:
        new_queue.enqueue(base_url)
        set_of_urls.add(base_url)
        if new_url != base_url:
            new_queue.enqueue(new_url)
            set_of_urls.add(new_url)
    list_of_queues.append(new_queue)
Example #4
0
def main():
    # 1. instantiate stuff
    canada_queue = url_queue.URLSearchQueue()
    list_of_external_queues = []
    set_of_external_base_urls = set([])
    email_set = set([])
    list_of_firms = []
    driver = webdriver.PhantomJS()
    url = 'https://www.cpacanada.ca/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm'
    url_start = '/en/the-cpa-profession/cpas-and-what-we-do/find-an-accounting-firm'

    # 2. get internal firm links from base page
    driver.get(url)
    tree = BeautifulSoup(driver.page_source, 'lxml')
    for url in extract_firm_page_urls(tree, url_start):
        canada_queue.enqueue(url)

    # 3. grab relevant info from each individual firm listing
    n = 0
    while canada_queue.queue_len() > 0:
        n += 1
        if n % 100 == 0:
            print('processed %s cpacanada pages' % n)

        curr_url = canada_queue.dequeue()
        firm_name, firm_details, email_list, web_list = scrape_for_firm_info(
            curr_url, driver
        )
        if len(web_list) > 0:
            for site in web_list:
                if site is not None and 'linkedin' not in site \
                        and 'facebook' not in site:
                    if site[:4] != 'http':
                        site = 'http://' + site
                    update_external_queue(list_of_external_queues,
                                          set_of_external_base_urls,
                                          site)
        if len(email_list) > 0:
            email_set.update(email_list)
        list_of_firms.append({'firm_name': firm_name,
                              'firm_details': firm_details})

    # with open('canada_firm_list.csv', 'w') as csvfile:
    #     fieldnames = ['firm_name', 'firm_details']
    #     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    #     writer.writeheader()
    #     for firm in list_of_firms:
    #         writer.writerow(firm)

    # 4. crawl each firm site for emails
    while len(list_of_external_queues) > 0:
        active_queue = list_of_external_queues.pop()
        email_set.update(process_external_url_queue(active_queue, driver))
Example #5
0
def main(argv):
    driver = webdriver.PhantomJS()
    file_name = argv[1] if len(argv) > 1 else FILENAME
    url_set = create_site_set(file_name)
    total_urls = len(url_set)
    print('======================\nTotal urls to crawl: %s'
          '\n=======================' % str(total_urls))
    url_queue_list = []
    for url in url_set:
        url_queue_list.append(url_queue.URLSearchQueue(url))
    n = 0
    for queue in url_queue_list:
        n += 1
        process_external_url_queue(queue, driver)
        if n % 100 == 0:
            print('\n%s urls to go!\n' % str(total_urls - n))
Example #6
0
def scrape_cpa_tree(queue, driver=None):
    firm_list = []
    set_of_external_urls = set([])
    list_of_external_url_queues = []
    n = 0
    while queue.queue_len() > 0:
        curr_url = queue.dequeue()
        n += 1
        if n % 100 == 0:
            print('%s base site pages scraped' % n)
        page_tree = parse_page.fetch_page(curr_url)
        if page_tree is not None:
            java_crawled = False
            url_list = parse_page.extract_urls(page_tree)
            for url in url_list:
                # This is a link to firm details
                if url[:36] == "javascript:open_window('details.aspx":
                    queue.enqueue(JAVA_PREFIX + url[24:len(url) - 2])
                # Deal with paginated lists
                elif url[:23] == 'javascript:__doPostBack' and not java_crawled:
                    java_crawled = True
                    java_urls = java_page_scraper.load_javascript_page(
                        curr_url, 'javascript:__doPostBack', driver)
                    for new_url in java_urls:
                        queue.enqueue(new_url)
                # Enqueue links to same site
                elif url[:len(SITE_PREFIX)] == SITE_PREFIX:
                    queue.enqueue(url)
                # Put external links into a separate queue
                if 'details.aspx?searchnumber=' in curr_url:
                    parsed_url = urlparse(url)
                    external_base_url = \
                        '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_url)
                    external_url_queue = url_queue.URLSearchQueue()
                    if external_base_url not in set_of_external_urls:
                        external_url_queue.enqueue(external_base_url)
                        set_of_external_urls.add(external_base_url)
                    if url != external_base_url and \
                            url not in set_of_external_urls:
                        external_url_queue.enqueue(url)
                        set_of_external_urls.add(url)
                    if external_url_queue.queue_len() > 0:
                        list_of_external_url_queues.append(external_url_queue)
            # if curr_url is detail page, extract firm info and add to firm_list
            if 'details.aspx?searchnumber=' in curr_url:
                firm_list.append(extract_firm_info(page_tree))
    return list_of_external_url_queues, firm_list