def export_domains(site): db_export = CrawlerDb(site) db_export.connect() logger.info("=" * 40) logger.info("Processing...") domains = db_export.get_all_domains() logger.info("There are %d domains" % len(domains)) file = open(DOMAINS_FILENAME, "w+") file.writelines("\n".join(domains)) file.close() logger.info("All domains saved to ./data/domains.csv") logger.info("=" * 40)
def export_emails(site): # Set up the database db_export = CrawlerDb(site) db_export.connect() logger.info("=" * 40) logger.info("Processing...") emails = db_export.get_all_emails() logger.info("There are %d emails" % len(emails)) file = open(EMAILS_FILENAME, "w+") file.writelines("\n".join(emails)) file.close() logger.info("All emails saved to ./data/emails.csv") logger.info("=" * 40)
def crawl(site, keywords, output_ui: OutputUIInterface = None): # Set up the database global db db = CrawlerDb(site) db.connect() """ This method will 1) Google the keywords, and extract MAX_SEARCH_RESULTS 2) For every result (aka website), crawl the website 2 levels deep. That is the homepage (level 1) and all it's links (level 2). But if level 1 has the email, then skip going to level 2. 3) Store the html in /data/html/ and update the database of the crawled emails crawl(keywords): Extract Google search results and put all in database Process each search result, the webpage: Crawl webpage level 1, the homepage Crawl webpage level 2, a link away from the homepage Update all crawled page in database, with has_crawled = True immediately Store the HTML """ logger.info("-" * 40) # logger.info("Keywords to Google for: %s" % keywords.decode('utf-8')) logger.info("Keywords to Google for: %s" % keywords) logger.info("-" * 40) # Step 1: Crawl Google Page # eg http://www.google.com/search?q=singapore+web+development&start=0 # Next page: https://www.google.com/search?q=singapore+web+development&start=10 # Google search results are paged with 10 urls each. There are also adurls for page_index in range(0, MAX_SEARCH_RESULTS, 10): query = {'q': keywords} url = 'http://%s/search?' % site url = url + urllib.parse.urlencode(query) + '&start=' + str(page_index) # query = {'wd': keywords} # url = 'http://www.baidu.com/s?' + urllib.parse.urlencode(query) + '&pn=' + str(page_index) try: data = retrieve_html(url) # print("data: \n%s" % data) for url in google_url_regex.findall(data): db.enqueue(str(url)) for url in google_adurl_regex.findall(data): db.enqueue(str(url)) except Exception as e: logger.error(e) # for url in baidu_url_regex.findall(data): # db.enqueue(str(url)) # for url in baidu_adurl_regex.findall(data): # db.enqueue(str(url)) # Step 2: Crawl each of the search result # We search till level 2 deep while (True): # Dequeue an uncrawled webpage from db uncrawled = db.dequeue() if (uncrawled == False): break email_set = find_emails_2_level_deep(uncrawled.url, output_ui) if (len(email_set) > 0): db.crawled(uncrawled, ",".join(list(email_set))) if output_ui: output_ui.append(list(email_set)) else: db.crawled(uncrawled, None)
google_adurl_regex = re.compile('adurl=(.*?)"') google_url_regex = re.compile('url\?q=(.*?)&sa=') email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE) url_regex = re.compile('<a\s.*?href=[\'"](.*?)[\'"].*?>') # Below url_regex will run into 'Castrophic Backtracking'! # http://stackoverflow.com/questions/8010005/python-re-infinite-execution # url_regex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') # Maximum number of search results to start the crawl MAX_SEARCH_RESULTS = 150 EMAILS_FILENAME = 'data/emails.csv' DOMAINS_FILENAME = 'data/domains.csv' # Set up the database db = CrawlerDb() db.connect() def crawl(keywords): """ This method will 1) Google the keywords, and extract MAX_SEARCH_RESULTS 2) For every result (aka website), crawl the website 2 levels deep. That is the homepage (level 1) and all it's links (level 2). But if level 1 has the email, then skip going to level 2. 3) Store the html in /data/html/ and update the database of the crawled emails crawl(keywords): Extract Google search results and put all in database
google_adurl_regex = re.compile('adurl=(.*?)"') google_url_regex = re.compile('url\?q=(.*?)&sa=') email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE) url_regex = re.compile('<a\s.*?href=[\'"](.*?)[\'"].*?>') # Below url_regex will run into 'Castrophic Backtracking'! # http://stackoverflow.com/questions/8010005/python-re-infinite-execution # url_regex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>') # Maximum number of search results to start the crawl MAX_SEARCH_RESULTS = 500 EMAILS_FILENAME = 'data/emails.csv' DOMAINS_FILENAME = 'data/domains.csv' # Set up the database db = CrawlerDb() db.connect() def crawl(keywords): """ This method will 1) Google the keywords, and extract MAX_SEARCH_RESULTS 2) For every result (aka website), crawl the website 2 levels deep. That is the homepage (level 1) and all it's links (level 2). But if level 1 has the email, then skip going to level 2. 3) Store the html in /data/html/ and update the database of the crawled emails crawl(keywords): Extract Google search results and put all in database
def crawl(keywords,name): """ This method will 1) Google the keywords, and extract MAX_SEARCH_RESULTS 2) For every result (aka website), crawl the website 2 levels deep. That is the homepage (level 1) and all it's links (level 2) and all it's link's links (level 3). 3) Store the html in /data/html/ and update the database of the crawled emails crawl(keywords): Extract Google search results and put all in database Process each search result, the webpage: Crawl webpage level 1, the homepage Crawl webpage level 2, a link away from the homepage Crawl webpage level 3, a link away from the homepage Update all crawled page in database, with has_crawled = True immediately Store the HTML """ if internet_on() == False: print "*******************NO INTERNET CONNECTION*******************" sys.exit(0) db = CrawlerDb() db.connect() logger.info("-"*40) logger.info("Keywords to Google for: %s" % keywords) logger.info("-"*40) # Step 1: Crawl Google Page # eg http://www.google.com/search?q=singapore+web+development&start=0 # Next page: https://www.google.com/search?q=singapore+web+development&start=10 # Google search results are paged with 10 urls each. There are also adurls for page_index in range(0, MAX_SEARCH_RESULTS, 10): query = {'q': keywords} url = 'http://www.google.com/search?' + urllib.urlencode(query) + '&start=' + str(page_index) data = retrieve_html(url) # print("data: \n%s" % data) for url in google_url_regex.findall(data): db.enqueue(url) break # Step 2: Crawl each of the search result # We search till level 2 deep while (True): # Dequeue an uncrawled webpage from db uncrawled = db.dequeue() if (uncrawled == False): break email_set = find_emails_3_level_deep(uncrawled.url,db) if (len(email_set) > 0): db.crawled(uncrawled, ",".join(list(email_set))) else: db.crawled(uncrawled, None) write(name,db)