def main(argv): input_file = None output_dir = None failed_urls = [] errors_encountered = 0 try: opts, args = getopt.getopt(argv, "i:o:") except getopt.GetoptError: print USAGE_MESSAGE sys.exit(2) for opt, arg in opts: if opt == "-i": input_file = arg if opt == "-o": output_dir = arg if not input_file or not output_dir: print USAGE_MESSAGE sys.exit(2) if os.path.isdir(output_dir): print "Found directory: %s" % output_dir else: mkdir_p(output_dir) print "Created directory: %s" % output_dir with open(input_file) as f: urls = f.readlines() print "Found %d URLs" % len(urls) browser = get_selenium_browser() for url in urls: url = url.strip() if not url: continue try: print "\nProcessing URL: %s" % url print "Requesting URL with Python Requests: ", url get_response = requests.get(url, headers=REQUEST_HEADERS, timeout=60) content_type = get_response.headers.get('content-type') encoding = get_response.encoding page_source = get_response.text final_url = get_response.url if 'text/html' in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium: ", final_url try: browser.get(final_url) page_source = browser.page_source except: print "First Selenium request failed. Trying one last time." browser.get(final_url) page_source = browser.page_source else: if 'text/html' in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium one last time." browser.get(final_url) page_source = browser.page_source final_url = browser.current_url print "Found final URL: %s" % final_url encoded_data, encoding_used = get_encoded_data( page_source, encoding) filepath = get_filepath(final_url, encoding_used, output_dir) with open(filepath, 'w') as f: f.write(encoded_data) print "Wrote file: %s with encoding: %s" % (filepath, encoding_used) except: errors_encountered += 1 failed_urls.append(url) try: traceback_info = '\n'.join( traceback.format_exception(*(sys.exc_info()))) except: traceback_info = '' print "*** ERROR PROCESSING: %s ***\nTraceback: %s\n" % ( url, traceback_info) print "\nOperational Errors: %d\n" % errors_encountered if failed_urls: print "The following %d URLs failed:" % len(failed_urls) for url in failed_urls: print url
if url_file: if os.path.isfile(url_file): print "Removing existing file: %s" % url_file os.remove(url_file) # Get URLs from config for d in urls_to_crawl: files_processed = 0 files_written = 0 errors_encountered = 0 seed_url = d["url"] urls_to_visit = [seed_url] all_urls = [seed_url] follow_links_containing = d["follow_links_containing"] ignore_query_strings = d.get("ignore_query_strings", False) # Selenium browser browser = get_selenium_browser() # Regex regex_filters = d.get("regex_filters") if regex_filters: using_regex_filters = True regex_filters = [ re.compile(regex_filter) for regex_filter in regex_filters ] else: using_regex_filters = False start_time = datetime.datetime.now() print "\nCurrent Time: %s" % start_time crawl_url() end_time = datetime.datetime.now() print "\nStart: %s\nFinish: %s\n" % (start_time, end_time)
def main(argv): input_file = None output_dir = None failed_urls = [] errors_encountered = 0 try: opts, args = getopt.getopt(argv, "i:o:") except getopt.GetoptError: print USAGE_MESSAGE sys.exit(2) for opt, arg in opts: if opt == "-i": input_file = arg if opt == "-o": output_dir = arg if not input_file or not output_dir: print USAGE_MESSAGE sys.exit(2) if os.path.isdir(output_dir): print "Found directory: %s" % output_dir else: mkdir_p(output_dir) print "Created directory: %s" % output_dir with open(input_file) as f: urls = f.readlines() print "Found %d URLs" % len(urls) browser = get_selenium_browser() for url in urls: url = url.strip() if not url: continue try: print "\nProcessing URL: %s" % url print "Requesting URL with Python Requests: ", url get_response = requests.get(url, headers=REQUEST_HEADERS, timeout=60) content_type = get_response.headers.get("content-type") encoding = get_response.encoding page_source = get_response.text final_url = get_response.url if "text/html" in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium: ", final_url try: browser.get(final_url) page_source = browser.page_source except: print "First Selenium request failed. Trying one last time." browser.get(final_url) page_source = browser.page_source else: if "text/html" in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium one last time." browser.get(final_url) page_source = browser.page_source final_url = browser.current_url print "Found final URL: %s" % final_url encoded_data, encoding_used = get_encoded_data(page_source, encoding) filepath = get_filepath(final_url, encoding_used, output_dir) with open(filepath, "w") as f: f.write(encoded_data) print "Wrote file: %s with encoding: %s" % (filepath, encoding_used) except: errors_encountered += 1 failed_urls.append(url) try: traceback_info = "\n".join(traceback.format_exception(*(sys.exc_info()))) except: traceback_info = "" print "*** ERROR PROCESSING: %s ***\nTraceback: %s\n" % (url, traceback_info) print "\nOperational Errors: %d\n" % errors_encountered if failed_urls: print "The following %d URLs failed:" % len(failed_urls) for url in failed_urls: print url