def main(argv):
    input_file = None
    output_dir = None
    failed_urls = []
    errors_encountered = 0
    try:
        opts, args = getopt.getopt(argv, "i:o:")
    except getopt.GetoptError:
        print USAGE_MESSAGE
        sys.exit(2)
    for opt, arg in opts:
        if opt == "-i":
            input_file = arg
        if opt == "-o":
            output_dir = arg
    if not input_file or not output_dir:
        print USAGE_MESSAGE
        sys.exit(2)
    if os.path.isdir(output_dir):
        print "Found directory: %s" % output_dir
    else:
        mkdir_p(output_dir)
        print "Created directory: %s" % output_dir
    with open(input_file) as f:
        urls = f.readlines()
    print "Found %d URLs" % len(urls)
    browser = get_selenium_browser()
    for url in urls:
        url = url.strip()
        if not url:
            continue
        try:
            print "\nProcessing URL: %s" % url
            print "Requesting URL with Python Requests: ", url
            get_response = requests.get(url,
                                        headers=REQUEST_HEADERS,
                                        timeout=60)
            content_type = get_response.headers.get('content-type')
            encoding = get_response.encoding
            page_source = get_response.text
            final_url = get_response.url
            if 'text/html' in content_type and not "<body" in page_source:
                print "No <body> tag found in page source. Requesting URL with Selenium: ", final_url
                try:
                    browser.get(final_url)
                    page_source = browser.page_source
                except:
                    print "First Selenium request failed. Trying one last time."
                    browser.get(final_url)
                    page_source = browser.page_source
                else:
                    if 'text/html' in content_type and not "<body" in page_source:
                        print "No <body> tag found in page source. Requesting URL with Selenium one last time."
                        browser.get(final_url)
                        page_source = browser.page_source
                final_url = browser.current_url
            print "Found final URL: %s" % final_url
            encoded_data, encoding_used = get_encoded_data(
                page_source, encoding)
            filepath = get_filepath(final_url, encoding_used, output_dir)
            with open(filepath, 'w') as f:
                f.write(encoded_data)
            print "Wrote file: %s with encoding: %s" % (filepath,
                                                        encoding_used)
        except:
            errors_encountered += 1
            failed_urls.append(url)
            try:
                traceback_info = '\n'.join(
                    traceback.format_exception(*(sys.exc_info())))
            except:
                traceback_info = ''
            print "*** ERROR PROCESSING: %s ***\nTraceback: %s\n" % (
                url, traceback_info)

    print "\nOperational Errors: %d\n" % errors_encountered
    if failed_urls:
        print "The following %d URLs failed:" % len(failed_urls)
        for url in failed_urls:
            print url
Example #2
0
        if url_file:
            if os.path.isfile(url_file):
                print "Removing existing file: %s" % url_file
                os.remove(url_file)

    # Get URLs from config
    for d in urls_to_crawl:
        files_processed = 0
        files_written = 0            
        errors_encountered = 0
        seed_url = d["url"]
        urls_to_visit = [seed_url]
        all_urls = [seed_url]
        follow_links_containing = d["follow_links_containing"]
        ignore_query_strings = d.get("ignore_query_strings", False)
        # Selenium browser
        browser = get_selenium_browser()        
        # Regex
        regex_filters = d.get("regex_filters")
        if regex_filters:
            using_regex_filters = True
            regex_filters = [ re.compile(regex_filter) for regex_filter in regex_filters ]
        else:
            using_regex_filters = False
            
        start_time = datetime.datetime.now()
        print "\nCurrent Time:  %s" % start_time
        crawl_url()
        end_time = datetime.datetime.now()
        print "\nStart:  %s\nFinish: %s\n" % (start_time, end_time)
Example #3
0
def main(argv):
    input_file = None
    output_dir = None
    failed_urls = []
    errors_encountered = 0
    try:
        opts, args = getopt.getopt(argv, "i:o:")
    except getopt.GetoptError:
        print USAGE_MESSAGE
        sys.exit(2)
    for opt, arg in opts:
        if opt == "-i":
            input_file = arg
        if opt == "-o":
            output_dir = arg
    if not input_file or not output_dir:
        print USAGE_MESSAGE
        sys.exit(2)
    if os.path.isdir(output_dir):
        print "Found directory: %s" % output_dir
    else:
        mkdir_p(output_dir)
        print "Created directory: %s" % output_dir
    with open(input_file) as f:
        urls = f.readlines()
    print "Found %d URLs" % len(urls)
    browser = get_selenium_browser()
    for url in urls:
        url = url.strip()
        if not url:
            continue
        try:
            print "\nProcessing URL: %s" % url
            print "Requesting URL with Python Requests: ", url
            get_response = requests.get(url, headers=REQUEST_HEADERS, timeout=60)
            content_type = get_response.headers.get("content-type")
            encoding = get_response.encoding
            page_source = get_response.text
            final_url = get_response.url
            if "text/html" in content_type and not "<body" in page_source:
                print "No <body> tag found in page source. Requesting URL with Selenium: ", final_url
                try:
                    browser.get(final_url)
                    page_source = browser.page_source
                except:
                    print "First Selenium request failed. Trying one last time."
                    browser.get(final_url)
                    page_source = browser.page_source
                else:
                    if "text/html" in content_type and not "<body" in page_source:
                        print "No <body> tag found in page source. Requesting URL with Selenium one last time."
                        browser.get(final_url)
                        page_source = browser.page_source
                final_url = browser.current_url
            print "Found final URL: %s" % final_url
            encoded_data, encoding_used = get_encoded_data(page_source, encoding)
            filepath = get_filepath(final_url, encoding_used, output_dir)
            with open(filepath, "w") as f:
                f.write(encoded_data)
            print "Wrote file: %s with encoding: %s" % (filepath, encoding_used)
        except:
            errors_encountered += 1
            failed_urls.append(url)
            try:
                traceback_info = "\n".join(traceback.format_exception(*(sys.exc_info())))
            except:
                traceback_info = ""
            print "*** ERROR PROCESSING: %s ***\nTraceback: %s\n" % (url, traceback_info)

    print "\nOperational Errors: %d\n" % errors_encountered
    if failed_urls:
        print "The following %d URLs failed:" % len(failed_urls)
        for url in failed_urls:
            print url