def crawl_url(): global errors_encountered print "\n* NEW CRAWLING SESSION FOR CONFIG URL: %s *\n" % seed_url while len(urls_to_visit) > 0: current_url = urls_to_visit.pop(0) try: time.sleep(request_delay) page_source = None met_mimetype_criteria = False met_file_extension_criteria = False write_file = False # Write url to file if file name provided if url_file: with open(url_file, "a") as fw: fw.write(current_url + os.linesep) print "\nProcessing URL: %s\n" % current_url # Look for a valid head response from the URL print "HEAD Request of URL: ", current_url head_response = requests.head(current_url, allow_redirects=True, headers=REQUEST_HEADERS, timeout=60) if not head_response.status_code == requests.codes.ok: print "Received an invalid HEAD response for URL: ", current_url else: content_type = head_response.headers.get('content-type') encoding = head_response.encoding final_url = head_response.url # If we found an HTML file, grab all the links if 'text/html' in content_type: print "Requesting URL with Python Requests: ", current_url get_response = requests.get(current_url, headers=REQUEST_HEADERS, timeout=60) content_type = get_response.headers.get('content-type') encoding = get_response.encoding page_source = get_response.text final_url = get_response.url if 'text/html' in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium: ", final_url try: browser.get(final_url) page_source = browser.page_source except: print "First Selenium request failed. Trying one last time." browser.get(final_url) page_source = browser.page_source else: if 'text/html' in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium one last time." browser.get(final_url) page_source = browser.page_source final_url = browser.current_url add_new_urls(final_url, page_source) # Check if we should write files with this mimetype or extension for mimetype in mimetypes_list: if mimetype in content_type: met_mimetype_criteria = True if not met_mimetype_criteria: url_parsed = urlparse.urlsplit(final_url) url_path = url_parsed.path.strip() for file_extension in file_extensions_list: if url_path.endswith(file_extension): met_file_extension_criteria = True # Check if we should write this file based on potential regex restrictions, only if it passes the mimetype or extension tests if met_mimetype_criteria or met_file_extension_criteria: if not using_regex_filters: write_file = True else: for regex_filter in regex_filters: if regex_filter.search(final_url): write_file = True break # Write a file if we need to if write_file: print "Need to write file" if not page_source: print "Requesting URL with Python Requests: ", final_url get_response = requests.get(final_url, headers=REQUEST_HEADERS, timeout=60) encoding = get_response.encoding page_source = get_response.text final_url = get_response.url encoded_data, encoding_used = get_encoded_data(page_source, encoding) filepath = get_filepath(final_url, encoding_used, output_dir) with open(filepath, 'w') as f: f.write(encoded_data) print "Wrote file: %s with encoding: %s" % (filepath, encoding_used) global files_written files_written += 1 global files_processed files_processed += 1 print "Files Found: %d Processed: %d Remaining: %d Written: %d Operational Errors: %d" % ( len(all_urls), files_processed, len(urls_to_visit), files_written, errors_encountered ) except: errors_encountered += 1 try: traceback_info = '\n'.join(traceback.format_exception(*(sys.exc_info()))) except: traceback_info = '' print "*** ERROR PROCESSING: %s ***\nTraceback: %s\n" % ( current_url, traceback_info )
def main(argv): input_file = None output_dir = None failed_urls = [] errors_encountered = 0 try: opts, args = getopt.getopt(argv, "i:o:") except getopt.GetoptError: print USAGE_MESSAGE sys.exit(2) for opt, arg in opts: if opt == "-i": input_file = arg if opt == "-o": output_dir = arg if not input_file or not output_dir: print USAGE_MESSAGE sys.exit(2) if os.path.isdir(output_dir): print "Found directory: %s" % output_dir else: mkdir_p(output_dir) print "Created directory: %s" % output_dir with open(input_file) as f: urls = f.readlines() print "Found %d URLs" % len(urls) browser = get_selenium_browser() for url in urls: url = url.strip() if not url: continue try: print "\nProcessing URL: %s" % url print "Requesting URL with Python Requests: ", url get_response = requests.get(url, headers=REQUEST_HEADERS, timeout=60) content_type = get_response.headers.get('content-type') encoding = get_response.encoding page_source = get_response.text final_url = get_response.url if 'text/html' in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium: ", final_url try: browser.get(final_url) page_source = browser.page_source except: print "First Selenium request failed. Trying one last time." browser.get(final_url) page_source = browser.page_source else: if 'text/html' in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium one last time." browser.get(final_url) page_source = browser.page_source final_url = browser.current_url print "Found final URL: %s" % final_url encoded_data, encoding_used = get_encoded_data( page_source, encoding) filepath = get_filepath(final_url, encoding_used, output_dir) with open(filepath, 'w') as f: f.write(encoded_data) print "Wrote file: %s with encoding: %s" % (filepath, encoding_used) except: errors_encountered += 1 failed_urls.append(url) try: traceback_info = '\n'.join( traceback.format_exception(*(sys.exc_info()))) except: traceback_info = '' print "*** ERROR PROCESSING: %s ***\nTraceback: %s\n" % ( url, traceback_info) print "\nOperational Errors: %d\n" % errors_encountered if failed_urls: print "The following %d URLs failed:" % len(failed_urls) for url in failed_urls: print url
from functions import access_folder, get_filepath, read_folder, get_date_str import os data = get_date_str() filepath = get_filepath() files = read_folder(filepath) def put_files_sftp(conn): if conn.pwd == "/": print(conn.listdir()) if "folder" in conn.listdir(): access_folder(conn, "folder", conn.listdir()) else: folder = input() access_folder(conn, folder, conn.listdir()) print(conn.listdir()) arq = conn.listdir() arq.remove(arq[1]) if arq[0]: access_folder(conn, arq[0], arq) if conn.listdir(): print("There are files of last week in the folder. Them weren't processed yet. Contact the developer.") else: for file in files: localpath = filepath + "uploads/" try: conn.put(localpath, '', callback=None, confirm=True) print("Transfering...{}".format(file)) os.rename(filepath + file, localpath + data + "/" + file)
def main(argv): input_file = None output_dir = None failed_urls = [] errors_encountered = 0 try: opts, args = getopt.getopt(argv, "i:o:") except getopt.GetoptError: print USAGE_MESSAGE sys.exit(2) for opt, arg in opts: if opt == "-i": input_file = arg if opt == "-o": output_dir = arg if not input_file or not output_dir: print USAGE_MESSAGE sys.exit(2) if os.path.isdir(output_dir): print "Found directory: %s" % output_dir else: mkdir_p(output_dir) print "Created directory: %s" % output_dir with open(input_file) as f: urls = f.readlines() print "Found %d URLs" % len(urls) browser = get_selenium_browser() for url in urls: url = url.strip() if not url: continue try: print "\nProcessing URL: %s" % url print "Requesting URL with Python Requests: ", url get_response = requests.get(url, headers=REQUEST_HEADERS, timeout=60) content_type = get_response.headers.get("content-type") encoding = get_response.encoding page_source = get_response.text final_url = get_response.url if "text/html" in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium: ", final_url try: browser.get(final_url) page_source = browser.page_source except: print "First Selenium request failed. Trying one last time." browser.get(final_url) page_source = browser.page_source else: if "text/html" in content_type and not "<body" in page_source: print "No <body> tag found in page source. Requesting URL with Selenium one last time." browser.get(final_url) page_source = browser.page_source final_url = browser.current_url print "Found final URL: %s" % final_url encoded_data, encoding_used = get_encoded_data(page_source, encoding) filepath = get_filepath(final_url, encoding_used, output_dir) with open(filepath, "w") as f: f.write(encoded_data) print "Wrote file: %s with encoding: %s" % (filepath, encoding_used) except: errors_encountered += 1 failed_urls.append(url) try: traceback_info = "\n".join(traceback.format_exception(*(sys.exc_info()))) except: traceback_info = "" print "*** ERROR PROCESSING: %s ***\nTraceback: %s\n" % (url, traceback_info) print "\nOperational Errors: %d\n" % errors_encountered if failed_urls: print "The following %d URLs failed:" % len(failed_urls) for url in failed_urls: print url