def crawlprocess(url_objects, start, html_dir, status_dir, agent): status_file = open(status_dir + "/status_temp_" + str(start) + ".json", "w") content_file = None if Config.DATA_FORMAT == "ONE_FILE": content_file = open(html_dir + "/html_" + str(start) + ".json", "a+") save_content = save_content_one_file(content_file) elif Config.DATA_FORMAT == "MULTI_FILE": save_content = save_content_multi_file(html_dir) for i in range(start, len(url_objects), Config.PROCESS_NUMBER): url_obj = url_objects[i] url = url_obj["url"] try: if Config.USE_TOR: res = requests.get(url, headers=Config.HEADERS[agent], proxies=TOR_PROXY, verify=False, timeout=5) else: res = requests.get(url, headers=Config.HEADERS[agent], verify=False, timeout=5) if Config.SAVE_HTML: save_content(url, res) save_response(url, URLUtility.encode(url), str(res.status_code), None, res.headers, agent, url_obj, status_file) except requests.ConnectionError: #In the event of a network problem (e.g. DNS failure, refused connection, etc) save_response(url, URLUtility.encode(url), None, "ConnectionError", None, agent, url_obj, status_file) except requests.HTTPError: #In the rare event of an invalid HTTP response save_response(url, URLUtility.encode(url), None, "HTTPError", None, agent, url_obj, status_file) except requests.Timeout: save_response(url, URLUtility.encode(url), None, "Timeout", None, agent, url_obj, status_file) except requests.TooManyRedirects: save_response(url, URLUtility.encode(url), None, "TooManyRedirects", None, agent, url_obj, status_file) except Exception: save_response(url, URLUtility.encode(url), None, "OtherExceptions", None, agent, url_obj, status_file) status_file.close() if content_file: content_file.close()
def save_content(url, res): html_filename = html_dir + "/" + URLUtility.encode(url) + ".html" html_file = open(html_filename, "w") text = res.text.encode('utf-8') html_file.write(text) html_file.close()