Ejemplo n.º 1
0
def crawlprocess(url_objects, start, html_dir, status_dir, agent):
    status_file = open(status_dir + "/status_temp_" + str(start) + ".json", "w")
    content_file = None
    if Config.DATA_FORMAT == "ONE_FILE":
        content_file = open(html_dir  + "/html_" + str(start) + ".json", "a+")
        save_content = save_content_one_file(content_file)
    elif Config.DATA_FORMAT == "MULTI_FILE":
        save_content = save_content_multi_file(html_dir)

    for i in range(start, len(url_objects), Config.PROCESS_NUMBER):
        url_obj = url_objects[i]
        url = url_obj["url"] 
        try:
            if Config.USE_TOR:
                res = requests.get(url, headers=Config.HEADERS[agent], proxies=TOR_PROXY, verify=False, timeout=5)
            else:
                res = requests.get(url, headers=Config.HEADERS[agent], verify=False, timeout=5)
            if Config.SAVE_HTML:
                save_content(url, res)
            save_response(url, URLUtility.encode(url), str(res.status_code), None, res.headers, agent, url_obj, status_file)
        except requests.ConnectionError:
            #In the event of a network problem (e.g. DNS failure, refused connection, etc)
            save_response(url, URLUtility.encode(url), None, "ConnectionError", None, agent, url_obj, status_file)
        except requests.HTTPError:
            #In the rare event of an invalid HTTP response
            save_response(url, URLUtility.encode(url), None, "HTTPError", None, agent, url_obj, status_file)
        except requests.Timeout:
            save_response(url, URLUtility.encode(url), None, "Timeout", None, agent, url_obj, status_file)
        except requests.TooManyRedirects:
            save_response(url, URLUtility.encode(url), None, "TooManyRedirects", None, agent, url_obj, status_file)
        except Exception:
            save_response(url, URLUtility.encode(url), None, "OtherExceptions", None, agent, url_obj, status_file)
    status_file.close()
    if content_file:
        content_file.close()
Ejemplo n.º 2
0
def crawlprocess(url_objects, start, html_dir, status_dir, agent):
    status_file = open(status_dir + "/status_temp_" + str(start) + ".json",
                       "w")
    content_file = None
    if Config.DATA_FORMAT == "ONE_FILE":
        content_file = open(html_dir + "/html_" + str(start) + ".json", "a+")
        save_content = save_content_one_file(content_file)
    elif Config.DATA_FORMAT == "MULTI_FILE":
        save_content = save_content_multi_file(html_dir)

    for i in range(start, len(url_objects), Config.PROCESS_NUMBER):
        url_obj = url_objects[i]
        url = url_obj["url"]
        try:
            if Config.USE_TOR:
                res = requests.get(url,
                                   headers=Config.HEADERS[agent],
                                   proxies=TOR_PROXY,
                                   verify=False,
                                   timeout=5)
            else:
                res = requests.get(url,
                                   headers=Config.HEADERS[agent],
                                   verify=False,
                                   timeout=5)
            if Config.SAVE_HTML:
                save_content(url, res)
            save_response(url, URLUtility.encode(url), str(res.status_code),
                          None, res.headers, agent, url_obj, status_file)
        except requests.ConnectionError:
            #In the event of a network problem (e.g. DNS failure, refused connection, etc)
            save_response(url, URLUtility.encode(url), None, "ConnectionError",
                          None, agent, url_obj, status_file)
        except requests.HTTPError:
            #In the rare event of an invalid HTTP response
            save_response(url, URLUtility.encode(url), None, "HTTPError", None,
                          agent, url_obj, status_file)
        except requests.Timeout:
            save_response(url, URLUtility.encode(url), None, "Timeout", None,
                          agent, url_obj, status_file)
        except requests.TooManyRedirects:
            save_response(url, URLUtility.encode(url), None,
                          "TooManyRedirects", None, agent, url_obj,
                          status_file)
        except Exception:
            save_response(url, URLUtility.encode(url), None, "OtherExceptions",
                          None, agent, url_obj, status_file)
    status_file.close()
    if content_file:
        content_file.close()
Ejemplo n.º 3
0
 def save_content(url, res):
     html_filename = html_dir + "/" + URLUtility.encode(url) + ".html"
     html_file = open(html_filename, "w")
     text = res.text.encode('utf-8')
     html_file.write(text)
     html_file.close()
Ejemplo n.º 4
0
 def save_content(url, res):
     html_filename = html_dir + "/" + URLUtility.encode(url) + ".html"
     html_file = open(html_filename, "w")
     text = res.text.encode('utf-8')
     html_file.write(text)
     html_file.close()