def print_results_tda(file_name, data, key_word):
    tda_number_of_records = data[-3:]
    with codecs.open(file_name, 'r', encoding='utf-8') as f:
        html = f.read().replace("tda_data", data)
        html = html.replace("tda_number_of_records", tda_number_of_records)
    #nombre del archivo
    file_name = Path(FOLDER_NAME + SLASH + key_word + FILE_LASTNAME +
                     FILE_EXTENSION)
    if file_name.exists():
        try:
            Path.unlink(file_name)
            pass
        except OSError as error:
            print("Error: {} - {}.".format(error.filename, error.strerror))
    else:
        create_data_files(FOLDER_NAME, key_word + FILE_LASTNAME, html)

    webbrowser.open(str(file_name), new=1, autoraise=True)
def print_all_results(file_name, key_word, tda_data, poli_data, colma_data):
    #apertura de archivo base
    with codecs.open(file_name, 'r', encoding='utf-8') as f:
        html = f.read()
        if tda_data:
            tda_number_of_records = tda_data[-3:]
            html = html.replace("tda_data", tda_data)
            html = html.replace("tda_number_of_records", tda_number_of_records)
        else:
            html = html.replace("tda_data", "No hay registros")
            html = html.replace("tda_number_of_records", "0")
        if poli_data:
            polijic_number_of_records = poli_data[-3:]
            html = html.replace("poli_data", poli_data)
            html = html.replace("polijic_number_of_records",
                                polijic_number_of_records)
        else:
            html = html.replace("poli_data", "No hay registros")
            html = html.replace("polijic_number_of_records", "0")
        if colma_data:
            colma_number_of_records = colma_data[-3:]
            html = html.replace("colma_data", colma_data)
            html = html.replace("colma_number_of_records",
                                colma_number_of_records)
        else:
            html = html.replace("colma_data", "No hay registros")
            html = html.replace("colma_number_of_records", "0")
    #nombre del archivo
    file_name = Path(FOLDER_NAME + SLASH + key_word + FILE_LASTNAME +
                     FILE_EXTENSION)
    #link
    xml_link = key_word + XML_LAST_NAME + XML_EXTENSION
    html = html.replace("xml_data", str(xml_link))
    if file_name.exists():
        try:
            Path.unlink(file_name)
            pass
        except OSError as error:
            print("Error: {} - {}.".format(error.filename, error.strerror))
    else:
        create_data_files(FOLDER_NAME, key_word + FILE_LASTNAME, html)

    webbrowser.open(str(file_name), new=1, autoraise=True)
Beispiel #3
0
    def crawl_page_for_search(url_to_crawl, key_word, folder_name):
        try:
            #validar a que institucion pertenece la url a buscar
            if 'aleph' in str(url_to_crawl):
                url_institution = 'POLIJIC'
                # se hace la peticion GET a la url
                webpage = requests.get(url_to_crawl, verify=False)
            elif 'tdea' in str(url_to_crawl):
                url_institution = 'TDA'
                # se hace la peticion GET a la url
                webpage = requests.post(url_to_crawl, verify=False)
            else:
                url_institution = 'COLMA'
                # se hace la peticion GET a la url
                webpage = requests.post(url_to_crawl, verify=False)

            content = webpage.text
            file_name = key_word+'-'+url_institution
            create_data_files(folder_name, file_name, content)
        except Exception as e:
                print(str(e))
Beispiel #4
0
 def crawl_page(thread_name):
     url_info = Crawler.fetch_url_info()
     if url_info is not None:
         url_to_crawl = url_info[1]
         url_id = url_info[0]
         theme_id = url_info[2]
         url_institution = url_info[3]
         print(thread_name + ' Crawling ')
         try:
             # se hace la peticion GET a la url
             webpage = requests.get(url_to_crawl, verify=False)
             # decodificacion segun charset
             content = str(webpage.content, webpage.headers.get('charset'))
             file_name = Crawler.get_theme(theme_id) + '-' + url_institution
             create_data_files(Crawler.folder_name, file_name, content)
             Crawler.update_url(1, url_id)
         except Exception as e:
             print(str(e))
             Crawler.update_url(0, url_id)
     else:
         print('Error al obtener url')