Example #1
0
def convert_article_ALLxml(spy=False):
    """Converte todos os arquivos HTML/XML que estão na pasta fonte."""

    logger.debug("Starting XML conversion, it may take sometime.")
    logger.warning("If you are facing problems with Python crashing during "
                   "conversion try to export this environment "
                   "variable: `OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`")

    xmls = [
        os.path.join(config.get("SOURCE_PATH"), xml)
        for xml in files.xml_files_list(config.get("SOURCE_PATH"))
    ]

    jobs = [{"file_xml_path": xml, "spy": spy} for xml in xmls]

    with tqdm(total=len(xmls)) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        def log_exceptions(exception, job, logger=logger):
            logger.error(
                "Could not convert file '%s'. The exception '%s' was raised.",
                job["file_xml_path"],
                exception,
            )

        DoJobsConcurrently(
            convert_article_xml,
            jobs=jobs,
            executor=concurrent.futures.ProcessPoolExecutor,
            max_workers=int(config.get("PROCESSPOOL_MAX_WORKERS")),
            exception_callback=log_exceptions,
            update_bar=update_bar,
        )
Example #2
0
def rollback_kernel_documents(
    session_db: object,
    import_output_path: str,
    extracted_title_path: str,
    output_path: str,
) -> None:
    """
    Baseado no arquivo `output_path`, desfaz o import dos documentos, o relacionamento 
    no `document bundle`s e o registro de mudança"""

    journals = get_journals_from_json(extracted_title_path)

    with open(import_output_path) as f:
        jobs = [{
            "doc_info": json.loads(doc_info),
            "session": session_db,
            "journals": journals,
        } for doc_info in f.readlines() if doc_info]

    with tqdm(total=len(jobs)) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        def write_result_to_file(result, path=output_path):
            with open(path, "a") as f:
                f.write(json.dumps(result) + "\n")

        def exception_callback(exception, job, logger=logger):
            logger.exception(
                "Could not roll back document '%s'. The following exception "
                "was raised: '%s'.",
                job["doc_info"].get("pid_v3"),
                exception,
            )

        # Manter como max_workers=1 até que o controle transacional seja implementado
        DoJobsConcurrently(
            rollback_document,
            jobs=jobs,
            max_workers=1,
            success_callback=write_result_to_file,
            exception_callback=exception_callback,
            update_bar=update_bar,
        )
Example #3
0
def extract_all_data(list_documents_pids: List[str]):
    """Extrai documentos XML a partir de uma lista de PIDS
    de entrada"""

    pids_to_extract, pids_extracteds, stage_path = files.fetch_stages_info(
        list_documents_pids, __name__)

    jobs = [{"pid": pid, "stage_path": stage_path} for pid in pids_to_extract]

    with tqdm(total=len(list_documents_pids)) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        DoJobsConcurrently(
            get_and_write,
            jobs=jobs,
            max_workers=config.get("THREADPOOL_MAX_WORKERS"),
            update_bar=update_bar,
        )
Example #4
0
def import_documents_to_kernel(session_db, pid_database_engine, storage, folder, output_path) -> None:
    """Armazena os arquivos do pacote SPS em um object storage, registra o documento
    no banco de dados do Kernel e por fim associa-o ao seu `document bundle`"""

    jobs = [
        {"folder": package_folder, "session": session_db, "storage": storage, "pid_database_engine": pid_database_engine}
        for package_folder, _, files in os.walk(folder)
        if files is not None and len(files) > 0
    ]

    with tqdm(total=len(jobs)) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        def write_result_to_file(result, path=output_path):
            with open(path, "a") as f:
                f.write(json.dumps(result) + "\n")

        def exception_callback(exception, job, logger=logger):
            logger.error(
                "Could not import package '%s'. The following exception "
                "was raised: '%s'.",
                job["folder"],
                exception,
            )

        # O param executor por padrão é concurrent.futures.ThreadPoolExecutor.
        # É possível e ganhamos velocidade quando utilizamos concurrent.futures.Executor,
        # porém é necessário saber dos por menores que envolve essa alteração, é possível
        # verificar isso em: https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor
        DoJobsConcurrently(
            register_document,
            jobs=jobs,
            max_workers=int(config.get("PROCESSPOOL_MAX_WORKERS")),
            success_callback=write_result_to_file,
            exception_callback=exception_callback,
            update_bar=update_bar,
        )
Example #5
0
    def run(self):
        fieldnames = (
            "pid",
            "aop_pid",
            "file_path",
            "date_collection",
            "date_created",
            "date_updated",
            "acron",
            "volnum",
            "lang",
        )
        with open(self.articles_csvfile, encoding="utf-8",
                  errors="replace") as csvfile:
            # pid, aoppid, file, pubdate, epubdate, update, acron, volnum
            articles_data_reader = csv.DictReader(csvfile,
                                                  fieldnames=fieldnames)
            jobs = [{"row": row} for row in articles_data_reader]
            with tqdm(total=len(jobs)) as pbar:

                def update_bar(pbar=pbar):
                    pbar.update(1)

                def exception_callback(exception, job, logger=logger):
                    logger.error(
                        "Could not import package '%s'. The following exception "
                        "was raised: '%s'.",
                        job["row"],
                        exception,
                    )

                DoJobsConcurrently(
                    self.start_collect,
                    jobs=jobs,
                    max_workers=int(config.get("THREADPOOL_MAX_WORKERS")),
                    exception_callback=exception_callback,
                    update_bar=update_bar,
                )
def check_documents_availability_in_website(pids: List[str],
                                            target_string: str,
                                            output: IO = None) -> None:
    """ Dada uma lista de pids, esta função verifica no site informado quais
    pids não estão disponíveis.
    
    Params:
        pids (List[str]): Lista de pids para verificar.
        target_string (str): Endereço da página de artigo no site algo.
        output (IO): Arquivo onde as URLs não disponíveis serão registradas.

    Return:
        None
    """

    template = string.Template(target_string)

    if "$id" not in target_string:
        return logger.error(
            "The target string must contain a $id variable. If you are facing"
            "troubles try to scape the variable e.g '\$id'.")

    def access_website_and_report(url, output, poison_pill):
        """Acessa uma URL e reporta o seu status de resposta"""

        if poison_pill.poisoned:
            return

        response = requests.head(url)

        if response.status_code not in (200, 301, 302):
            logger.error(
                "The URL '%s' is not available. Returned the status code '%s'.",
                url,
                response.status_code,
            )

            if output is not None:
                try:
                    output.write(url + "\n")
                except IOError as exc:
                    logger.error(
                        "Cannot write in the file. The exception '%s' was raided ",
                        exc)

    jobs = [{
        "url": template.substitute({"id": pid.strip()}),
        "output": output
    } for pid in pids]

    with tqdm(total=len(jobs)) as pbar:

        def update_bar(pbar=pbar):
            pbar.update(1)

        def exception_callback(exception, job, logger=logger, output=output):
            logger.error(
                "Could not check availability for URL '%s'. The following exception "
                "was raised: '%s'.",
                job["url"],
                exception,
            )

            logger.exception(exception)

        DoJobsConcurrently(
            access_website_and_report,
            jobs,
            max_workers=int(config.get("THREADPOOL_MAX_WORKERS")),
            exception_callback=exception_callback,
            update_bar=update_bar,
        )