def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) renditions, renditions_metadata = sps_package.get_renditions_metadata() logger.info("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets(asset_replacements + renditions, pkg_path, bad_pkg_path, sps_package.package_name) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def convert_article_ALLxml(spy=False): """Converte todos os arquivos HTML/XML que estão na pasta fonte.""" logger.debug("Starting XML conversion, it may take sometime.") logger.warning("If you are facing problems with Python crashing during " "conversion try to export this environment " "variable: `OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`") xmls = [ os.path.join(config.get("SOURCE_PATH"), xml) for xml in files.xml_files_list(config.get("SOURCE_PATH")) ] jobs = [{"file_xml_path": xml, "spy": spy} for xml in xmls] with tqdm(total=len(xmls)) as pbar: def update_bar(pbar=pbar): pbar.update(1) def log_exceptions(exception, job, logger=logger): logger.error( "Could not convert file '%s'. The exception '%s' was raised.", job["file_xml_path"], exception, ) DoJobsConcurrently( convert_article_xml, jobs=jobs, executor=concurrent.futures.ProcessPoolExecutor, max_workers=int(config.get("PROCESSPOOL_MAX_WORKERS")), exception_callback=log_exceptions, update_bar=update_bar, )
def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) files.make_empty_dir(pkg_path) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path, sps_package.package_name) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def get_and_write(pid, stage_path, poison_pill): def save_file(stage_path, file_path, documents_pid, article_content): logger.debug("\t Salvando arquivo '%s'", file_path) files.write_file(file_path, article_content) files.register_latest_stage(stage_path, documents_pid) if poison_pill.poisoned: return documents_pid = pid.strip() logger.debug("\t coletando dados do Documento '%s'", documents_pid) xml_article = article.ext_article_txt(documents_pid) if xml_article: save_file( stage_path, os.path.join(config.get("SOURCE_PATH"), "%s.xml" % documents_pid), documents_pid, xml_article, ) json_article = article.ext_article_json(documents_pid) if json_article: save_file( stage_path, os.path.join(config.get("SOURCE_PATH"), "%s.json" % documents_pid), documents_pid, json_article, )
def download_asset(old_path, new_fname, dest_path): """Returns msg, if error""" if old_path.startswith("http"): location = old_path else: try: location = urljoin(config.get("STATIC_URL_FILE"), old_path.strip()) except ValueError as exc: return 'cannot join URL parts "%s" and "%s": %s' % ( config.get("STATIC_URL_FILE"), old_path, exc, ) # Verifica se o arquivo ja foi baixado anteriormente filename_m, ext_m = files.extract_filename_ext_by_path(old_path) dest_path_file = os.path.join(dest_path, "%s%s" % (new_fname.strip(), ext_m)) if os.path.exists(dest_path_file): logger.info("Arquivo ja baixado: %s", dest_path_file) return try: request_file = request.get(location, timeout=int(config.get("TIMEOUT") or 10)) except request.HTTPGetError as e: try: msg = str(e) except TypeError: msg = "Unknown error" logger.error(e) return msg else: files.write_file_binary(dest_path_file, request_file.content)
def ext_identifiers(): journals_id = request.get( "%s/journal/identifiers/" % config.get("AM_URL_API"), params={ "collection": config.get("SCIELO_COLLECTION") }, ).json() return journals_id
def list_converted_xml_view(request): list_files_xmls = files.xml_files_list(config.get("CONVERSION_PATH")) list_files_xmls += files.xml_files_list(config.get("VALID_XML_PATH")) xmls = Page( list_files_xmls, page=int(request.params.get("page", 1)), items_per_page=20, item_count=len(list_files_xmls), ) return {"xmls": xmls, "page_title": "Lista de XMLS Convertidos"}
def ext_journal(issn): journal = request.get( "%s/journal" % config.get("AM_URL_API"), params={ "collection": config.get("SCIELO_COLLECTION"), "issn": issn }, ).json() return Journal(journal[0])
def ext_identifiers(issn_journal): articles_id = request.get( "%s/article/identifiers/" % config.get("AM_URL_API"), params={ "collection": config.get("SCIELO_COLLECTION"), "issn": issn_journal }, ) if articles_id: return articles_id.json()
def convert_article_ALLxml(): logger.info("Iniciando Conversão do xmls") list_files_xmls = files.xml_files_list(config.get("SOURCE_PATH")) for file_xml in tqdm(list_files_xmls): try: convert_article_xml( os.path.join(config.get("SOURCE_PATH"), file_xml)) except Exception as ex: logger.error(file_xml) logger.exception(ex)
def get_asset(old_path, new_fname, dest_path): """Obtém os ativos digitais no sistema de arquivo e realiza a persistência no ``dest_path``. Args: old_path: Caminho do ativo new_fname: Novo nome para o ativo dest_path: Pasta de destino Retornos: Sem retornos. Persiste o ativo no ``dest_path`` Exceções: IOError TypeError """ if old_path.startswith("http"): asset_path = urlparse(old_path).path else: asset_path = old_path asset_path = asset_path.strip('/') # Verifica se o arquivo ja foi baixado anteriormente filename_m, ext_m = files.extract_filename_ext_by_path(old_path) dest_path_file = os.path.join(dest_path, "%s%s" % (new_fname.strip(), ext_m)) if os.path.exists(dest_path_file): logger.debug("Arquivo já armazenado na pasta de destino: %s", dest_path_file) return paths = [ os.path.join(config.get('SOURCE_IMG_FILE'), asset_path), os.path.join(config.get('SOURCE_PDF_FILE'), asset_path), ] if (filename_m, ext_m) == ("seta", ".gif"): seta_path = os.path.join(config.get('SOURCE_IMG_FILE'), "img", "seta.gif") paths.insert(0, seta_path) try: for path in paths: path = find_file(path) if path: break content = files.read_file_binary(path) except (TypeError, FileNotFoundError, IOError): raise AssetNotFoundError(f"Not found {old_path}") else: files.write_file_binary(dest_path_file, content)
def pack_article_ALLxml(): logger.info("Empacotando os documentos XML") list_files_xmls = files.xml_files_list(config.get("VALID_XML_PATH")) for file_xml in tqdm(list_files_xmls): try: pack_article_xml(os.path.join(config.get("VALID_XML_PATH"), file_xml)) except (PermissionError, OSError, etree.Error) as ex: logger.error("Falha no empacotamento de %s" % file_xml) logger.exception(ex)
def ext_article(code, **ext_params): params = ext_params params.update({ "collection": config.get("SCIELO_COLLECTION"), "code": code }) try: article = request.get("%s/article" % config.get("AM_URL_API"), params=params) except request.HTTPGetError: logger.error("Erro coletando dados do artigo PID %s" % code) else: return article
def ext_journal(issn): try: journal = request.get( "%s/journal" % config.get("AM_URL_API"), params={ "collection": config.get("SCIELO_COLLECTION"), "issn": issn }, ) except request.HTTPGetError: logger.error("Journal nao encontrado: %s: %s" % (config.get("SCIELO_COLLECTION"), issn)) else: return Journal(journal.json()[0])
def reading_article_ALLxml(): logger.info("Iniciando Leituras do xmls") list_files_xmls = files.list_dir(config.get("CONVERSION_PATH")) for file_xml in list_files_xmls: try: reading_article_xml( os.path.join(config.get("CONVERSION_PATH"), file_xml), move_success=False, ) except Exception as ex: logger.error(file_xml) logger.exception(ex)
def ext_issue(code, **ext_params): issue = request.get( "%s/issue" % config.AM_URL_API, params={"collection": config.get("SCIELO_COLLECTION"), "code": code}, ).json() obj_issue = Issue(issue)
def register_documents(session_db, storage, documents_sorter, folder) -> None: """Realiza o processo de importação de pacotes SPS no diretório indicado. O processo de importação segue as fases: registro de assets/renditions no object storage informado, registro do manifesto na base de dados do Kernel informada e ordenação dos documentos em um `documents_sorter` para posterior associação aos seus respectivos fascículos""" err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents.err") for path, _, sps_files in os.walk(folder): if not sps_files: continue try: xml = list(filter(lambda f: f.endswith(".xml"), sps_files))[0] xml_path = os.path.join(path, xml) constructor.article_xml_constructor(xml_path, path, False) registration_result = register_document(path, session_db, storage) if registration_result: document_xml, document_id = registration_result documents_sorter.insert_document(document_id, document_xml) except (IndexError, ValueError, TypeError, exceptions.XMLError) as ex: msg = "Falha ao registrar documento %s: %s" % (path, ex) logger.error(msg) files.write_file(err_filename, msg, "a")
def test_register_documents_in_documents_bundle_no_issn_in_document( self, mk_read_json_file, mk_open): documents = [ { "pid_v3": "JwqGdMDrdcV3Z7MFHgtKvVk", "acron": "aiss", "eissn": None, "issn": None, "number": "4", "order": "00349", "pid": "S0021-25712009000400001", "supplement": None, "volume": "45", "year": "2009", }, ] journals = [SAMPLES_JOURNAL] mk_read_json_file.return_value = journals mock_file = MagicMock() mock_file.readlines.return_value = [ json.dumps(document) for document in documents ] mk_open.return_value.__enter__.return_value = mock_file mk_open.return_value.__exit__.return_value = Mock(return_value=False) inserting.register_documents_in_documents_bundle( Session(), "/tmp/documents.json", "/tmp/journals.json") err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents_in_bundle.err") self.assertEqual(os.path.isfile(err_filename), True) with open(err_filename) as fp: content = fp.read() self.assertEqual(content, "JwqGdMDrdcV3Z7MFHgtKvVk\n")
def extract_all_data(list_documents_pids: List[str]): """Extrai documentos XML a partir de uma lista de PIDS de entrada""" pids_to_extract, pids_extracteds, stage_path = files.fetch_stages_info( list_documents_pids, __name__) logger.info("Iniciando extração dos Documentos") count = 0 try: for documents_pid in tqdm( iterable=pids_to_extract, initial=len(pids_extracteds), total=len(list_documents_pids), ): documents_pid = documents_pid.strip() logger.debug("\t coletando dados do Documento '%s'", documents_pid) xml_article = article.ext_article_txt(documents_pid) if xml_article: count += 1 file_path = os.path.join(config.get("SOURCE_PATH"), "%s.xml" % documents_pid) logger.debug("\t Salvando arquivo '%s'", file_path) files.write_file(file_path, xml_article) files.register_latest_stage(stage_path, documents_pid) except KeyboardInterrupt: ... logger.info("\t Total de %s artigos", count)
async def fetch_articles(session, pid, cut_off_mark, output_filepath): """ Obtém os artigos e gera um dicionário contendo as informação necessária para saída em JSON. A variável comp_data terá a seguinte estrutura: { 'classic': 'Introduction One of the major current public health problems remains sepsis, which persists with hig', 'new': 'One of the major current public health problems remains sepsis, which persists with high hospital mo', 'url_classic': 'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-86502017000300175', 'url_new': 'http://new.scielo.br/article/S0102-86502017000300175', 'similarity': '72.22%', 'pid_v2': 'S0102-86502017000300175', 'similarity_technique': 'jaccard', 'cut_off_mark': 90, 'found_text_classic': true, 'found_text_new': false } Args: session: http session object(aiohttp), sessão http pid: pid do artigo. cut_off_mark: Régua de similiridade. output_filepath: Caminho do arquivo de saída. Retornos: Não há retorno Exceções: Não lança exceções. """ comp_data = {} for inst in config.get("SITE_INSTANCES"): html = await fetch_article(session, pid, inst.get("url")) comp_data["%s" % inst.get("name")] = extract( html, inst.get("html"), inst.get("remove_tags"), inst.get("remove_texts"), inst.get("compare_tags"), ) comp_data["url_%s" % inst.get("name")] = inst.get("url").format(pid) sim, percent = sim_jaccard( normalize(comp_data["classic"]), normalize(comp_data["new"]) ) comp_data["similarity"] = percent comp_data["found_text_classic"] = bool(comp_data["classic"]) comp_data["found_text_new"] = bool(comp_data["new"]) if (sim * 100) > cut_off_mark: del comp_data["classic"] del comp_data["new"] comp_data["pid_v2"] = pid comp_data["similarity_technique"] = "jaccard" comp_data["cut_off_mark"] = cut_off_mark dump_jsonl(output_filepath, [json.dumps(comp_data)])
def validate_article_xml(file_xml_path, print_error=True): result = {} logger.debug(file_xml_path) try: xmlvalidator = XMLValidator.parse(file_xml_path) if config.get("VALIDATE_ALL") == "TRUE": is_valid, errors = xmlvalidator.validate_all() else: is_valid, errors = xmlvalidator.validate() except (exceptions.XMLSPSVersionError, etree.LxmlError) as e: result[str(e)] = { "count": 1, "lineno": [1], "message": [str(e)], "filename": {file_xml_path}, } return result if not is_valid: for error in errors: if print_error: logger.error("%s - %s - %s", error.level, error.line, error.message) message = error.message[:80] data = { "count": 1, "lineno": [error.line], "message": [error.message], "filename": {file_xml_path}, } dicts.merge(result, message, data) return result
def configure_logger(): l_config.dictConfig({ "version": 1, "formatters": { "default": { "format": "%(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s", "datefmt": "%Y-%m-%d %H:%M:%S", } }, "handlers": { "console": { "level": "DEBUG", "class": "logging.StreamHandler", "formatter": "default", "stream": "ext://sys.stdout", }, "file": { "level": "ERROR", "class": "logging.handlers.RotatingFileHandler", "formatter": "default", "filename": os.path.join(config.get("LOGGER_PATH"), "migracao.log"), "maxBytes": 10 * 1024, "backupCount": 3, }, }, "loggers": { "": { "level": "DEBUG", "handlers": ["console", "file"] } }, "disable_existing_loggers": False, })
def test_register_documents_in_documents_bundle( self, mk_link_documents_bundle_with_documents, mk_read_json_file): documents = { "JwqGdMDrdcV3Z7MFHgtKvVk": { "acron": "aiss", "eissn": None, "issn": "0036-3634", "number": "04", "order": "00349", "pid": "S0021-25712009000400001", "pissn": "0036-3634", "supplement": None, "volume": "45", "year": "2009", } } journals = [SAMPLES_JOURNAL] mk_read_json_file.side_effect = [journals, documents] err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents_in_bundle.err") session_db = Session() manifest = inserting.ManifestDomainAdapter(SAMPLE_ISSUES_KERNEL[0]) session_db.documents_bundles.add(manifest) inserting.register_documents_in_documents_bundle( session_db, "/tmp/documents.json", "/tmp/journals.json") self.assertEqual(os.path.isfile(err_filename), True) with open(err_filename) as fp: content = fp.read() self.assertEqual(content, "0036-3634-2009-v45-n4\n")
def convert_article_xml(file_xml_path): obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body() # CONVERTE PUB-DATE PARA SPS 1.9 xml_sps.transform_pubdate() # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def run(path: str, output_file: str): """Roda um subprocesso com o isis2json de target para extrair dados de uma base ISIS em formato MST. O resultado da extração é armazenado em formato JSON em arquivo determinado pelo parâmetro output_file. """ command = "java -cp %s org.python.util.jython %s -t 3 -p 'v' -o %s %s" % ( config.get("CLASSPATH"), ISIS2JSON_PATH, output_file, path, ) try: logger.debug("Extracting database file: %s" % path) subprocess.run( shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, ) logger.debug("Writing extracted result as JSON file in: %s" % output_file) except Exception as exc: raise exceptions.ExtractError(str(exc)) from None
def setUp(self): self.data = dict([ ("eissn", "1234-5678"), ("pissn", "0001-3714"), ("issn", "0987-0987"), ("year", "1998"), ("volume", "29"), ("number", "3"), ("supplement", None), ]) self.aop_data = dict([("eissn", "0001-3714"), ("issn", "0001-3714"), ("year", "2019")]) self.bundle_id = "0001-3714-1998-v29-n3" self.issn = "0987-0987" if not os.path.isdir(config.get("ERRORS_PATH")): os.makedirs(config.get("ERRORS_PATH"))
def download_asset(old_path, new_fname, dest_path): """Returns msg, if error""" location = urljoin(config.get("STATIC_URL_FILE"), old_path) try: request_file = request.get(location, timeout=int(config.get("TIMEOUT") or 10)) except request.HTTPGetError as e: try: msg = str(e) except TypeError: msg = "Unknown error" logger.error(e) return msg else: filename_m, ext_m = files.extract_filename_ext_by_path(old_path) files.write_file_binary( os.path.join(dest_path, "%s%s" % (new_fname, ext_m)), request_file.content)
def pack_article_ALLxml(): """Gera os pacotes SPS a partir de um lista de XML validos. Args: Não há argumentos Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: pack_article_ALLxml() Exceções: Não lança exceções. """ xmls = [ os.path.join(config.get("VALID_XML_PATH"), xml) for xml in files.xml_files_list(config.get("VALID_XML_PATH")) ] jobs = [{"file_xml_path": xml} for xml in xmls] with tqdm(total=len(xmls), initial=0) as pbar: def update_bar(pbar=pbar): pbar.update(1) def log_exceptions(exception, job, logger=logger): logger.error( "Could not pack file '%s'. The exception '%s' was raised.", job["file_xml_path"], exception, ) DoJobsConcurrently( pack_article_xml, jobs=jobs, max_workers=int(config.get("THREADPOOL_MAX_WORKERS")), exception_callback=log_exceptions, update_bar=update_bar, )
def migrate_logos_to_website(session, website_img_dir): """Read all Journals from Website MongoDB collection and, for each one, get journal logo from current website, save to website media directory, create an image record in SQLite Image Table and update journal document with logo URL. session: SQLite DB session created in `connect_to_databases` website_img_dir: Website media directory """ journals = Journal.objects.all() if len(journals) == 0: raise exceptions.NoJournalInWebsiteError( "No journals in Website Database. Migrate Isis Journals first.") for journal in journals: logger.debug("Journal acronym %s", journal.acronym) logo_old_filename = "glogo.gif" logo_url = "{}img/revistas/{}/glogo.gif".format( config.get("STATIC_URL_FILE"), journal.acronym) try: logger.debug("Getting Journal logo in %s", logo_url) request_file = request.get(logo_url, timeout=int( config.get("TIMEOUT") or 10)) except request.HTTPGetError as e: try: msg = str(e) except TypeError: msg = "Unknown error" logger.error(msg) else: logo_filename = "_".join([journal.acronym, logo_old_filename]) dest_path_file = os.path.join(website_img_dir, logo_filename) logger.debug("Saving Journal logo in %s", dest_path_file) files.write_file_binary(dest_path_file, request_file.content) image_path = "images/%s" % logo_filename logger.debug("Saving logo as image in %s", image_path) session.add(Image(name=logo_filename, path=image_path)) session.commit() journal.logo_url = "/media/%s" % image_path logger.debug("Updating Journal with logo_url %s", journal.logo_url) journal.save()
def register_documents(session_db, storage, documents_sorter) -> None: logger.info("Iniciando Envio dos do xmls") list_folders = files.list_files(config.get("SPS_PKG_PATH")) err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents.err") for folder in list_folders: try: document_path = os.path.join(config.get("SPS_PKG_PATH"), folder) registration_result = register_document(document_path, session_db, storage) if registration_result: document_xml, document_id = registration_result documents_sorter.insert_document(document_id, document_xml) except Exception as ex: msg = "Falha ao registrar documento %s: %s" % (document_path, ex) logger.error(msg) files.write_file(err_filename, msg, "a")