def test_extract_filename_ext_by_path(self): filename, extension = files.extract_filename_ext_by_path( "xml/conversion/S0044-59672014000400003/S0044-59672014000400003.pt.xml" ) self.assertEqual(filename, "S0044-59672014000400003") self.assertEqual(extension, ".xml")
def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) renditions, renditions_metadata = sps_package.get_renditions_metadata() logger.info("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets(asset_replacements + renditions, pkg_path, bad_pkg_path, sps_package.package_name) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def download_asset(old_path, new_fname, dest_path): """Returns msg, if error""" if old_path.startswith("http"): location = old_path else: try: location = urljoin(config.get("STATIC_URL_FILE"), old_path.strip()) except ValueError as exc: return 'cannot join URL parts "%s" and "%s": %s' % ( config.get("STATIC_URL_FILE"), old_path, exc, ) # Verifica se o arquivo ja foi baixado anteriormente filename_m, ext_m = files.extract_filename_ext_by_path(old_path) dest_path_file = os.path.join(dest_path, "%s%s" % (new_fname.strip(), ext_m)) if os.path.exists(dest_path_file): logger.info("Arquivo ja baixado: %s", dest_path_file) return try: request_file = request.get(location, timeout=int(config.get("TIMEOUT") or 10)) except request.HTTPGetError as e: try: msg = str(e) except TypeError: msg = "Unknown error" logger.error(e) return msg else: files.write_file_binary(dest_path_file, request_file.content)
def parser_node(self, node): node.tag = "graphic" _attrib = deepcopy(node.attrib) src = _attrib.pop("src") node.attrib.clear() node.set("{http://www.w3.org/1999/xlink}href", src) filename, __ = files.extract_filename_ext_by_path(src) if "t" in filename: new_element = "table-wrap" id_name = filename.split("t") else: id_name = filename.split("f") new_element = "fig" n_id = gera_id(new_element[0] + id_name[-1]) if n_id: root = node.getroottree() ref_node = root.find("//%s[@ref-id='%s']" % (new_element, n_id)) if ref_node is not None: _node = deepcopy(node) ref_node.append(_node) parent = node.getparent() parent.remove(node)
def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) files.make_empty_dir(pkg_path) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path, sps_package.package_name) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def tag_and_reftype_and_id_from_filepath(self, file_path, elem_name=None): filename, __ = files.extract_filename_ext_by_path(file_path) prefix_and_tag_items = STARTSWITH_RETURNS_TAG_AND_REFTYPE if elem_name: prefix_and_tag_items = [ (prefix, tag) for prefix, tag in STARTSWITH_RETURNS_TAG_AND_REFTYPE if tag == elem_name ] prefix_and_tag_items.append((elem_name[0], elem_name)) for prefix, tag in prefix_and_tag_items: if prefix == filename: return tag, self.ref_type(tag), filename if prefix in filename: parts = filename.split(prefix) if len(parts) < 2: continue if parts[0] and parts[0][-1].isalpha(): continue if parts[1] and parts[1][0].isalpha(): continue if parts[1]: return tag, self.ref_type(tag), prefix + "".join(parts[1:])
def get_renditions_metadata(self): renditions = [] metadata = {} obj_article = article.get_article(self.publisher_id) if obj_article: metadata = obj_article.fulltexts().get("pdf", {}) for lang, url in metadata.items(): filename, ext = files.extract_filename_ext_by_path(url) renditions.append((url, filename)) return renditions, metadata
def replace_assets_names(self): replacements = [] attr_name = "{http://www.w3.org/1999/xlink}href" for node in self.elements_which_has_xlink_href: old_path = node.get(attr_name) if is_asset_href(old_path): f_name, ext = files.extract_filename_ext_by_path(old_path) new_fname = self.asset_name(f_name) node.set(attr_name, "%s%s" % (new_fname, ext)) replacements.append((old_path, new_fname)) return replacements
def get_renditions_metadata(self): renditions = [] metadata = {} if self.article_meta is not None: for node in self.article_meta.findall(".//self-uri"): url = node.get("{http://www.w3.org/1999/xlink}href") lang = node.get("{http://www.w3.org/XML/1998/namespace}lang") filename, ext = files.extract_filename_ext_by_path(url) renditions.append((url, filename)) metadata[lang] = url return renditions, metadata
def replace_assets_names(self): replacements = [] attr_name = "{http://www.w3.org/1999/xlink}href" for node in self.elements_which_has_xlink_href: old_path = node.get(attr_name) if is_asset_href(old_path): f_name, ext = files.extract_filename_ext_by_path(old_path) new_fname = self.asset_name(f_name) node.set(attr_name, "%s%s" % (new_fname, ext)) replacements.append((old_path, new_fname)) # RENDITION PDF obj_article = article.get_article(self.publisher_id) if obj_article: pdfs = obj_article.fulltexts().get("pdf", {}) for l_pdf, u_pdf in pdfs.items(): f_name, ext = files.extract_filename_ext_by_path(u_pdf) new_fname = self.asset_name(f_name) replacements.append((u_pdf, new_fname)) return replacements
def get_asset(old_path, new_fname, dest_path): """Obtém os ativos digitais no sistema de arquivo e realiza a persistência no ``dest_path``. Args: old_path: Caminho do ativo new_fname: Novo nome para o ativo dest_path: Pasta de destino Retornos: Sem retornos. Persiste o ativo no ``dest_path`` Exceções: IOError TypeError """ if old_path.startswith("http"): asset_path = urlparse(old_path).path else: asset_path = old_path asset_path = asset_path.strip('/') # Verifica se o arquivo ja foi baixado anteriormente filename_m, ext_m = files.extract_filename_ext_by_path(old_path) dest_path_file = os.path.join(dest_path, "%s%s" % (new_fname.strip(), ext_m)) if os.path.exists(dest_path_file): logger.debug("Arquivo já armazenado na pasta de destino: %s", dest_path_file) return paths = [ os.path.join(config.get('SOURCE_IMG_FILE'), asset_path), os.path.join(config.get('SOURCE_PDF_FILE'), asset_path), ] if (filename_m, ext_m) == ("seta", ".gif"): seta_path = os.path.join(config.get('SOURCE_IMG_FILE'), "img", "seta.gif") paths.insert(0, seta_path) try: for path in paths: path = find_file(path) if path: break content = files.read_file_binary(path) except (TypeError, FileNotFoundError, IOError): raise AssetNotFoundError(f"Not found {old_path}") else: files.write_file_binary(dest_path_file, content)
def validate_article_ALLxml(move_to_processed_source=False, move_to_valid_xml=False): logger.info("Iniciando Validação dos xmls") list_files_xmls = files.xml_files_list(config.get("CONVERSION_PATH")) success_path = config.get("VALID_XML_PATH") errors_path = config.get("XML_ERRORS_PATH") func = shutil.move if move_to_valid_xml else shutil.copyfile result = {} for file_xml in tqdm(list_files_xmls): filename, _ = files.extract_filename_ext_by_path(file_xml) converted_file = os.path.join(config.get("CONVERSION_PATH"), file_xml) try: errors = validate_article_xml(converted_file, False) for k_error, v_error in errors.items(): dicts.merge(result, k_error, v_error) if errors_path: manage_error_file( errors, os.path.join(errors_path, "%s.err" % filename), converted_file, ) if not errors: if success_path: func(converted_file, os.path.join(success_path, file_xml)) if move_to_processed_source: files.move_xml_to( "%s.xml" % filename, config.get("SOURCE_PATH"), config.get("PROCESSED_SOURCE_PATH"), ) except Exception as ex: logger.exception(ex) raise analase = sorted(result.items(), key=lambda x: x[1]["count"], reverse=True) for k_result, v_result in analase: logger.error("%s - %s", k_result, v_result["count"])
def download_asset(old_path, new_fname, dest_path): """Returns msg, if error""" location = urljoin(config.get("STATIC_URL_FILE"), old_path) try: request_file = request.get(location, timeout=int(config.get("TIMEOUT") or 10)) except request.HTTPGetError as e: try: msg = str(e) except TypeError: msg = "Unknown error" logger.error(e) return msg else: filename_m, ext_m = files.extract_filename_ext_by_path(old_path) files.write_file_binary( os.path.join(dest_path, "%s%s" % (new_fname, ext_m)), request_file.content)
def tag_and_reftype_and_id_from_filepath(self, file_path, elem_name=None): filename, __ = files.extract_filename_ext_by_path(file_path) if elem_name: clue_and_tag_items = self.rules.sorted_by_tag.get(elem_name, []) clue_and_tag_items.append((elem_name[0], elem_name)) else: clue_and_tag_items = self.rules.sorted_rules for clue, tag in clue_and_tag_items: if clue == filename: return tag, self.ref_type(tag), filename if clue in filename: parts = filename.split(clue) if len(parts) < 2: continue if parts[0] and parts[0][-1].isalpha(): continue if parts[1] and parts[1][0].isalpha(): continue if parts[1]: return tag, self.ref_type(tag), clue + "".join(parts[1:])
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()): """Empacoda um xml e seus ativos digitais. Args: file_xml_path: Caminho para o XML poison_pill: Injeta um PosionPill() Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: packing.pack_article_xml( os.path.join("S0044-59672003000300002.xml") ) Exceções: Não lança exceções. """ if poison_pill.poisoned: return original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) sps_package.fix("article_id_which_id_type_is_other", sps_package.scielo_pid_v2 and sps_package.scielo_pid_v2[-5:], silently=True) new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10]) if new_issns: sps_package.fix("issns", new_issns, silently=True) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.debug("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) source_json = get_source_json(sps_package.scielo_pid_v2) renditions, renditions_metadata = source_json.get_renditions_metadata() logger.debug("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets( asset_replacements + renditions, pkg_path, incomplete_pkg_path, sps_package.package_name, sps_package.scielo_pid_v2, ) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def get_renditions_metadata(self): renditions = [] for lang, url in self.fixed_renditions_metadata.items(): filename, ext = files.extract_filename_ext_by_path(url) renditions.append((url, filename)) return renditions, self.fixed_renditions_metadata