Ejemplo n.º 1
0
    def test_extract_filename_ext_by_path(self):

        filename, extension = files.extract_filename_ext_by_path(
            "xml/conversion/S0044-59672014000400003/S0044-59672014000400003.pt.xml"
        )
        self.assertEqual(filename, "S0044-59672014000400003")
        self.assertEqual(extension, ".xml")
Ejemplo n.º 2
0
def pack_article_xml(file_xml_path):
    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.info("%s possui %s ativos digitais", file_xml_path,
                len(asset_replacements))

    renditions, renditions_metadata = sps_package.get_renditions_metadata()
    logger.info("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(asset_replacements + renditions, pkg_path,
                                  bad_pkg_path, sps_package.package_name)

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))

    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Ejemplo n.º 3
0
def download_asset(old_path, new_fname, dest_path):
    """Returns msg, if error"""
    if old_path.startswith("http"):
        location = old_path
    else:
        try:
            location = urljoin(config.get("STATIC_URL_FILE"), old_path.strip())
        except ValueError as exc:
            return 'cannot join URL parts "%s" and "%s": %s' % (
                config.get("STATIC_URL_FILE"),
                old_path,
                exc,
            )

    # Verifica se o arquivo ja foi baixado anteriormente
    filename_m, ext_m = files.extract_filename_ext_by_path(old_path)
    dest_path_file = os.path.join(dest_path,
                                  "%s%s" % (new_fname.strip(), ext_m))
    if os.path.exists(dest_path_file):
        logger.info("Arquivo ja baixado: %s", dest_path_file)
        return

    try:
        request_file = request.get(location,
                                   timeout=int(config.get("TIMEOUT") or 10))
    except request.HTTPGetError as e:
        try:
            msg = str(e)
        except TypeError:
            msg = "Unknown error"
        logger.error(e)
        return msg
    else:
        files.write_file_binary(dest_path_file, request_file.content)
        def parser_node(self, node):
            node.tag = "graphic"
            _attrib = deepcopy(node.attrib)
            src = _attrib.pop("src")

            node.attrib.clear()
            node.set("{http://www.w3.org/1999/xlink}href", src)

            filename, __ = files.extract_filename_ext_by_path(src)
            if "t" in filename:
                new_element = "table-wrap"
                id_name = filename.split("t")
            else:
                id_name = filename.split("f")
                new_element = "fig"

            n_id = gera_id(new_element[0] + id_name[-1])
            if n_id:
                root = node.getroottree()
                ref_node = root.find("//%s[@ref-id='%s']" %
                                     (new_element, n_id))
                if ref_node is not None:
                    _node = deepcopy(node)
                    ref_node.append(_node)

                    parent = node.getparent()
                    parent.remove(node)
Ejemplo n.º 5
0
def pack_article_xml(file_xml_path):
    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename)

    files.make_empty_dir(pkg_path)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.info("%s possui %s ativos digitais", file_xml_path,
                len(asset_replacements))

    package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path,
                                  sps_package.package_name)

    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Ejemplo n.º 6
0
    def tag_and_reftype_and_id_from_filepath(self, file_path, elem_name=None):
        filename, __ = files.extract_filename_ext_by_path(file_path)

        prefix_and_tag_items = STARTSWITH_RETURNS_TAG_AND_REFTYPE
        if elem_name:
            prefix_and_tag_items = [
                (prefix, tag)
                for prefix, tag in STARTSWITH_RETURNS_TAG_AND_REFTYPE
                if tag == elem_name
            ]
            prefix_and_tag_items.append((elem_name[0], elem_name))

        for prefix, tag in prefix_and_tag_items:
            if prefix == filename:
                return tag, self.ref_type(tag), filename
            if prefix in filename:
                parts = filename.split(prefix)
                if len(parts) < 2:
                    continue
                if parts[0] and parts[0][-1].isalpha():
                    continue
                if parts[1] and parts[1][0].isalpha():
                    continue
                if parts[1]:
                    return tag, self.ref_type(tag), prefix + "".join(parts[1:])
Ejemplo n.º 7
0
 def get_renditions_metadata(self):
     renditions = []
     metadata = {}
     obj_article = article.get_article(self.publisher_id)
     if obj_article:
         metadata = obj_article.fulltexts().get("pdf", {})
         for lang, url in metadata.items():
             filename, ext = files.extract_filename_ext_by_path(url)
             renditions.append((url, filename))
     return renditions, metadata
 def replace_assets_names(self):
     replacements = []
     attr_name = "{http://www.w3.org/1999/xlink}href"
     for node in self.elements_which_has_xlink_href:
         old_path = node.get(attr_name)
         if is_asset_href(old_path):
             f_name, ext = files.extract_filename_ext_by_path(old_path)
             new_fname = self.asset_name(f_name)
             node.set(attr_name, "%s%s" % (new_fname, ext))
             replacements.append((old_path, new_fname))
     return replacements
Ejemplo n.º 9
0
 def get_renditions_metadata(self):
     renditions = []
     metadata = {}
     if self.article_meta is not None:
         for node in self.article_meta.findall(".//self-uri"):
             url = node.get("{http://www.w3.org/1999/xlink}href")
             lang = node.get("{http://www.w3.org/XML/1998/namespace}lang")
             filename, ext = files.extract_filename_ext_by_path(url)
             renditions.append((url, filename))
             metadata[lang] = url
     return renditions, metadata
    def replace_assets_names(self):
        replacements = []
        attr_name = "{http://www.w3.org/1999/xlink}href"
        for node in self.elements_which_has_xlink_href:
            old_path = node.get(attr_name)
            if is_asset_href(old_path):
                f_name, ext = files.extract_filename_ext_by_path(old_path)
                new_fname = self.asset_name(f_name)
                node.set(attr_name, "%s%s" % (new_fname, ext))
                replacements.append((old_path, new_fname))

        # RENDITION PDF
        obj_article = article.get_article(self.publisher_id)
        if obj_article:
            pdfs = obj_article.fulltexts().get("pdf", {})
            for l_pdf, u_pdf in pdfs.items():
                f_name, ext = files.extract_filename_ext_by_path(u_pdf)
                new_fname = self.asset_name(f_name)
                replacements.append((u_pdf, new_fname))

        return replacements
Ejemplo n.º 11
0
def get_asset(old_path, new_fname, dest_path):
    """Obtém os ativos digitais no sistema de arquivo e realiza a persistência
    no ``dest_path``.

    Args:
        old_path: Caminho do ativo
        new_fname: Novo nome para o ativo
        dest_path: Pasta de destino

    Retornos:
        Sem retornos.

        Persiste o ativo no ``dest_path``

    Exceções:
        IOError
        TypeError
    """
    if old_path.startswith("http"):
        asset_path = urlparse(old_path).path
    else:
        asset_path = old_path

    asset_path = asset_path.strip('/')

    # Verifica se o arquivo ja foi baixado anteriormente
    filename_m, ext_m = files.extract_filename_ext_by_path(old_path)
    dest_path_file = os.path.join(dest_path,
                                  "%s%s" % (new_fname.strip(), ext_m))
    if os.path.exists(dest_path_file):
        logger.debug("Arquivo já armazenado na pasta de destino: %s",
                     dest_path_file)
        return

    paths = [
        os.path.join(config.get('SOURCE_IMG_FILE'), asset_path),
        os.path.join(config.get('SOURCE_PDF_FILE'), asset_path),
    ]
    if (filename_m, ext_m) == ("seta", ".gif"):
        seta_path = os.path.join(config.get('SOURCE_IMG_FILE'), "img",
                                 "seta.gif")
        paths.insert(0, seta_path)

    try:
        for path in paths:
            path = find_file(path)
            if path:
                break
        content = files.read_file_binary(path)
    except (TypeError, FileNotFoundError, IOError):
        raise AssetNotFoundError(f"Not found {old_path}")
    else:
        files.write_file_binary(dest_path_file, content)
Ejemplo n.º 12
0
def validate_article_ALLxml(move_to_processed_source=False,
                            move_to_valid_xml=False):
    logger.info("Iniciando Validação dos xmls")
    list_files_xmls = files.xml_files_list(config.get("CONVERSION_PATH"))

    success_path = config.get("VALID_XML_PATH")
    errors_path = config.get("XML_ERRORS_PATH")
    func = shutil.move if move_to_valid_xml else shutil.copyfile

    result = {}
    for file_xml in tqdm(list_files_xmls):

        filename, _ = files.extract_filename_ext_by_path(file_xml)
        converted_file = os.path.join(config.get("CONVERSION_PATH"), file_xml)

        try:
            errors = validate_article_xml(converted_file, False)

            for k_error, v_error in errors.items():
                dicts.merge(result, k_error, v_error)

            if errors_path:
                manage_error_file(
                    errors,
                    os.path.join(errors_path, "%s.err" % filename),
                    converted_file,
                )

            if not errors:
                if success_path:
                    func(converted_file, os.path.join(success_path, file_xml))

                if move_to_processed_source:
                    files.move_xml_to(
                        "%s.xml" % filename,
                        config.get("SOURCE_PATH"),
                        config.get("PROCESSED_SOURCE_PATH"),
                    )

        except Exception as ex:
            logger.exception(ex)
            raise

    analase = sorted(result.items(), key=lambda x: x[1]["count"], reverse=True)
    for k_result, v_result in analase:
        logger.error("%s - %s", k_result, v_result["count"])
Ejemplo n.º 13
0
def download_asset(old_path, new_fname, dest_path):
    """Returns msg, if error"""
    location = urljoin(config.get("STATIC_URL_FILE"), old_path)
    try:
        request_file = request.get(location,
                                   timeout=int(config.get("TIMEOUT") or 10))
    except request.HTTPGetError as e:
        try:
            msg = str(e)
        except TypeError:
            msg = "Unknown error"
        logger.error(e)
        return msg
    else:
        filename_m, ext_m = files.extract_filename_ext_by_path(old_path)
        files.write_file_binary(
            os.path.join(dest_path, "%s%s" % (new_fname, ext_m)),
            request_file.content)
Ejemplo n.º 14
0
 def tag_and_reftype_and_id_from_filepath(self, file_path, elem_name=None):
     filename, __ = files.extract_filename_ext_by_path(file_path)
     if elem_name:
         clue_and_tag_items = self.rules.sorted_by_tag.get(elem_name, [])
         clue_and_tag_items.append((elem_name[0], elem_name))
     else:
         clue_and_tag_items = self.rules.sorted_rules
     for clue, tag in clue_and_tag_items:
         if clue == filename:
             return tag, self.ref_type(tag), filename
         if clue in filename:
             parts = filename.split(clue)
             if len(parts) < 2:
                 continue
             if parts[0] and parts[0][-1].isalpha():
                 continue
             if parts[1] and parts[1][0].isalpha():
                 continue
             if parts[1]:
                 return tag, self.ref_type(tag), clue + "".join(parts[1:])
Ejemplo n.º 15
0
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()):
    """Empacoda um xml e seus ativos digitais.

    Args:
        file_xml_path: Caminho para o XML
        poison_pill: Injeta um PosionPill()

    Retornos:
        Sem retornos.

        Persiste o XML no ``package_path``

    Exemplo:
        packing.pack_article_xml(
                os.path.join("S0044-59672003000300002.xml")
            )

    Exceções:
        Não lança exceções.
    """
    if poison_pill.poisoned:
        return

    original_filename, ign = files.extract_filename_ext_by_path(file_xml_path)

    obj_xml = xml.file2objXML(file_xml_path)

    sps_package = SPS_Package(obj_xml, original_filename)
    sps_package.fix("article_id_which_id_type_is_other",
                    sps_package.scielo_pid_v2
                    and sps_package.scielo_pid_v2[-5:],
                    silently=True)
    new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10])
    if new_issns:
        sps_package.fix("issns", new_issns, silently=True)

    SPS_PKG_PATH = config.get("SPS_PKG_PATH")
    INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH")

    pkg_path = os.path.join(SPS_PKG_PATH, original_filename)
    incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH,
                                       original_filename)

    asset_replacements = list(set(sps_package.replace_assets_names()))
    logger.debug("%s possui %s ativos digitais", file_xml_path,
                 len(asset_replacements))

    source_json = get_source_json(sps_package.scielo_pid_v2)
    renditions, renditions_metadata = source_json.get_renditions_metadata()
    logger.debug("%s possui %s renditions", file_xml_path, len(renditions))

    package_path = packing_assets(
        asset_replacements + renditions,
        pkg_path,
        incomplete_pkg_path,
        sps_package.package_name,
        sps_package.scielo_pid_v2,
    )

    files.write_file(os.path.join(package_path, "manifest.json"),
                     json.dumps(renditions_metadata))
    xml.objXML2file(
        os.path.join(package_path, "%s.xml" % (sps_package.package_name)),
        obj_xml)
Ejemplo n.º 16
0
 def get_renditions_metadata(self):
     renditions = []
     for lang, url in self.fixed_renditions_metadata.items():
         filename, ext = files.extract_filename_ext_by_path(url)
         renditions.append((url, filename))
     return renditions, self.fixed_renditions_metadata