Esempio n. 1
0
def transform_ejp_zip(zip_file, tmp_dir, output_dir):
    "transform ejp zip file and write a new zip file output"

    zip_file_name = zip_file.split(os.sep)[-1]

    # profile the zip contents
    asset_file_name_map = zip_lib.unzip_zip(zip_file, tmp_dir)

    # start logging
    LOGGER.info("%s starting to transform", zip_file_name)

    new_asset_file_name_map = transform_ejp_files(
        asset_file_name_map, output_dir, zip_file_name
    )

    # write new zip file
    new_zip_file_path = rezip(new_asset_file_name_map, output_dir, zip_file_name)

    return new_zip_file_path
Esempio n. 2
0
def transform_xml_history_tags(root, soup, zip_file_name):
    "remove history tags from the XML for particular article types"
    article_type = parser.article_type(soup)
    display_channel_list = parser.display_channel(soup)
    LOGGER.info(
        "%s article_type %s, display_channel %s",
        zip_file_name,
        article_type,
        display_channel_list,
    )

    if article_type in ["correction", "editorial", "retraction"] or (
        article_type == "article-commentary"
        and "insight" in [value.lower() for value in display_channel_list if value]
    ):
        LOGGER.info("%s transforming xml history tags", zip_file_name)
        # remove history tag
        for history_tag in root.findall("./front/article-meta/history"):
            root.find("./front/article-meta").remove(history_tag)
    return root
Esempio n. 3
0
def check_multi_page_figure_pdf(figures, identifier):
    pdfimages_available = pdf_utils.pdfimages_exists()
    for pdf in [
            pdf for pdf in figures if pdf.get("pages") and pdf.get("pages") > 1
    ]:
        is_multi_page = False
        if pdfimages_available:
            LOGGER.info(
                "%s using pdfimages to check PDF figure file: %s",
                identifier,
                pdf.get("file_name"),
            )
            try:
                image_pages = pdf_utils.pdf_image_pages(pdf.get("file_path"))
                LOGGER.info(
                    "%s pdfimages found images on pages %s in PDF figure file: %s",
                    identifier,
                    image_pages,
                    pdf.get("file_name"),
                )
                is_multi_page = bool(
                    [page for page in image_pages if page > 1])
            except:
                LOGGER.exception(
                    "%s exception using pdfimages to check PDF figure file: %s",
                    identifier,
                    pdf.get("file_name"),
                )
                # consider it multi page in the case pdfimages raises an exception
                is_multi_page = True
        else:
            is_multi_page = True
        if is_multi_page:
            LOGGER.warning(
                "%s multiple page PDF figure file: %s",
                identifier,
                pdf.get("file_name"),
            )
Esempio n. 4
0
def code_file_transformations(root, asset_file_name_map, output_dir, identifier):
    # zip code files
    code_files = code_file_list(root)
    file_transformations = []
    for file_data in code_files:
        code_file_name = file_data.get("upload_file_nm")

        LOGGER.info("%s code_file_name: %s", identifier, code_file_name)
        # collect file name data
        original_code_file_name, original_code_file_path = find_in_file_name_map(
            code_file_name, asset_file_name_map
        )

        from_file = ArticleZipFile(
            code_file_name, original_code_file_name, original_code_file_path
        )
        LOGGER.info("%s from_file: %s", identifier, from_file)

        to_file = zip_code_file(from_file, output_dir)
        LOGGER.info("%s to_file: %s", identifier, to_file)

        # save the from file to file transformation
        file_transformations.append((from_file, to_file))
    return file_transformations
Esempio n. 5
0
 def test_configure_logging(self):
     expected = "INFO elifecleaner:test_init:test_configure_logging: test_configure_logging\n"
     configure_logging(self.log_file)
     LOGGER.info("test_configure_logging")
     with open(self.log_file, "r") as open_file:
         self.assertEqual(open_file.read(), expected)
Esempio n. 6
0
def rezip(asset_file_name_map, output_dir, zip_file_name):
    "write new zip file"
    new_zip_file_path = os.path.join(output_dir, zip_file_name)
    LOGGER.info("%s writing new zip file %s", zip_file_name, new_zip_file_path)
    create_zip_from_file_map(new_zip_file_path, asset_file_name_map)
    return new_zip_file_path
Esempio n. 7
0
def write_xml_file(root, xml_asset_path, identifier):
    # write new XML file
    xml_string = xml_element_to_string(root)
    LOGGER.info("%s writing xml to file %s", identifier, xml_asset_path)
    with open(xml_asset_path, "w") as open_file:
        open_file.write(xml_string)
Esempio n. 8
0
def xml_rewrite_file_tags(xml_asset_path, file_transformations, identifier):
    root = parse.parse_article_xml(xml_asset_path)
    # rewrite the XML tags
    LOGGER.info("%s rewriting xml tags", identifier)
    root = transform_xml_file_tags(root, file_transformations)
    write_xml_file(root, xml_asset_path, identifier)
Esempio n. 9
0
def code_file_zip(file_transformations, output_dir, identifier):
    for from_file, to_file in file_transformations:
        LOGGER.info(
            "%s zipping from_file: %s, to_file: %s", identifier, from_file, to_file
        )
        to_file = zip_code_file(from_file, output_dir)