Esempio n. 1
0
def check_missing_files(files, asset_file_name_map, identifier):
    "check for missing files and log a warning if missing"
    missing_files = find_missing_files(files, asset_file_name_map)
    for missing_file in missing_files:
        LOGGER.warning(
            "%s does not contain a file in the manifest: %s",
            identifier,
            missing_file,
        )
Esempio n. 2
0
def check_missing_files_by_name(files, identifier):
    "check for files numbered out of sequence and log a warning when found"
    missing_files_by_name = find_missing_files_by_name(files)
    for missing_file in missing_files_by_name:
        LOGGER.warning(
            "%s has file misisng from expected numeric sequence: %s",
            identifier,
            missing_file,
        )
Esempio n. 3
0
def pdf_page_count(file_path):
    "open PDF as an image and count the number of pages"
    if file_path:
        try:
            with Image(filename=file_path) as img:
                return len(img.sequence)
        except WandRuntimeError:
            LOGGER.exception("WandRuntimeError in pdf_page_count(), "
                             "imagemagick may not be installed")
            raise
        except PolicyError:
            LOGGER.exception(
                "PolicyError in pdf_page_count(), "
                "imagemagick policy.xml may not allow reading PDF files")
            raise
    return None
Esempio n. 4
0
def parse_article_xml(xml_file):
    with open(xml_file, "r") as open_file:
        xml_string = open_file.read()
        # unescape any HTML entities to avoid undefined entity XML exceptions later
        xml_string = html_entity_unescape(xml_string)
        try:
            return ElementTree.fromstring(xml_string)
        except ElementTree.ParseError:
            if REPAIR_XML:
                # try to repair the xml namespaces
                xml_string = repair_article_xml(xml_string)
                return ElementTree.fromstring(xml_string)
            else:
                LOGGER.exception(
                    "ParseError raised because REPAIR_XML flag is False")
                raise
Esempio n. 5
0
def transform_ejp_zip(zip_file, tmp_dir, output_dir):
    "transform ejp zip file and write a new zip file output"

    zip_file_name = zip_file.split(os.sep)[-1]

    # profile the zip contents
    asset_file_name_map = zip_lib.unzip_zip(zip_file, tmp_dir)

    # start logging
    LOGGER.info("%s starting to transform", zip_file_name)

    new_asset_file_name_map = transform_ejp_files(
        asset_file_name_map, output_dir, zip_file_name
    )

    # write new zip file
    new_zip_file_path = rezip(new_asset_file_name_map, output_dir, zip_file_name)

    return new_zip_file_path
Esempio n. 6
0
def transform_xml_history_tags(root, soup, zip_file_name):
    "remove history tags from the XML for particular article types"
    article_type = parser.article_type(soup)
    display_channel_list = parser.display_channel(soup)
    LOGGER.info(
        "%s article_type %s, display_channel %s",
        zip_file_name,
        article_type,
        display_channel_list,
    )

    if article_type in ["correction", "editorial", "retraction"] or (
        article_type == "article-commentary"
        and "insight" in [value.lower() for value in display_channel_list if value]
    ):
        LOGGER.info("%s transforming xml history tags", zip_file_name)
        # remove history tag
        for history_tag in root.findall("./front/article-meta/history"):
            root.find("./front/article-meta").remove(history_tag)
    return root
Esempio n. 7
0
def check_multi_page_figure_pdf(figures, identifier):
    pdfimages_available = pdf_utils.pdfimages_exists()
    for pdf in [
            pdf for pdf in figures if pdf.get("pages") and pdf.get("pages") > 1
    ]:
        is_multi_page = False
        if pdfimages_available:
            LOGGER.info(
                "%s using pdfimages to check PDF figure file: %s",
                identifier,
                pdf.get("file_name"),
            )
            try:
                image_pages = pdf_utils.pdf_image_pages(pdf.get("file_path"))
                LOGGER.info(
                    "%s pdfimages found images on pages %s in PDF figure file: %s",
                    identifier,
                    image_pages,
                    pdf.get("file_name"),
                )
                is_multi_page = bool(
                    [page for page in image_pages if page > 1])
            except:
                LOGGER.exception(
                    "%s exception using pdfimages to check PDF figure file: %s",
                    identifier,
                    pdf.get("file_name"),
                )
                # consider it multi page in the case pdfimages raises an exception
                is_multi_page = True
        else:
            is_multi_page = True
        if is_multi_page:
            LOGGER.warning(
                "%s multiple page PDF figure file: %s",
                identifier,
                pdf.get("file_name"),
            )
Esempio n. 8
0
def code_file_transformations(root, asset_file_name_map, output_dir, identifier):
    # zip code files
    code_files = code_file_list(root)
    file_transformations = []
    for file_data in code_files:
        code_file_name = file_data.get("upload_file_nm")

        LOGGER.info("%s code_file_name: %s", identifier, code_file_name)
        # collect file name data
        original_code_file_name, original_code_file_path = find_in_file_name_map(
            code_file_name, asset_file_name_map
        )

        from_file = ArticleZipFile(
            code_file_name, original_code_file_name, original_code_file_path
        )
        LOGGER.info("%s from_file: %s", identifier, from_file)

        to_file = zip_code_file(from_file, output_dir)
        LOGGER.info("%s to_file: %s", identifier, to_file)

        # save the from file to file transformation
        file_transformations.append((from_file, to_file))
    return file_transformations
 def tearDown(self):
     LOGGER.removeHandler(self.log_handler)
     delete_files_in_folder(self.temp_dir, filter_out=[".keepme"])
     delete_files_in_folder(self.output_dir, filter_out=[".keepme"])
Esempio n. 10
0
 def test_configure_logging(self):
     expected = "INFO elifecleaner:test_init:test_configure_logging: test_configure_logging\n"
     configure_logging(self.log_file)
     LOGGER.info("test_configure_logging")
     with open(self.log_file, "r") as open_file:
         self.assertEqual(open_file.read(), expected)
Esempio n. 11
0
def rezip(asset_file_name_map, output_dir, zip_file_name):
    "write new zip file"
    new_zip_file_path = os.path.join(output_dir, zip_file_name)
    LOGGER.info("%s writing new zip file %s", zip_file_name, new_zip_file_path)
    create_zip_from_file_map(new_zip_file_path, asset_file_name_map)
    return new_zip_file_path
Esempio n. 12
0
def write_xml_file(root, xml_asset_path, identifier):
    # write new XML file
    xml_string = xml_element_to_string(root)
    LOGGER.info("%s writing xml to file %s", identifier, xml_asset_path)
    with open(xml_asset_path, "w") as open_file:
        open_file.write(xml_string)
Esempio n. 13
0
def xml_rewrite_file_tags(xml_asset_path, file_transformations, identifier):
    root = parse.parse_article_xml(xml_asset_path)
    # rewrite the XML tags
    LOGGER.info("%s rewriting xml tags", identifier)
    root = transform_xml_file_tags(root, file_transformations)
    write_xml_file(root, xml_asset_path, identifier)
Esempio n. 14
0
def code_file_zip(file_transformations, output_dir, identifier):
    for from_file, to_file in file_transformations:
        LOGGER.info(
            "%s zipping from_file: %s, to_file: %s", identifier, from_file, to_file
        )
        to_file = zip_code_file(from_file, output_dir)
Esempio n. 15
0
 def tearDown(self):
     LOGGER.removeHandler(self.log_handler)
     delete_files_in_folder(self.temp_dir, filter_out=[".keepme"])
     parse.REPAIR_XML = self.original_repair_xml_value
Esempio n. 16
0
def check_extra_files(files, asset_file_name_map, identifier):
    "check for extra files and log them as a warning if present"
    extra_files = find_extra_files(files, asset_file_name_map)
    for extra_file in extra_files:
        LOGGER.warning("%s has file not listed in the manifest: %s",
                       identifier, extra_file)