def transform_ejp_zip(zip_file, tmp_dir, output_dir): "transform ejp zip file and write a new zip file output" zip_file_name = zip_file.split(os.sep)[-1] # profile the zip contents asset_file_name_map = zip_lib.unzip_zip(zip_file, tmp_dir) # start logging LOGGER.info("%s starting to transform", zip_file_name) new_asset_file_name_map = transform_ejp_files( asset_file_name_map, output_dir, zip_file_name ) # write new zip file new_zip_file_path = rezip(new_asset_file_name_map, output_dir, zip_file_name) return new_zip_file_path
def transform_xml_history_tags(root, soup, zip_file_name): "remove history tags from the XML for particular article types" article_type = parser.article_type(soup) display_channel_list = parser.display_channel(soup) LOGGER.info( "%s article_type %s, display_channel %s", zip_file_name, article_type, display_channel_list, ) if article_type in ["correction", "editorial", "retraction"] or ( article_type == "article-commentary" and "insight" in [value.lower() for value in display_channel_list if value] ): LOGGER.info("%s transforming xml history tags", zip_file_name) # remove history tag for history_tag in root.findall("./front/article-meta/history"): root.find("./front/article-meta").remove(history_tag) return root
def check_multi_page_figure_pdf(figures, identifier): pdfimages_available = pdf_utils.pdfimages_exists() for pdf in [ pdf for pdf in figures if pdf.get("pages") and pdf.get("pages") > 1 ]: is_multi_page = False if pdfimages_available: LOGGER.info( "%s using pdfimages to check PDF figure file: %s", identifier, pdf.get("file_name"), ) try: image_pages = pdf_utils.pdf_image_pages(pdf.get("file_path")) LOGGER.info( "%s pdfimages found images on pages %s in PDF figure file: %s", identifier, image_pages, pdf.get("file_name"), ) is_multi_page = bool( [page for page in image_pages if page > 1]) except: LOGGER.exception( "%s exception using pdfimages to check PDF figure file: %s", identifier, pdf.get("file_name"), ) # consider it multi page in the case pdfimages raises an exception is_multi_page = True else: is_multi_page = True if is_multi_page: LOGGER.warning( "%s multiple page PDF figure file: %s", identifier, pdf.get("file_name"), )
def code_file_transformations(root, asset_file_name_map, output_dir, identifier): # zip code files code_files = code_file_list(root) file_transformations = [] for file_data in code_files: code_file_name = file_data.get("upload_file_nm") LOGGER.info("%s code_file_name: %s", identifier, code_file_name) # collect file name data original_code_file_name, original_code_file_path = find_in_file_name_map( code_file_name, asset_file_name_map ) from_file = ArticleZipFile( code_file_name, original_code_file_name, original_code_file_path ) LOGGER.info("%s from_file: %s", identifier, from_file) to_file = zip_code_file(from_file, output_dir) LOGGER.info("%s to_file: %s", identifier, to_file) # save the from file to file transformation file_transformations.append((from_file, to_file)) return file_transformations
def test_configure_logging(self): expected = "INFO elifecleaner:test_init:test_configure_logging: test_configure_logging\n" configure_logging(self.log_file) LOGGER.info("test_configure_logging") with open(self.log_file, "r") as open_file: self.assertEqual(open_file.read(), expected)
def rezip(asset_file_name_map, output_dir, zip_file_name): "write new zip file" new_zip_file_path = os.path.join(output_dir, zip_file_name) LOGGER.info("%s writing new zip file %s", zip_file_name, new_zip_file_path) create_zip_from_file_map(new_zip_file_path, asset_file_name_map) return new_zip_file_path
def write_xml_file(root, xml_asset_path, identifier): # write new XML file xml_string = xml_element_to_string(root) LOGGER.info("%s writing xml to file %s", identifier, xml_asset_path) with open(xml_asset_path, "w") as open_file: open_file.write(xml_string)
def xml_rewrite_file_tags(xml_asset_path, file_transformations, identifier): root = parse.parse_article_xml(xml_asset_path) # rewrite the XML tags LOGGER.info("%s rewriting xml tags", identifier) root = transform_xml_file_tags(root, file_transformations) write_xml_file(root, xml_asset_path, identifier)
def code_file_zip(file_transformations, output_dir, identifier): for from_file, to_file in file_transformations: LOGGER.info( "%s zipping from_file: %s, to_file: %s", identifier, from_file, to_file ) to_file = zip_code_file(from_file, output_dir)