def repackage_archive_zip_to_pmc_zip(self, doi_id): "repackage the zip file in the TMP_DIR to a PMC zip format" # unzip contents zip_input_dir = os.path.join(self.get_tmp_dir(), self.TMP_DIR) zip_extracted_dir = os.path.join(self.get_tmp_dir(), self.JUNK_DIR) zip_renamed_files_dir = os.path.join(self.get_tmp_dir(), self.RENAME_DIR) pmc_zip_output_dir = os.path.join(self.get_tmp_dir(), self.INPUT_DIR) archive_zip_name = glob.glob(zip_input_dir + "/*.zip")[0] with zipfile.ZipFile(archive_zip_name, 'r') as myzip: myzip.extractall(zip_extracted_dir) # rename the files and profile the files file_name_map = article_processing.rename_files_remove_version_number( files_dir = zip_extracted_dir, output_dir = zip_renamed_files_dir ) if self.logger: self.logger.info("FTPArticle running %s workflow for article %s, file_name_map" % (self.workflow, self.doi_id)) self.logger.info(file_name_map) # convert the XML article_xml_file = glob.glob(zip_renamed_files_dir + "/*.xml")[0] article_processing.convert_xml(xml_file=article_xml_file, file_name_map=file_name_map) # rezip the files into PMC zip format soup = parser.parse_document(article_xml_file) volume = parser.volume(soup) pmc_zip_file_name = article_processing.new_pmc_zip_filename(self.journal, volume, doi_id) with zipfile.ZipFile(os.path.join(pmc_zip_output_dir, pmc_zip_file_name), 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as new_zipfile: dirfiles = article_processing.file_list(zip_renamed_files_dir) for df in dirfiles: filename = df.split(os.sep)[-1] new_zipfile.write(df, filename) return True
def profile_article(self, document): """ Temporary, profile the article by folder names in test data set In real code we still want this to return the same values """ # Temporary setting of version values from directory names soup = self.article_soup(self.article_xml_file()) # elife id / doi id / manuscript id fid = parser.doi(soup).split('.')[-1] # article status if parser.is_poa(soup) is True: status = 'poa' else: status = 'vor' # version version = self.version_number(document) # volume volume = parser.volume(soup) return (fid, status, version, volume)
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len( parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def build_article_from_xml(article_xml_filename, detail="brief"): """ Parse JATS XML with elifetools parser, and populate an eLifePOA article object Basic data crossref needs: article_id, doi, title, contributors with names set detail="brief" is normally enough, detail="full" will populate all the contributor affiliations that are linked by xref tags """ error_count = 0 soup = parser.parse_document(article_xml_filename) # Get DOI doi = parser.doi(soup) # Create the article object article = eLifePOA(doi, title=None) # Related articles article.related_articles = build_related_articles(parser.related_article(soup)) # Get publisher_id and set object manuscript value publisher_id = parser.publisher_id(soup) article.manuscript = publisher_id # Set the articleType article_type = parser.article_type(soup) if article_type: article.articleType = article_type # title article.title = parser.full_title(soup) #print article.title # abstract article.abstract = clean_abstract(parser.full_abstract(soup)) # digest article.digest = clean_abstract(parser.full_digest(soup)) # elocation-id article.elocation_id = parser.elocation_id(soup) # contributors all_contributors = parser.contributors(soup, detail) author_contributors = filter(lambda con: con.get('type') in ['author', 'on-behalf-of'], all_contributors) contrib_type = "author" contributors = build_contributors(author_contributors, contrib_type) contrib_type = "author non-byline" authors = parser.authors_non_byline(soup, detail) contributors_non_byline = build_contributors(authors, contrib_type) article.contributors = contributors + contributors_non_byline # license href license = eLifeLicense() license.href = parser.license_url(soup) article.license = license # article_category article.article_categories = parser.category(soup) # keywords article.author_keywords = parser.keywords(soup) # research organisms article.research_organisms = parser.research_organism(soup) # funding awards article.funding_awards = build_funding(parser.full_award_groups(soup)) # references or citations article.ref_list = build_ref_list(parser.refs(soup)) # components with component DOI article.component_list = build_components(parser.components(soup)) # History dates date_types = ["received", "accepted"] for date_type in date_types: history_date = parser.history_date(soup, date_type) if history_date: date_instance = eLifeDate(date_type, history_date) article.add_date(date_instance) # Pub date pub_date = parser.pub_date(soup) if pub_date: date_instance = eLifeDate("pub", pub_date) article.add_date(date_instance) # Set the volume if present volume = parser.volume(soup) if volume: article.volume = volume article.is_poa = parser.is_poa(soup) return article, error_count