def test_input_output(self, filename): """test parsing a file while retaining the doctype""" with open(sample_xml(filename), "rb") as xml_file: xml_output_expected = xml_file.read() root, doctype_dict = xmlio.parse(sample_xml(filename), return_doctype_dict=True) self.assertEqual(xmlio.output(root, None, doctype_dict), xml_output_expected)
def test_rewrite_subject_group(self, xml, subjects, subject_group_type, overwrite, xml_expected): root = xmlio.parse(xml) xmlio.rewrite_subject_group(root, subjects, subject_group_type, overwrite) # unicode encoding option added in python 3 rough_string = ElementTree.tostring(root, encoding="unicode") self.assertEqual(rough_string, xml_expected)
def test_output_root(self, xml, publicId, systemId, internalSubset, xml_expected): encoding = "UTF-8" qualifiedName = "article" root = xmlio.parse(BytesIO(xml)) doctype = xmlio.build_doctype(qualifiedName, publicId, systemId, internalSubset) xml_output = xmlio.output_root(root, doctype, encoding) self.assertEqual(xml_output.decode("utf-8"), xml_expected)
def test_output_root(self, xml, publicId, systemId, internalSubset, xml_expected): encoding = 'UTF-8' qualifiedName = "article" root = xmlio.parse(StringIO.StringIO(xml)) doctype = xmlio.build_doctype(qualifiedName, publicId, systemId, internalSubset) xml_output = xmlio.output_root(root, doctype, encoding) self.assertEqual(xml_output, xml_expected)
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len( parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def test_convert_xlink_href(self, name_map, xml_input_filename, xml_expected_filename): xmlio.register_xmlns() root = xmlio.parse(sample_xml(xml_input_filename)) xlink_count = xmlio.convert_xlink_href(root, name_map) xml_output = xmlio.output(root) xml_output_expected = None with open(sample_xml(xml_expected_filename), "rb") as xml_file: xml_output_expected = xml_file.read() self.assertEqual(xml_output, xml_output_expected)
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) pub_date = None if parser.pub_date(soup) is None: # add the published date to the XML pub_date = self.get_pub_date_if_missing(doi_id) root = self.add_pub_date_to_xml(doi_id, pub_date, root) else: pub_date = parser.pub_date(soup) if parser.volume(soup) is None: # Get the pub-date year to calculate the volume year = pub_date[0] volume = year - 2011 self.add_volume_to_xml(doi_id, volume, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if pdf file then add self-uri tag if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0: for filename in new_filenames: if filename.endswith('.pdf'): root = self.add_self_uri_to_xml(doi_id, filename, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n", '').replace("\t", '') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def rewrite_xml_file(self, xml_filename, file_name_map): local_xml_filename = path.join(self.get_tmp_dir(), xml_filename) xmlio.register_xmlns() root = xmlio.parse(local_xml_filename) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # Start the file output reparsed_string = xmlio.output(root) f = open(local_xml_filename, 'wb') f.write(reparsed_string) f.close()
def convert_xml(xml_file, file_name_map): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # TODO - compare whether all file names were converted # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def convert_xml(self, xml_file, file_name_map): # Register namespaces xmlio.register_xmlns() root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # TODO - compare whether all file names were converted # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def rewrite_xml_file(self, xml_filename, file_name_map): local_xml_filename = path.join(self.get_tmp_dir(), xml_filename) xmlio.register_xmlns() root, doctype_dict = xmlio.parse(local_xml_filename, return_doctype_dict=True) # Convert xlink href values total = xmlio.convert_xlink_href(root, file_name_map) # Start the file output reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict) f = open(local_xml_filename, 'wb') f.write(reparsed_string) f.close()
def convert_xml(self, doi_id, xml_file, filenames, new_filenames): # Register namespaces xmlio.register_xmlns() root = xmlio.parse(xml_file) soup = self.article_soup(xml_file) if parser.is_poa(soup): # Capitalise subject group values in article categories root = self.subject_group_convert_in_xml(root) if parser.pub_date(soup) is None: # add the published date to the XML root = self.add_pub_date_to_xml(doi_id, root) # set the article-id, to overwrite the v2, v3 value if present root = self.set_article_id_xml(doi_id, root) # if ds.zip file is there, then add it to the xml poa_ds_zip_file = None for f in new_filenames: if f.endswith('.zip'): poa_ds_zip_file = f if poa_ds_zip_file: root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root) # Start the file output reparsed_string = xmlio.output(root) # Remove extra whitespace here for PoA articles to clean up and one VoR file too reparsed_string = reparsed_string.replace("\n",'').replace("\t",'') f = open(xml_file, 'wb') f.write(reparsed_string) f.close()
def test_output(self, xml, type, xml_expected): root = xmlio.parse(StringIO.StringIO(xml)) xml_output = xmlio.output(root, type) self.assertEqual(xml_output, xml_expected)
def test_parse_doctype(self, xml, doctype_dict_expected): root, doctype_dict = xmlio.parse(BytesIO(xml), True) self.assertEqual(doctype_dict, doctype_dict_expected)
def test_get_first_element_index(self, filename, tag_name, index): root = xmlio.parse(sample_xml(filename)) self.assertEqual(xmlio.get_first_element_index(root, tag_name), index)
def test_input_output_forcing_jats_doctype(self, filename): with open(sample_xml(filename), "rb") as xml_file: xml_output_expected = xml_file.read() root, doctype_dict = xmlio.parse(sample_xml(filename), return_doctype_dict=True) self.assertEqual(xmlio.output(root, 'JATS'), xml_output_expected)
def test_parse_doctype_processing(self, xml, pi_target_expected): root, doctype_dict, processing_instruction_nodes = xmlio.parse( BytesIO(xml), True, True) self.assertEqual(processing_instruction_nodes[0].target, pi_target_expected)
def test_parse(self, filename): root = xmlio.parse(sample_xml(filename)) self.assertEqual(type(root), Element)
def test_output(self, xml, doc_type, xml_expected): root = xmlio.parse(BytesIO(xml)) xml_output = xmlio.output(root, doc_type) self.assertEqual(xml_output.decode("utf-8"), xml_expected)
def test_output_processing_instructions(self, xml, doc_type, xml_expected): root, doctype_dict, processing_instructions = xmlio.parse( BytesIO(xml), True, True) xml_output = xmlio.output(root, doc_type, doctype_dict, processing_instructions) self.assertEqual(xml_output.decode("utf-8"), xml_expected)