Exemple #1
0
 def test_input_output(self, filename):
     """test parsing a file while retaining the doctype"""
     with open(sample_xml(filename), "rb") as xml_file:
         xml_output_expected = xml_file.read()
     root, doctype_dict = xmlio.parse(sample_xml(filename),
                                      return_doctype_dict=True)
     self.assertEqual(xmlio.output(root, None, doctype_dict),
                      xml_output_expected)
 def test_rewrite_subject_group(self, xml, subjects, subject_group_type,
                                overwrite, xml_expected):
     root = xmlio.parse(xml)
     xmlio.rewrite_subject_group(root, subjects, subject_group_type,
                                 overwrite)
     # unicode encoding option added in python 3
     rough_string = ElementTree.tostring(root, encoding="unicode")
     self.assertEqual(rough_string, xml_expected)
 def test_output_root(self, xml, publicId, systemId, internalSubset,
                      xml_expected):
     encoding = "UTF-8"
     qualifiedName = "article"
     root = xmlio.parse(BytesIO(xml))
     doctype = xmlio.build_doctype(qualifiedName, publicId, systemId,
                                   internalSubset)
     xml_output = xmlio.output_root(root, doctype, encoding)
     self.assertEqual(xml_output.decode("utf-8"), xml_expected)
Exemple #4
0
 def test_output_root(self, xml, publicId, systemId, internalSubset,
                      xml_expected):
     encoding = 'UTF-8'
     qualifiedName = "article"
     root = xmlio.parse(StringIO.StringIO(xml))
     doctype = xmlio.build_doctype(qualifiedName, publicId, systemId,
                                   internalSubset)
     xml_output = xmlio.output_root(root, doctype, encoding)
     self.assertEqual(xml_output, xml_expected)
Exemple #5
0
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(
                    parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file,
                                                  root)

        # Start the file output
        reparsed_string = xmlio.output(root,
                                       type=None,
                                       doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
Exemple #6
0
 def test_convert_xlink_href(self, name_map, xml_input_filename,
                             xml_expected_filename):
     xmlio.register_xmlns()
     root = xmlio.parse(sample_xml(xml_input_filename))
     xlink_count = xmlio.convert_xlink_href(root, name_map)
     xml_output = xmlio.output(root)
     xml_output_expected = None
     with open(sample_xml(xml_expected_filename), "rb") as xml_file:
         xml_output_expected = xml_file.read()
     self.assertEqual(xml_output, xml_output_expected)
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)


        # Start the file output
        reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
    def rewrite_xml_file(self, xml_filename, file_name_map):

        local_xml_filename = path.join(self.get_tmp_dir(), xml_filename)

        xmlio.register_xmlns()
        root = xmlio.parse(local_xml_filename)

        # Convert xlink href values
        total = xmlio.convert_xlink_href(root, file_name_map)

        # Start the file output
        reparsed_string = xmlio.output(root)
        f = open(local_xml_filename, 'wb')
        f.write(reparsed_string)
        f.close()
def convert_xml(xml_file, file_name_map):

    # Register namespaces
    xmlio.register_xmlns()

    root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

    # Convert xlink href values
    total = xmlio.convert_xlink_href(root, file_name_map)
    # TODO - compare whether all file names were converted

    # Start the file output
    reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

    f = open(xml_file, 'wb')
    f.write(reparsed_string)
    f.close()
Exemple #10
0
    def convert_xml(self, xml_file, file_name_map):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        # Convert xlink href values
        total = xmlio.convert_xlink_href(root, file_name_map)
        # TODO - compare whether all file names were converted

        # Start the file output
        reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
    def rewrite_xml_file(self, xml_filename, file_name_map):

        local_xml_filename = path.join(self.get_tmp_dir(), xml_filename)

        xmlio.register_xmlns()
        root, doctype_dict = xmlio.parse(local_xml_filename,
                                         return_doctype_dict=True)

        # Convert xlink href values
        total = xmlio.convert_xlink_href(root, file_name_map)

        # Start the file output
        reparsed_string = xmlio.output(root,
                                       type=None,
                                       doctype_dict=doctype_dict)
        f = open(local_xml_filename, 'wb')
        f.write(reparsed_string)
        f.close()
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()
        
        root = xmlio.parse(xml_file)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)
            
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                root = self.add_pub_date_to_xml(doi_id, root)
            
            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)
            
            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)
            
                
        # Start the file output
        reparsed_string = xmlio.output(root)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n",'').replace("\t",'')
        
        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
Exemple #13
0
 def test_output(self, xml, type, xml_expected):
     root = xmlio.parse(StringIO.StringIO(xml))
     xml_output = xmlio.output(root, type)
     self.assertEqual(xml_output, xml_expected)
 def test_parse_doctype(self, xml, doctype_dict_expected):
     root, doctype_dict = xmlio.parse(BytesIO(xml), True)
     self.assertEqual(doctype_dict, doctype_dict_expected)
Exemple #15
0
 def test_get_first_element_index(self, filename, tag_name, index):
     root = xmlio.parse(sample_xml(filename))
     self.assertEqual(xmlio.get_first_element_index(root, tag_name), index)
Exemple #16
0
 def test_input_output_forcing_jats_doctype(self, filename):
     with open(sample_xml(filename), "rb") as xml_file:
         xml_output_expected = xml_file.read()
     root, doctype_dict = xmlio.parse(sample_xml(filename),
                                      return_doctype_dict=True)
     self.assertEqual(xmlio.output(root, 'JATS'), xml_output_expected)
 def test_parse_doctype_processing(self, xml, pi_target_expected):
     root, doctype_dict, processing_instruction_nodes = xmlio.parse(
         BytesIO(xml), True, True)
     self.assertEqual(processing_instruction_nodes[0].target,
                      pi_target_expected)
Exemple #18
0
 def test_parse(self, filename):
     root = xmlio.parse(sample_xml(filename))
     self.assertEqual(type(root), Element)
 def test_output(self, xml, doc_type, xml_expected):
     root = xmlio.parse(BytesIO(xml))
     xml_output = xmlio.output(root, doc_type)
     self.assertEqual(xml_output.decode("utf-8"), xml_expected)
 def test_output_processing_instructions(self, xml, doc_type, xml_expected):
     root, doctype_dict, processing_instructions = xmlio.parse(
         BytesIO(xml), True, True)
     xml_output = xmlio.output(root, doc_type, doctype_dict,
                               processing_instructions)
     self.assertEqual(xml_output.decode("utf-8"), xml_expected)