Ejemplo n.º 1
0
 def test_input_output(self, filename):
     """test parsing a file while retaining the doctype"""
     with open(sample_xml(filename), "rb") as xml_file:
         xml_output_expected = xml_file.read()
     root, doctype_dict = xmlio.parse(sample_xml(filename),
                                      return_doctype_dict=True)
     self.assertEqual(xmlio.output(root, None, doctype_dict),
                      xml_output_expected)
Ejemplo n.º 2
0
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(
                    parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file,
                                                  root)

        # Start the file output
        reparsed_string = xmlio.output(root,
                                       type=None,
                                       doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
Ejemplo n.º 3
0
 def test_convert_xlink_href(self, name_map, xml_input_filename,
                             xml_expected_filename):
     xmlio.register_xmlns()
     root = xmlio.parse(sample_xml(xml_input_filename))
     xlink_count = xmlio.convert_xlink_href(root, name_map)
     xml_output = xmlio.output(root)
     xml_output_expected = None
     with open(sample_xml(xml_expected_filename), "rb") as xml_file:
         xml_output_expected = xml_file.read()
     self.assertEqual(xml_output, xml_output_expected)
Ejemplo n.º 4
0
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)

            pub_date = None
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                pub_date = self.get_pub_date_if_missing(doi_id)
                root = self.add_pub_date_to_xml(doi_id, pub_date, root)
            else:
                pub_date = parser.pub_date(soup)

            if parser.volume(soup) is None:
                # Get the pub-date year to calculate the volume
                year = pub_date[0]
                volume = year - 2011
                self.add_volume_to_xml(doi_id, volume, root)

            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)

            # if pdf file then add self-uri tag
            if parser.self_uri(soup) is not None and len(parser.self_uri(soup)) == 0:
                for filename in new_filenames:
                    if filename.endswith('.pdf'):
                        root = self.add_self_uri_to_xml(doi_id, filename, root)

            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)


        # Start the file output
        reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n", '').replace("\t", '')

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
    def rewrite_xml_file(self, xml_filename, file_name_map):

        local_xml_filename = path.join(self.get_tmp_dir(), xml_filename)

        xmlio.register_xmlns()
        root = xmlio.parse(local_xml_filename)

        # Convert xlink href values
        total = xmlio.convert_xlink_href(root, file_name_map)

        # Start the file output
        reparsed_string = xmlio.output(root)
        f = open(local_xml_filename, 'wb')
        f.write(reparsed_string)
        f.close()
Ejemplo n.º 6
0
def convert_xml(xml_file, file_name_map):

    # Register namespaces
    xmlio.register_xmlns()

    root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

    # Convert xlink href values
    total = xmlio.convert_xlink_href(root, file_name_map)
    # TODO - compare whether all file names were converted

    # Start the file output
    reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

    f = open(xml_file, 'wb')
    f.write(reparsed_string)
    f.close()
Ejemplo n.º 7
0
    def convert_xml(self, xml_file, file_name_map):

        # Register namespaces
        xmlio.register_xmlns()

        root, doctype_dict = xmlio.parse(xml_file, return_doctype_dict=True)

        # Convert xlink href values
        total = xmlio.convert_xlink_href(root, file_name_map)
        # TODO - compare whether all file names were converted

        # Start the file output
        reparsed_string = xmlio.output(root, type=None, doctype_dict=doctype_dict)

        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
    def rewrite_xml_file(self, xml_filename, file_name_map):

        local_xml_filename = path.join(self.get_tmp_dir(), xml_filename)

        xmlio.register_xmlns()
        root, doctype_dict = xmlio.parse(local_xml_filename,
                                         return_doctype_dict=True)

        # Convert xlink href values
        total = xmlio.convert_xlink_href(root, file_name_map)

        # Start the file output
        reparsed_string = xmlio.output(root,
                                       type=None,
                                       doctype_dict=doctype_dict)
        f = open(local_xml_filename, 'wb')
        f.write(reparsed_string)
        f.close()
    def convert_xml(self, doi_id, xml_file, filenames, new_filenames):

        # Register namespaces
        xmlio.register_xmlns()
        
        root = xmlio.parse(xml_file)

        soup = self.article_soup(xml_file)

        if parser.is_poa(soup):
            # Capitalise subject group values in article categories
            root = self.subject_group_convert_in_xml(root)
            
            if parser.pub_date(soup) is None:
                # add the published date to the XML
                root = self.add_pub_date_to_xml(doi_id, root)
            
            # set the article-id, to overwrite the v2, v3 value if present
            root = self.set_article_id_xml(doi_id, root)
            
            # if ds.zip file is there, then add it to the xml
            poa_ds_zip_file = None
            for f in new_filenames:
                if f.endswith('.zip'):
                    poa_ds_zip_file = f
            if poa_ds_zip_file:
                root = self.add_poa_ds_zip_to_xml(doi_id, poa_ds_zip_file, root)
            
                
        # Start the file output
        reparsed_string = xmlio.output(root)

        # Remove extra whitespace here for PoA articles to clean up and one VoR file too
        reparsed_string = reparsed_string.replace("\n",'').replace("\t",'')
        
        f = open(xml_file, 'wb')
        f.write(reparsed_string)
        f.close()
Ejemplo n.º 10
0
 def test_output(self, xml, type, xml_expected):
     root = xmlio.parse(StringIO.StringIO(xml))
     xml_output = xmlio.output(root, type)
     self.assertEqual(xml_output, xml_expected)
Ejemplo n.º 11
0
 def test_input_output_forcing_jats_doctype(self, filename):
     with open(sample_xml(filename), "rb") as xml_file:
         xml_output_expected = xml_file.read()
     root, doctype_dict = xmlio.parse(sample_xml(filename),
                                      return_doctype_dict=True)
     self.assertEqual(xmlio.output(root, 'JATS'), xml_output_expected)
Ejemplo n.º 12
0
 def test_output_processing_instructions(self, xml, doc_type, xml_expected):
     root, doctype_dict, processing_instructions = xmlio.parse(
         BytesIO(xml), True, True)
     xml_output = xmlio.output(root, doc_type, doctype_dict,
                               processing_instructions)
     self.assertEqual(xml_output.decode("utf-8"), xml_expected)
Ejemplo n.º 13
0
 def test_output(self, xml, doc_type, xml_expected):
     root = xmlio.parse(BytesIO(xml))
     xml_output = xmlio.output(root, doc_type)
     self.assertEqual(xml_output.decode("utf-8"), xml_expected)