Example #1
0
class Merge(Debuggable):
    """
     Standalone Processing object which merges current  JATS/BITS XML file in to the Body of a BITS-XML document.

    """
    def __init__(self):

        self.args = self.read_command_line()
        self.debug = Debug()
        self.settings = Settings(self.args)
        self.gv = GV(self.settings)
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.scheme = self.args.get('<scheme>')
        self.set_numbering_tags = self.args.get('--set-numbering-tags')
        self.tr = etree.parse(os.path.join(self.dr, self.f))

        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='xmlMerge 0.0.1')

    def create_output_bits(self):
        """
        Create bits output file, generates a new file, if no file is found.
        Otherwise the current file is appended to the book body as a book-part.

        See Also
        --------
        create_book_part_bits, create_book_bits, do_file_io

        """
        fuf = os.path.join(self.dr, self.gv.uuid)
        pt = os.path.join(self.dr, os.path.basename(self.gv.uuid))

        trf = None
        if os.path.isfile(fuf):
            trf = etree.parse(fuf)
            bp = trf.find(".//book-body")
            book_part = self.create_book_part_bits()
            bp.append(book_part)
        else:
            trf = self.create_book_bits()
        trf = self.process(trf)

        self.do_file_io(
            etree.tostring(trf,
                           pretty_print=False,
                           xml_declaration=True,
                           encoding='UTF-8',
                           standalone='yes'), 'w', pt)

    def create_output_jats(self):
        """
        Create jats output file, generates a new file,

        See Also
        --------
        create_book_part_bits, create_book_bits, do_file_io

        """
        fuf = os.path.join(self.dr, self.gv.uuid)
        pt = os.path.join(self.dr, os.path.basename(self.gv.uuid))

        trf = None
        if os.path.isfile(fuf):
            trf = etree.parse(fuf)
            bpf = trf.find(".//body")
            f, bd, bk = self.get_xml_parts()
            if bd is not None:
                for sec in list(bd):
                    bpf.append(sec)

            bkrf = trf.find(".//back/ref-list")
            for r in bk.findall('.//ref-list/ref'):
                bkrf.append(r)

            bkff = trf.find(".//back/fn-group")
            for fn in bk.findall('.//fn-group/fn'):
                bkff.append(fn)

        else:
            trf = self.create_journal_jats()

        trf = self.process(trf)
        self.do_file_io(
            etree.tostring(trf,
                           pretty_print=False,
                           xml_declaration=True,
                           encoding='UTF-8',
                           standalone='yes'), 'w', pt)

    def process(self, tr):
        """
        Process  BITS-XML file and do all transformations into the elementtree

        Parameters
        ----------
        tr : elementtree
            element tree as input

        Returns
        -------
        tr : elementtree
            transformed element tree

        See Also
        --------
        globals.set_numbering_tags(), set_book_part_attributes()

        """
        tr = self.gv.set_numbering_tags(self.set_numbering_tags.split(','),
                                        tr) if self.set_numbering_tags else tr

        self.set_book_part_attributes(tr)

        return tr

    def set_book_part_attributes(self, tr):
        """
        Add  specific attributes to book-part

        Parameters
        ----------
        tr : elementtree
            element tree as input


        Returns
        -------
        tr : elementtree
            transformed element tree


        """
        book_parts = tr.findall('.//book-part')
        for i, b in enumerate(book_parts):
            b.attrib['id'] = "ch_" + str(i)
            b.attrib['book-part-type'] = "chapter"
        return tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        del p[-4:]
        name, ext = os.path.splitext(os.path.basename(self.gv.uuid))
        file_name = [name, '.', metadata, '.', 'xml']
        p.append('metadata')
        p.append(''.join(file_name))

        pth = os.sep.join(p)
        self.debug.print_debug(self, 'merging headers' + str(pth))
        return pth

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'merge'
        return name

    def create_book_bits(self):
        """
        creates a  full BITS XML book and optionally adds metadata

        Returns
        -------
        book : elementtree
            Elementtree which complies to BITS XML Scheme.

        See Also
        ---------
        create_metadata_path, create_book_part_bits

        """
        nsmap = {
            'xlink': "http://www.w3.org/1999/xlink",
            'mml': "http://www.w3.org/1998/Math/MathML",
            "xml": "http://www.w3.org/XML/1998/namespace"
        }
        book = etree.Element(etree.QName('book'), nsmap=nsmap)
        book.attrib['dtd-version'] = "2.1"
        book.attrib[etree.QName(
            '{http://www.w3.org/XML/1998/namespace}lang')] = "de"
        book.attrib['book-type'] = "proceedings"

        metadata = self.args.get('--metadata')

        if metadata:
            pth = self.create_metadata_path(metadata)
            self.debug.print_console(self, 'merging headers' + str(pth))
            if os.path.isfile(pth):
                bp = etree.parse(pth).find('.//book-meta')
                book.insert(0, bp)
            else:
                self.debug.print_console(
                    self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + str(pth))
                #sys.exit(1)

        else:
            sys.exit('Metadata argument undefined')
        bd = etree.Element("book-body")
        bpbd = self.create_book_part_bits()
        bd.append(bpbd)
        book.append(bd)

        return book

    def create_journal_jats(self):
        """
        creates a  full JATS XML book and optionally adds metadata

        Returns
        -------
        book : elementtree
            Elementtree which complies to BITS XML Scheme.

        See Also
        ---------
        create_metadata_path, create_book_part_bits

        """

        nsmap = {
            'xlink': "http://www.w3.org/1999/xlink",
            'mml': "http://www.w3.org/1998/Math/MathML",
            "xml": "http://www.w3.org/XML/1998/namespace"
        }
        journal = etree.Element(etree.QName('article'), nsmap=nsmap)
        journal.attrib['dtd-version'] = "3.0"
        journal.attrib[etree.QName(
            '{http://www.w3.org/XML/1998/namespace}lang')] = "de"

        f, bd, bk = self.get_xml_parts()

        metadata = self.args.get('--metadata')

        if metadata:
            pth = self.create_metadata_path(metadata)
            if os.path.isfile(pth):
                bpm = etree.parse(pth).find('.')
                if bpm is not None:
                    if bpm.getroottree().getroot().tag == 'front':
                        journal.insert(0, bpm)
                    else:
                        self.debug.print_debug(self,
                                               'front metadata unspecified')
                        sys.exit(1)
        else:
            journal.insert(0, f)

        journal.append(bd)
        if len(bk) > 0:
            journal.append(bk)
        else:
            back = etree.Element(etree.QName('back'))
            back.append(etree.Element(etree.QName('fn-group')))
            back.append(etree.Element(etree.QName('ref-list')))
            journal.append(back)
        return journal

    def create_book_part_bits(self):
        """
        Reads a JATS XMl File and creates a book-part element tree according to BITS-XML.

        Returns
        -------
        bp : elementtree
            Book part elementTree
        """
        f, bd, bk = self.get_xml_parts()

        bp = etree.Element("book-part")

        if f is not None:
            if len(f):
                bp.append(f)
        if bd is not None:
            bp.append(bd)
        if bk is not None:
            bp.append(bk)
        return bp

    def get_xml_parts(self):
        """
        Returns  the front-matter , body and back-matter of a JATS XML file in the above order

        Returns
        -------
        f : elementtree
            Front-matter of JATS elementTree
        bd : elementtree
            Body of JATS elementTree
        bk : elementtree
            Back-matter of JATS elementTree

        """
        r = self.tr.getroot()
        f = r.find(".//front")
        if f is None:
            f = r.find(".//book-part-meta")
        bd = r.find(".//body")
        bk = r.find(".//back")
        return f, bd, bk

    def do_file_io(self, s, mode, pth):
        """
        Executes read or write operations on a path

        Parameters
        ----------
        s: str
            Content to be written or None for read
        mode: str
            w for write , r for r
        pth : str
            Path to the file to be read or written

        Raises
        ------
        IOError
            I/O operation fails

        """
        try:
            w = open(pth, mode)
            if mode == 'w':
                w.write(s.rstrip('\r\n'))
                w.close()
            if mode == 'r':
                o = w.read()
                w.close()
        except IOError as i:
            self.debug.print_debug(self, i)
            print(i)
            sys.exit(1)

    def run(self):
        """
         Runs the configuration on the processing object. Process  JATS-XML file and merges it into the full BITS-XML file

        See Also
        --------
        create_output_bits

        Warning
        -------
        function create_output_jats not yet used

        """

        self.gv.create_dirs_recursive(self.dr.split('/'))
        if self.scheme == 'bits':
            self.create_output_bits()

        elif self.scheme == 'jats':
            self.tr = self.create_output_jats()
Example #2
0
class Prepare(Debuggable):
    """
    Standalone Processing object to combine, clean and modify a JATS XML file and optionally inject BITS Metadata headers.

    Features
    --------
    add Id numbering for any tag type, clean comments, remove unused references,
    set numbering, add unique ids to certain tag types, sort references

    """
    def __init__(self):
        self.args = self.read_command_line()
        self.debug = Debug()
        self.settings = Settings(self.args)
        self.gv = GV(self.settings)
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.stand_alone = self.args.get('--stand-alone')
        self.tr = etree.parse(os.path.join(self.dr, self.f))

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='xml 0.1')

    def citations_to_references(self):
        """ Removes  mixed-citation block, adds as a <sec> Section element

        Returns
         -------
         tr : elementtree

        """

        t = self.tr.getroot()
        bd = t.find('.//body')
        sc = etree.Element('sec')
        ttl = etree.Element('title')
        ttl.text = 'References'
        sc.append(ttl)
        mc = t.findall('.//mixed-citation')
        if len(mc) > 0:
            for r in mc:
                r.tag = 'p'
                sc.append(r)
            bd.append(sc)
            rlst = t.find('.//ref-list')
            rlst.getparent().remove(rlst)
            bck = t.find('.//back')
            bck.append(etree.Element('ref-list'))

        return self.tr

    def clean_references(self):
        """ removes  references, which are not linked.

         Parameters
         -----------
         tag : str
            name of the XML tag

         Returns
         -------
         tr : elementtree

         See Also
         --------
         remove_element, remove_tags

        """
        r = self.tr.getroot()

        for e in r.findall('.//back/ref-list/ref'):
            if e.attrib.get('id'):
                if r.find(".//xref[@ref-type='bibr'][@rid='" +
                          e.attrib.get('id') + "']") is None:
                    self.remove_element(e)
            else:
                self.remove_element(e)
        for e in r.findall(".//xref[@ref-type='bibr']"):
            if r.find(".//back/ref-list/ref[@id='" + e.attrib.get('rid') +
                      "']") is None:
                if e.getparent() is not None:
                    for c in e.getparent().getiterator():
                        if c.tag == 'xref' and c.attrib.get(
                                'ref-type') == 'bibr':
                            self.remove_tags(c)
        return self.tr

    def remove_tags(self, e):
        """
        Takes an etree element and replaces it with its own text

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            previous = e.getprevious()
            if previous is not None:
                if previous.tail:
                    if e.text:
                        previous.tail = previous.tail + e.text
                    if e.tail:
                        previous.tail = previous.tail + e.tail
                    e.getparent().remove(e)

    def remove_element(self, e):
        """
        Remove any element only if it has a parent

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            e.getparent().remove(e)

    def set_uuids_for_back_matter(self, tags):
        """
        Add unique id tags to  any of the sub-elements of the back matter

        Parameters
        ----------
        tags: list
         list of elements

        Returns
        -------
        tr : elementtree

        """
        for s in tags:
            f = {}
            ref_type = 'bibr' if s == 'ref' else s
            fns = self.tr.getroot().findall(''.join(
                ['.//xref/[@ref-type="', ref_type, '"]']))
            for i in fns:
                rid = ''.join(['bibd', str(uuid.uuid4())])
                f[i.attrib['rid']] = rid
                i.set('rid', rid)
            for m in list(f.keys()):
                n = self.tr.getroot().find(''.join(
                    ['.//' + s + '/[@id="', m, '"]']))
                if n is not None:
                    n.set('id', f[m]) if len(n) > 0 else ''
        return self.tr

    def set_numbering_values(self, tag, attr, value, count, range_list):
        """
        Adds numerical values to  a  tag  in arguments list

        Parameters
        ---------
        tag: str
            xml tag name
        attr: str
            attribute name
        value :str
            value name
        count : int
            current sequence number
        range_list : list
           lower and upper level for the  numbering

        See Also
        --------
        set_roman_numbers

        """
        searchTag = './/' + tag + '[@' + attr + '="' + value + '"]'
        elems = self.tr.getroot().findall(searchTag)
        range_count = 1
        for elem in elems:
            elem.text, range_count = self.set_roman_numbers(
                count, range_count, range_list)
            count += 1

        return self.tr, count

    def convert_int_to_roman(self, i):
        """
        Converts an integer number into a roman number

        Parameters
        ---------
        i : int
            integer number

        Returns
        -------
        result : str
            Roman number

        """
        result = []
        for integer, numeral in self.gv.numeral_map:
            count = i // integer
            result.append(numeral * count)
            i -= integer * count
        return ''.join(result)

    def set_roman_numbers(self, count, r_count, range_list):
        """
        Converts a given set of elements defined by range_array into roman numbers

        Parameters
        ---------
        count :int
        r_count : int
        range_list : list
            lower and upper level for the  numbering

        Returns
        -------
        val : str
        r_count: int

        See Also
        --------
        convert_int_to_roman

        """

        val = str(count)
        if int(range_list[0]) <= count <= int(range_list[1]):
            val = self.convert_int_to_roman(r_count).lower()
            r_count += 1
        else:
            val = str(count - r_count + 1)
        return val, r_count

    def merge_metadata(self, metadata):
        """
        reads a metadata file path and  merge its content into the metadata section

        Parameters
        ----------
        metadata : str
             suffix  of the metadata files

        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        -------
        create_metadata_path

        """
        r = self.tr.getroot()

        pth = self.create_metadata_path(metadata)

        if os.path.isfile(pth):
            fr = r.find('.//front')
            if len(fr):
                bg = r.find('.//body').getparent()
                fr.getparent().remove(fr)
                bpm = etree.parse(pth).find('.//book-part-meta')
                if bpm is None:
                    bpm = etree.parse(pth).find('.')
                    if bpm is not None:
                        if bpm.getroottree().getroot().tag == 'front':
                            bg.insert(0, bpm)
                        else:
                            self.debug.print_debug(
                                self, 'front or bookpart metadata unspecified')
                            sys.exit(1)
                else:
                    bg.insert(0, bpm)
            else:
                self.debug.print_debug(self, 'front metadata unspecified')
        else:
            self.debug.print_debug(
                self, pth + self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST)
            sys.exit(1)
        return self.tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        f = os.path.basename(self.f)
        name, ext = os.path.splitext(f)
        file_name = [name, '.', metadata, ext]

        if not self.stand_alone or not os.path.exists(os.sep.join(p)):
            del p[-4:]
        p.append('metadata')
        p.append(''.join(file_name))
        pth = os.sep.join(p)
        return pth

    def sort_by_tags(self, tag_list, elem):
        """
        Sorts  a   list  of elements alphabetically

        Parameters
        ----------
        tag_list : list
            A list of tag types
        elem : Element
            Element to be modified

        """
        data = []
        for e in elem:
            vl = []
            for tag in tag_list:
                vl.append(e.findtext(".//" + tag))
            vl.append(e)
            data.append(tuple(vl))

        data.sort()
        elem[:] = [item[-1] for item in data]

    def sort_references(self, tag_list):
        """
        Sort references based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/ref-list')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def sort_footnotes(self, tag_list):
        """
        Sort footnotes based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/fn-group')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def process(self):
        """
        Process  JATS-XML file and do all transformations into the elementtree

        See Also
        --------
        merge_metadata, set_numbering_tags,set_uuids_for_back_matter,sort_footnotes,sort_references,set_numbering_values

        """

        citations_to_references = self.args.get('--citations-to-references')
        clean_references = self.args.get('--clean-references')
        set_numbering_tags = self.args.get('--set-numbering-tags')
        set_unique_ids = self.args.get('--set-uuids')
        sort_footnotes = self.args.get('--sort-footnotes')
        sort_references = self.args.get('--sort-references')
        set_numbering_values = self.args.get('--set-numbering-values')

        metadata = self.args.get('--metadata')
        self.tr = self.merge_metadata(metadata) if metadata else self.tr

        self.tr = self.citations_to_references(
        ) if citations_to_references else self.tr
        self.tr = self.clean_references() if clean_references else self.tr
        self.tr = self.gv.set_numbering_tags(
            set_numbering_tags.split(','),
            self.tr) if set_numbering_tags else self.tr
        self.tr = self.set_uuids_for_back_matter(
            set_unique_ids.split(',')) if set_unique_ids else self.tr
        self.tr = self.sort_footnotes(
            sort_footnotes.split(',')) if sort_footnotes else self.tr
        self.tr = self.sort_references(
            sort_references.split(',')) if sort_references else self.tr

        for s in set_numbering_values.split(';'):
            vals = s.split(',')

            count = 1
            range_count = [0, 0]

            if len(vals) > 3:
                r = vals[3].lstrip('{').rstrip('}').split(':')
                range_count = [int(r[0]), int(r[1])]
            self.tr, count = self.set_numbering_values(vals[0], vals[1],
                                                       vals[2], count,
                                                       range_count)

        self.gv.create_dirs_recursive(self.dr.split('/'))
        self.create_xml_file(os.path.join(self.dr, os.path.basename(self.f)))

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'prepare'
        return name

    def create_xml_file(self, pth):
        """
        Write the current elementTree into the file path

        Parameters
        ----------
        pth : str
            Correct path of the metadata file in the folder structure

        Raises
        ------
        IOError
            I/O operation fails

        Notes
        -----
        Default configuration writes a normalized XML file with XML scheme

        """

        try:

            self.tr.write(pth, pretty_print=False, xml_declaration=True)
            print()
        except IOError as e:
            print(e)
            self.debug.print_debug(self, self.XML_FILE_NOT_CREATED)

    def run(self):
        """
        Runs the configuration on the processing object

        See Also
        --------
        process


        """
        self.process()
Example #3
0
class Merge(Debuggable):
    """
     Standalone Processing object which merges current  JATS/BITS XML file in to the Body of a BITS-XML document.

    """

    def __init__(self):

        self.args = self.read_command_line()
        self.debug = Debug()
        self.gv = GV()
        self.uid = self.gv.uuid
        self.dr = self.args.get("<path>")
        self.f = self.args.get("<input_file>")
        self.scheme = self.args.get("<scheme>")
        self.set_numbering_tags = self.args.get("--set-numbering-tags")
        self.tr = etree.parse(os.path.join(self.dr, self.f))

        Debuggable.__init__(self, "Main")
        if self.args.get("--debug"):
            self.debug.enable_debug()

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version="xmlMerge 0.0.1")

    def create_output_bits(self):
        """
        Create bits output file, generates a new file, if no file is found.
        Otherwise the current file is appended to the book body as a book-part.

        See Also
        --------
        create_book_part_bits, create_book_bits, do_file_io

        """
        fuf = os.path.join(self.dr, self.uid)
        pt = os.path.join(self.dr, os.path.basename(self.uid))

        trf = None
        if os.path.isfile(fuf):
            trf = etree.parse(fuf)
            bp = trf.find(".//book-body")
            book_part = self.create_book_part_bits()
            bp.append(book_part)
        else:
            trf = self.create_book_bits()
        trf = self.process(trf)

        self.do_file_io(
            etree.tostring(trf, pretty_print=True, xml_declaration=True, encoding="UTF-8", standalone="yes"), "w", pt
        )

    def process(self, tr):
        """
        Process  BITS-XML file and do all transformations into the elementtree

        Parameters
        ----------
        tr : elementtree
            element tree as input

        Returns
        -------
        tr : elementtree
            transformed element tree

        See Also
        --------
        globals.set_numbering_tags(), set_book_part_attributes()

        """
        tr = self.gv.set_numbering_tags(self.set_numbering_tags.split(","), tr) if self.set_numbering_tags else tr

        self.set_book_part_attributes(tr)

        return tr

    def set_book_part_attributes(self, tr):
        """
        Add  specific attributes to book-part

        Parameters
        ----------
        tr : elementtree
            element tree as input


        Returns
        -------
        tr : elementtree
            transformed element tree


        """
        book_parts = tr.findall(".//book-part")
        for i, b in enumerate(book_parts):
            b.attrib["id"] = "ch_" + str(i)
            b.attrib["book-part-type"] = "chapter"
        return tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        del p[-4:]
        name, ext = os.path.splitext(os.path.basename(self.uid))
        file_name = [name, ".", metadata, ext]
        p.append("metadata")
        p.append("".join(file_name))
        pth = os.sep.join(p)
        return pth

    def create_book_bits(self):
        """
        creates a  full BITS XML book and optionally adds metadata

        Returns
        -------
        book : elementtree
            Elementtree which complies to BITS XML Schheme.

        See Also
        ---------
        create_metadata_path, create_book_part_bits

        """
        nsmap = {
            "xlink": "http://www.w3.org/1999/xlink",
            "mml": "http://www.w3.org/1998/Math/MathML",
            "xml": "http://www.w3.org/XML/1998/namespace",
        }
        book = etree.Element(etree.QName("book"), nsmap=nsmap)
        book.attrib["dtd-version"] = "2.1"
        book.attrib[etree.QName("{http://www.w3.org/XML/1998/namespace}lang")] = "de"
        book.attrib["book-type"] = "proceedings"

        metadata = self.args.get("--metadata")
        if metadata:
            pth = self.create_metadata_path(metadata)
            if os.path.isfile(pth):
                bp = etree.parse(pth).find(".//book-meta")
                book.insert(0, bp)

        bd = etree.Element("book-body")
        bpbd = self.create_book_part_bits()
        bd.append(bpbd)
        book.append(bd)

        return book

    def create_book_part_bits(self):
        """
        Reads a JATS XMl File and creates a book-part element tree according to BITS-XML.

        Returns
        -------
        bp : elementtree
            Book part elementTree
        """

        f, bd, bk = self.get_xml_parts()

        bp = etree.Element("book-part")

        if f is not None:
            if len(f):
                bp.append(f)
        bp.append(bd)
        bp.append(bk)
        return bp

    def get_xml_parts(self):
        """
        Returns  the front-matter , body and back-matter of a JATS XML file in the above order

        Returns
        -------
        f : elementtree
            Front-matter of JATS elementTree
        bd : elementtree
            Body of JATS elementTree
        bk : elementtree
            Back-matter of JATS elementTree

        """
        r = self.tr.getroot()
        f = r.find(".//front")
        if f is None:
            f = r.find(".//book-part-meta")
        bd = r.find(".//body")
        bk = r.find(".//back")
        return f, bd, bk

    def do_file_io(self, s, mode, pth):
        """
        Executes read or write operations on a path

        Parameters
        ----------
        s: str
            Content to be written or None for read
        mode: str
            w for write , r for r
        pth : str
            Path to the file to be read or written

        Raises
        ------
        IOError
            I/O operation fails

        """
        try:
            w = open(pth, mode)
            if mode == "w":
                w.write(s)
                w.close()
            if mode == "r":
                o = w.read()
                w.close()
        except IOError as i:
            self.debug.print_debug(self, i)
            print(i)
            sys.exit(1)

    def run(self):
        """
         Runs the configuration on the processing object. Process  JATS-XML file and merges it into the full BITS-XML file

        See Also
        --------
        create_output_bits

        Warning
        -------
        function create_output_jats not yet used

        """

        self.gv.create_dirs_recursive(self.dr.split("/"))
        if self.scheme == "bits":
            self.create_output_bits()

        elif self.scheme == "jats":
            self.tr = self.create_output_jats(self.tr)
Example #4
0
class Process(Debuggable):
    """
    Standalone Processing object to combine, clean and modify a JATS XML file and optionally inject BITS Metadata headers.

    Features
    --------
    add Id numbering for any tag type, clean comments, remove unused references,
    set numbering, add unique ids to certain tag types, sort references

    """

    def __init__(self):
        self.args = self.read_command_line()
        self.debug = Debug()
        self.gv = GV()
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.tr = etree.parse(os.path.join(self.dr, self.f))

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='xml 0.1')

    def remove_references(self):
        """ removes  references, which are not linked.

         Parameters
         -----------
         tag : str
            name of the XML tag

         Returns
         -------
         tr : elementtree

         See Also
         --------
         remove_element, remove_tags

        """
        r = self.tr.getroot()

        for e in r.findall('.//back/ref-list/ref'):
            if e.attrib.get('id'):
                if r.find(".//xref[@ref-type='bibr'][@rid='" + e.attrib.get('id') + "']") is None:
                    self.remove_element(e)
            else:
                self.remove_element(e)
        for e in r.findall(".//xref[@ref-type='bibr']"):
            if r.find(".//back/ref-list/ref[@id='" + e.attrib.get('rid') + "']") is None:
                if e.getparent() is not None:
                    for c in e.getparent().getiterator():
                        if c.tag == 'xref' and c.attrib.get('ref-type') == 'bibr':
                            self.remove_tags(c)
        return self.tr

    def remove_tags(self, e):
        """
        Takes an etree element and replaces it with its own text

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            previous = e.getprevious()
            if previous is not None:
                if previous.tail:
                    if e.text:
                        previous.tail = previous.tail + e.text
                    if e.tail:
                        previous.tail = previous.tail + e.tail
                    e.getparent().remove(e)

    def remove_element(self, e):
        """
        Remove any element only if it has a parent

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            e.getparent().remove(e)

    def set_uuids_for_back_matter(self, tags):
        """
        Add unique id tags to  any of the sub-elements of the back matter

        Parameters
        ----------
        tags: list
         list of elements

        Returns
        -------
        tr : elementtree

        """
        for s in tags:
            f = {}
            ref_type = 'bibr' if s == 'ref' else s
            fns = self.tr.getroot().findall(
                ''.join(['.//xref/[@ref-type="', ref_type, '"]']))
            for i in fns:
                rid = ''.join(['bibd', uuid.uuid4().get_hex()])
                f[i.attrib['rid']] = rid
                i.set('rid', rid)
            for m in f.keys():
                n = self.tr.getroot().find(
                    ''.join(['.//' + s + '/[@id="', m, '"]']))
                if n is not None:
                    n.set('id', f[m]) if len(n) > 0 else ''
        return self.tr

    def set_numbering_values(
            self,
            tag,
            attr,
            value,
            count,
            range_list):
        """
        Adds numerical values to  a  tag  in arguments list

        Parameters
        ---------
        tag: str
            xml tag name
        attr: str
            attribute name
        value :str
            value name
        count : int
            current sequence number
        range_list : list
           lower and upper level for the  numbering

        See Also
        --------
        set_roman_numbers

        """
        searchTag = './/' + tag + '[@' + attr + '="' + value + '"]'
        elems = self.tr.getroot().findall(searchTag)
        range_count = 1
        for elem in elems:
            elem.text, range_count = self.set_roman_numbers(
                count, range_count, range_list)
            count += 1

        return self.tr, count

    def convert_int_to_roman(self, i):
        """
        Converts an integer number into a roman number

        Parameters
        ---------
        i : int
            integer number

        Returns
        -------
        result : str
            Roman number

        """
        result = []
        for integer, numeral in self.gv.numeral_map:
            count = i // integer
            result.append(numeral * count)
            i -= integer * count
        return ''.join(result)

    def set_roman_numbers(self, count, r_count, range_list):
        """
        Converts a given set of elements defined by range_array into roman numbers

        Parameters
        ---------
        count :int
        r_count : int
        range_list : list
            lower and upper level for the  numbering

        Returns
        -------
        val : str
        r_count: int

        See Also
        --------
        convert_int_to_roman

        """

        val = str(count)
        if int(range_list[0]) <= count <= int(range_list[1]):
            val = self.convert_int_to_roman(r_count).lower()
            r_count += 1
        else:
            val = str(count - r_count + 1)
        return val, r_count

    def merge_metadata(self, metadata):
        """
        reads a metadata file path and  merge its content into the metadata section

        Parameters
        ----------
        metadata : str
             suffix  of the metadata files

        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        -------
        create_metadata_path

        """
        r = self.tr.getroot()

        pth = self.create_metadata_path(metadata)

        if os.path.isfile(pth):
            fr = r.find('.//front')
            fr.getparent().remove(fr)
            bpm = etree.parse(pth).find('.//book-part-meta')
            bg = r.find('.//body').getparent()
            bg.insert(0, bpm)

        else:
            self.debug.print_debug(self, pth +
                                   self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST)

        return self.tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        del p[-4:]
        f = os.path.basename(self.f)
        name, ext = os.path.splitext(f)
        file_name = [name, '.', metadata, ext]
        p.append('metadata')
        p.append(''.join(file_name))
        pth = os.sep.join(p)
        return pth

    def sort_by_tags(self, tag_list, elem):
        """
        Sorts  a   list  of elements alphabetically

        Parameters
        ----------
        tag_list : list
            A list of tag types
        elem : Element
            Element to be modified

        """
        data = []
        for e in elem:
            vl = []
            for tag in tag_list:
                vl.append(e.findtext(".//" + tag))

            vl.append(e)
            data.append(tuple(vl))

        data.sort()
        elem[:] = [item[-1] for item in data]

    def sort_references(self, tag_list):
        """
        Sort references based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/ref-list')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def sort_footnotes(self, tag_list):
        """
        Sort footnotes based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/fn-group')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def process(self):
        """
        Process  JATS-XML file and do all transformations into the elementtree

        See Also
        --------
        merge_metadata, set_numbering_tags,set_uuids_for_back_matter,sort_footnotes,sort_references,set_numbering_values

        """

        clean_references = self.args.get('--clean-references')

        set_numbering_tags = self.args.get('--set-numbering-tags')
        set_unique_ids = self.args.get('--set-uuids')
        sort_footnotes = self.args.get('--sort-footnotes')
        sort_references = self.args.get('--sort-references')
        set_numbering_values = self.args.get('--set-numbering-values')

        metadata = self.args.get('--metadata')
        self.tr = self.merge_metadata(metadata) if metadata else self.tr

        self.tr = self.remove_references() if clean_references else self.tr
        self.tr = self.gv.set_numbering_tags(set_numbering_tags.split(
            ','), self.tr) if set_numbering_tags else self.tr
        self.tr = self.set_uuids_for_back_matter(
            set_unique_ids.split(',')) if set_unique_ids else self.tr
        self.tr = self.sort_footnotes(
            sort_footnotes.split(',')) if sort_footnotes else self.tr
        self.tr = self.sort_references(
            sort_references.split(',')) if sort_references else self.tr

        for s in set_numbering_values.split(';'):
            vals = s.split(',')

            count = 1
            range_count = [0, 0]

            if len(vals) > 3:
                r = vals[3].lstrip('{').rstrip('}').split(':')
                range_count = [int(r[0]), int(r[1])]
            self.tr, count = self.set_numbering_values(
                vals[0], vals[1], vals[2], count, range_count)

        self.gv.create_dirs_recursive(self.dr.split('/'))
        self.create_xml_file(
            os.path.join(
                self.dr, os.path.basename(
                    self.f)))

    def create_xml_file(self, pth):
        """
        Write the current elementTree into the file path

        Parameters
        ----------
        pth : str
            Correct path of the metadata file in the folder structure

        Raises
        ------
        IOError
            I/O operation fails

        Notes
        -----
        Default configuration writes a normalized XML file with XML scheme

        """

        try:

            self.tr.write(
                pth,
                pretty_print=False,
                xml_declaration=True
            )
            print
        except IOError as e:
            print e
            self.debug.print_debug(self, self.XML_FILE_NOT_CREATED)

    def run(self):
        """
        Runs the configuration on the processing object

        See Also
        --------
        process


        """
        self.process()