def run(self):
        """
        Handle Zotero in-line reference items

        @return: a list of items handled
        """
        tei_manipulator = TeiManipulate(self.gv)
        object_list = tei_manipulator.get_object_list('//tei:ref[@rend="ref"]', ' ADDIN EN.CITE', u'zoterobiblio')
        object_list += tei_manipulator.get_object_list('//tei:ref', ' ADDIN ZOTERO_ITEM CSL_CITATION', u'zoterobiblio')

        tei_manipulator.drop_addin('//tei:ref[@rend="ref"]', ' ADDIN EN.CITE', 'EndNote',
                                   'hi', 'reference_to_link', self, u'zoterobiblio', True)

        tei_manipulator.drop_addin_json('//tei:ref', ' ADDIN ZOTERO_ITEM CSL_CITATION',
                                        'hi', 'reference_to_link', self)

        tei_manipulator.drop_addin_json('//tei:ref', ' ADDIN ZOTERO_ITEM',
                                        'hi', 'reference_to_link', self)

        # handle bibliography
        self.handle_bibliography(tei_manipulator)

        if len(object_list) > 0:
            self.debug.print_debug(self, u'Stashed {0} references for bibliography parsing'.format(len(object_list)))

        return object_list
Esempio n. 2
0
    def pre_clean(self):
        self.extract_metadata_fields()

        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # get all elements in the body
        section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        items_to_match = ['{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p',
                          '{http://www.tei-c.org/ns/1.0}cit']

        count = 0

        matched_authors = []

        for item in section:
            if count > 2:
                break

            if item.tag in items_to_match:
                count += 1
                text = self.get_stripped_text(item)

                processed = False

                for author in self.authors:
                    if not author in matched_authors:
                        has_all = True
                        for component in author:
                            if not component in text:
                                has_all = False
                                break

                        if has_all:
                            # found a metadata line
                            matched_authors.append(author)
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(self, u'Removed line "{0}" '
                                                         u'because it appears to be author metadata'.format(text))
                            processed = True
                            break

                if not processed:
                    for metadata in self.metadata:
                        if metadata in text:
                            # found a metadata line
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(self, u'Removed line "{0}" '
                                                         u'because it appears to be duplicated metadata'.format(text))

        manipulate.save_tree(tree)
Esempio n. 3
0
    def run(self):
        """
        Handle Mendeley reference tags, replacing them with NLM-spec references

        @return: a list of processed tags
        """
        tei_manipulator = TeiManipulate(self.gv)
        object_list = tei_manipulator.get_object_list('//tei:ref[@rend="ref"]', 'ADDIN CSL_CITATION', u'zoterobiblio')

        tei_manipulator.drop_addin_json('//tei:ref', 'ADDIN CSL_CITATION',
                                        'hi', 'reference_to_link', self)

        self.handle_bibliography(tei_manipulator)

        if len(object_list) > 0:
            self.debug.print_debug(self, u'Stashed {0} references for bibliography parsing'.format(len(object_list)))

        return object_list
Esempio n. 4
0
    def run(self):
        """
        Handle all unknown types of addin, stripping them from the output

        @return: a list of tags that were removed
        """
        tei_manipulator = TeiManipulate(self.gv)
        object_list = tei_manipulator.get_object_list('//*', ' ADDIN', u'addin')

        drop = self.gv.setting('drop-unknown-addins')

        tei_manipulator.drop_addin('//*', ' ADDIN', 'EndNote',
                                   'hi', 'unknown_addin_text', self, u'addin',
                                   drop == 'True')

        if len(object_list) > 0:
            self.debug.print_debug(self, u'Handled {0} unknown addin tags'.format(len(object_list)))

        return object_list
Esempio n. 5
0
    def pre_cleanup(self):
        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # make sure that head elements are not encapsulated within any elements that will stop them from being
        # correctly transformed by the XSL
        allowed = ['{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body']

        head_elements = tree.xpath('//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        count = 0

        for element in head_elements:
            current = element

            while current is not None:
                current = current.getparent()

                if current is not None:
                    if current.tag and current.tag not in allowed:
                        current.tag = 'REMOVE'
                        count += 1
                    elif current.tag and current.tag in allowed:
                        break
                else:
                    break

        if count > 0:
            etree.strip_tags(tree, 'REMOVE')
            manipulate.save_tree(tree)
            self.debug.print_debug(self, u'Extracted {0} headings from inside invalid elements'.format(count))

        # split any p tags with sub-tags hi rend="Indent" into new elements

        biblio_elements = tree.xpath('//tei:p'
                                     '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
                                     'contains(@rend, "Text Body")]]',
                                     namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        for parent in biblio_elements:
            add_position = parent

            for element in parent.xpath('tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
                                        'contains(@rend, "Text Body")]',
                                        namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):

                new_p = etree.Element('p')
                if 'rend' in parent.attrib:
                    new_p.attrib['rend'] = parent.attrib['rend']

                add_position.addnext(new_p)
                new_p.append(element)
                add_position = new_p

            manipulate.save_tree(tree)
            self.debug.print_debug(self, u'Separated out p {0}'.format(manipulate.get_stripped_text(parent)))
Esempio n. 6
0
    def pre_cleanup(self):
        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # make sure that head elements are not encapsulated within any elements that will stop them from being
        # correctly transformed by the XSL
        allowed = [
            '{http://www.tei-c.org/ns/1.0}div',
            '{http://www.tei-c.org/ns/1.0}body'
        ]

        head_elements = tree.xpath(
            '//tei:div[tei:head]',
            namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        count = 0

        for element in head_elements:
            current = element

            while current is not None:
                current = current.getparent()

                if current is not None:
                    if current.tag and current.tag not in allowed:
                        current.tag = 'REMOVE'
                        count += 1
                    elif current.tag and current.tag in allowed:
                        break
                else:
                    break

        if count > 0:
            etree.strip_tags(tree, 'REMOVE')
            manipulate.save_tree(tree)
            self.debug.print_debug(
                self,
                u'Extracted {0} headings from inside invalid elements'.format(
                    count))

        # split any p tags with sub-tags hi rend="Indent" into new elements

        biblio_elements = tree.xpath(
            '//tei:p'
            '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
            'contains(@rend, "Text Body")]]',
            namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        for parent in biblio_elements:
            add_position = parent

            for element in parent.xpath(
                    'tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
                    'contains(@rend, "Text Body")]',
                    namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):

                new_p = etree.Element('p')
                if 'rend' in parent.attrib:
                    new_p.attrib['rend'] = parent.attrib['rend']

                add_position.addnext(new_p)
                new_p.append(element)
                add_position = new_p

            manipulate.save_tree(tree)
            self.debug.print_debug(
                self, u'Separated out p {0}'.format(
                    manipulate.get_stripped_text(parent)))
Esempio n. 7
0
    def run_modules(self):
        ag = int(self.gv.settings.args['--aggression'])
        self.debug.print_debug(
            self, u'Running at aggression level {0} {1}'.format(
                ag, "[grrr!]" if ag == 10 else ""))

        if ag > 10:
            self.debug.print_debug(
                self,
                "WARNING: safety bail-out features are disabled at aggression level 11"
            )

        if self.args['bibscan']:

            BibliographyDatabase(self.gv).scan()
        else:
            # check for stylesheets
            self.gv.check_file_exists(self.gv.docx_style_sheet_dir)
            # metadata file
            gv.metadata_file = self.set_metadata_file()

            self.gv.mk_dir(self.gv.output_folder_path)

            if self.args['doc']:
                # run doc to docx conversion
                # then run docx to tei
                UnoconvToDocx(self.gv).run('doc')
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['odt']:
                # run odt to docx conversion
                # then run docx to tei
                UnoconvToDocx(self.gv).run('odt')
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['other']:
                # run other unoconv-supported format to docx conversion
                # then run docx to tei
                UnoconvToDocx(self.gv).run('unoconv')
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['docx']:
                # run docx to tei conversion
                # includes hooks for proprietary transforms if enabled
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['docxextracted']:
                self.debug.print_debug(self, u'Skipping docx extraction')
                DocxToTei(self.gv).run(False, self.args['--proprietary'])
            elif self.args['tei']:
                self.debug.print_debug(
                    self, u'Skipping docx extraction; processing TEI file')
                DocxToTei(self.gv).run(False,
                                       self.args['--proprietary'],
                                       tei=True)

            if self.args['--puretei']:
                self.debug.print_debug(self,
                                       u'Exiting as TEI transform complete')
                return

            metadata = Metadata(self.gv)
            metadata.pre_clean()

            # run size classifier
            # aggression 5
            SizeClassifier(self.gv).run()

            # run bibliographic addins handler
            # aggression 4
            found_bibliography = BibliographyAddins(self.gv).run()

            # run list classifier
            # aggression 4
            ListClassifier(self.gv).run()

            bibliography_classifier = BibliographyClassifier(self.gv)

            if not found_bibliography:
                # run bibliographic classifier
                # aggression 4
                bibliography_classifier.run()

            # tei
            # aggression 3
            TeiManipulate(self.gv).run()

            # run tei to nlm conversion
            TeiToNlm(self.gv).run(not found_bibliography)

            if self.gv.settings.args['--purenlm']:
                self.debug.print_debug(self,
                                       u'Exiting as NLM transform complete')
                return

            manipulate = NlmManipulate(self.gv)

            if not self.gv.used_list_method:
                manipulate.fuse_references()

            # run reference linker
            if not (self.args['--nolink']):
                rl = ReferenceLinker(self.gv)
                rl.run(self.args['--interactive'])
                rl.cleanup()

            # run table classifier
            cc = CaptionClassifier(self.gv)
            if int(self.args['--aggression']) > int(
                    self.gv.settings.get_setting(
                        'tablecaptions', self, domain='aggression')):
                cc.run_tables()

            if int(self.args['--aggression']) > int(
                    self.gv.settings.get_setting(
                        'graphiccaptions', self, domain='aggression')):
                cc.run_graphics()

            # run metadata merge
            metadata.run()

            if self.args['--interactive']:
                bibliography_classifier.run_prompt(True)

            # process any bibliography entries that are possible
            BibliographyDatabase(self.gv).run()

            # remove stranded titles and cleanup
            manipulate.final_clean()

            if self.args['--identifiers']:
                IdGenerator(self.gv).run()

            if self.args['--chain']:
                # construct and run an XSLT chainer
                XslChain(self.gv).run()

            if self.args['--clean']:
                ComplianceEnforcer(self.gv).run()
Esempio n. 8
0
    def pre_clean(self):
        self.extract_metadata_fields()

        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # get all elements in the body
        section = tree.xpath('//tei:body//*',
                             namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        items_to_match = [
            '{http://www.tei-c.org/ns/1.0}head',
            '{http://www.tei-c.org/ns/1.0}p',
            '{http://www.tei-c.org/ns/1.0}cit'
        ]

        count = 0

        matched_authors = []

        for item in section:
            if count > 2:
                break

            if item.tag in items_to_match:
                count += 1
                text = self.get_stripped_text(item)

                processed = False

                for author in self.authors:
                    if not author in matched_authors:
                        has_all = True
                        for component in author:
                            if not component in text:
                                has_all = False
                                break

                        if has_all:
                            # found a metadata line
                            matched_authors.append(author)
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(
                                self, u'Removed line "{0}" '
                                u'because it appears to be author metadata'.
                                format(text))
                            processed = True
                            break

                if not processed:
                    for metadata in self.metadata:
                        if metadata in text:
                            # found a metadata line
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(
                                self, u'Removed line "{0}" '
                                u'because it appears to be duplicated metadata'
                                .format(text))

        manipulate.save_tree(tree)