Example #1
0
    def run(self):
        elements = ['abbrev', 'abstract', 'ack', 'address', 'aff', 'alt-text', 'app', 'app-group', 'array',
                    'article-title', 'attrib', 'author-comment', 'author-notes', 'award-group', 'bio', 'boxed-text',
                    'caption', 'chem-struct', 'chem-struct-wrap', 'col', 'colgroup', 'collab', 'compound-kwd',
                    'contrib', 'contrib-group', 'corresp', 'custom-meta', 'def', 'def-item', 'def-list', 'disp-formula',
                    'disp-formula-group', 'disp-quote', 'element-citation', 'ext-link', 'fig', 'fig-group', 'fn',
                    'fn-group', 'funding-source', 'glossary', 'glyph-data', 'graphic', 'inline-formula',
                    'inline-graphic', 'inline-supplementary-material', 'institution', 'kwd', 'kwd-group', 'list',
                    'list-item', 'long-desc', 'media', 'milestone-end', 'milestone-start', 'mixed-citation',
                    'named-content', 'nlm-citation', 'note', 'notes', 'p', 'person-group', 'preformat',
                    'product', 'ref', 'ref-list', 'related-article', 'related-object', 'response', 'sec', 'sig',
                    'sig-block', 'source', 'speech', 'statement', 'sub-article', 'supplementary-material', 'table',
                    'table-wrap', 'table-wrap-group', 'tbody', 'td', 'term', 'tex-math', 'tfoot', 'th', 'thead',
                    'title', 'tr', 'trans-abstract', 'trans-source', 'trans-title', 'trans-title-group', 'verse-group',
                    'xref']

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        for element in elements:
            self.debug.print_debug(self, u'Assigning ID to all {0} elements'.format(element))
            for item in tree.xpath(u'//{0}'.format(element)):
                if not 'id' in item.attrib:
                    item.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4()))

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #2
0
    def scan(self):
        self.gv.nlm_file_path = self.gv.settings.args['<input>']
        handle, self.gv.nlm_temp_path = tempfile.mkstemp()
        os.close(handle)

        manipulate = NlmManipulate(self.gv)

        # open the database
        self.debug.print_debug(
            self, u'Opening database: {0}'.format(self.gv.database_file_path))
        db = shelve.open(self.gv.database_file_path)

        # we /could/ use objectify, which would be cleaner, but it doesn't allow such rigidity of parsing

        # scan for journal items
        tree = manipulate.return_elements(
            '//element-citation[@publication-type="journal"]')
        self.store_journal_item(db, tree)

        tree = manipulate.return_elements(
            '//element-citation[@publication-type="book"]')
        self.store_book(db, tree)

        tree = manipulate.return_elements(
            '//element-citation[@publication-type="bookchapter"]')
        self.store_book_chapter(db, tree)

        db.close()
Example #3
0
    def run(self):
        elements = ['abbrev', 'abstract', 'ack', 'address', 'aff', 'alt-text', 'app', 'app-group', 'array',
                    'article-title', 'attrib', 'author-comment', 'author-notes', 'award-group', 'bio', 'boxed-text',
                    'caption', 'chem-struct', 'chem-struct-wrap', 'col', 'colgroup', 'collab', 'compound-kwd',
                    'contrib', 'contrib-group', 'corresp', 'custom-meta', 'def', 'def-item', 'def-list', 'disp-formula',
                    'disp-formula-group', 'disp-quote', 'element-citation', 'ext-link', 'fig', 'fig-group', 'fn',
                    'fn-group', 'funding-source', 'glossary', 'glyph-data', 'graphic', 'inline-formula',
                    'inline-graphic', 'inline-supplementary-material', 'institution', 'kwd', 'kwd-group', 'list',
                    'list-item', 'long-desc', 'media', 'milestone-end', 'milestone-start', 'mixed-citation',
                    'named-content', 'nlm-citation', 'note', 'notes', 'p', 'person-group', 'preformat',
                    'product', 'ref', 'ref-list', 'related-article', 'related-object', 'response', 'sec', 'sig',
                    'sig-block', 'source', 'speech', 'statement', 'sub-article', 'supplementary-material', 'table',
                    'table-wrap', 'table-wrap-group', 'tbody', 'td', 'term', 'tex-math', 'tfoot', 'th', 'thead',
                    'title', 'tr', 'trans-abstract', 'trans-source', 'trans-title', 'trans-title-group', 'verse-group',
                    'xref']

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        for element in elements:
            self.debug.print_debug(self, u'Assigning ID to all {0} elements'.format(element))
            for item in tree.xpath(u'//{0}'.format(element)):
                if not 'id' in item.attrib:
                    item.attrib['id'] = u'ID{0}'.format(uuid.uuid4())

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #4
0
    def process_database_references(self, db):
        manipulate = NlmManipulate(self.gv)
        master_tree = manipulate.load_dom_tree()
        tree = master_tree.xpath('//back/ref-list/ref')
        for element in tree:
            cont = True
            text = manipulate.get_stripped_text(element)

            year_test = re.compile('((19|20)\d{2})|(n\.d\.)')

            match = year_test.search(text)

            if match:
                # strip out elements in brackets that might scupper parsing
                text = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', text)

                list_split = text.split(',')
                list_split = [x.strip() for x in list_split]

                if len(list_split) < 10:

                    for length in range(1, len(list_split)):
                        if not cont:
                            break

                        for permute in itertools.permutations(
                                list_split, length):
                            key = match.groups(0)[0] + ''.join(permute).strip()

                            if isinstance(key, unicode):
                                key = key.encode("utf-16le")

                            if key in db:
                                obj = db[key]
                                print('Found {0} in database "{1}"'.format(
                                    obj.object_type(), obj.title))

                                new_element = etree.fromstring(
                                    obj.get_citation())

                                hex_dig = u'ID{0}'.format(unicode(
                                    uuid.uuid4()))

                                new_element.attrib['id'] = hex_dig

                                if 'id' in element.attrib:
                                    current_id = element.attrib['id']
                                    referrers = master_tree.xpath(
                                        '//*[@rid={0}]'.format(current_id))

                                    for link in referrers:
                                        link.attrib['rid'] = hex_dig

                                element.addnext(new_element)
                                element.getparent().remove(element)
                                cont = False
                                break
        return manipulate, master_tree
Example #5
0
    def prune(self):
        self.debug.print_debug(self, u'Deleting all stubs from article')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        for p in tree.xpath('//xref[@ref-type="bibr" and @rid="TO_LINK"]'):
            self.extract_contents(p)

        manipulate.save_tree(tree)
    def replace_in_text(self, id, element, replace_text, ref_type):
        before_after = element.text.split(replace_text, 1)
        element.text = before_after[0]

        new_element = etree.Element('xref')
        new_element.attrib['rid'] = unicode(id)
        new_element.attrib['ref-type'] = ref_type
        new_element.text = replace_text
        new_element.tail = ''.join(before_after[1:])

        NlmManipulate.append_safe(element, new_element, self)
Example #7
0
    def process_database_references(self, db):
        manipulate = NlmManipulate(self.gv)
        master_tree = manipulate.load_dom_tree()
        tree = master_tree.xpath('//back/ref-list/ref')
        for element in tree:
            cont = True
            text = manipulate.get_stripped_text(element)

            year_test = re.compile('((19|20)\d{2})|(n\.d\.)')

            match = year_test.search(text)

            if match:
                # strip out elements in brackets that might scupper parsing
                text = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', text)

                list_split = text.split(',')
                list_split = [x.strip() for x in list_split]

                if len(list_split) < 10:

                    for length in range(1, len(list_split)):
                        if not cont:
                            break

                        for permute in itertools.permutations(list_split, length):
                            key = match.groups(0)[0] + ''.join(permute).strip()

                            if isinstance(key, unicode):
                                key = key.encode("utf-16le")

                            if key in db:
                                obj = db[key]
                                print ('Found {0} in database "{1}"'.format(obj.object_type(), obj.title))

                                new_element = etree.fromstring(obj.get_citation())

                                hash_object = hashlib.sha256(key)
                                hex_dig = hash_object.hexdigest()

                                new_element.attrib['id'] = hex_dig

                                if 'id' in element.attrib:
                                    current_id = element.attrib['id']
                                    referrers = master_tree.xpath('//*[@rid={0}]'.format(current_id))

                                    for link in referrers:
                                        link.attrib['rid'] = hex_dig

                                element.addnext(new_element)
                                element.getparent().remove(element)
                                cont = False
                                break
        return manipulate, master_tree
Example #8
0
    def replace_in_text(self, id, element, replace_text, ref_type):
        before_after = element.text.split(replace_text, 1)
        element.text = before_after[0]

        new_element = etree.Element('xref')
        new_element.attrib['rid'] = unicode(id)
        new_element.attrib['ref-type'] = ref_type
        new_element.text = replace_text
        new_element.tail = ''.join(before_after[1:])

        NlmManipulate.append_safe(element, new_element, self)
Example #9
0
    def run_prompt(self):
        self.run(False)
        self.debug.print_debug(self, u'Entering interactive mode')

        prompt = Interactive(self.gv)

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        ref_items = tree.xpath('//back/ref-list/ref')

        # note that we don't want to exit even if there are no references to link because the user may want to delete
        # some

        delete_all = False

        for p in tree.xpath('//xref[@ref-type="bibr"]'):
            text = manipulate.get_stripped_text(p)
            prompt.print_(prompt.colorize('green', ("-" * 80)))

            if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK':
                prompt.print_(
                    u"Found an unhandled reference marker: {0}".format(text))
            elif 'rid' in p.attrib:
                remote = next((x for x in ref_items if 'id' in x.attrib and (
                    x.attrib['id'] == p.attrib['rid'])), None)
                remote_text = manipulate.get_stripped_text(
                    remote) if remote else ''
                prompt.print_(
                    u"Found a handled reference marker: \"{0}\" which links to \"{1}\""
                    .format(text, remote_text))

            opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid',
                    'enter Link id', 'skip Rest', 'show Context')

            sel = ''

            if delete_all:
                sel = 'd'
            else:
                sel = prompt.input_options(opts)

            result = self.handle_input(manipulate,
                                       opts,
                                       p,
                                       prompt,
                                       ref_items,
                                       sel,
                                       tree=tree)

            if result == 'abort':
                manipulate.save_tree(tree)
                return
            elif result == 'delall':
                delete_all = True

            manipulate.save_tree(tree)
Example #10
0
def main():
    args = docopt(__doc__, version='meTypeset 0.1')
    bare_gv = GV(args)

    if args['--debug']:
        bare_gv.debug.enable_debug(args['--nogit'])

    nlm_instance = TeiToNlm(bare_gv)

    if args['process']:
        # run non-transform portions of teitonlm
        TeiToNlm(bare_gv).run(True, False)
        # run reference linker
        rl = ReferenceLinker(bare_gv)
        rl.run(args['--interactive'])
        rl.cleanup()

        bibliography_classifier = BibliographyClassifier(bare_gv)

        # run table classifier
        cc = CaptionClassifier(bare_gv)
        if int(args['--aggression']) > int(
                bare_gv.settings.get_setting(
                    'tablecaptions', None, domain='aggression')):
            cc.run_tables()

        if int(args['--aggression']) > int(
                bare_gv.settings.get_setting(
                    'graphiccaptions', None, domain='aggression')):
            cc.run_graphics()

        if args['--interactive']:
            bibliography_classifier.run_prompt(True)

        # process any bibliography entries that are possible
        BibliographyDatabase(bare_gv).run()

        # remove stranded titles
        manipulate = NlmManipulate(bare_gv)
        manipulate.final_clean()

        if args['--identifiers']:
            IdGenerator(bare_gv).run()

        if args['--chain']:
            # construct and run an XSLT chainer
            XslChain(bare_gv).run()

        if args['--clean']:
            ComplianceEnforcer(bare_gv).run()
    def run_ext_link_compliance(self):
        self.debug.print_debug(self, u'Attempting to correct any mis-nested graphics elements')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()
        bad_links = tree.xpath('//ext-link/graphic')

        for link in bad_links:
            link_parent = link.getparent()
            parent = link_parent.getparent()
            parent.insert(parent.index(link_parent)+1, link)

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #12
0
    def run_ext_link_compliance(self):
        self.debug.print_debug(self, u'Attempting to correct any mis-nested graphics elements')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()
        bad_links = tree.xpath('//ext-link/graphic')

        for link in bad_links:
            link_parent = link.getparent()
            parent = link_parent.getparent()
            parent.insert(parent.index(link_parent)+1, link)

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #13
0
    def link_items(self, source_id, dest_id, manipulate=None, tree=None):
        self.debug.print_debug(self, u'Attempting to link XREF {0} to REF {1}'.format(source_id, dest_id))

        if manipulate is None:
            manipulate = NlmManipulate(self.gv)

        if tree is None:
            tree = manipulate.load_dom_tree()

        source = tree.xpath('//xref[@id="{0}"]'.format(source_id))[0]
        dest = tree.xpath('//ref[@id="{0}"]'.format(dest_id))[0]

        ReplaceObject(self.gv, source, dest).link()

        manipulate.save_tree(tree)
Example #14
0
def main():
    args = docopt(__doc__, version='meTypeset 0.1')
    bare_gv = GV(args)

    if args['--debug']:
        bare_gv.debug.enable_debug(args['--nogit'])

    nlm_instance = TeiToNlm(bare_gv)

    if args['process']:
            # run non-transform portions of teitonlm
            TeiToNlm(bare_gv).run(True, False)
            # run reference linker
            rl = ReferenceLinker(bare_gv)
            rl.run(args['--interactive'])
            rl.cleanup()

            bibliography_classifier = BibliographyClassifier(bare_gv)

            # run table classifier
            cc = CaptionClassifier(bare_gv)
            if int(args['--aggression']) > int(bare_gv.settings.get_setting('tablecaptions',
                                                                            None, domain='aggression')):
                cc.run_tables()

            if int(args['--aggression']) > int(bare_gv.settings.get_setting('graphiccaptions',
                                                                            None, domain='aggression')):
                cc.run_graphics()

            if args['--interactive']:
                bibliography_classifier.run_prompt(True)

            # process any bibliography entries that are possible
            BibliographyDatabase(bare_gv).run()

            # remove stranded titles
            manipulate = NlmManipulate(bare_gv)
            manipulate.final_clean()

            if args['--identifiers']:
                IdGenerator(bare_gv).run()

            if args['--chain']:
                # construct and run an XSLT chainer
                XslChain(bare_gv).run()

            if args['--clean']:
                ComplianceEnforcer(bare_gv).run()
Example #15
0
    def run_prompt(self):
        self.run(False)
        self.debug.print_debug(self, u"Entering interactive mode")

        prompt = Interactive(self.gv)

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        ref_items = tree.xpath("//back/ref-list/ref")

        # note that we don't want to exit even if there are no references to link because the user may want to delete
        # some

        delete_all = False

        for p in tree.xpath('//xref[@ref-type="bibr"]'):
            text = manipulate.get_stripped_text(p)

            if "rid" in p.attrib and p.attrib["rid"] == "TO_LINK":
                prompt.print_(u"Found an unhandled reference marker: {0}".format(text))
            elif "rid" in p.attrib:
                remote = next((x for x in ref_items if "id" in x.attrib and (x.attrib["id"] == p.attrib["rid"])), None)
                remote_text = manipulate.get_stripped_text(remote)
                prompt.print_(u'Found a handled reference marker: "{0}" which links to "{1}"'.format(text, remote_text))

            opts = (
                "Skip",
                "Delete",
                "deleTe all",
                "Enter search",
                "Ibid",
                "enter Link id",
                "skip Rest",
                "show Context",
            )

            sel = ""

            if delete_all:
                sel = "d"
            else:
                sel = prompt.input_options(opts)

            result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree)

            if result == "abort":
                manipulate.save_tree(tree)
                return
            elif result == "delall":
                delete_all = True

            manipulate.save_tree(tree)
Example #16
0
    def run_prompt(self):
        self.run(False)
        self.debug.print_debug(self, u'Entering interactive mode')

        prompt = Interactive(self.gv)

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        ref_items = tree.xpath('//back/ref-list/ref')

        # note that we don't want to exit even if there are no references to link because the user may want to delete
        # some

        delete_all = False

        for p in tree.xpath('//xref[@ref-type="bibr"]'):
            text = manipulate.get_stripped_text(p)
            prompt.print_(prompt.colorize('green',("-" * 80)))

            if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK':
                prompt.print_(u"Found an unhandled reference marker: {0}".format(text))
            elif 'rid' in p.attrib:
                remote = next((x for x in ref_items if 'id' in x.attrib and (x.attrib['id'] == p.attrib['rid'])), None)
                remote_text = manipulate.get_stripped_text(remote)
                prompt.print_(u"Found a handled reference marker: \"{0}\" which links to \"{1}\"".format(text,
                                                                                                         remote_text))

            opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid', 'enter Link id',
                    'skip Rest', 'show Context')

            sel = ''

            if delete_all:
                sel = 'd'
            else:
                sel = prompt.input_options(opts)

            result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree)

            if result == 'abort':
                manipulate.save_tree(tree)
                return
            elif result == 'delall':
                delete_all = True

            manipulate.save_tree(tree)
    def scan(self):
        self.gv.nlm_file_path = self.gv.settings.args['<input>']
        handle, self.gv.nlm_temp_path = tempfile.mkstemp()
        os.close(handle)

        manipulate = NlmManipulate(self.gv)

        # open the database
        self.debug.print_debug(self, u'Opening database: {0}'.format(self.gv.database_file_path))
        db = shelve.open(self.gv.database_file_path)

        # we /could/ use objectify, which would be cleaner, but it doesn't allow such rigidity of parsing

        # scan for journal items
        tree = manipulate.return_elements('//element-citation[@publication-type="journal"]')
        self.store_journal_item(db, tree)

        tree = manipulate.return_elements('//element-citation[@publication-type="book"]')
        self.store_book(db, tree)

        tree = manipulate.return_elements('//element-citation[@publication-type="bookchapter"]')
        self.store_book_chapter(db, tree)

        db.close()
Example #18
0
    def run_quirks(self, process_ref_lists):
        manipulate = NlmManipulate(self.gv)

        if self.gv.settings.get_setting('linebreaks-as-comments', self) == 'False':
            # we need to convert every instance of <!--meTypeset:br--> to a new paragraph
            manipulate.close_and_open_tag('comment()[. = "meTypeset:br"]', 'p')
            manipulate.close_and_open_tag_not_styled('comment()[. = "meTypeset:br"]', 'title')

        # we will replace inside table cells and titles regardless because these are real JATS break tags
        manipulate.insert_break('comment()[. = "meTypeset:br"]', 'td')
        manipulate.insert_break('comment()[. = "meTypeset:br"]', 'title')

        manipulate.remove_empty_elements('//sec//p')

        if process_ref_lists:
            self.debug.print_debug(self, u'Finding potential reference lists')
            manipulate.find_reference_list()
            manipulate.tag_bibliography_refs()

        manipulate.remove_empty_elements('//sec/list')
        manipulate.remove_empty_elements('//sec/disp-quote')
        manipulate.remove_empty_elements('//back/ref-list/ref')
Example #19
0
    def run_tables(self):
        self.debug.print_debug(
            self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            caption_element = None
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(
                    self,
                    u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(uuid.uuid4())

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith(
                            'sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(
                            self,
                            u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith(
                                'sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(
                                self,
                                u'Moved table and siblings to parent section')

            # If none of that worked, try to find caption in table rows
            if caption_element is None:
                table_rows = table.find("table").getchildren()

                # Check if first row has fewer columns than others
                # Therefore not likely to be data or a header
                columns_count = {}
                first_column = {}
                row_number = 0

                for row in table_rows:
                    row_number += 1
                    columns_count[row_number] = len(row.getchildren())
                    try:
                        first_column[row_number] = row.getchildren()[0].text
                    except:
                        first_column[row_number] = ""
                    fewest_columns = min(columns_count, key=columns_count.get)

                if len(columns_count) > 2 and columns_count[
                        1] == fewest_columns and columns_count[
                            2] != fewest_columns:
                    # If it has fewest columns, also check Levenshtein distance
                    # To ensure this row is unlike the others
                    if editdistance.eval(first_column[1],
                                         first_column[2]) > editdistance.eval(
                                             first_column[2], first_column[3]):

                        # OK, we have something, move it
                        caption_element = etree.Element('caption')
                        caption_element.text = first_column[1]
                        NlmManipulate.append_safe(table, caption_element, self)
                        table.find("table").remove(table_rows[0])

        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #20
0
    def run(self):
        elements = [
            "abbrev",
            "abstract",
            "ack",
            "address",
            "aff",
            "alt-text",
            "app",
            "app-group",
            "array",
            "article-title",
            "attrib",
            "author-comment",
            "author-notes",
            "award-group",
            "bio",
            "boxed-text",
            "caption",
            "chem-struct",
            "chem-struct-wrap",
            "col",
            "colgroup",
            "collab",
            "compound-kwd",
            "contrib",
            "contrib-group",
            "corresp",
            "custom-meta",
            "def",
            "def-item",
            "def-list",
            "disp-formula",
            "disp-formula-group",
            "disp-quote",
            "element-citation",
            "ext-link",
            "fig",
            "fig-group",
            "fn",
            "fn-group",
            "funding-source",
            "glossary",
            "glyph-data",
            "graphic",
            "inline-formula",
            "inline-graphic",
            "inline-supplementary-material",
            "institution",
            "kwd",
            "kwd-group",
            "list",
            "list-item",
            "long-desc",
            "media",
            "milestone-end",
            "milestone-start",
            "mixed-citation",
            "named-content",
            "nlm-citation",
            "note",
            "notes",
            "p",
            "person-group",
            "preformat",
            "product",
            "ref",
            "ref-list",
            "related-article",
            "related-object",
            "response",
            "sec",
            "sig",
            "sig-block",
            "source",
            "speech",
            "statement",
            "sub-article",
            "supplementary-material",
            "table",
            "table-wrap",
            "table-wrap-group",
            "tbody",
            "td",
            "term",
            "tex-math",
            "tfoot",
            "th",
            "thead",
            "title",
            "tr",
            "trans-abstract",
            "trans-source",
            "trans-title",
            "trans-title-group",
            "verse-group",
            "xref",
        ]

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        for element in elements:
            self.debug.print_debug(self, u"Assigning ID to all {0} elements".format(element))
            for item in tree.xpath(u"//{0}".format(element)):
                if not "id" in item.attrib:
                    item.attrib["id"] = u"ID{0}".format(unicode(uuid.uuid4()))

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
    def run_tables(self):
        self.debug.print_debug(self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            caption_element = None
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(uuid.uuid4())

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith('sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(self, u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith('sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(self, u'Moved table and siblings to parent section')

            # If none of that worked, try to find caption in table rows
            if caption_element is None:
                table_rows = table.find("table").getchildren()

                # Check if first row has fewer columns than others
                # Therefore not likely to be data or a header
                columns_count = {}
                first_column = {}
                row_number = 0

                for row in table_rows:
                    row_number += 1
                    columns_count[row_number] = len(row.getchildren())
                    try:
                        first_column[row_number] = row.getchildren()[0].text
                    except:
                        first_column[row_number] = ""
                    fewest_columns = min(columns_count, key=columns_count.get)

                if len(columns_count) > 2 and columns_count[1] == fewest_columns and columns_count[2] != fewest_columns:
                    # If it has fewest columns, also check Levenshtein distance
                    # To ensure this row is unlike the others
                    if editdistance.eval(first_column[1], first_column[2]) > editdistance.eval(first_column[2], first_column[3]):

                        # OK, we have something, move it
                        caption_element = etree.Element('caption')
                        caption_element.text = first_column[1]
                        NlmManipulate.append_safe(table, caption_element, self)
                        table.find("table").remove(table_rows[0])


        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
    def process_zotero(self):
        from zotero import libzotero
        zotero = libzotero.LibZotero(unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv)

        manipulate = NlmManipulate(self.gv)
        master_tree = manipulate.load_dom_tree()
        tree = master_tree.xpath('//back/ref-list/ref')

        for element in tree:
            original_term = manipulate.get_stripped_text(element)
            term = original_term

            #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term)
            term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term)
            term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '', term)
            term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term)
            term = re.sub(u'\s.\s', u' ', term)
            term = re.sub(u'(?<=[A-Z])\.', u' ', term)
            term = term.replace(u'“', u'')
            term = term.replace(u'\'s', u'')
            term = term.replace(u'’s', u'')
            term = term.replace(u'’', u'')
            term = term.replace(u' Ed. ', u' ')
            term = term.replace(u' Ed ', u' ')
            term = term.replace(u' Trans. ', u' ')
            term = term.replace(u' Trans ', u' ')
            term = term.replace(u' trans ', u' ')
            term = term.replace(u' trans. ', u' ')
            term = term.replace(u' by. ', u' ')
            term = term.replace(u' by ', u' ')
            term = term.replace(u' ed. ', u' ')
            term = term.replace(u' ed ', u' ')
            term = term.replace(u' In ', u' ')
            term = term.replace(u' in ', u' ')
            term = term.replace(u' print ', u' ')
            term = term.replace(u' Print ', u' ')
            term = term.replace(u' and ', u' ')
            term = term.replace(u'”', u'')
            term = re.sub(r'[Aa]ccessed', '', term)
            term = re.sub(r'meTypesetbr', '', term)
            term = re.sub(r'\s+', ' ', term)

            results = zotero.search(term.strip())

            while len(results) == 0 and len(term.strip().split(' ')) > 2:
                # no results found.
                # begin iterating backwards
                term = ' '.join(term.strip().split(' ')[:-1])
                results = zotero.search(term.strip())

            if len(results) == 1:
                res = results[0].JATS_format()

                if res is not None:
                    ref = etree.fromstring(res)
                    if 'id' in element.attrib:
                        ref.attrib['id'] = element.attrib['id']

                    element.addnext(ref)

                    original_term = re.sub(u'--', u'', original_term)

                    comment = etree.Comment(original_term)
                    ref.addnext(comment)

                    element.tag = 'REMOVE'

        etree.strip_elements(master_tree, 'REMOVE')

        manipulate.save_tree(master_tree)
Example #23
0
    def run_graphics_sibling(self):
        # images are hard to handle because Word/OO puts them in different places
        # for instance, the caption can come before or after;
        # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
        # orientation="portrait" xlink:type="simple"/>

        self.debug.print_debug(
            self,
            u'Attempting to classify captions for graphics objects [sibling]')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        graphics = tree.xpath('//graphic')

        graphic_titles = []
        graphic_ids = []
        graphic_regex_dot = re.compile('^.+?\s*\d+\..+')
        graphic_regex_colon = re.compile('^.+?\s*\d+\:.+')

        separator = ':'

        for graphic in graphics:
            use_next = False
            use_previous = False

            # get the next sibling
            p = graphic.getparent().getnext()
            pprev = graphic.getparent().getprevious()

            if p is not None and p.tag == 'p':
                text = manipulate.get_stripped_text(p)

                if graphic_regex_colon.match(text):
                    use_next = True
                    separator = ':'
                elif graphic_regex_dot.match(text):
                    use_next = True
                    separator = '.'

            if not use_next:
                if pprev is not None and pprev.tag == 'p':
                    text = manipulate.get_stripped_text(pprev)

                    if graphic_regex_colon.match(text):
                        use_previous = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_previous = True
                        separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = graphic.getparent()

                while parent is not None and not parent.tag.endswith('sec'):
                    parent = parent.getparent()
                    if parent is not None:
                        titles = parent.xpath('title')
                    else:
                        titles = []

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if graphic_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                self.debug.print_debug(
                    self,
                    u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = graphic.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    graphic.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                new_p = etree.Element('p')
                new_p.text = caption

                NlmManipulate.append_safe(caption_element, new_p, self)
                NlmManipulate.append_safe(graphic, caption_element, self)

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                if graphic.tail:
                    graphic.tail = graphic.tail.replace(title + separator, '')
                    graphic.tail = graphic.tail.replace(
                        caption + separator, '')
                    graphic.tail = graphic.tail.replace(caption, '')

                if not 'id' in graphic.attrib:
                    graphic.attrib['id'] = u'ID{0}'.format(
                        unicode(uuid.uuid4()))

                graphic_titles.append(title)
                graphic_ids.append(graphic.attrib['id'])

        paragraphs = tree.xpath('//p')

        self.link(graphic_ids, graphic_titles, paragraphs, 'fig')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #24
0
 def cleanup(self):
     manipulate = NlmManipulate(self.gv)
     manipulate.remove_reference_numbering()
Example #25
0
    def run(self, interactive):
        if interactive:
            self.run_prompt()
            return

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # pre-cleanup: remove all empty ext-links as these break the linker
        items_to_clean = tree.xpath('//ext-link')

        count = 0

        for item in items_to_clean:
            if '{http://www.w3.org/1999/xlink}href' in item.attrib and \
                    item.attrib['{http://www.w3.org/1999/xlink}href'] == '':
                count += 1
                item.tag = 'REMOVE'
                etree.strip_tags(item.getparent(), 'REMOVE')

        if count > 0:
            manipulate.save_tree(tree)
            self.debug.print_debug(self, u'Removed {0} blank ext-link tags'.format(count))

        ref_items = tree.xpath('//back/ref-list/ref')

        self.clean_ref_items(tree, ref_items, manipulate)

        # handle numbered reference items
        references_and_numbers = {}

        for ref in ref_items:
            text = manipulate.get_stripped_text(ref)
            ref_match = re.compile('^(?P<number>\d+)\.*')
            result = ref_match.match(text)

            if result:
                references_and_numbers[result.group('number')] = ref

        parsed = self.process_ibid_authors(ref_items)

        if parsed > 0:

            manipulate.save_tree(tree)

            self.debug.print_debug(self, u'Replace {0} instances of "---." at start of references'.format(parsed))

        to_link = []
        to_stub = []

        square_bracket_count = {}


        for p in tree.xpath('//sec//p[not(mml:math)] | //td',
                            namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'}):

            text = manipulate.get_stripped_text(p)

            reference_test = re.compile('\((?P<text>[^%]+?)\)')
            matches = reference_test.finditer(text)

            # exclude any square brackets with numbers inside
            sub_match = re.compile('\[(?P<square>\d*[,\-;\d\s]*)\]')
            smatch = sub_match.search(text)

            if smatch:
                smatches = sub_match.finditer(text)
                for smatch in smatches:
                    self.debug.print_debug(self, u'Handling references in square '
                                                 u'brackets: [{0}] '.format(smatch.group('square')))
                    for item in re.split(';|,', smatch.group('square')):
                        if '-' in item:
                            parent, tail = manipulate.find_text(p, item)

                            if parent is not None:
                                new_string = ''

                                try:
                                    split_range = item.strip().split('-')
                                    for no in range(int(split_range[0]), int(split_range[1]) + 1):
                                        new_string += str(no) + ','
                                except:
                                    self.debug.print_debug(self, u'Unable to parse reference '
                                                                 u'number in range {0}'.format(item))
                                    break

                                if new_string.endswith(',') and not item.endswith(','):
                                    new_string = new_string[0:len(new_string) - 1]

                                if tail and new_string != '':
                                    parent.tail = parent.tail.replace(item, new_string)
                                elif not tail and new_string != '':
                                    parent.text = parent.text.replace(item, new_string)

                                try:
                                    split_range = item.strip().split('-')
                                    for no in range(int(split_range[0]), int(split_range[1]) + 1):
                                        self.debug.print_debug(self, u'Parsing reference '
                                                                     u'number in range {0}'.format(str(no)))

                                        to_stub.append(ReplaceStub(self.gv, p, str(no), tree, manipulate,
                                                                   'TO_LINK_NUMBER', length_ignore=True))
                                except:
                                    self.debug.print_debug(self, u'Unable to parse reference '
                                                                 u'number in range {0}'.format(item))
                                    break

                            else:
                                # just replace the components
                                split_range = item.strip().split('-')
                                for link in split_range:
                                    to_stub.append(ReplaceStub(self.gv, p, link, tree, manipulate,
                                                               'TO_LINK_NUMBER', length_ignore=True))
                        else:
                            if len(item.strip()) < 60:
                                to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate, 'TO_LINK_NUMBER',
                                                           length_ignore=True))

                        square_bracket_count[item.strip()] = 1
            else:
                for match in matches:
                    for item in match.group('text').split(u';'):
                        if len(item.strip()) < 60:
                            to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate))

        for link in to_stub:
            link.link(to_stub)
            #pass

        etree.strip_elements(tree, 'REMOVE')

        use_index_method = False

        if len(square_bracket_count) != len(references_and_numbers):
            # we found more than 3 [1], [2] style references but no reference elements beginning with numbers
            # so, we will simply try to use the /index/ of the reference item (-1 for zero-based compensation)
            self.debug.print_debug(self, u'Using indexical method for square bracket correlation')
            use_index_method = True

        if len(ref_items) == 0:
            self.debug.print_debug(self, u'Found no references to link')

            manipulate.save_tree(tree)

            return

        for p in tree.xpath('//xref[@rid="TO_LINK_NUMBER"]'):
            text = manipulate.get_stripped_text(p)

            if not use_index_method:
                if text in references_and_numbers:
                    ReplaceObject(self.gv, p, references_and_numbers[text]).link()
                else:
                    p.attrib['rid'] = 'TO_LINK'
            else:
                try:
                    ReplaceObject(self.gv, p, ref_items[int(text) - 1]).link()
                except:
                    self.debug.print_debug(self, u'Failed to link to reference {0} + 1 using '
                                                 u'indexical method'.format(text))
                    p.attrib['rid'] = 'TO_LINK'

        for p in tree.xpath('//xref[@rid="TO_LINK"]'):
            text = manipulate.get_stripped_text(p)

            item = text

            bare_items = item.strip().replace(u',', '').split(u' ')

            for ref in ref_items:
                found = True

                bare_ref = manipulate.get_stripped_text(ref)

                bare_refs = bare_ref.split(' ')

                replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"]'

                for sub_item in bare_items:
                    found_ref = False
                    for sub_ref in bare_refs:
                        if re.sub(replace_chars, '', sub_item.strip()).strip() == sub_ref.strip(replace_chars):
                            found_ref = True
                            break

                    if not found_ref:
                        found = False

                if len(bare_items) > 0 and found:
                    to_link.append(ReplaceObject(self.gv, p, ref))

                elif len(bare_items) > 0:
                    replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\d]'
                    found = True

                    for sub_item in bare_items:
                        found_ref = False
                        subbed_text = re.sub(replace_chars, '', sub_item.strip()).strip()
                        for sub_ref in bare_refs:
                            sub_ref = re.sub(replace_chars, '', sub_ref.strip()).strip()

                            if subbed_text == '' and len(bare_items) > 1:
                                found_ref = True
                                break

                            if subbed_text == sub_ref and subbed_text != '' and sub_ref != '':
                                found_ref = True
                                break

                        if not found_ref:
                            found = False

                    # we don't allow linking to the last item here because it is almost universally wrong
                    if len(bare_items) > 0 and found and ref_items.index(ref) != len(ref_items) - 1:
                        to_link.append(ReplaceObject(self.gv, p, ref))


        if len(to_link) == 0:
            self.debug.print_debug(self, u'Found no references to link')

        for link in to_link:
            link.link()
            #pass

        manipulate.save_tree(tree)
    def run_graphics_sibling(self):
        # images are hard to handle because Word/OO puts them in different places
        # for instance, the caption can come before or after;
        # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
        # orientation="portrait" xlink:type="simple"/>

        self.debug.print_debug(self, u'Attempting to classify captions for graphics objects [sibling]')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        graphics = tree.xpath('//graphic')

        graphic_titles = []
        graphic_ids = []
        graphic_regex_dot = re.compile('^.+?\s*\d+\..+')
        graphic_regex_colon = re.compile('^.+?\s*\d+\:.+')

        separator = ':'

        for graphic in graphics:
            use_next = False
            use_previous = False

            # get the next sibling
            p = graphic.getparent().getnext()
            pprev = graphic.getparent().getprevious()

            if p is not None and p.tag == 'p':
                text = manipulate.get_stripped_text(p)

                if graphic_regex_colon.match(text):
                    use_next = True
                    separator = ':'
                elif graphic_regex_dot.match(text):
                    use_next = True
                    separator = '.'

            if not use_next:
                if pprev is not None and pprev.tag == 'p':
                    text = manipulate.get_stripped_text(pprev)

                    if graphic_regex_colon.match(text):
                        use_previous = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_previous = True
                        separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = graphic.getparent()

                while parent is not None and not parent.tag.endswith('sec'):
                    parent = parent.getparent()
                    if parent is not None:
                      titles = parent.xpath('title')
                else:
                    titles = []

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if graphic_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif graphic_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = (''.join(split_title[1:])).strip()

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = graphic.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    graphic.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                new_p = etree.Element('p')
                new_p.text = caption

                NlmManipulate.append_safe(caption_element, new_p, self)
                NlmManipulate.append_safe(graphic, caption_element, self)

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                if graphic.tail:
                    graphic.tail = graphic.tail.replace(title + separator, '')
                    graphic.tail = graphic.tail.replace(caption + separator, '')
                    graphic.tail = graphic.tail.replace(caption, '')

                if not 'id' in graphic.attrib:
                    graphic.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4()))

                graphic_titles.append(title)
                graphic_ids.append(graphic.attrib['id'])

        paragraphs = tree.xpath('//p')

        self.link(graphic_ids, graphic_titles, paragraphs, 'fig')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
    def run_tables(self):
        self.debug.print_debug(self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0]
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4()))

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith('sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(self, u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith('sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(self, u'Moved table and siblings to parent section')

        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #28
0
    def run_tables(self):
        self.debug.print_debug(
            self, u'Attempting to classify captions for table objects')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        tables = tree.xpath('//table-wrap')

        table_titles = []
        table_ids = []
        table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
        table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')

        separator = ':'

        for table in tables:
            use_next = False
            use_previous = False
            used_title = False

            # get the next sibling
            p = table.getnext()
            pprev = table.getprevious()
            old_title = None

            if p is not None and p.tag == 'p':
                cont = True
                for sub in p:
                    if sub.tag == 'graphic':
                        cont = False

                if cont:
                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'

            if not use_next:
                cont = True
                for sub in pprev:
                    if sub.tag == 'graphic':
                        cont = False
                if cont:
                    if pprev is not None and pprev.tag == 'p':
                        text = manipulate.get_stripped_text(pprev)

                        if table_regex_colon.match(text):
                            use_previous = True
                            separator = ':'
                        elif table_regex_dot.match(text):
                            use_previous = True
                            separator = '.'

            if not use_next or use_previous:
                # see if the title in this section potentially contains text we can match
                parent = table.getparent()

                titles = parent.xpath('title')

                if len(titles) > 0:
                    p = titles[0]

                    text = manipulate.get_stripped_text(p)

                    if table_regex_colon.match(text):
                        use_next = True
                        separator = ':'
                        used_title = True
                    elif table_regex_dot.match(text):
                        use_next = True
                        separator = '.'
                        used_title = True

            if use_next or use_previous:

                if use_next:
                    text = manipulate.get_stripped_text(p)
                else:
                    text = manipulate.get_stripped_text(pprev)
                    p = pprev

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0]
                caption = (''.join(split_title[1:])).strip()

                # strip all formatting from caption for ease of parsing
                # TODO: preserve formatting (far harder)
                new_p = etree.Element('p')
                new_p.text = caption

                if p.tag.endswith('title'):
                    new_title = etree.Element('title')
                    new_title.text = ''
                    old_title = new_title
                    p.addnext(new_title)
                    p.getparent().remove(p)
                else:
                    p.getparent().remove(p)

                p = new_p

                self.debug.print_debug(
                    self,
                    u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = table.xpath('label')[0]
                except:
                    title_element = etree.Element('label')
                    table.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element('caption')
                NlmManipulate.append_safe(caption_element, p, self)
                table.insert(1, caption_element)

                if not 'id' in table.attrib:
                    table.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4()))

                table_titles.append(title)
                table_ids.append(table.attrib['id'])

                if used_title:
                    # if we took the title out, then we should move the parent into its previous sibling and then
                    # strip tags
                    old_title.tag = 'REMOVE'

                    etree.strip_elements(tree, 'REMOVE')

                    section = table.getparent()

                    previous = section.getprevious()

                    while previous is not None and not previous.tag.endswith(
                            'sec'):
                        previous = previous.getprevious()

                    if previous is not None:
                        previous.append(section)
                        section.tag = 'REMOVE'

                        etree.strip_tags(tree, 'REMOVE')

                        self.debug.print_debug(
                            self,
                            u'Moved table and siblings to previous section')
                    else:
                        previous = section.getparent()

                        if previous is not None and previous.tag.endswith(
                                'sec'):
                            previous.append(section)
                            section.tag = 'REMOVE'

                            etree.strip_tags(tree, 'REMOVE')

                            self.debug.print_debug(
                                self,
                                u'Moved table and siblings to parent section')

        paragraphs = tree.xpath('//p')

        self.link(table_ids, table_titles, paragraphs, 'table')

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)
Example #29
0
    def run_quirks(self, process_ref_lists):
        manipulate = NlmManipulate(self.gv)

        if self.gv.settings.get_setting('linebreaks-as-comments',
                                        self) == 'False':
            # we need to convert every instance of <!--meTypeset:br--> to a new paragraph
            manipulate.close_and_open_tag('comment()[. = "meTypeset:br"]', 'p')
            manipulate.close_and_open_tag_not_styled(
                'comment()[. = "meTypeset:br"]', 'title')

        # we will replace inside table cells and titles regardless because these are real JATS break tags
        manipulate.insert_break('comment()[. = "meTypeset:br"]', 'td')
        manipulate.insert_break('comment()[. = "meTypeset:br"]', 'title')

        manipulate.remove_empty_elements('//sec//p')

        if process_ref_lists:
            self.debug.print_debug(self, u'Finding potential reference lists')
            manipulate.find_reference_list()
            manipulate.tag_bibliography_refs()

        manipulate.remove_empty_elements('//sec/list')
        manipulate.remove_empty_elements('//sec/disp-quote')
        manipulate.remove_empty_elements('//back/ref-list/ref')
Example #30
0
    def run_graphics(self):
        # images are hard to handle because Word/OO puts them in different places
        # for instance, the caption can come before or after;
        # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
        # orientation="portrait" xlink:type="simple"/>

        self.debug.print_debug(self, u"Attempting to classify captions for graphics objects [plain]")

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        graphics = tree.xpath("//graphic")

        graphic_titles = []
        graphic_ids = []
        graphic_regex_dot = re.compile("^.+?\s*\d+\..+")
        graphic_regex_colon = re.compile("^.+?\s*\d+\:.+")

        separator = ":"

        for graphic in graphics:
            use_next = False

            # get the next sibling
            p = graphic.getparent()

            if p is not None and p.tag == "p":
                text = manipulate.get_stripped_text(p)

                if graphic_regex_colon.match(text):
                    use_next = True
                    separator = ":"
                elif graphic_regex_dot.match(text):
                    use_next = True
                    separator = "."

            if use_next:
                text = manipulate.get_stripped_text(p)

                # likely this is a table identifier
                split_title = text.split(separator)

                title = split_title[0].strip()
                caption = ("".join(split_title[1:])).strip()

                self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))

                title_element = None

                # use an existing title element if one exists
                try:
                    title_element = graphic.xpath("label")[0]
                except:
                    title_element = etree.Element("label")
                    graphic.insert(0, title_element)

                title_element.text = title

                caption_element = etree.Element("caption")
                new_p = etree.Element("p")
                new_p.text = caption

                NlmManipulate.append_safe(caption_element, new_p, self)
                NlmManipulate.append_safe(graphic, caption_element, self)

                if graphic.tail:
                    graphic.tail = graphic.tail.replace(title + separator, "")
                    graphic.tail = graphic.tail.replace(caption + separator, "")
                    graphic.tail = graphic.tail.replace(caption, "")

                if not "id" in graphic.attrib:
                    graphic.attrib["id"] = u"ID{0}".format(unicode(uuid.uuid4()))

                graphic_titles.append(title)
                graphic_ids.append(graphic.attrib["id"])

        paragraphs = tree.xpath("//p")

        self.link(graphic_ids, graphic_titles, paragraphs, "fig")

        tree.write(self.gv.nlm_file_path)
        tree.write(self.gv.nlm_temp_file_path)

        self.run_graphics_sibling()
Example #31
0
    def process_zotero(self):
        from zotero import libzotero
        zotero = libzotero.LibZotero(
            unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv)

        manipulate = NlmManipulate(self.gv)
        master_tree = manipulate.load_dom_tree()
        tree = master_tree.xpath('//back/ref-list/ref')

        for element in tree:
            original_term = manipulate.get_stripped_text(element)
            term = original_term

            #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term)
            term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term)
            term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '',
                          term)
            term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term)
            term = re.sub(u'\s.\s', u' ', term)
            term = re.sub(u'(?<=[A-Z])\.', u' ', term)
            term = term.replace(u'“', u'')
            term = term.replace(u'\'s', u'')
            term = term.replace(u'’s', u'')
            term = term.replace(u'’', u'')
            term = term.replace(u' Ed. ', u' ')
            term = term.replace(u' Ed ', u' ')
            term = term.replace(u' Trans. ', u' ')
            term = term.replace(u' Trans ', u' ')
            term = term.replace(u' trans ', u' ')
            term = term.replace(u' trans. ', u' ')
            term = term.replace(u' by. ', u' ')
            term = term.replace(u' by ', u' ')
            term = term.replace(u' ed. ', u' ')
            term = term.replace(u' ed ', u' ')
            term = term.replace(u' In ', u' ')
            term = term.replace(u' in ', u' ')
            term = term.replace(u' print ', u' ')
            term = term.replace(u' Print ', u' ')
            term = term.replace(u' and ', u' ')
            term = term.replace(u'”', u'')
            term = re.sub(r'[Aa]ccessed', '', term)
            term = re.sub(r'meTypesetbr', '', term)
            term = re.sub(r'\s+', ' ', term)

            results = zotero.search(term.strip())

            while len(results) == 0 and len(term.strip().split(' ')) > 2:
                # no results found.
                # begin iterating backwards
                term = ' '.join(term.strip().split(' ')[:-1])
                results = zotero.search(term.strip())

            if len(results) == 1:
                res = results[0].JATS_format()

                if res is not None:
                    ref = etree.fromstring(res)
                    if 'id' in element.attrib:
                        ref.attrib['id'] = element.attrib['id']

                    element.addnext(ref)

                    original_term = re.sub(u'--', u'', original_term)

                    comment = etree.Comment(original_term)
                    ref.addnext(comment)

                    element.tag = 'REMOVE'

        etree.strip_elements(master_tree, 'REMOVE')

        manipulate.save_tree(master_tree)