Esempio n. 1
0
    def run_prompt(self):
        self.run(False)
        self.debug.print_debug(self, u'Entering interactive mode')

        prompt = Interactive(self.gv)

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        ref_items = tree.xpath('//back/ref-list/ref')

        # note that we don't want to exit even if there are no references to link because the user may want to delete
        # some

        delete_all = False

        for p in tree.xpath('//xref[@ref-type="bibr"]'):
            text = manipulate.get_stripped_text(p)
            prompt.print_(prompt.colorize('green', ("-" * 80)))

            if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK':
                prompt.print_(
                    u"Found an unhandled reference marker: {0}".format(text))
            elif 'rid' in p.attrib:
                remote = next((x for x in ref_items if 'id' in x.attrib and (
                    x.attrib['id'] == p.attrib['rid'])), None)
                remote_text = manipulate.get_stripped_text(
                    remote) if remote else ''
                prompt.print_(
                    u"Found a handled reference marker: \"{0}\" which links to \"{1}\""
                    .format(text, remote_text))

            opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid',
                    'enter Link id', 'skip Rest', 'show Context')

            sel = ''

            if delete_all:
                sel = 'd'
            else:
                sel = prompt.input_options(opts)

            result = self.handle_input(manipulate,
                                       opts,
                                       p,
                                       prompt,
                                       ref_items,
                                       sel,
                                       tree=tree)

            if result == 'abort':
                manipulate.save_tree(tree)
                return
            elif result == 'delall':
                delete_all = True

            manipulate.save_tree(tree)
Esempio n. 2
0
    def prune(self):
        self.debug.print_debug(self, u'Deleting all stubs from article')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        for p in tree.xpath('//xref[@ref-type="bibr" and @rid="TO_LINK"]'):
            self.extract_contents(p)

        manipulate.save_tree(tree)
Esempio n. 3
0
    def run_prompt(self):
        self.run(False)
        self.debug.print_debug(self, u"Entering interactive mode")

        prompt = Interactive(self.gv)

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        ref_items = tree.xpath("//back/ref-list/ref")

        # note that we don't want to exit even if there are no references to link because the user may want to delete
        # some

        delete_all = False

        for p in tree.xpath('//xref[@ref-type="bibr"]'):
            text = manipulate.get_stripped_text(p)

            if "rid" in p.attrib and p.attrib["rid"] == "TO_LINK":
                prompt.print_(u"Found an unhandled reference marker: {0}".format(text))
            elif "rid" in p.attrib:
                remote = next((x for x in ref_items if "id" in x.attrib and (x.attrib["id"] == p.attrib["rid"])), None)
                remote_text = manipulate.get_stripped_text(remote)
                prompt.print_(u'Found a handled reference marker: "{0}" which links to "{1}"'.format(text, remote_text))

            opts = (
                "Skip",
                "Delete",
                "deleTe all",
                "Enter search",
                "Ibid",
                "enter Link id",
                "skip Rest",
                "show Context",
            )

            sel = ""

            if delete_all:
                sel = "d"
            else:
                sel = prompt.input_options(opts)

            result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree)

            if result == "abort":
                manipulate.save_tree(tree)
                return
            elif result == "delall":
                delete_all = True

            manipulate.save_tree(tree)
Esempio n. 4
0
    def link_items(self, source_id, dest_id, manipulate=None, tree=None):
        self.debug.print_debug(self, u'Attempting to link XREF {0} to REF {1}'.format(source_id, dest_id))

        if manipulate is None:
            manipulate = NlmManipulate(self.gv)

        if tree is None:
            tree = manipulate.load_dom_tree()

        source = tree.xpath('//xref[@id="{0}"]'.format(source_id))[0]
        dest = tree.xpath('//ref[@id="{0}"]'.format(dest_id))[0]

        ReplaceObject(self.gv, source, dest).link()

        manipulate.save_tree(tree)
Esempio n. 5
0
    def run_prompt(self):
        self.run(False)
        self.debug.print_debug(self, u'Entering interactive mode')

        prompt = Interactive(self.gv)

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        ref_items = tree.xpath('//back/ref-list/ref')

        # note that we don't want to exit even if there are no references to link because the user may want to delete
        # some

        delete_all = False

        for p in tree.xpath('//xref[@ref-type="bibr"]'):
            text = manipulate.get_stripped_text(p)
            prompt.print_(prompt.colorize('green',("-" * 80)))

            if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK':
                prompt.print_(u"Found an unhandled reference marker: {0}".format(text))
            elif 'rid' in p.attrib:
                remote = next((x for x in ref_items if 'id' in x.attrib and (x.attrib['id'] == p.attrib['rid'])), None)
                remote_text = manipulate.get_stripped_text(remote)
                prompt.print_(u"Found a handled reference marker: \"{0}\" which links to \"{1}\"".format(text,
                                                                                                         remote_text))

            opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid', 'enter Link id',
                    'skip Rest', 'show Context')

            sel = ''

            if delete_all:
                sel = 'd'
            else:
                sel = prompt.input_options(opts)

            result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree)

            if result == 'abort':
                manipulate.save_tree(tree)
                return
            elif result == 'delall':
                delete_all = True

            manipulate.save_tree(tree)
Esempio n. 6
0
    def process_zotero(self):
        from zotero import libzotero
        zotero = libzotero.LibZotero(unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv)

        manipulate = NlmManipulate(self.gv)
        master_tree = manipulate.load_dom_tree()
        tree = master_tree.xpath('//back/ref-list/ref')

        for element in tree:
            original_term = manipulate.get_stripped_text(element)
            term = original_term

            #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term)
            term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term)
            term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '', term)
            term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term)
            term = re.sub(u'\s.\s', u' ', term)
            term = re.sub(u'(?<=[A-Z])\.', u' ', term)
            term = term.replace(u'“', u'')
            term = term.replace(u'\'s', u'')
            term = term.replace(u'’s', u'')
            term = term.replace(u'’', u'')
            term = term.replace(u' Ed. ', u' ')
            term = term.replace(u' Ed ', u' ')
            term = term.replace(u' Trans. ', u' ')
            term = term.replace(u' Trans ', u' ')
            term = term.replace(u' trans ', u' ')
            term = term.replace(u' trans. ', u' ')
            term = term.replace(u' by. ', u' ')
            term = term.replace(u' by ', u' ')
            term = term.replace(u' ed. ', u' ')
            term = term.replace(u' ed ', u' ')
            term = term.replace(u' In ', u' ')
            term = term.replace(u' in ', u' ')
            term = term.replace(u' print ', u' ')
            term = term.replace(u' Print ', u' ')
            term = term.replace(u' and ', u' ')
            term = term.replace(u'”', u'')
            term = re.sub(r'[Aa]ccessed', '', term)
            term = re.sub(r'meTypesetbr', '', term)
            term = re.sub(r'\s+', ' ', term)

            results = zotero.search(term.strip())

            while len(results) == 0 and len(term.strip().split(' ')) > 2:
                # no results found.
                # begin iterating backwards
                term = ' '.join(term.strip().split(' ')[:-1])
                results = zotero.search(term.strip())

            if len(results) == 1:
                res = results[0].JATS_format()

                if res is not None:
                    ref = etree.fromstring(res)
                    if 'id' in element.attrib:
                        ref.attrib['id'] = element.attrib['id']

                    element.addnext(ref)

                    original_term = re.sub(u'--', u'', original_term)

                    comment = etree.Comment(original_term)
                    ref.addnext(comment)

                    element.tag = 'REMOVE'

        etree.strip_elements(master_tree, 'REMOVE')

        manipulate.save_tree(master_tree)
Esempio n. 7
0
    def run(self, interactive):
        if interactive:
            self.run_prompt()
            return

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # pre-cleanup: remove all empty ext-links as these break the linker
        items_to_clean = tree.xpath('//ext-link')

        count = 0

        for item in items_to_clean:
            if '{http://www.w3.org/1999/xlink}href' in item.attrib and \
                    item.attrib['{http://www.w3.org/1999/xlink}href'] == '':
                count += 1
                item.tag = 'REMOVE'
                etree.strip_tags(item.getparent(), 'REMOVE')

        if count > 0:
            manipulate.save_tree(tree)
            self.debug.print_debug(self, u'Removed {0} blank ext-link tags'.format(count))

        ref_items = tree.xpath('//back/ref-list/ref')

        self.clean_ref_items(tree, ref_items, manipulate)

        # handle numbered reference items
        references_and_numbers = {}

        for ref in ref_items:
            text = manipulate.get_stripped_text(ref)
            ref_match = re.compile('^(?P<number>\d+)\.*')
            result = ref_match.match(text)

            if result:
                references_and_numbers[result.group('number')] = ref

        parsed = self.process_ibid_authors(ref_items)

        if parsed > 0:

            manipulate.save_tree(tree)

            self.debug.print_debug(self, u'Replace {0} instances of "---." at start of references'.format(parsed))

        to_link = []
        to_stub = []

        square_bracket_count = {}


        for p in tree.xpath('//sec//p[not(mml:math)] | //td',
                            namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'}):

            text = manipulate.get_stripped_text(p)

            reference_test = re.compile('\((?P<text>[^%]+?)\)')
            matches = reference_test.finditer(text)

            # exclude any square brackets with numbers inside
            sub_match = re.compile('\[(?P<square>\d*[,\-;\d\s]*)\]')
            smatch = sub_match.search(text)

            if smatch:
                smatches = sub_match.finditer(text)
                for smatch in smatches:
                    self.debug.print_debug(self, u'Handling references in square '
                                                 u'brackets: [{0}] '.format(smatch.group('square')))
                    for item in re.split(';|,', smatch.group('square')):
                        if '-' in item:
                            parent, tail = manipulate.find_text(p, item)

                            if parent is not None:
                                new_string = ''

                                try:
                                    split_range = item.strip().split('-')
                                    for no in range(int(split_range[0]), int(split_range[1]) + 1):
                                        new_string += str(no) + ','
                                except:
                                    self.debug.print_debug(self, u'Unable to parse reference '
                                                                 u'number in range {0}'.format(item))
                                    break

                                if new_string.endswith(',') and not item.endswith(','):
                                    new_string = new_string[0:len(new_string) - 1]

                                if tail and new_string != '':
                                    parent.tail = parent.tail.replace(item, new_string)
                                elif not tail and new_string != '':
                                    parent.text = parent.text.replace(item, new_string)

                                try:
                                    split_range = item.strip().split('-')
                                    for no in range(int(split_range[0]), int(split_range[1]) + 1):
                                        self.debug.print_debug(self, u'Parsing reference '
                                                                     u'number in range {0}'.format(str(no)))

                                        to_stub.append(ReplaceStub(self.gv, p, str(no), tree, manipulate,
                                                                   'TO_LINK_NUMBER', length_ignore=True))
                                except:
                                    self.debug.print_debug(self, u'Unable to parse reference '
                                                                 u'number in range {0}'.format(item))
                                    break

                            else:
                                # just replace the components
                                split_range = item.strip().split('-')
                                for link in split_range:
                                    to_stub.append(ReplaceStub(self.gv, p, link, tree, manipulate,
                                                               'TO_LINK_NUMBER', length_ignore=True))
                        else:
                            if len(item.strip()) < 60:
                                to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate, 'TO_LINK_NUMBER',
                                                           length_ignore=True))

                        square_bracket_count[item.strip()] = 1
            else:
                for match in matches:
                    for item in match.group('text').split(u';'):
                        if len(item.strip()) < 60:
                            to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate))

        for link in to_stub:
            link.link(to_stub)
            #pass

        etree.strip_elements(tree, 'REMOVE')

        use_index_method = False

        if len(square_bracket_count) != len(references_and_numbers):
            # we found more than 3 [1], [2] style references but no reference elements beginning with numbers
            # so, we will simply try to use the /index/ of the reference item (-1 for zero-based compensation)
            self.debug.print_debug(self, u'Using indexical method for square bracket correlation')
            use_index_method = True

        if len(ref_items) == 0:
            self.debug.print_debug(self, u'Found no references to link')

            manipulate.save_tree(tree)

            return

        for p in tree.xpath('//xref[@rid="TO_LINK_NUMBER"]'):
            text = manipulate.get_stripped_text(p)

            if not use_index_method:
                if text in references_and_numbers:
                    ReplaceObject(self.gv, p, references_and_numbers[text]).link()
                else:
                    p.attrib['rid'] = 'TO_LINK'
            else:
                try:
                    ReplaceObject(self.gv, p, ref_items[int(text) - 1]).link()
                except:
                    self.debug.print_debug(self, u'Failed to link to reference {0} + 1 using '
                                                 u'indexical method'.format(text))
                    p.attrib['rid'] = 'TO_LINK'

        for p in tree.xpath('//xref[@rid="TO_LINK"]'):
            text = manipulate.get_stripped_text(p)

            item = text

            bare_items = item.strip().replace(u',', '').split(u' ')

            for ref in ref_items:
                found = True

                bare_ref = manipulate.get_stripped_text(ref)

                bare_refs = bare_ref.split(' ')

                replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"]'

                for sub_item in bare_items:
                    found_ref = False
                    for sub_ref in bare_refs:
                        if re.sub(replace_chars, '', sub_item.strip()).strip() == sub_ref.strip(replace_chars):
                            found_ref = True
                            break

                    if not found_ref:
                        found = False

                if len(bare_items) > 0 and found:
                    to_link.append(ReplaceObject(self.gv, p, ref))

                elif len(bare_items) > 0:
                    replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\d]'
                    found = True

                    for sub_item in bare_items:
                        found_ref = False
                        subbed_text = re.sub(replace_chars, '', sub_item.strip()).strip()
                        for sub_ref in bare_refs:
                            sub_ref = re.sub(replace_chars, '', sub_ref.strip()).strip()

                            if subbed_text == '' and len(bare_items) > 1:
                                found_ref = True
                                break

                            if subbed_text == sub_ref and subbed_text != '' and sub_ref != '':
                                found_ref = True
                                break

                        if not found_ref:
                            found = False

                    # we don't allow linking to the last item here because it is almost universally wrong
                    if len(bare_items) > 0 and found and ref_items.index(ref) != len(ref_items) - 1:
                        to_link.append(ReplaceObject(self.gv, p, ref))


        if len(to_link) == 0:
            self.debug.print_debug(self, u'Found no references to link')

        for link in to_link:
            link.link()
            #pass

        manipulate.save_tree(tree)
Esempio n. 8
0
    def process_zotero(self):
        from zotero import libzotero
        zotero = libzotero.LibZotero(
            unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv)

        manipulate = NlmManipulate(self.gv)
        master_tree = manipulate.load_dom_tree()
        tree = master_tree.xpath('//back/ref-list/ref')

        for element in tree:
            original_term = manipulate.get_stripped_text(element)
            term = original_term

            #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term)
            term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term)
            term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '',
                          term)
            term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term)
            term = re.sub(u'\s.\s', u' ', term)
            term = re.sub(u'(?<=[A-Z])\.', u' ', term)
            term = term.replace(u'“', u'')
            term = term.replace(u'\'s', u'')
            term = term.replace(u'’s', u'')
            term = term.replace(u'’', u'')
            term = term.replace(u' Ed. ', u' ')
            term = term.replace(u' Ed ', u' ')
            term = term.replace(u' Trans. ', u' ')
            term = term.replace(u' Trans ', u' ')
            term = term.replace(u' trans ', u' ')
            term = term.replace(u' trans. ', u' ')
            term = term.replace(u' by. ', u' ')
            term = term.replace(u' by ', u' ')
            term = term.replace(u' ed. ', u' ')
            term = term.replace(u' ed ', u' ')
            term = term.replace(u' In ', u' ')
            term = term.replace(u' in ', u' ')
            term = term.replace(u' print ', u' ')
            term = term.replace(u' Print ', u' ')
            term = term.replace(u' and ', u' ')
            term = term.replace(u'”', u'')
            term = re.sub(r'[Aa]ccessed', '', term)
            term = re.sub(r'meTypesetbr', '', term)
            term = re.sub(r'\s+', ' ', term)

            results = zotero.search(term.strip())

            while len(results) == 0 and len(term.strip().split(' ')) > 2:
                # no results found.
                # begin iterating backwards
                term = ' '.join(term.strip().split(' ')[:-1])
                results = zotero.search(term.strip())

            if len(results) == 1:
                res = results[0].JATS_format()

                if res is not None:
                    ref = etree.fromstring(res)
                    if 'id' in element.attrib:
                        ref.attrib['id'] = element.attrib['id']

                    element.addnext(ref)

                    original_term = re.sub(u'--', u'', original_term)

                    comment = etree.Comment(original_term)
                    ref.addnext(comment)

                    element.tag = 'REMOVE'

        etree.strip_elements(master_tree, 'REMOVE')

        manipulate.save_tree(master_tree)