def run_prompt(self): self.run(False) self.debug.print_debug(self, u'Entering interactive mode') prompt = Interactive(self.gv) manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() ref_items = tree.xpath('//back/ref-list/ref') # note that we don't want to exit even if there are no references to link because the user may want to delete # some delete_all = False for p in tree.xpath('//xref[@ref-type="bibr"]'): text = manipulate.get_stripped_text(p) prompt.print_(prompt.colorize('green', ("-" * 80))) if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK': prompt.print_( u"Found an unhandled reference marker: {0}".format(text)) elif 'rid' in p.attrib: remote = next((x for x in ref_items if 'id' in x.attrib and ( x.attrib['id'] == p.attrib['rid'])), None) remote_text = manipulate.get_stripped_text( remote) if remote else '' prompt.print_( u"Found a handled reference marker: \"{0}\" which links to \"{1}\"" .format(text, remote_text)) opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid', 'enter Link id', 'skip Rest', 'show Context') sel = '' if delete_all: sel = 'd' else: sel = prompt.input_options(opts) result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree) if result == 'abort': manipulate.save_tree(tree) return elif result == 'delall': delete_all = True manipulate.save_tree(tree)
def prune(self): self.debug.print_debug(self, u'Deleting all stubs from article') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() for p in tree.xpath('//xref[@ref-type="bibr" and @rid="TO_LINK"]'): self.extract_contents(p) manipulate.save_tree(tree)
def run_prompt(self): self.run(False) self.debug.print_debug(self, u"Entering interactive mode") prompt = Interactive(self.gv) manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() ref_items = tree.xpath("//back/ref-list/ref") # note that we don't want to exit even if there are no references to link because the user may want to delete # some delete_all = False for p in tree.xpath('//xref[@ref-type="bibr"]'): text = manipulate.get_stripped_text(p) if "rid" in p.attrib and p.attrib["rid"] == "TO_LINK": prompt.print_(u"Found an unhandled reference marker: {0}".format(text)) elif "rid" in p.attrib: remote = next((x for x in ref_items if "id" in x.attrib and (x.attrib["id"] == p.attrib["rid"])), None) remote_text = manipulate.get_stripped_text(remote) prompt.print_(u'Found a handled reference marker: "{0}" which links to "{1}"'.format(text, remote_text)) opts = ( "Skip", "Delete", "deleTe all", "Enter search", "Ibid", "enter Link id", "skip Rest", "show Context", ) sel = "" if delete_all: sel = "d" else: sel = prompt.input_options(opts) result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree) if result == "abort": manipulate.save_tree(tree) return elif result == "delall": delete_all = True manipulate.save_tree(tree)
def link_items(self, source_id, dest_id, manipulate=None, tree=None): self.debug.print_debug(self, u'Attempting to link XREF {0} to REF {1}'.format(source_id, dest_id)) if manipulate is None: manipulate = NlmManipulate(self.gv) if tree is None: tree = manipulate.load_dom_tree() source = tree.xpath('//xref[@id="{0}"]'.format(source_id))[0] dest = tree.xpath('//ref[@id="{0}"]'.format(dest_id))[0] ReplaceObject(self.gv, source, dest).link() manipulate.save_tree(tree)
def run_prompt(self): self.run(False) self.debug.print_debug(self, u'Entering interactive mode') prompt = Interactive(self.gv) manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() ref_items = tree.xpath('//back/ref-list/ref') # note that we don't want to exit even if there are no references to link because the user may want to delete # some delete_all = False for p in tree.xpath('//xref[@ref-type="bibr"]'): text = manipulate.get_stripped_text(p) prompt.print_(prompt.colorize('green',("-" * 80))) if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK': prompt.print_(u"Found an unhandled reference marker: {0}".format(text)) elif 'rid' in p.attrib: remote = next((x for x in ref_items if 'id' in x.attrib and (x.attrib['id'] == p.attrib['rid'])), None) remote_text = manipulate.get_stripped_text(remote) prompt.print_(u"Found a handled reference marker: \"{0}\" which links to \"{1}\"".format(text, remote_text)) opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid', 'enter Link id', 'skip Rest', 'show Context') sel = '' if delete_all: sel = 'd' else: sel = prompt.input_options(opts) result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree) if result == 'abort': manipulate.save_tree(tree) return elif result == 'delall': delete_all = True manipulate.save_tree(tree)
def process_zotero(self): from zotero import libzotero zotero = libzotero.LibZotero(unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv) manipulate = NlmManipulate(self.gv) master_tree = manipulate.load_dom_tree() tree = master_tree.xpath('//back/ref-list/ref') for element in tree: original_term = manipulate.get_stripped_text(element) term = original_term #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term) term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term) term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '', term) term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term) term = re.sub(u'\s.\s', u' ', term) term = re.sub(u'(?<=[A-Z])\.', u' ', term) term = term.replace(u'“', u'') term = term.replace(u'\'s', u'') term = term.replace(u'’s', u'') term = term.replace(u'’', u'') term = term.replace(u' Ed. ', u' ') term = term.replace(u' Ed ', u' ') term = term.replace(u' Trans. ', u' ') term = term.replace(u' Trans ', u' ') term = term.replace(u' trans ', u' ') term = term.replace(u' trans. ', u' ') term = term.replace(u' by. ', u' ') term = term.replace(u' by ', u' ') term = term.replace(u' ed. ', u' ') term = term.replace(u' ed ', u' ') term = term.replace(u' In ', u' ') term = term.replace(u' in ', u' ') term = term.replace(u' print ', u' ') term = term.replace(u' Print ', u' ') term = term.replace(u' and ', u' ') term = term.replace(u'”', u'') term = re.sub(r'[Aa]ccessed', '', term) term = re.sub(r'meTypesetbr', '', term) term = re.sub(r'\s+', ' ', term) results = zotero.search(term.strip()) while len(results) == 0 and len(term.strip().split(' ')) > 2: # no results found. # begin iterating backwards term = ' '.join(term.strip().split(' ')[:-1]) results = zotero.search(term.strip()) if len(results) == 1: res = results[0].JATS_format() if res is not None: ref = etree.fromstring(res) if 'id' in element.attrib: ref.attrib['id'] = element.attrib['id'] element.addnext(ref) original_term = re.sub(u'--', u'', original_term) comment = etree.Comment(original_term) ref.addnext(comment) element.tag = 'REMOVE' etree.strip_elements(master_tree, 'REMOVE') manipulate.save_tree(master_tree)
def run(self, interactive): if interactive: self.run_prompt() return manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() # pre-cleanup: remove all empty ext-links as these break the linker items_to_clean = tree.xpath('//ext-link') count = 0 for item in items_to_clean: if '{http://www.w3.org/1999/xlink}href' in item.attrib and \ item.attrib['{http://www.w3.org/1999/xlink}href'] == '': count += 1 item.tag = 'REMOVE' etree.strip_tags(item.getparent(), 'REMOVE') if count > 0: manipulate.save_tree(tree) self.debug.print_debug(self, u'Removed {0} blank ext-link tags'.format(count)) ref_items = tree.xpath('//back/ref-list/ref') self.clean_ref_items(tree, ref_items, manipulate) # handle numbered reference items references_and_numbers = {} for ref in ref_items: text = manipulate.get_stripped_text(ref) ref_match = re.compile('^(?P<number>\d+)\.*') result = ref_match.match(text) if result: references_and_numbers[result.group('number')] = ref parsed = self.process_ibid_authors(ref_items) if parsed > 0: manipulate.save_tree(tree) self.debug.print_debug(self, u'Replace {0} instances of "---." at start of references'.format(parsed)) to_link = [] to_stub = [] square_bracket_count = {} for p in tree.xpath('//sec//p[not(mml:math)] | //td', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'}): text = manipulate.get_stripped_text(p) reference_test = re.compile('\((?P<text>[^%]+?)\)') matches = reference_test.finditer(text) # exclude any square brackets with numbers inside sub_match = re.compile('\[(?P<square>\d*[,\-;\d\s]*)\]') smatch = sub_match.search(text) if smatch: smatches = sub_match.finditer(text) for smatch in smatches: self.debug.print_debug(self, u'Handling references in square ' u'brackets: [{0}] '.format(smatch.group('square'))) for item in re.split(';|,', smatch.group('square')): if '-' in item: parent, tail = manipulate.find_text(p, item) if parent is not None: new_string = '' try: split_range = item.strip().split('-') for no in range(int(split_range[0]), int(split_range[1]) + 1): new_string += str(no) + ',' except: self.debug.print_debug(self, u'Unable to parse reference ' u'number in range {0}'.format(item)) break if new_string.endswith(',') and not item.endswith(','): new_string = new_string[0:len(new_string) - 1] if tail and new_string != '': parent.tail = parent.tail.replace(item, new_string) elif not tail and new_string != '': parent.text = parent.text.replace(item, new_string) try: split_range = item.strip().split('-') for no in range(int(split_range[0]), int(split_range[1]) + 1): self.debug.print_debug(self, u'Parsing reference ' u'number in range {0}'.format(str(no))) to_stub.append(ReplaceStub(self.gv, p, str(no), tree, manipulate, 'TO_LINK_NUMBER', length_ignore=True)) except: self.debug.print_debug(self, u'Unable to parse reference ' u'number in range {0}'.format(item)) break else: # just replace the components split_range = item.strip().split('-') for link in split_range: to_stub.append(ReplaceStub(self.gv, p, link, tree, manipulate, 'TO_LINK_NUMBER', length_ignore=True)) else: if len(item.strip()) < 60: to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate, 'TO_LINK_NUMBER', length_ignore=True)) square_bracket_count[item.strip()] = 1 else: for match in matches: for item in match.group('text').split(u';'): if len(item.strip()) < 60: to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate)) for link in to_stub: link.link(to_stub) #pass etree.strip_elements(tree, 'REMOVE') use_index_method = False if len(square_bracket_count) != len(references_and_numbers): # we found more than 3 [1], [2] style references but no reference elements beginning with numbers # so, we will simply try to use the /index/ of the reference item (-1 for zero-based compensation) self.debug.print_debug(self, u'Using indexical method for square bracket correlation') use_index_method = True if len(ref_items) == 0: self.debug.print_debug(self, u'Found no references to link') manipulate.save_tree(tree) return for p in tree.xpath('//xref[@rid="TO_LINK_NUMBER"]'): text = manipulate.get_stripped_text(p) if not use_index_method: if text in references_and_numbers: ReplaceObject(self.gv, p, references_and_numbers[text]).link() else: p.attrib['rid'] = 'TO_LINK' else: try: ReplaceObject(self.gv, p, ref_items[int(text) - 1]).link() except: self.debug.print_debug(self, u'Failed to link to reference {0} + 1 using ' u'indexical method'.format(text)) p.attrib['rid'] = 'TO_LINK' for p in tree.xpath('//xref[@rid="TO_LINK"]'): text = manipulate.get_stripped_text(p) item = text bare_items = item.strip().replace(u',', '').split(u' ') for ref in ref_items: found = True bare_ref = manipulate.get_stripped_text(ref) bare_refs = bare_ref.split(' ') replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"]' for sub_item in bare_items: found_ref = False for sub_ref in bare_refs: if re.sub(replace_chars, '', sub_item.strip()).strip() == sub_ref.strip(replace_chars): found_ref = True break if not found_ref: found = False if len(bare_items) > 0 and found: to_link.append(ReplaceObject(self.gv, p, ref)) elif len(bare_items) > 0: replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\d]' found = True for sub_item in bare_items: found_ref = False subbed_text = re.sub(replace_chars, '', sub_item.strip()).strip() for sub_ref in bare_refs: sub_ref = re.sub(replace_chars, '', sub_ref.strip()).strip() if subbed_text == '' and len(bare_items) > 1: found_ref = True break if subbed_text == sub_ref and subbed_text != '' and sub_ref != '': found_ref = True break if not found_ref: found = False # we don't allow linking to the last item here because it is almost universally wrong if len(bare_items) > 0 and found and ref_items.index(ref) != len(ref_items) - 1: to_link.append(ReplaceObject(self.gv, p, ref)) if len(to_link) == 0: self.debug.print_debug(self, u'Found no references to link') for link in to_link: link.link() #pass manipulate.save_tree(tree)
def process_zotero(self): from zotero import libzotero zotero = libzotero.LibZotero( unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv) manipulate = NlmManipulate(self.gv) master_tree = manipulate.load_dom_tree() tree = master_tree.xpath('//back/ref-list/ref') for element in tree: original_term = manipulate.get_stripped_text(element) term = original_term #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term) term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term) term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '', term) term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term) term = re.sub(u'\s.\s', u' ', term) term = re.sub(u'(?<=[A-Z])\.', u' ', term) term = term.replace(u'“', u'') term = term.replace(u'\'s', u'') term = term.replace(u'’s', u'') term = term.replace(u'’', u'') term = term.replace(u' Ed. ', u' ') term = term.replace(u' Ed ', u' ') term = term.replace(u' Trans. ', u' ') term = term.replace(u' Trans ', u' ') term = term.replace(u' trans ', u' ') term = term.replace(u' trans. ', u' ') term = term.replace(u' by. ', u' ') term = term.replace(u' by ', u' ') term = term.replace(u' ed. ', u' ') term = term.replace(u' ed ', u' ') term = term.replace(u' In ', u' ') term = term.replace(u' in ', u' ') term = term.replace(u' print ', u' ') term = term.replace(u' Print ', u' ') term = term.replace(u' and ', u' ') term = term.replace(u'”', u'') term = re.sub(r'[Aa]ccessed', '', term) term = re.sub(r'meTypesetbr', '', term) term = re.sub(r'\s+', ' ', term) results = zotero.search(term.strip()) while len(results) == 0 and len(term.strip().split(' ')) > 2: # no results found. # begin iterating backwards term = ' '.join(term.strip().split(' ')[:-1]) results = zotero.search(term.strip()) if len(results) == 1: res = results[0].JATS_format() if res is not None: ref = etree.fromstring(res) if 'id' in element.attrib: ref.attrib['id'] = element.attrib['id'] element.addnext(ref) original_term = re.sub(u'--', u'', original_term) comment = etree.Comment(original_term) ref.addnext(comment) element.tag = 'REMOVE' etree.strip_elements(master_tree, 'REMOVE') manipulate.save_tree(master_tree)