def scan_current_node_for_terms(self, event=None): node_text = self.xml_text.get('1.0', tk.END) label = self.current_node.get('label') print label self.terms = [] self.unmarked_defs.delete(0, tk.END) for term, def_label, _ in self.gather_defined_terms(): term_locations = set(find_all_occurrences(node_text, term)) plural_term = self.inf.plural(term) plural_term_locations = set(find_all_occurrences(node_text, plural_term)) unmarked_locs = list(plural_term_locations | term_locations ^ plural_term_locations) for start in unmarked_locs: if start in plural_term_locations: term_to_use = plural_term elif start in term_locations: term_to_use = term end = start + len(term_to_use) start_index = '1.0 + {} chars'.format(start) end_index = '1.0 + {} chars'.format(end) if not enclosed_in_tag(node_text, 'ref', start) and \ not enclosed_in_tag(node_text, 'def', start) and \ not enclosed_in_tag(node_text, 'title', start): term_data = (term_to_use, start, end, start_index, end_index, def_label) if term_data not in self.terms: self.terms.append(term_data) self.terms.sort(key=itemgetter(2)) for term in self.terms: self.unmarked_defs.insert(tk.END, '{} [{}]'.format(term[0], term[1])) self.xml_text.tag_add('undefined_term', term[3], term[4]) self.xml_text.tag_configure('undefined_term', background='yellow')
def test_find_all_occurrences(self): s = "There are many days. Sunday is a day. Saturday is a day too. Days happen.".lower() occurances = find_all_occurrences(s, 'day') self.assertTrue(33 in occurances) self.assertTrue(52 in occurances) self.assertEqual(len(occurances), 2) occurances = find_all_occurrences(s, 'days') self.assertTrue(15 in occurances) self.assertTrue(61 in occurances) self.assertEqual(len(occurances), 2)
def test_find_all_occurrences(self): s = "There are many days. Sunday is a day. Saturday is a day too. Days happen.".lower( ) occurances = find_all_occurrences(s, 'day') self.assertTrue(33 in occurances) self.assertTrue(52 in occurances) self.assertEqual(len(occurances), 2) occurances = find_all_occurrences(s, 'days') self.assertTrue(15 in occurances) self.assertTrue(61 in occurances) self.assertEqual(len(occurances), 2)
def scan_current_node_for_terms(self, event=None): node_text = self.xml_text.get('1.0', tk.END) label = self.current_node.get('label') self.terms = [] self.unmarked_defs.delete(0, tk.END) for term, def_label, _ in self.gather_defined_terms(): term_locations = set(find_all_occurrences(node_text, term)) plural_term = self.inf.plural(term) plural_term_locations = set( find_all_occurrences(node_text, plural_term)) unmarked_locs = list(plural_term_locations | term_locations ^ plural_term_locations) for start in unmarked_locs: if start in plural_term_locations: term_to_use = plural_term elif start in term_locations: term_to_use = term end = start + len(term_to_use) start_index = '1.0 + {} chars'.format(start) end_index = '1.0 + {} chars'.format(end) if not enclosed_in_tag(node_text, 'ref', start) and \ not enclosed_in_tag(node_text, 'def', start) and \ not enclosed_in_tag(node_text, 'title', start) and \ not enclosed_in_tag(node_text, 'subject', start): term_data = (term_to_use, start, end, start_index, end_index, def_label) if term_data not in self.terms: self.terms.append(term_data) self.terms.sort(key=itemgetter(2)) for term in self.terms: self.unmarked_defs.insert(tk.END, '{} [{}]'.format(term[0], term[1])) self.xml_text.tag_add('undefined_term', term[3], term[4]) self.xml_text.tag_configure('undefined_term', background='yellow')
def build_external_citations_layer(root): """ Build the external citations layer from the provided root of the XML tree. :param root: The root element of the XML tree. :type root: :class:`etree.Element` :return: An OrderedDict containing the locations of external citations, suitable for direct transformation into JSON for use with the eRegs frontend. :rtype: :class:`collections.OrderedDict`: """ paragraphs = root.findall('.//{eregs}paragraph') layer_dict = OrderedDict() for paragraph in paragraphs: marker = paragraph.get('marker') par_text = marker + ' ' + xml_node_text( paragraph.find('{eregs}content')) par_label = paragraph.get('label') cites = paragraph.findall('.//{eregs}ref[@reftype="external"]') citation_list = [] for cite in cites: target = cite.get('target').split(':') citation_type = target[0] try: citation_target = target[1].split('-') except IndexError as e: print( "Error in external citations: '{}' is not formatted properly. " .format(target), "Look for an empty or malformed target in a reftype=\"external\"." ) raise e text = cite.text positions = find_all_occurrences(par_text, text) cite_dict = OrderedDict() cite_dict['citation'] = citation_target cite_dict['citation_type'] = citation_type cite_dict['offsets'] = [] for pos in positions: cite_dict['offsets'].append([pos, pos + len(text)]) if cite_dict not in citation_list and cite_dict['offsets'] != []: citation_list.append(cite_dict) if citation_list != []: layer_dict[par_label] = citation_list return layer_dict
def build_external_citations_layer(root): """ Build the external citations layer from the provided root of the XML tree. :param root: The root element of the XML tree. :type root: :class:`etree.Element` :return: An OrderedDict containing the locations of external citations, suitable for direct transformation into JSON for use with the eRegs frontend. :rtype: :class:`collections.OrderedDict`: """ paragraphs = root.findall('.//{eregs}paragraph') layer_dict = OrderedDict() for paragraph in paragraphs: marker = paragraph.get('marker') par_text = marker + ' ' + xml_node_text( paragraph.find('{eregs}content')) par_label = paragraph.get('label') cites = paragraph.findall('.//{eregs}ref[@reftype="external"]') citation_list = [] for cite in cites: target = cite.get('target').split(':') citation_type = target[0] try: citation_target = target[1].split('-') except IndexError as e: print("Error in external citations: '{}' is not formatted properly. ".format(target), "Look for an empty or malformed target in a reftype=\"external\".") raise e text = cite.text positions = find_all_occurrences(par_text, text) cite_dict = OrderedDict() cite_dict['citation'] = citation_target cite_dict['citation_type'] = citation_type cite_dict['offsets'] = [] for pos in positions: cite_dict['offsets'].append([pos, pos + len(text)]) if cite_dict not in citation_list and cite_dict['offsets'] != []: citation_list.append(cite_dict) if citation_list != []: layer_dict[par_label] = citation_list return layer_dict
def build_terms_layer(root): """ Build the terms layer from the provided root of the XML tree. :param root: The root element of the XML tree. :type root: :class:`etree.Element` :return: An OrderedDict containing the locations of terms, suitable for direct transformation into JSON for use with the eRegs frontend. :rtype: :class:`collections.OrderedDict`: """ definitions_dict = OrderedDict() terms_dict = OrderedDict() inf_engine = inflect.engine() inf_engine.defnoun('bonus', 'bonuses') paragraphs = root.findall('.//{eregs}paragraph') + \ root.findall('.//{eregs}interpParagraph') definitions = root.findall('.//{eregs}def') paragraphs_with_defs = [ par for par in paragraphs if par.find('{eregs}content') is not None and par.find('{eregs}content').find('{eregs}def') is not None ] for paragraph in paragraphs_with_defs: label = paragraph.get('label') marker = paragraph.get('marker') or '' title = paragraph.find('{eregs}title') content = apply_formatting(paragraph.find('{eregs}content')) par_text = xml_node_text(content).strip() definitions = content.findall('{eregs}def') total_offset = get_offset(paragraph, marker, title) for defn in definitions: defined_term = defn.get('term') if inf_engine.singular_noun(defined_term.lower()) and not \ defined_term.lower() in settings.SPECIAL_SINGULAR_NOUNS: key = inf_engine.singular_noun(defined_term.lower()) + \ ':' + label else: key = defined_term.lower() + ':' + label def_text = defn.text positions = find_all_occurrences(par_text, def_text) def_dict = OrderedDict() pos = positions[0] def_dict['position'] = [ pos + total_offset, pos + len(def_text) + total_offset ] def_dict['reference'] = label def_dict['term'] = defined_term if def_dict['position'] != []: definitions_dict[key] = def_dict for paragraph in paragraphs: content = apply_formatting(paragraph.find('{eregs}content')) terms = content.findall('.//{eregs}ref[@reftype="term"]') title = paragraph.find('{eregs}title') marker = paragraph.get('marker') or '' label = paragraph.get('label') # If this is a subparagraph of a type that wants an intro paragraph # and this paragraph is intro text, set the paragraph's label to reference # the parent's if wants_intro_text( paragraph.getparent()) and is_intro_text(paragraph): # This intro paragraph will get attached to its parent node by # build_reg_tree label = paragraph.getparent().get('label') if len(terms) > 0: terms_dict[label] = [] total_offset = get_offset(paragraph, marker, title) term_positions = OrderedDict() term_targets = OrderedDict() for term in terms: running_par_text = content.text or '' for child in content.getchildren(): if child != term: tail = child.tail or '' running_par_text += child.text + tail else: break text = term.text target = term.get('target') defn_location = [ key for key, defn in definitions_dict.items() if defn['reference'] == target ] if len(defn_location) > 0: defn_location = defn_location[0] term_position = len(running_par_text) + total_offset term_positions.setdefault(text, []).append(term_position) term_targets[text] = defn_location for term, positions in term_positions.items(): target = term_targets[term] ref_dict = OrderedDict() ref_dict['offsets'] = [] for pos in positions: ref_dict['offsets'].append([pos, pos + len(term)]) ref_dict['ref'] = target if len(ref_dict['offsets']) > 0 and \ ref_dict not in terms_dict[label]: terms_dict[label].append(ref_dict) terms_dict['referenced'] = definitions_dict return terms_dict
def build_terms_layer(root): """ Build the terms layer from the provided root of the XML tree. :param root: The root element of the XML tree. :type root: :class:`etree.Element` :return: An OrderedDict containing the locations of terms, suitable for direct transformation into JSON for use with the eRegs frontend. :rtype: :class:`collections.OrderedDict`: """ definitions_dict = OrderedDict() terms_dict = OrderedDict() inf_engine = inflect.engine() inf_engine.defnoun('bonus', 'bonuses') paragraphs = root.findall('.//{eregs}paragraph') + \ root.findall('.//{eregs}interpParagraph') definitions = root.findall('.//{eregs}def') paragraphs_with_defs = [par for par in paragraphs if par.find('{eregs}content') is not None and par.find('{eregs}content').find('{eregs}def') is not None] for paragraph in paragraphs_with_defs: label = paragraph.get('label') marker = paragraph.get('marker') or '' content = apply_formatting(paragraph.find('{eregs}content')) par_text = (marker + ' ' + xml_node_text(content)).strip() definitions = content.findall('{eregs}def') for defn in definitions: defined_term = defn.get('term') if inf_engine.singular_noun(defined_term.lower()) and not \ defined_term.lower() in settings.SPECIAL_SINGULAR_NOUNS: key = inf_engine.singular_noun(defined_term.lower()) + \ ':' + label else: key = defined_term.lower() + ':' + label def_text = defn.text positions = find_all_occurrences(par_text, def_text) def_dict = OrderedDict() pos = positions[0] def_dict['position'] = [pos, pos + len(def_text)] def_dict['reference'] = label def_dict['term'] = defined_term if def_dict['position'] != []: definitions_dict[key] = def_dict for paragraph in paragraphs: content = apply_formatting(paragraph.find('{eregs}content')) terms = content.findall('.//{eregs}ref[@reftype="term"]') title = paragraph.find('{eregs}title') marker = paragraph.get('marker') or '' label = paragraph.get('label') if wants_intro_text(paragraph.getparent()) and is_intro_text(paragraph): # This intro paragraph will get attached to its parent node by # build_reg_text label = paragraph.getparent().get('label') if len(terms) > 0: terms_dict[label] = [] if marker != '' and paragraph.tag != '{eregs}interpParagraph': marker_offset = len(marker + ' ') else: marker_offset = 0 # Keyterm offset. # Note: reg-site treats interp-paragraphs as "special" — they # don't get the keyterm text included, so we don't include an # offset here. if title is not None and title.get('type') == 'keyterm' and \ paragraph.tag != '{eregs}interpParagraph': keyterm_offset = len(title.text) else: keyterm_offset = 0 term_positions = OrderedDict() term_targets = OrderedDict() for term in terms: running_par_text = content.text or '' for child in content.getchildren(): if child != term: tail = child.tail or '' running_par_text += child.text + tail else: break text = term.text target = term.get('target') defn_location = [key for key, defn in definitions_dict.items() if defn['reference'] == target] if len(defn_location) > 0: defn_location = defn_location[0] term_position = len(running_par_text) + marker_offset + keyterm_offset term_positions.setdefault(text, []).append(term_position) term_targets[text] = defn_location for term, positions in term_positions.items(): target = term_targets[term] ref_dict = OrderedDict() ref_dict['offsets'] = [] for pos in positions: ref_dict['offsets'].append([pos, pos + len(term)]) ref_dict['ref'] = target if len(ref_dict['offsets']) > 0 and \ ref_dict not in terms_dict[label]: terms_dict[label].append(ref_dict) terms_dict['referenced'] = definitions_dict return terms_dict