def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.keyterm_in_node(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", "”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node( node_text, node_type=Node.INTERP, tagged_text=tagged_text, label=[get_first_interp_marker(node_text)] ) keyterm = KeyTerms.keyterm_in_node(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.' * len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = [(m, m.start(), m.end()) for m in marker.finditer(node_text)] possible = remove_citation_overlaps(node_text, possible) possible = [triplet[0] for triplet in possible] collapsed_markers.extend( match for match in possible if not false_collapsed_marker(match, node_text, tagged_text) ) return collapsed_markers
def test_emphasis_later(self): """ Don't pick up something that is emphasized later in a paragraph as a key-term. """ node = Node( '(a) This has a list: apples et seq.', label=['101', '22', 'a'], tagged_text='(a) This has a list: apples <E T="03">et seq.</E>') assert KeyTerms.keyterm_in_node(node) is None
def test_emphasis_later(self): """ Don't pick up something that is emphasized later in a paragraph as a key-term. """ node = Node( '(a) This has a list: apples et seq.', label=['101', '22', 'a'], tagged_text='(a) This has a list: apples <E T="03">et seq.</E>') assert KeyTerms.keyterm_in_node(node) is None
def test_emphasis_close_to_front(self): """ An emphasized word is close to the front, but is not a key term. """ node = Node( '(a) T et seq. has a list: apples', label=['101', '22', 'a'], tagged_text='(a) T <E T="03">et seq.</E> has a list: apples') assert KeyTerms.keyterm_in_node(node) is None
def test_emphasis_close_to_front(self): """ An emphasized word is close to the front, but is not a key term. """ node = Node( '(a) T et seq. has a list: apples', label=['101', '22', 'a'], tagged_text='(a) T <E T="03">et seq.</E> has a list: apples') assert KeyTerms.keyterm_in_node(node) is None
def replace_markerless(self, stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.keyterm_in_node(node, ignore_definitions=False) if keyterm: p_num = hash_for_paragraph(keyterm) else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{}'.format(p_num)
def replace_markerless(self, stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.keyterm_in_node(node, ignore_definitions=False) if keyterm: p_num = hash_for_paragraph(keyterm) else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{}'.format(p_num)
def replace_markerless(stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.keyterm_in_node(node, ignore_definitions=False) if keyterm: p_num = hash_for_paragraph(keyterm) """Sometimes key terms will be repeated and the hash will be identical. This is here to catch that case.""" if 'p{0}'.format(p_num) in [item[1].label[0] for item in stack.m_stack[-1]]: p_num = hash_for_paragraph(keyterm + "dedupe") else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{0}'.format(p_num)
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.keyterm_in_node(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, '.'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace('.'*len(keyterm), keyterm) node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) self.nodes.append(node)