def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node( node_text, node_type=Node.INTERP, tagged_text=tagged_text, label=[get_first_interp_marker(node_text)] ) keyterm = KeyTerms.keyterm_in_node(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.' * len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = [(m, m.start(), m.end()) for m in marker.finditer(node_text)] possible = remove_citation_overlaps(node_text, possible) possible = [triplet[0] for triplet in possible] collapsed_markers.extend( match for match in possible if not false_collapsed_marker(match, node_text, tagged_text) ) return collapsed_markers
def collapsed_markers(text): """Not all paragraph markers are at the beginning of of the text. This grabs inner markers like (1) and (i) here: (c) cContent —(1) 1Content (i) iContent""" potential = [triplet for triplet in _collapsed_grammar.scanString(text)] # remove any that overlap with citations potential = [trip for trip in remove_citation_overlaps(text, potential)] # flatten the results potential = [pm for pms, _, _ in potential for pm in pms] # remove any matches that aren't (a), (1), (i), etc. -- All other # markers can't be collapsed first_markers = [level[0] for level in p_levels] potential = [pm for pm in potential if pm in first_markers] return potential
def collapsed_markers_matches(node_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives""" collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] collapsed_markers.extend(m for m, _, _ in possible) return collapsed_markers
def collapsed_markers(text): """Not all paragraph markers are at the beginning of of the text. This grabs inner markers like (1) and (i) here: (c) cContent —(1) 1Content (i) iContent""" potential = [triplet for triplet in _collapsed_grammar.scanString(text)] # remove any that overlap with citations potential = [trip for trip in remove_citation_overlaps(text, potential)] # flatten the results potential = [pm for pms, _, _ in potential for pm in pms] # remove any matches that aren't (a), (1), (i), etc. -- All other # markers can't be collapsed first_markers = [level[0] for level in p_levels] potential = [pm for pm in potential if pm in first_markers] return potential
def get_collapsed_markers(text): """Not all paragraph markers are at the beginning of of the text. This grabs inner markers like (1) and (i) here: (c) cContent —(1) 1Content (i) iContent""" matches = [] for parser in _first_markers: matches.extend(parser.scanString(text)) # remove matches at the beginning if matches and matches[0][1] == 0: matches = matches[1:] # remove any that overlap with citations matches = [m for m, _, _ in remove_citation_overlaps(text, matches)] # get the letters; poor man's flatten return reduce(lambda lhs, rhs: list(lhs) + list(rhs), matches, [])
def get_collapsed_markers(text): """Not all paragraph markers are at the beginning of of the text. This grabs inner markers like (1) and (i) here: (c) cContent —(1) 1Content (i) iContent""" matches = [] for parser in _first_markers: matches.extend(parser.scanString(text)) # remove matches at the beginning if matches and matches[0][1] == 0: matches = matches[1:] # remove any that overlap with citations matches = [m for m, _, _ in remove_citation_overlaps(text, matches)] # get the letters; poor man's flatten return reduce(lambda lhs, rhs: list(lhs) + list(rhs), matches, [])