def process(self, node): label = node.label_id() if label in self.model_forms_nodes and self.model_forms_nodes[label]: keyterm = KeyTerms.get_keyterm(node) if keyterm: end = '</E>' node_text = node.text[ node.text.find(end) + len(end):].split(' ') else: node_text = KeyTerms.process_node_text(node).split(' ') start_of_model_form = node_text[0] end_of_model_form = node_text[-1] if start_of_model_form and end_of_model_form: list_of_ends = [w for w in node_text if w == end_of_model_form] location_end = len(list_of_ends) - 1 layer_el = [{ 'start_word': start_of_model_form, 'start_locations': [0], 'end_word': end_of_model_form, 'end_locations':[location_end] }] return layer_el
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def process(self, node): label = node.label_id() if label in self.model_forms_nodes and self.model_forms_nodes[label]: keyterm = KeyTerms.get_keyterm(node) if keyterm: end = '</E>' node_text = node.text[node.text.find(end) + len(end):].split(' ') else: node_text = KeyTerms.process_node_text(node).split(' ') start_of_model_form = node_text[0] end_of_model_form = node_text[-1] if start_of_model_form and end_of_model_form: list_of_ends = [w for w in node_text if w == end_of_model_form] location_end = len(list_of_ends) - 1 layer_el = [{ 'start_word': start_of_model_form, 'start_locations': [0], 'end_word': end_of_model_form, 'end_locations': [location_end] }] return layer_el
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, ';'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace(';'*len(keyterm), keyterm) # label_candidate = [initial_marker(mtext)[0]] # existing_node = None # for node in self.nodes: # if node.label == label_candidate: # existing_node = node # if existing_node: # self.paragraph_counter += 1 # node = Node(mtext, node_type=Node.APPENDIX, # label=['dup{}'.format(self.paragraph_counter), # initial_marker(mtext)[0]]) # else: node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) node.tagged_text = tagged_text self.nodes.append(node)
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, ';' * len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace(';' * len(keyterm), keyterm) # label_candidate = [initial_marker(mtext)[0]] # existing_node = None # for node in self.nodes: # if node.label == label_candidate: # existing_node = node # if existing_node: # self.paragraph_counter += 1 # node = Node(mtext, node_type=Node.APPENDIX, # label=['dup{}'.format(self.paragraph_counter), # initial_marker(mtext)[0]]) # else: node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) node.tagged_text = tagged_text self.nodes.append(node)
def replace_markerless(self, stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.get_keyterm(node, ignore_definitions=False) if keyterm: p_num = keyterm_to_int(keyterm) else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{}'.format(p_num)
def replace_markerless(self, stack, node, depth): """Assign a unique index to all of the MARKERLESS paragraphs""" if node.label[-1] == mtypes.MARKERLESS: keyterm = KeyTerms.get_keyterm(node, ignore_definitions=False) if keyterm: p_num = hash_for_paragraph(keyterm) else: # len(n.label[-1]) < 6 filters out keyterm nodes p_num = sum(n.is_markerless() and len(n.label[-1]) < 6 for n in stack.peek_level(depth)) + 1 node.label[-1] = 'p{}'.format(p_num)
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, '.'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace('.'*len(keyterm), keyterm) node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) self.nodes.append(node)