def _refine_split(offsets, original_text): # Postprocessor expects newlines, so add. Also, replace # sentence-internal newlines with spaces not to confuse it. new_text = "\n".join((original_text[o[0] : o[1]].replace("\n", " ") for o in offsets)) from sspostproc import refine_split output = refine_split(new_text) # Align the texts and see where our offsets don't match old_offsets = offsets[::-1] # Protect against edge case of single-line docs missing # sentence-terminal newline if len(old_offsets) == 0: old_offsets.append((0, len(original_text))) new_offsets = [] for refined_sentence in output.split("\n"): new_offset = old_offsets.pop() # Merge the offsets if we have received a corrected split while new_offset[1] - new_offset[0] < len(refined_sentence) - 1: _, next_end = old_offsets.pop() new_offset = (new_offset[0], next_end) new_offsets.append(new_offset) # Protect against missing document-final newline causing the last # sentence to fall out of offset scope if len(new_offsets) != 0 and new_offsets[-1][1] != len(original_text) - 1: start = new_offsets[-1][1] + 1 while start < len(original_text) and original_text[start].isspace(): start += 1 if start < len(original_text) - 1: new_offsets.append((start, len(original_text) - 1)) # Finally, inject new-lines from the original document as to respect the # original formatting where it is made explicit. last_newline = -1 while True: try: orig_newline = original_text.index("\n", last_newline + 1) except ValueError: # No more newlines break for o_start, o_end in new_offsets: if o_start <= orig_newline < o_end: # We need to split the existing offsets in two new_offsets.remove((o_start, o_end)) new_offsets.extend(((o_start, orig_newline), (orig_newline + 1, o_end))) break elif o_end == orig_newline: # We have already respected this newline break else: # Stand-alone "null" sentence, just insert it new_offsets.append((orig_newline, orig_newline)) last_newline = orig_newline new_offsets.sort() return new_offsets
def _refine_split(offsets, original_text): # Postprocessor expects newlines, so add. Also, replace # sentence-internal newlines with spaces not to confuse it. new_text = '\n'.join((original_text[o[0]:o[1]].replace('\n', ' ') for o in offsets)) from sspostproc import refine_split output = refine_split(new_text) # Align the texts and see where our offsets don't match old_offsets = offsets[::-1] # Protect against edge case of single-line docs missing # sentence-terminal newline if len(old_offsets) == 0: old_offsets.append((0, len(original_text))) new_offsets = [] for refined_sentence in output.split('\n'): new_offset = old_offsets.pop() # Merge the offsets if we have received a corrected split while new_offset[1] - new_offset[0] < len(refined_sentence) - 1: _, next_end = old_offsets.pop() new_offset = (new_offset[0], next_end) new_offsets.append(new_offset) # protect against missing document-final newline causing the last # sentence to fall out of offset scope if len(new_offsets) != 0 and new_offsets[-1][1] != len(original_text)-1: start = new_offsets[-1][1]+1 while start < len(original_text) and original_text[start].isspace(): start += 1 if start < len(original_text)-1: new_offsets.append((start, len(original_text)-1)) return new_offsets
def _refine_split(offsets, original_text): # Postprocessor expects newlines, so add. Also, replace # sentence-internal newlines with spaces not to confuse it. new_text = '\n'.join( (original_text[o[0]:o[1]].replace('\n', ' ') for o in offsets)) from sspostproc import refine_split output = refine_split(new_text) # Align the texts and see where our offsets don't match old_offsets = offsets[::-1] # Protect against edge case of single-line docs missing # sentence-terminal newline if len(old_offsets) == 0: old_offsets.append((0, len(original_text))) new_offsets = [] for refined_sentence in output.split('\n'): new_offset = old_offsets.pop() # Merge the offsets if we have received a corrected split while new_offset[1] - new_offset[0] < len(refined_sentence) - 1: _, next_end = old_offsets.pop() new_offset = (new_offset[0], next_end) new_offsets.append(new_offset) # protect against missing document-final newline causing the last # sentence to fall out of offset scope if len(new_offsets) != 0 and new_offsets[-1][1] != len(original_text) - 1: start = new_offsets[-1][1] + 1 while start < len(original_text) and original_text[start].isspace(): start += 1 if start < len(original_text) - 1: new_offsets.append((start, len(original_text) - 1)) return new_offsets
def sentences(passage, refine=True): split = [] if not passage.endswith('\n'): passage += '\n' # spacy needs the newline analyzed = pipeline(passage) for sentence in analyzed.sents: text = str(sentence) if text and not text.isspace(): split.append(text.rstrip('\n')) if refine: split = refine_split('\n'.join(split)).split('\n') return split
def _refine_split(offsets, original_text): # Postprocessor expects newlines, so add. Also, replace # sentence-internal newlines with spaces not to confuse it. new_text = '\n'.join((original_text[o[0]:o[1]].replace('\n', ' ') for o in offsets)) from sspostproc import refine_split output = refine_split(new_text) # Align the texts and see where our offsets don't match old_offsets = offsets[::-1] # Protect against edge case of single-line docs missing # sentence-terminal newline if len(old_offsets) == 0: old_offsets.append((0, len(original_text), )) new_offsets = [] for refined_sentence in output.split('\n'): new_offset = old_offsets.pop() # Merge the offsets if we have received a corrected split while new_offset[1] - new_offset[0] < len(refined_sentence) - 1: _, next_end = old_offsets.pop() new_offset = (new_offset[0], next_end) new_offsets.append(new_offset) # Protect against missing document-final newline causing the last # sentence to fall out of offset scope if len(new_offsets) != 0 and new_offsets[-1][1] != len(original_text)-1: start = new_offsets[-1][1]+1 while start < len(original_text) and original_text[start].isspace(): start += 1 if start < len(original_text)-1: new_offsets.append((start, len(original_text)-1)) # Finally, inject new-lines from the original document as to respect the # original formatting where it is made explicit. last_newline = -1 while True: try: orig_newline = original_text.index('\n', last_newline + 1) except ValueError: # No more newlines break for o_start, o_end in new_offsets: if o_start <= orig_newline < o_end: # We need to split the existing offsets in two new_offsets.remove((o_start, o_end)) new_offsets.extend(((o_start, orig_newline, ), (orig_newline + 1, o_end), )) break elif o_end == orig_newline: # We have already respected this newline break else: # Stand-alone "null" sentence, just insert it new_offsets.append((orig_newline, orig_newline, )) last_newline = orig_newline new_offsets.sort() return new_offsets