def _split_and_tokenize(s): """ Helper, sentence-splits and tokenizes, returns array comparable to what you would get from re.split(r'(\s+)', s). """ from ssplit import en_sentence_boundary_gen from tokenise import en_token_boundary_gen tokens = [] sprev = 0 for sstart, send in en_sentence_boundary_gen(s): if sprev != sstart: # between-sentence space tokens.append(s[sprev:sstart]) stext = s[sstart:send] tprev = sstart for tstart, tend in en_token_boundary_gen(stext): if tprev != tstart: # between-token space tokens.append(s[sstart+tprev:sstart+tstart]) tokens.append(s[sstart+tstart:sstart+tend]) tprev = tend sprev = send if sprev != len(s): # document-final space tokens.append(s[sprev:]) assert "".join(tokens) == s, "INTERNAL ERROR\n'%s'\n'%s'" % ("".join(tokens),s) return tokens
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' ' + unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info # First, generate tokenisation if JAPANESE: from tokenise import jp_token_boundary_gen token_offsets = [o for o in jp_token_boundary_gen(text)] else: from tokenise import en_token_boundary_gen token_offsets = [o for o in en_token_boundary_gen(text)] j_dic['token_offsets'] = token_offsets if NEWLINE_SS: from ssplit import newline_sentence_boundary_gen sentence_offsets = [o for o in newline_sentence_boundary_gen(text)] elif JAPANESE: from ssplit import jp_sentence_boundary_gen sentence_offsets = [o for o in jp_sentence_boundary_gen(text)] #log_info('offsets: ' + str(offsets)) else: from ssplit import en_sentence_boundary_gen sentence_offsets = [o for o in en_sentence_boundary_gen(text)] #log_info('offsets: ' + str(sentence_offsets)) j_dic['sentence_offsets'] = sentence_offsets return True
def sentencebreaks_to_newlines(text): offsets = [o for o in en_sentence_boundary_gen(text)] # adjust to include any initial space skipped by the # boundary generator. (TODO: fix generator instead.) if offsets and offsets[0][0] > 0: offsets.insert(0, (0, offsets[0][0])) # break into sentences sentences = [s for s in _text_by_offsets_gen(text, offsets)] # join up, adding a newline for space where possible orig_parts = [] new_parts = [] prev_end = 0 sentnum = len(sentences) for i in range(sentnum): sent = sentences[i] orig_parts.append(sent) new_parts.append(sent) if i < sentnum - 1: orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if (offsets[i][1] < offsets[i + 1][0] and text[offsets[i][1]].isspace()): # intervening space; can add newline new_parts.append('\n' + text[offsets[i][1] + 1:offsets[i + 1][0]]) else: new_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if len(offsets) and offsets[-1][1] < len(text): orig_parts.append(text[offsets[-1][1]:]) new_parts.append(text[offsets[-1][1]:]) # sanity check assert text == ''.join( orig_parts), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( text, ''.join(orig_parts)) splittext = ''.join(new_parts) # sanity assert len(text) == len(splittext), "INTERNAL ERROR" assert _normspace(text) == _normspace( splittext), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( _normspace(text), _normspace(splittext)) return splittext
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' '+unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info # First, generate tokenisation if JAPANESE: from tokenise import jp_token_boundary_gen token_offsets = [o for o in jp_token_boundary_gen(text)] else: from tokenise import en_token_boundary_gen token_offsets = [o for o in en_token_boundary_gen(text)] j_dic['token_offsets'] = token_offsets if NEWLINE_SS: from ssplit import newline_sentence_boundary_gen sentence_offsets = [o for o in newline_sentence_boundary_gen(text)] elif JAPANESE: from ssplit import jp_sentence_boundary_gen sentence_offsets = [o for o in jp_sentence_boundary_gen(text)] #log_info('offsets: ' + str(offsets)) else: from ssplit import en_sentence_boundary_gen sentence_offsets = [o for o in en_sentence_boundary_gen(text)] #log_info('offsets: ' + str(sentence_offsets)) j_dic['sentence_offsets'] = sentence_offsets return True
def sentencebreaks_to_newlines(text): offsets = [o for o in en_sentence_boundary_gen(text)] # adjust to include any initial space skipped by the # boundary generator. (TODO: fix generator instead.) if offsets and offsets[0][0] > 0: offsets.insert(0, (0, offsets[0][0])) # break into sentences sentences = [s for s in _text_by_offsets_gen(text, offsets)] # join up, adding a newline for space where possible orig_parts = [] new_parts = [] prev_end = 0 sentnum = len(sentences) for i in range(sentnum): sent = sentences[i] orig_parts.append(sent) new_parts.append(sent) if i < sentnum-1: orig_parts.append(text[offsets[i][1]:offsets[i+1][0]]) if (offsets[i][1] < offsets[i+1][0] and text[offsets[i][1]].isspace()): # intervening space; can add newline new_parts.append('\n'+text[offsets[i][1]+1:offsets[i+1][0]]) else: new_parts.append(text[offsets[i][1]:offsets[i+1][0]]) if len(offsets) and offsets[-1][1] < len(text): orig_parts.append(text[offsets[-1][1]:]) new_parts.append(text[offsets[-1][1]:]) # sanity check assert text == ''.join(orig_parts), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (text, ''.join(orig_parts)) splittext = ''.join(new_parts) # sanity assert len(text) == len(splittext), "INTERNAL ERROR" assert _normspace(text) == _normspace(splittext), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (_normspace(text), _normspace(splittext)) return splittext
def split_sentences(text): offsets = [o for o in en_sentence_boundary_gen(text)] # adjust to include any initial space skipped by the # boundary generator. (TODO: fix generator instead.) if offsets and offsets[0][0] > 0: offsets.insert(0, (0, offsets[0][0])) # adjust to include any intervening space adjusted = [] for i in range(len(offsets)-1): adjusted.append((offsets[i][0], offsets[i+1][0])) if offsets: adjusted.append((offsets[-1][0], len(text))) offsets = adjusted return [s for s in _text_by_offsets_gen(text, offsets)]
def split_sentences(text): offsets = [o for o in en_sentence_boundary_gen(text)] # adjust to include any initial space skipped by the # boundary generator. (TODO: fix generator instead.) if offsets and offsets[0][0] > 0: offsets.insert(0, (0, offsets[0][0])) # adjust to include any intervening space adjusted = [] for i in range(len(offsets) - 1): adjusted.append((offsets[i][0], offsets[i + 1][0])) if offsets: adjusted.append((offsets[-1][0], len(text))) offsets = adjusted return [s for s in _text_by_offsets_gen(text, offsets)]
def sentencebreaks_to_newlines(text): offsets = [o for o in en_sentence_boundary_gen(text)] # break into sentences sentences = [s for s in _text_by_offsets_gen(text, offsets)] # join up, adding a newline for space where possible orig_parts = [] new_parts = [] sentnum = len(sentences) for i in range(sentnum): sent = sentences[i] orig_parts.append(sent) new_parts.append(sent) if i < sentnum - 1: orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if (offsets[i][1] < offsets[i + 1][0] and text[offsets[i][1]].isspace()): # intervening space; can add newline new_parts.append('\n' + text[offsets[i][1] + 1:offsets[i + 1][0]]) else: new_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if len(offsets) and offsets[-1][1] < len(text): orig_parts.append(text[offsets[-1][1]:]) new_parts.append(text[offsets[-1][1]:]) # sanity check assert text == ''.join( orig_parts), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( text, ''.join(orig_parts)) splittext = ''.join(new_parts) # sanity assert len(text) == len(splittext), "INTERNAL ERROR" assert _normspace(text) == _normspace( splittext), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( _normspace(text), _normspace(splittext)) return splittext
def _get_offset_sentence_map(s): """ Helper, sentence-splits and returns a mapping from character offsets to sentence number. """ from ssplit import en_sentence_boundary_gen m = {} # TODO: why is this a dict and not an array? sprev, snum = 0, 1 # note: sentences indexed from 1 for sstart, send in en_sentence_boundary_gen(s): # if there are extra newlines (i.e. more than one) in between # the previous end and the current start, those need to be # added to the sentence number snum += max(0,len([nl for nl in s[sprev:sstart] if nl == "\n"]) - 1) for o in range(sprev, send): m[o] = snum sprev = send snum += 1 return m
def sentencebreaks_to_newlines(text): offsets = [o for o in en_sentence_boundary_gen(text)] # break into sentences sentences = [s for s in _text_by_offsets_gen(text, offsets)] # join up, adding a newline for space where possible orig_parts = [] new_parts = [] sentnum = len(sentences) for i in range(sentnum): sent = sentences[i] orig_parts.append(sent) new_parts.append(sent) if i < sentnum-1: orig_parts.append(text[offsets[i][1]:offsets[i+1][0]]) if (offsets[i][1] < offsets[i+1][0] and text[offsets[i][1]].isspace()): # intervening space; can add newline new_parts.append('\n'+text[offsets[i][1]+1:offsets[i+1][0]]) else: new_parts.append(text[offsets[i][1]:offsets[i+1][0]]) if len(offsets) and offsets[-1][1] < len(text): orig_parts.append(text[offsets[-1][1]:]) new_parts.append(text[offsets[-1][1]:]) # sanity check assert text == ''.join(orig_parts), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (text, ''.join(orig_parts)) splittext = ''.join(new_parts) # sanity assert len(text) == len(splittext), "INTERNAL ERROR" assert _normspace(text) == _normspace(splittext), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (_normspace(text), _normspace(splittext)) return splittext