def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, "r") as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error("Error reading text file: nonstandard encoding or binary?", -1) raise UnableToReadTextFile(txt_file_path) j_dic["text"] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == "mecab": from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == "whitespace": from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == "ptblike": from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning("Unrecognized tokenisation option " ", reverting to whitespace tokenisation.") from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic["token_offsets"] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == "newline": from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == "regex": from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning("Unrecognized sentence splitting option " ", reverting to newline sentence splitting.") from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic["sentence_offsets"] = [o for o in ss_offset_gen(text)] return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' ' + unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' '+unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: (filepath, tempfilename) = os.path.split(txt_file_path) (filename, extension) = os.path.splitext(tempfilename) r = RepoModel(filepath) r.save_xml(filepath) # xml_save(filepath, filename, filename) xml_file_path = os.path.join(filepath, filename+'.xml') # print("xml_file_path::::", r, file=sys.stderr) # if xml_file_path: # pass # else: # xml_save(filepath, filename, filename) with open(xml_file_path, 'r') as xml_file: xml = xml_file.read() with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() j_dic['xml'] = xml except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True