def jsalign_with_error(texts, s_lang, t_lang, note, align_file): source_list = convert.file_to_list(texts[0], s_lang) s_sentence_splitter = util.sentence_splitter(s_lang) source_list = text_sent_splitter(source_list, s_sentence_splitter) if s_lang == 'fr'and note[0] == '6': french_quotes(source_list) target_list = convert.file_to_list(texts[1], t_lang) t_sentence_splitter = util.sentence_splitter(t_lang) target_list = text_sent_splitter(target_list, t_sentence_splitter) if t_lang == 'fr'and note[0] == 6: french_quotes(target_list) tag_list = ['none'] * max(len(source_list), len(target_list)) jsalign = convert.jsalign_table(source_list, target_list, tag_list, s_lang, t_lang, note) with codecs.open(align_file + '_manual.html', 'w', 'utf-8') as fout: fout.write(jsalign)
def test_file_to_list_tries_3(self): # Not testing numbering_separator here. text = (' \t\nnon-breaking' u"\u00A0" 'space \n \t ' 'u\n' 'uu\n' 'uuu\n' ' Another line!\n \n \n \n \n \n \n \n \n \n ') result = ['non-breaking space', 'Another line!'] self.assertEqual(result, convert.file_to_list(text, 'ro', 3))
def smart_aligner(texts, s_lang, t_lang, dictionary, align_file, note, over=True, para_size=PARA_MAX, para_size_small=PARA_MIN, make_dic=True, compress=False): # functions.smart_aligner(texts, "en", # "ro", "enro.dic", "bi_test", "/home/filip/eunlp/", "A720120002") """ :type texts: list :type s_lang: str :type t_lang: str :type dictionary: str :type align_file: str :type note: str :type over: bool :type para_size: int :type para_size_small: int :type make_dic: bool :type compress: bool :rtype: None """ if (not over) and ( os.path.isfile(align_file + '.tmx') or os.path.isfile(align_file + '_manual.html') or os.path.isfile(align_file + '.tmx.gz')): logging.warning("File pair already aligned: %s", align_file) return # exit if already aligned and over=False source_list = convert.file_to_list(texts[0], s_lang) target_list = convert.file_to_list(texts[1], t_lang) # when debugging: # jsalign = convert.jsalign_table(source_list, target_list, s_lang, # t_lang, note) # with codecs.open(align_file + '_manual_0.html', 'w', 'utf-8') as fout: # fout.write(jsalign) if len(source_list) != len(target_list): logging.error('Smart alignment failed in %s: %s-%s', note, s_lang, t_lang) jsalign_with_error(texts, s_lang, t_lang, note, align_file) return try: tab_file = parallel_aligner(source_list, target_list, s_lang, t_lang, dictionary, para_size=para_size, para_size_small=para_size_small, note=note, make_dic=make_dic) # turn alignment into tmx and manual html alignment tmx_file = convert.tab_to_tmx(tab_file, s_lang, t_lang, note) with codecs.open(align_file + '.tmx', "w", "utf-8") as fout: fout.write(tmx_file) source_list, target_list, tag_list = convert.tab_to_separate(tab_file) jsalign = convert.jsalign_table(source_list, target_list, tag_list, s_lang, t_lang, note) with codecs.open(align_file + '_manual.html', 'w', 'utf-8') as fout: fout.write(jsalign) if compress: convert.gzipper(align_file + '.tmx') convert.gzipper(align_file + '_manual.html') except StopIteration: logging.error('StopIteration in %s -> %s, %s', note, s_lang, t_lang) jsalign_with_error(texts, s_lang, t_lang, note, align_file)