def call_apertium(text, mode_path): """ Call Apertium for the rescue! Accepts a text and a mode path, as in '~/apertium-foo-bar/modes/foo-bar.mode'. Returns the text processed with that mode. """ chunks = mode_path.split('/') mode = chunks[-1].split('.')[0] path = '/'.join(chunks[:-2]) p = sh.apertium('-u -d {0}'.format(path), mode, _in=text.encode('utf-8'), _encoding='utf-8') output = p.stdout.decode('utf-8') return output
def translate_with_apertium(self, input, pair): try: # Works with a temporary file as buffer (opened in text mode) with NamedTemporaryFile(mode='w+t') as temp_input_file: temp_input_file.writelines(input) temp_input_file.seek(0) input_translated = apertium('-ud', self.pack_dir, pair, temp_input_file.name) except ErrorReturnCode as e: raise Exception('Unable to translate this string.') return str(input_translated)
def launch_apertium(value, input): if value.endswith('.mode'): chunks = value.split('/') mode = chunks[-1].split('.')[0] path = '/'.join(chunks[:-2]) p = sh.apertium('-u -d {0}'.format(path), mode, _in=input.encode('utf-8'), _encoding='utf-8') output = p.stdout.decode('utf-8') elif value.endswith('.txt'): output = open(value).read() else: raise click.BadParameter('Invalid argument. Please specify either ' 'path to .txt file or path to ' 'Apertium translator / morhological analyzer for your language pair.') return output
def generate_anmor(value, text): # from tagger or translation mode, try to guess morphological analyzer mode if value.endswith('.mode'): chunks = value.split('/') mode = chunks[-1].split('.')[0] path = '/'.join(chunks[:-2]) if mode.endswith('tagger'): mode = '-'.join(mode.split('-')[:-1]) mode += '-anmor' p = sh.apertium('-u -d {0}'.format(path), mode, _in=text.encode('utf-8'), _encoding='utf-8') output = p.stdout.decode('utf-8') else: output = None # well, should do without it return output
def launch_apertium(value, input): if value.endswith('.mode'): chunks = value.split('/') mode = chunks[-1].split('.')[0] path = '/'.join(chunks[:-2]) p = sh.apertium('-d {0}'.format(path), mode, _in=input.encode('utf-8'), _encoding='utf-8') output = p.stdout.decode('utf-8') elif value.endswith('.txt'): output = open(value).read() else: raise click.BadParameter( 'Invalid argument. Please specify either' 'path to .txt file or path to' 'Apertium translator / POS tagger for your language pair.') return output
def launch_apertium(value, input): if value.endswith('.mode'): chunks = value.split('/') mode = chunks[-1].split('.')[0] path = '/'.join(chunks[:-2]) p = sh.apertium('-u -d {0}'.format(path), mode, _in=input.encode('utf-8'), _encoding='utf-8') output = p.stdout.decode('utf-8') elif value.endswith('.txt'): # output = open(value).read() output = (open(value).read()).rstrip('\n') elif value.endswith('.lst'): # get a list of machine-translated documents and insert their contents here # the first line is an offset, the rest are filenames # Newlines are replaced by "@@@" which will be processed later by a custom filter in the docgisting.html # template (ugly hack) f = open(value) offset = f.readline().rstrip('\n') if not offset.endswith('/'): offset = offset + '/' output = "" while True: mtfilename = f.readline() if mtfilename == '': break mtcontents = open((offset + mtfilename).rstrip('\n')).read() output = output + "@@@".join(mtcontents.split( "\n")) + "\n" # not sure about this last \n else: raise click.BadParameter( 'Invalid argument. Please specify ' 'path to .txt file, ' 'path to .lst file, ' 'or path to ' 'Apertium translator / morhological analyzer for your language pair.' ) return output
PATH_TO_APERTIUM = "/home/zhake/Source/apertium-eng-kaz" TRANSLATION_DIRECTION = "eng-kaz" PATH_TO_MORPH_LANG_NMT_MODEL = "/media/zhake/Data/Универ/NMT/Experiments/МЯ с и без \ сегментации/С сегментацией без слов/dataset/nmt_attention_model" # прочитать текст со входа for source_text_line in sys.stdin: # ========== # сделать морфологический анализ и получить MorphLang_sl # ========== # apertium -d '/home/zhake/Source/apertium-eng-kaz' eng-kaz-tagger morph_lang_sl = str( sh.apertium( sh.echo(source_text_line), "-d", PATH_TO_APERTIUM, TRANSLATION_DIRECTION + "-tagger", )) # весь текст в нижний регистр, так как словарь в нижнем регистре # до морфологического анализа делать не стоит, так как для него регистр важен morph_lang_sl = morph_lang_sl.lower() # ========== # разделить теги и слова # ========== # "очистка", чтобы слова разделялись только "$^" morph_lang_sl = re.sub(pattern="\$.+?\^", repl="$^", string=morph_lang_sl) # удаляем пробелы по краям, первый символ "^" и последний символ "$"
def local_pairs(self): output = apertium('-d', self.pack_dir, '-l').strip() return [s.strip() for s in output.split('\n')]