def tokenize(an_file, tok_file, full_tags): tok_to_id = {} id_to_tok = [] sents = [] an_file.seek(0) import numpy for line in an_file: cur = [] for word in line.split('$'): if '^' not in word: continue an = word.split('^')[-1].split('/')[0] if '<' in an and an.split('<')[1][:-1] not in full_tags: an = word.split('>')[0] + '>' tid = tok_to_id.get(an, -1) if tid == -1: tok_to_id[an] = len(id_to_tok) id_to_tok.append(an) cur.append(tid) sents.append(numpy.asarray(cur, dtype=numpy.uint32)) #tok_file.write('%d %d\n' % (len(sents), len(id_to_tok))) #for sn in sents: # if len(sn) > 0x400: # tok_file.write('0\n') # else: # tok_file.write(' '.join(str(n) for n in sn) + '\n') import eflomal eflomal.write_text(tok_file, tuple(sents), len(id_to_tok)) return len(sents)
def run_eflomal( sents: List[Tuple[List[int], List[int]]]) -> List[Dict[int, int]]: sl_nums = NamedTemporaryFile('wb+') sl = tuple([numpy.asarray(x[0], dtype=numpy.uint32) for x in sents]) eflomal.write_text(sl_nums, sl, 1 + max(map(lambda x: max(x[0]), sents))) tl_nums = NamedTemporaryFile('wb+') tl = tuple([numpy.asarray(x[1], dtype=numpy.uint32) for x in sents]) eflomal.write_text(tl_nums, tl, 1 + max(map(lambda x: max(x[1]), sents))) # I don't know what this is calculating, but it corresponds to the # default arguments in the eflomal python interface defaults = ['-m', '3', '-n', '1', '-N', '0.2'] iters = max(2, int(round(5000.0 / math.sqrt(len(sents))))) iters4 = max(1, iters // 4) defaults += [ '-1', str(max(2, iters4)), '-2', str(iters4), '-3', str(iters) ] #align = NamedTemporaryFile('w+') align = open('jam-alignments.txt', 'w+') subprocess.run([ 'eflomal', '-s', sl_nums.name, '-t', tl_nums.name, '-f', align.name, '-q' ] + defaults) align.seek(0) ret = [] for line in align: dct = {} for nums in line.split(): sl, tl = nums.split('-') dct[int(sl)] = int(tl) ret.append(dct) align.close() return ret
def main(): parser = argparse.ArgumentParser( description='eflomal: efficient low-memory aligner') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Enable verbose output') parser.add_argument('--debug', dest='debug', action='store_true', help='Enable gdb debugging of eflomal binary') parser.add_argument('--overwrite', dest='overwrite', action='store_true', help='Overwrite existing output files') parser.add_argument('--null-prior', dest='null_prior', default=0.2, metavar='X', type=float, help='Prior probability of NULL alignment') parser.add_argument( '-m', '--model', dest='model', default=3, metavar='N', type=int, help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)') parser.add_argument('-M', '--score-model', dest='score_model', default=0, metavar='N', type=int, help='Model used for sentence scoring ' '(1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)') parser.add_argument('--source-prefix', dest='source_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (source)') parser.add_argument('--source-suffix', dest='source_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (source)') parser.add_argument('--target-prefix', dest='target_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (target)') parser.add_argument('--target-suffix', dest='target_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (target)') parser.add_argument('-l', '--length', dest='length', default=1.0, metavar='X', type=float, help='Relative number of sampling iterations') parser.add_argument('-1', '--ibm1-iters', dest='iters1', default=None, metavar='X', type=int, help='Number of IBM1 iterations (overrides --length)') parser.add_argument('-2', '--hmm-iters', dest='iters2', default=None, metavar='X', type=int, help='Number of HMM iterations (overrides --length)') parser.add_argument( '-3', '--fert-iters', dest='iters3', default=None, metavar='X', type=int, help='Number of HMM+fertility iterations (overrides --length)') parser.add_argument('--n-samplers', dest='n_samplers', default=3, metavar='X', type=int, help='Number of independent samplers to run') parser.add_argument('-s', '--source', dest='source_filename', type=str, metavar='filename', help='Source text filename') parser.add_argument('-t', '--target', dest='target_filename', type=str, metavar='filename', help='Target text filename') parser.add_argument('-i', '--input', dest='joint_filename', type=str, metavar='filename', help='fast_align style ||| separated file') parser.add_argument( '-f', '--forward-links', dest='links_filename_fwd', type=str, metavar='filename', help='Filename to write forward direction alignments to') parser.add_argument( '-r', '--reverse-links', dest='links_filename_rev', type=str, metavar='filename', help='Filename to write reverse direction alignments to') parser.add_argument( '-F', '--forward-scores', dest='scores_filename_fwd', type=str, metavar='filename', help='Filename to write alignment scores to (generation ' 'probability of target sentences)') parser.add_argument( '-R', '--reverse-scores', dest='scores_filename_rev', type=str, metavar='filename', help='Filename to write alignment scores to (generation ' 'probability of source sentences)') parser.add_argument('-p', '--priors', dest='priors_filename', type=str, metavar='filename', help='File to read priors from') args = parser.parse_args() if not (args.joint_filename or (args.source_filename and args.target_filename)): print('ERROR: need to specify either -s and -t, or -i', file=sys.stderr, flush=True) sys.exit(1) for filename in ((args.joint_filename, ) if args.joint_filename else (args.source_filename, args.target_filename)): if not os.path.exists(filename): print('ERROR: input file %s does not exist!' % filename, file=sys.stderr, flush=True) sys.exit(1) for filename in (args.links_filename_fwd, args.links_filename_rev): if (not args.overwrite) and (filename is not None) \ and os.path.exists(filename): print('ERROR: output file %s exists, will not overwrite!' % \ filename, file=sys.stderr, flush=True) sys.exit(1) if args.priors_filename: if args.verbose: print('Reading lexical priors from %s...' % args.priors_filename, file=sys.stderr, flush=True) priors_list = [] # list of (srcword, trgword, alpha) ferf_priors = [] # list of (wordform, alpha) ferr_priors = [] # list of (wordform, alpha) hmmf_priors = {} # dict of jump: alpha hmmr_priors = {} # dict of jump: alpha with open(args.priors_filename, 'r', encoding='utf-8') as f: # 5 types of lines valid: # # LEX srcword trgword alpha | lexical prior # HMMF jump alpha | target-side HMM prior # HMMR jump alpha | source-side HMM prior # FERF srcword fert alpha | source-side fertility p. # FERR trgword fert alpha | target-side fertility p. for i, line in enumerate(f): fields = line.rstrip('\n').split('\t') try: alpha = float(fields[-1]) except ValueError: print('ERROR: priors file %s line %d contains alpha ' 'value of "%s" which is not numeric' % (args.priors_filename, i + 1, fields[2]), file=sys.stderr, flush=True) sys.exit(1) if fields[0] == 'LEX' and len(fields) == 4: priors_list.append((fields[1], fields[2], alpha)) elif fields[0] == 'HMMF' and len(fields) == 3: hmmf_priors[int(fields[1])] = alpha elif fields[0] == 'HMMR' and len(fields) == 3: hmmr_priors[int(fields[1])] = alpha elif fields[0] == 'FERF' and len(fields) == 4: ferf_priors.append((fields[1], int(fields[2]), alpha)) elif fields[0] == 'FERR' and len(fields) == 4: ferr_priors.append((fields[1], int(fields[2]), alpha)) else: print('ERROR: priors file %s line %d is invalid ' % (args.priors_filename, i + 1), file=sys.stderr, flush=True) sys.exit(1) if args.joint_filename: if args.verbose: print('Reading source/target sentences from %s...' % args.joint_filename, file=sys.stderr, flush=True) with open(args.joint_filename, 'r', encoding='utf-8') as f: src_sents_text = [] trg_sents_text = [] for i, line in enumerate(f): fields = line.strip().split(' ||| ') if len(fields) != 2: print('ERROR: line %d of %s does not contain a single |||' ' separator, or sentence(s) are empty!' % (i + 1, args.joint_filename), file=sys.stderr, flush=True) sys.exit(1) src_sents_text.append(fields[0]) trg_sents_text.append(fields[1]) src_text = '\n'.join(src_sents_text) + '\n' trg_text = '\n'.join(trg_sents_text) + '\n' src_sents_text = None trg_sents_text = None with io.StringIO(src_text) as f: src_sents, src_index = read_text(f, True, args.source_prefix_len, args.source_suffix_len) n_src_sents = len(src_sents) src_voc_size = len(src_index) srcf = NamedTemporaryFile('wb', delete=False) write_text(srcf, tuple(src_sents), src_voc_size) src_sents = None src_text = None scrf.close() with io.StringIO(trg_text) as f: trg_sents, trg_index = read_text(f, True, args.target_prefix_len, args.target_suffix_len) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trgf = NamedTemporaryFile('wb', delete=False) write_text(trgf, tuple(trg_sents), trg_voc_size) trg_sents = None trg_text = None trgf.close() else: if args.verbose: print('Reading source text from %s...' % args.source_filename, file=sys.stderr, flush=True) with open(args.source_filename, 'r', encoding='utf-8') as f: src_sents, src_index = read_text(f, True, args.source_prefix_len, args.source_suffix_len) n_src_sents = len(src_sents) src_voc_size = len(src_index) srcf = NamedTemporaryFile('wb', delete=False) write_text(srcf, tuple(src_sents), src_voc_size) src_sents = None srcf.close() if args.verbose: print('Reading target text from %s...' % args.target_filename, file=sys.stderr, flush=True) with open(args.target_filename, 'r', encoding='utf-8') as f: trg_sents, trg_index = read_text(f, True, args.target_prefix_len, args.target_suffix_len) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trgf = NamedTemporaryFile('wb', delete=False) write_text(trgf, tuple(trg_sents), trg_voc_size) trg_sents = None trgf.close() if n_src_sents != n_trg_sents: print( 'ERROR: number of sentences differ in input files (%d vs %d)' % (n_src_sents, n_trg_sents), file=sys.stderr, flush=True) sys.exit(1) def get_src_index(src_word): src_word = src_word.lower() if args.source_prefix_len != 0: src_word = src_word[:args.source_prefix_len] if args.source_suffix_len != 0: src_word = src_word[-args.source_suffix_len:] e = src_index.get(src_word) if e is not None: e = e + 1 return e def get_trg_index(trg_word): trg_word = trg_word.lower() if args.target_prefix_len != 0: trg_word = trg_word[:args.target_prefix_len] if args.target_suffix_len != 0: trg_word = trg_word[-args.target_suffix_len:] f = trg_index.get(trg_word) if f is not None: f = f + 1 return f if args.priors_filename: priors_indexed = {} for src_word, trg_word, alpha in priors_list: if src_word == '<NULL>': e = 0 else: e = get_src_index(src_word) if trg_word == '<NULL>': f = 0 else: f = get_trg_index(trg_word) if (e is not None) and (f is not None): priors_indexed[(e,f)] = priors_indexed.get((e,f), 0.0) \ + alpha ferf_indexed = {} for src_word, fert, alpha in ferf_priors: e = get_src_index(src_word) if e is not None: ferf_indexed[(e, fert)] = \ ferf_indexed.get((e, fert), 0.0) + alpha ferr_indexed = {} for trg_word, fert, alpha in ferr_priors: f = get_trg_index(trg_word) if f is not None: ferr_indexed[(f, fert)] = \ ferr_indexed.get((f, fert), 0.0) + alpha if args.verbose: print('%d (of %d) pairs of lexical priors used' % (len(priors_indexed), len(priors_list)), file=sys.stderr) priorsf = NamedTemporaryFile('w', encoding='utf-8', delete=False) print('%d %d %d %d %d %d %d' % (len(src_index) + 1, len(trg_index) + 1, len(priors_indexed), len(hmmf_priors), len(hmmr_priors), len(ferf_indexed), len(ferr_indexed)), file=priorsf) for (e, f), alpha in sorted(priors_indexed.items()): print('%d %d %g' % (e, f, alpha), file=priorsf) for jump, alpha in sorted(hmmf_priors.items()): print('%d %g' % (jump, alpha), file=priorsf) for jump, alpha in sorted(hmmr_priors.items()): print('%d %g' % (jump, alpha), file=priorsf) for (e, fert), alpha in sorted(ferf_indexed.items()): print('%d %d %g' % (e, fert, alpha), file=priorsf) for (f, fert), alpha in sorted(ferr_indexed.items()): print('%d %d %g' % (f, fert, alpha), file=priorsf) priorsf.flush() priorsf.close() trg_index = None src_index = None iters = (args.iters1, args.iters2, args.iters3) if any(x is None for x in iters[:args.model]): iters = None if args.verbose: print('Aligning %d sentences...' % n_src_sents, file=sys.stderr, flush=True) align(srcf.name, trgf.name, links_filename_fwd=args.links_filename_fwd, links_filename_rev=args.links_filename_rev, statistics_filename=None, scores_filename_fwd=args.scores_filename_fwd, scores_filename_rev=args.scores_filename_rev, priors_filename=(None if args.priors_filename is None else priorsf.name), model=args.model, score_model=args.score_model, n_iterations=iters, n_samplers=args.n_samplers, quiet=not args.verbose, rel_iterations=args.length, null_prior=args.null_prior, use_gdb=args.debug) srcf.close() trgf.close() if args.priors_filename: priorsf.close()
def main(): parser = argparse.ArgumentParser( description='eflomal: efficient low-memory aligner') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Enable verbose output') parser.add_argument('--debug', dest='debug', action='store_true', help='Enable gdb debugging of eflomal binary') parser.add_argument('--overwrite', dest='overwrite', action='store_true', help='Overwrite existing output files') parser.add_argument('--null-prior', dest='null_prior', default=0.2, metavar='X', type=float, help='Prior probability of NULL alignment') parser.add_argument( '-m', '--model', dest='model', default=3, metavar='N', type=int, help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)') parser.add_argument('--source-prefix', dest='source_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (source)') parser.add_argument('--source-suffix', dest='source_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (source)') parser.add_argument('--target-prefix', dest='target_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (target)') parser.add_argument('--target-suffix', dest='target_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (target)') parser.add_argument('-l', '--length', dest='length', default=1.0, metavar='X', type=float, help='Relative number of sampling iterations') parser.add_argument('-1', '--ibm1-iters', dest='iters1', default=None, metavar='X', type=int, help='Number of IBM1 iterations (overrides --length)') parser.add_argument('-2', '--hmm-iters', dest='iters2', default=None, metavar='X', type=int, help='Number of HMM iterations (overrides --length)') parser.add_argument( '-3', '--fert-iters', dest='iters3', default=None, metavar='X', type=int, help='Number of HMM+fertility iterations (overrides --length)') parser.add_argument('--n-samplers', dest='n_samplers', default=3, metavar='X', type=int, help='Number of independent samplers to run') parser.add_argument('-s', '--source', dest='source_filename', type=str, metavar='filename', help='Source text filename', required=True) parser.add_argument('-t', '--target', dest='target_filename', type=str, metavar='filename', help='Target text filename', required=True) parser.add_argument( '-f', '--forward-links', dest='links_filename_fwd', type=str, metavar='filename', help='Filename to write forward direction alignments to') parser.add_argument( '-r', '--reverse-links', dest='links_filename_rev', type=str, metavar='filename', help='Filename to write reverse direction alignments to') args = parser.parse_args() for filename in (args.source_filename, args.target_filename): if not os.path.exists(filename): print('ERROR: input file %s does not exist!' % filename, file=sys.stderr, flush=True) sys.exit(1) for filename in (args.links_filename_fwd, args.links_filename_rev): if (not args.overwrite) and (filename is not None) \ and os.path.exists(filename): print('ERROR: output file %s exists, will not overwrite!' % \ filename, file=sys.stderr, flush=True) sys.exit(1) if args.verbose: print('Reading source text from %s...' % args.source_filename, file=sys.stderr, flush=True) with open(args.source_filename, 'r', encoding='utf-8') as f: src_sents, src_index = read_text(f, True, args.source_prefix_len, args.source_suffix_len) n_src_sents = len(src_sents) src_voc_size = len(src_index) src_index = None srcf = NamedTemporaryFile('wb') write_text(srcf, tuple(src_sents), src_voc_size) src_sents = None if args.verbose: print('Reading target text from %s...' % args.target_filename, file=sys.stderr, flush=True) with open(args.target_filename, 'r', encoding='utf-8') as f: trg_sents, trg_index = read_text(f, True, args.target_prefix_len, args.target_suffix_len) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trg_index = None trgf = NamedTemporaryFile('wb') write_text(trgf, tuple(trg_sents), trg_voc_size) trg_sents = None if n_src_sents != n_trg_sents: print('ERROR: number of sentences differ in input files (%d vs %d)' % (n_src_sents, n_trg_sents), file=sys.stderr, flush=True) sys.exit(1) iters = (args.iters1, args.iters2, args.iters3) if any(x is None for x in iters[:args.model]): iters = None if args.verbose: print('Aligning %d sentences...' % n_src_sents, file=sys.stderr, flush=True) align(srcf.name, trgf.name, links_filename_fwd=args.links_filename_fwd, links_filename_rev=args.links_filename_rev, statistics_filename=None, scores_filename=None, model=args.model, n_iterations=iters, n_samplers=args.n_samplers, quiet=not args.verbose, rel_iterations=args.length, null_prior=args.null_prior, use_gdb=args.debug) srcf.close() trgf.close()
def do_align(f_name, rev_f_name, seed=None): print('Reading source/target sentences from %s...' % f_name, file=sys.stderr, flush=True) with open(f_name, 'r', encoding='utf-8') as f: src_sents_text = [] trg_sents_text = [] for i, line in enumerate(f): fields = line.strip().split(' ||| ') if len(fields) != 2: print('ERROR: line %d of %s does not contain a single |||' ' separator, or sentence(s) are empty!' % ( i+1, args.joint_filename), file=sys.stderr, flush=True) sys.exit(1) src_sents_text.append(fields[0]) trg_sents_text.append(fields[1]) src_text = '\n'.join(src_sents_text) + '\n' trg_text = '\n'.join(trg_sents_text) + '\n' src_sents_text = None trg_sents_text = None source_prefix_len = 0 source_suffix_len = 0 target_prefix_len = 0 target_suffix_len = 0 with io.StringIO(src_text) as f: src_sents, src_index = read_text( f, True, source_prefix_len, source_suffix_len) n_src_sents = len(src_sents) src_voc_size = len(src_index) srcf = NamedTemporaryFile('wb') write_text(srcf, tuple(src_sents), src_voc_size) src_sents = None src_text = None with io.StringIO(trg_text) as f: trg_sents, trg_index = read_text( f, True, target_prefix_len, target_suffix_len) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trgf = NamedTemporaryFile('wb') write_text(trgf, tuple(trg_sents), trg_voc_size) trg_sents = None trg_text = None """ print("source") with open(f_name, 'r', encoding='utf-8') as f: src_sents, src_index = read_text( f, True, 0, 0) n_src_sents = len(src_sents) src_voc_size = len(src_index) srcf = NamedTemporaryFile('wb') write_text(srcf, tuple(src_sents), src_voc_size) src_sents = None print("target") with open(rev_f_name, 'r', encoding='utf-8') as f: trg_sents, trg_index = read_text( f, True, 0, 0) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trgf = NamedTemporaryFile('wb') write_text(trgf, tuple(trg_sents), trg_voc_size) trg_sents = None """ fwd_links_file = NamedTemporaryFile('r+') rev_links_file = NamedTemporaryFile('r+') stat_file = NamedTemporaryFile('r+') print("start align") align(srcf.name, trgf.name, statistics_filename = stat_file.name, quiet=False, links_filename_fwd=fwd_links_file.name, links_filename_rev=rev_links_file.name) # Not using stat_file at the moment result = fwd_links_file.readlines() rev_result = rev_links_file.readlines() fwd_links_file.close() rev_links_file.close() stat_file.close() srcf.close() trgf.close() """ if discretize: ibm_print(aaa, reverse, output.fileno()) else: # Not used at the moment, but keeping this for the future with open(output_prob, 'wb') as f: pickle.dump(aaa, f, -1) output.seek(0) result = [] for line in output: result.append(line.decode('ascii').strip()) """ return result, rev_result
def main(): parser = argparse.ArgumentParser( description= 'mkmodel.py: compute IBM-1 translation probabilties using eflomal, the efficient low-memory aligner' ) parser.add_argument('-v', '--verbose', dest='verbose', action="count", default=0, help='Enable verbose output') parser.add_argument('--debug', dest='debug', action='store_true', help='Enable gdb debugging of eflomal binary') parser.add_argument('--no-lowercase', dest='lowercase', action='store_false', default=True, help='Do not lowercase input text') parser.add_argument('--overwrite', dest='overwrite', action='store_true', help='Overwrite existing output files') parser.add_argument('--null-prior', dest='null_prior', default=0.2, metavar='X', type=float, help='Prior probability of NULL alignment') parser.add_argument( '-m', '--model', dest='model', default=3, metavar='N', type=int, help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)') parser.add_argument('--source-prefix', dest='source_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (source)') parser.add_argument('--source-suffix', dest='source_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (source)') parser.add_argument('--target-prefix', dest='target_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (target)') parser.add_argument('--target-suffix', dest='target_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (target)') parser.add_argument('-l', '--length', dest='length', default=1.0, metavar='X', type=float, help='Relative number of sampling iterations') parser.add_argument('-1', '--ibm1-iters', dest='iters1', default=None, metavar='X', type=int, help='Number of IBM1 iterations (overrides --length)') parser.add_argument('-2', '--hmm-iters', dest='iters2', default=None, metavar='X', type=int, help='Number of HMM iterations (overrides --length)') parser.add_argument( '-3', '--fert-iters', dest='iters3', default=None, metavar='X', type=int, help='Number of HMM+fertility iterations (overrides --length)') parser.add_argument('--n-samplers', dest='n_samplers', default=3, metavar='X', type=int, help='Number of independent samplers to run') parser.add_argument('-s', '--source', dest='source_filename', type=str, metavar='filename', help='Source text filename', required=True) parser.add_argument('-t', '--target', dest='target_filename', type=str, metavar='filename', help='Target text filename', required=True) parser.add_argument( '-f', '--forward-probabilities', dest='p_filename_fwd', type=str, metavar='filename', help= 'Filename to write forward direction probabilities to, as pickle dump') parser.add_argument( '-r', '--reverse-probabilities', dest='p_filename_rev', type=str, metavar='filename', help= 'Filename to write reverse direction probabilities to, as pickle dump') parser.add_argument( '-F', '--forward-probabilities-human', dest='p_filename_fwd_h', type=str, metavar='filename', help= 'Filename to write forward direction probabilities to, as human readable dump' ) parser.add_argument( '-R', '--reverse-probabilities-human', dest='p_filename_rev_h', type=str, metavar='filename', help= 'Filename to write reverse direction probabilities to, as human readable dump' ) args = parser.parse_args() logger = Logger(args.verbose) if args.p_filename_fwd is None and args.p_filename_rev is None: print('ERROR: no file to save probabilities (-f/-r), will do nothing.', file=sys.stderr, flush=True) sys.exit(1) for filename in (args.source_filename, args.target_filename): if not os.path.exists(filename): print('ERROR: input file %s does not exist!' % filename, file=sys.stderr, flush=True) sys.exit(1) for filename in (args.p_filename_fwd, args.p_filename_rev): if (not args.overwrite) and (filename is not None) \ and os.path.exists(filename): print('ERROR: output file %s exists, will not overwrite!' % \ filename, file=sys.stderr, flush=True) sys.exit(1) if args.verbose: print('Reading source text from %s...' % args.source_filename, file=sys.stderr, flush=True) with xopen(args.source_filename, 'r', encoding='utf-8') as f: src_sents, src_index = read_text(f, args.lowercase, args.source_prefix_len, args.source_suffix_len) n_src_sents = len(src_sents) src_voc_size = len(src_index) src_index = None srcf = NamedTemporaryFile('wb') write_text(srcf, tuple(src_sents), src_voc_size) src_sents = None if args.verbose: print('Reading target text from %s...' % args.target_filename, file=sys.stderr, flush=True) with xopen(args.target_filename, 'r', encoding='utf-8') as f: trg_sents, trg_index = read_text(f, args.lowercase, args.target_prefix_len, args.target_suffix_len) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trg_index = None trgf = NamedTemporaryFile('wb') write_text(trgf, tuple(trg_sents), trg_voc_size) trg_sents = None if n_src_sents != n_trg_sents: print('ERROR: number of sentences differ in input files (%d vs %d)' % (n_src_sents, n_trg_sents), file=sys.stderr, flush=True) sys.exit(1) iters = (args.iters1, args.iters2, args.iters3) if any(x is None for x in iters[:args.model]): iters = None if args.verbose: print('Aligning %d sentences...' % n_src_sents, file=sys.stderr, flush=True) fwd_alignment_file = NamedTemporaryFile('w') rev_alignment_file = NamedTemporaryFile('w') align(srcf.name, trgf.name, links_filename_fwd=fwd_alignment_file.name, links_filename_rev=rev_alignment_file.name, statistics_filename=None, scores_filename=None, model=args.model, n_iterations=iters, n_samplers=args.n_samplers, quiet=not args.verbose, rel_iterations=args.length, null_prior=args.null_prior, use_gdb=args.debug) srcf.close() trgf.close() # split and, if requested, lowercase tokens logger.info("Preprocessing sentences for probability estimation...") with xopen(args.source_filename, 'r', encoding='utf-8') as fsrc, xopen(args.target_filename, 'r', encoding='utf-8') as ftgt: src_sents = preprocess(fsrc.readlines(), args.lowercase) trg_sents = preprocess(ftgt.readlines(), args.lowercase) # extract token --> index hash table logger.info("Extracting vocabulary...") voc_s = make_voc(src_sents) voc_t = make_voc(trg_sents) if args.p_filename_fwd is not None: logger.info("Estimating forward counts...") counts, s_counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) logger.info("Estimating forward probabilities...") p = compute_p(voc_s, voc_t, counts, s_counts) logger.info("Saving forward probabilities...") model = IBM1(p, voc_s, voc_t) save_p(model, args.p_filename_fwd) if args.p_filename_fwd_h is not None: with xopen(args.p_filename_fwd_h, "w") as f: model.dump(f) if args.p_filename_rev is not None: logger.info("Estimating reverse counts...") counts, t_counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name, args.lowercase) logger.info("Estimating reverse probabilities...") p = compute_p(voc_t, voc_s, counts, t_counts) logger.info("Saving reverse probabilities...") model = IBM1(p, voc_t, voc_s) save_p(model, args.p_filename_rev) if args.p_filename_rev_h is not None: with xopen(args.p_filename_rev_h, "w") as f: model.dump(f) fwd_alignment_file.close() rev_alignment_file.close()