def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename, lowercase): counts = lil_matrix((len(voc_t.items()), len(voc_s.items()))) t_counts = zeros(len(voc_t.items())) with xopen(alignment_filename, "r") as afile: pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(src_sents)).start() i = 0 s = src_sents[i] t = trg_sents[i] aline = afile.readline() while aline != "": a = [(int(x), int(y)) for x, y in [apair.split("-") for apair in aline.split()]] for s_i, t_i in a: token_s = s[s_i] token_t = t[t_i] token_s_id = voc_s[token_s] token_t_id = voc_t[token_t] counts[token_t_id, token_s_id] += 1 t_counts[token_t_id] += 1 i += 1 pbar.update(i) if i < len(src_sents): s = src_sents[i] t = trg_sents[i] aline = afile.readline() pbar.finish() return counts, t_counts
def save_p(model, fname): with xopen(fname, "wb") as f: pickle.dump(model, f)
def main(): parser = argparse.ArgumentParser( description= 'mkmodel.py: compute IBM-1 translation probabilties using eflomal, the efficient low-memory aligner' ) parser.add_argument('-v', '--verbose', dest='verbose', action="count", default=0, help='Enable verbose output') parser.add_argument('--debug', dest='debug', action='store_true', help='Enable gdb debugging of eflomal binary') parser.add_argument('--no-lowercase', dest='lowercase', action='store_false', default=True, help='Do not lowercase input text') parser.add_argument('--overwrite', dest='overwrite', action='store_true', help='Overwrite existing output files') parser.add_argument('--null-prior', dest='null_prior', default=0.2, metavar='X', type=float, help='Prior probability of NULL alignment') parser.add_argument( '-m', '--model', dest='model', default=3, metavar='N', type=int, help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)') parser.add_argument('--source-prefix', dest='source_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (source)') parser.add_argument('--source-suffix', dest='source_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (source)') parser.add_argument('--target-prefix', dest='target_prefix_len', default=0, metavar='N', type=int, help='Length of prefix for stemming (target)') parser.add_argument('--target-suffix', dest='target_suffix_len', default=0, metavar='N', type=int, help='Length of suffix for stemming (target)') parser.add_argument('-l', '--length', dest='length', default=1.0, metavar='X', type=float, help='Relative number of sampling iterations') parser.add_argument('-1', '--ibm1-iters', dest='iters1', default=None, metavar='X', type=int, help='Number of IBM1 iterations (overrides --length)') parser.add_argument('-2', '--hmm-iters', dest='iters2', default=None, metavar='X', type=int, help='Number of HMM iterations (overrides --length)') parser.add_argument( '-3', '--fert-iters', dest='iters3', default=None, metavar='X', type=int, help='Number of HMM+fertility iterations (overrides --length)') parser.add_argument('--n-samplers', dest='n_samplers', default=3, metavar='X', type=int, help='Number of independent samplers to run') parser.add_argument('-s', '--source', dest='source_filename', type=str, metavar='filename', help='Source text filename', required=True) parser.add_argument('-t', '--target', dest='target_filename', type=str, metavar='filename', help='Target text filename', required=True) parser.add_argument( '-f', '--forward-probabilities', dest='p_filename_fwd', type=str, metavar='filename', help= 'Filename to write forward direction probabilities to, as pickle dump') parser.add_argument( '-r', '--reverse-probabilities', dest='p_filename_rev', type=str, metavar='filename', help= 'Filename to write reverse direction probabilities to, as pickle dump') parser.add_argument( '-F', '--forward-probabilities-human', dest='p_filename_fwd_h', type=str, metavar='filename', help= 'Filename to write forward direction probabilities to, as human readable dump' ) parser.add_argument( '-R', '--reverse-probabilities-human', dest='p_filename_rev_h', type=str, metavar='filename', help= 'Filename to write reverse direction probabilities to, as human readable dump' ) args = parser.parse_args() logger = Logger(args.verbose) if args.p_filename_fwd is None and args.p_filename_rev is None: print('ERROR: no file to save probabilities (-f/-r), will do nothing.', file=sys.stderr, flush=True) sys.exit(1) for filename in (args.source_filename, args.target_filename): if not os.path.exists(filename): print('ERROR: input file %s does not exist!' % filename, file=sys.stderr, flush=True) sys.exit(1) for filename in (args.p_filename_fwd, args.p_filename_rev): if (not args.overwrite) and (filename is not None) \ and os.path.exists(filename): print('ERROR: output file %s exists, will not overwrite!' % \ filename, file=sys.stderr, flush=True) sys.exit(1) if args.verbose: print('Reading source text from %s...' % args.source_filename, file=sys.stderr, flush=True) with xopen(args.source_filename, 'r', encoding='utf-8') as f: src_sents, src_index = read_text(f, args.lowercase, args.source_prefix_len, args.source_suffix_len) n_src_sents = len(src_sents) src_voc_size = len(src_index) src_index = None srcf = NamedTemporaryFile('wb') write_text(srcf, tuple(src_sents), src_voc_size) src_sents = None if args.verbose: print('Reading target text from %s...' % args.target_filename, file=sys.stderr, flush=True) with xopen(args.target_filename, 'r', encoding='utf-8') as f: trg_sents, trg_index = read_text(f, args.lowercase, args.target_prefix_len, args.target_suffix_len) trg_voc_size = len(trg_index) n_trg_sents = len(trg_sents) trg_index = None trgf = NamedTemporaryFile('wb') write_text(trgf, tuple(trg_sents), trg_voc_size) trg_sents = None if n_src_sents != n_trg_sents: print('ERROR: number of sentences differ in input files (%d vs %d)' % (n_src_sents, n_trg_sents), file=sys.stderr, flush=True) sys.exit(1) iters = (args.iters1, args.iters2, args.iters3) if any(x is None for x in iters[:args.model]): iters = None if args.verbose: print('Aligning %d sentences...' % n_src_sents, file=sys.stderr, flush=True) fwd_alignment_file = NamedTemporaryFile('w') rev_alignment_file = NamedTemporaryFile('w') align(srcf.name, trgf.name, links_filename_fwd=fwd_alignment_file.name, links_filename_rev=rev_alignment_file.name, statistics_filename=None, scores_filename=None, model=args.model, n_iterations=iters, n_samplers=args.n_samplers, quiet=not args.verbose, rel_iterations=args.length, null_prior=args.null_prior, use_gdb=args.debug) srcf.close() trgf.close() # split and, if requested, lowercase tokens logger.info("Preprocessing sentences for probability estimation...") with xopen(args.source_filename, 'r', encoding='utf-8') as fsrc, xopen(args.target_filename, 'r', encoding='utf-8') as ftgt: src_sents = preprocess(fsrc.readlines(), args.lowercase) trg_sents = preprocess(ftgt.readlines(), args.lowercase) # extract token --> index hash table logger.info("Extracting vocabulary...") voc_s = make_voc(src_sents) voc_t = make_voc(trg_sents) if args.p_filename_fwd is not None: logger.info("Estimating forward counts...") counts, s_counts = compute_counts_fwd(voc_s, voc_t, src_sents, trg_sents, fwd_alignment_file.name, args.lowercase) logger.info("Estimating forward probabilities...") p = compute_p(voc_s, voc_t, counts, s_counts) logger.info("Saving forward probabilities...") model = IBM1(p, voc_s, voc_t) save_p(model, args.p_filename_fwd) if args.p_filename_fwd_h is not None: with xopen(args.p_filename_fwd_h, "w") as f: model.dump(f) if args.p_filename_rev is not None: logger.info("Estimating reverse counts...") counts, t_counts = compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, rev_alignment_file.name, args.lowercase) logger.info("Estimating reverse probabilities...") p = compute_p(voc_t, voc_s, counts, t_counts) logger.info("Saving reverse probabilities...") model = IBM1(p, voc_t, voc_s) save_p(model, args.p_filename_rev) if args.p_filename_rev_h is not None: with xopen(args.p_filename_rev_h, "w") as f: model.dump(f) fwd_alignment_file.close() rev_alignment_file.close()
from convenience import header, blue, green, yellow, orange, red, bold, underline if __name__=="__main__": parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("input", help="Input model file. Pickle or text, gzipped or not. Automatically processed extensions are .pickle, .pickle.gz, .gz. Required.") parser.add_argument("--delimiter", "-d", type=str, dest="delimiter", default="\t", help="Delimiter used in model file, if in text format. Use plain string between quotes. Default=<tab>.") parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase verbosity") args = parser.parse_args() logger = Logger(args.verbosity) logger.info("Loading model") if args.input.endswith(".pickle.gz") or args.input.endswith(".pickle"): logger.debug("Pickle detected") with xopen(args.input, "rb") as f: model = pickle.load(f) print(blue("Source vocabulary size:\t"+bold(str(len(model.voc_s))))) print(blue("Target vocabulary size:\t"+bold(str(len(model.voc_t))))) print(green("Number of entries:\t"+bold(str(model.p.count_nonzero())))) else: logger.debug("Text format detected") with xopen(args.input, "r") as f: n_entries = 0 voc_s = set() voc_t = set() for line in f.readlines(): entry = line.split(args.delimiter, maxsplit=2) voc_s.add(entry[0]) voc_t.add(entry[1])