コード例 #1
0
ファイル: mkmodel.py プロジェクト: sylvainraybaud/maceflomal
def compute_counts_rev(voc_s, voc_t, src_sents, trg_sents, alignment_filename,
                       lowercase):

    counts = lil_matrix((len(voc_t.items()), len(voc_s.items())))
    t_counts = zeros(len(voc_t.items()))

    with xopen(alignment_filename, "r") as afile:
        pbar = ProgressBar(widgets=[Percentage(), Bar()],
                           maxval=len(src_sents)).start()
        i = 0
        s = src_sents[i]
        t = trg_sents[i]
        aline = afile.readline()
        while aline != "":
            a = [(int(x), int(y))
                 for x, y in [apair.split("-") for apair in aline.split()]]

            for s_i, t_i in a:
                token_s = s[s_i]
                token_t = t[t_i]
                token_s_id = voc_s[token_s]
                token_t_id = voc_t[token_t]
                counts[token_t_id, token_s_id] += 1
                t_counts[token_t_id] += 1

            i += 1
            pbar.update(i)
            if i < len(src_sents):
                s = src_sents[i]
                t = trg_sents[i]
            aline = afile.readline()

    pbar.finish()

    return counts, t_counts
コード例 #2
0
ファイル: mkmodel.py プロジェクト: sylvainraybaud/maceflomal
def save_p(model, fname):
    with xopen(fname, "wb") as f:
        pickle.dump(model, f)
コード例 #3
0
ファイル: mkmodel.py プロジェクト: sylvainraybaud/maceflomal
def main():
    parser = argparse.ArgumentParser(
        description=
        'mkmodel.py: compute IBM-1 translation probabilties using eflomal, the efficient low-memory aligner'
    )
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action="count",
                        default=0,
                        help='Enable verbose output')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        help='Enable gdb debugging of eflomal binary')
    parser.add_argument('--no-lowercase',
                        dest='lowercase',
                        action='store_false',
                        default=True,
                        help='Do not lowercase input text')
    parser.add_argument('--overwrite',
                        dest='overwrite',
                        action='store_true',
                        help='Overwrite existing output files')
    parser.add_argument('--null-prior',
                        dest='null_prior',
                        default=0.2,
                        metavar='X',
                        type=float,
                        help='Prior probability of NULL alignment')
    parser.add_argument(
        '-m',
        '--model',
        dest='model',
        default=3,
        metavar='N',
        type=int,
        help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)')
    parser.add_argument('--source-prefix',
                        dest='source_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (source)')
    parser.add_argument('--source-suffix',
                        dest='source_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (source)')
    parser.add_argument('--target-prefix',
                        dest='target_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (target)')
    parser.add_argument('--target-suffix',
                        dest='target_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (target)')
    parser.add_argument('-l',
                        '--length',
                        dest='length',
                        default=1.0,
                        metavar='X',
                        type=float,
                        help='Relative number of sampling iterations')
    parser.add_argument('-1',
                        '--ibm1-iters',
                        dest='iters1',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of IBM1 iterations (overrides --length)')
    parser.add_argument('-2',
                        '--hmm-iters',
                        dest='iters2',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of HMM iterations (overrides --length)')
    parser.add_argument(
        '-3',
        '--fert-iters',
        dest='iters3',
        default=None,
        metavar='X',
        type=int,
        help='Number of HMM+fertility iterations (overrides --length)')
    parser.add_argument('--n-samplers',
                        dest='n_samplers',
                        default=3,
                        metavar='X',
                        type=int,
                        help='Number of independent samplers to run')
    parser.add_argument('-s',
                        '--source',
                        dest='source_filename',
                        type=str,
                        metavar='filename',
                        help='Source text filename',
                        required=True)
    parser.add_argument('-t',
                        '--target',
                        dest='target_filename',
                        type=str,
                        metavar='filename',
                        help='Target text filename',
                        required=True)
    parser.add_argument(
        '-f',
        '--forward-probabilities',
        dest='p_filename_fwd',
        type=str,
        metavar='filename',
        help=
        'Filename to write forward direction probabilities to, as pickle dump')
    parser.add_argument(
        '-r',
        '--reverse-probabilities',
        dest='p_filename_rev',
        type=str,
        metavar='filename',
        help=
        'Filename to write reverse direction probabilities to, as pickle dump')
    parser.add_argument(
        '-F',
        '--forward-probabilities-human',
        dest='p_filename_fwd_h',
        type=str,
        metavar='filename',
        help=
        'Filename to write forward direction probabilities to, as human readable dump'
    )
    parser.add_argument(
        '-R',
        '--reverse-probabilities-human',
        dest='p_filename_rev_h',
        type=str,
        metavar='filename',
        help=
        'Filename to write reverse direction probabilities to, as human readable dump'
    )

    args = parser.parse_args()

    logger = Logger(args.verbose)

    if args.p_filename_fwd is None and args.p_filename_rev is None:
        print('ERROR: no file to save probabilities (-f/-r), will do nothing.',
              file=sys.stderr,
              flush=True)
        sys.exit(1)

    for filename in (args.source_filename, args.target_filename):
        if not os.path.exists(filename):
            print('ERROR: input file %s does not exist!' % filename,
                  file=sys.stderr,
                  flush=True)
            sys.exit(1)

    for filename in (args.p_filename_fwd, args.p_filename_rev):
        if (not args.overwrite) and (filename is not None) \
                and os.path.exists(filename):
            print('ERROR: output file %s exists, will not overwrite!' % \
                    filename,
                  file=sys.stderr, flush=True)
            sys.exit(1)

    if args.verbose:
        print('Reading source text from %s...' % args.source_filename,
              file=sys.stderr,
              flush=True)
    with xopen(args.source_filename, 'r', encoding='utf-8') as f:
        src_sents, src_index = read_text(f, args.lowercase,
                                         args.source_prefix_len,
                                         args.source_suffix_len)
        n_src_sents = len(src_sents)
        src_voc_size = len(src_index)
        src_index = None
        srcf = NamedTemporaryFile('wb')
        write_text(srcf, tuple(src_sents), src_voc_size)
        src_sents = None

    if args.verbose:
        print('Reading target text from %s...' % args.target_filename,
              file=sys.stderr,
              flush=True)
    with xopen(args.target_filename, 'r', encoding='utf-8') as f:
        trg_sents, trg_index = read_text(f, args.lowercase,
                                         args.target_prefix_len,
                                         args.target_suffix_len)
        trg_voc_size = len(trg_index)
        n_trg_sents = len(trg_sents)
        trg_index = None
        trgf = NamedTemporaryFile('wb')
        write_text(trgf, tuple(trg_sents), trg_voc_size)
        trg_sents = None

    if n_src_sents != n_trg_sents:
        print('ERROR: number of sentences differ in input files (%d vs %d)' %
              (n_src_sents, n_trg_sents),
              file=sys.stderr,
              flush=True)
        sys.exit(1)

    iters = (args.iters1, args.iters2, args.iters3)
    if any(x is None for x in iters[:args.model]):
        iters = None

    if args.verbose:
        print('Aligning %d sentences...' % n_src_sents,
              file=sys.stderr,
              flush=True)

    fwd_alignment_file = NamedTemporaryFile('w')
    rev_alignment_file = NamedTemporaryFile('w')

    align(srcf.name,
          trgf.name,
          links_filename_fwd=fwd_alignment_file.name,
          links_filename_rev=rev_alignment_file.name,
          statistics_filename=None,
          scores_filename=None,
          model=args.model,
          n_iterations=iters,
          n_samplers=args.n_samplers,
          quiet=not args.verbose,
          rel_iterations=args.length,
          null_prior=args.null_prior,
          use_gdb=args.debug)

    srcf.close()
    trgf.close()

    # split and, if requested, lowercase tokens
    logger.info("Preprocessing sentences for probability estimation...")
    with xopen(args.source_filename, 'r',
               encoding='utf-8') as fsrc, xopen(args.target_filename,
                                                'r',
                                                encoding='utf-8') as ftgt:
        src_sents = preprocess(fsrc.readlines(), args.lowercase)
        trg_sents = preprocess(ftgt.readlines(), args.lowercase)

    # extract token --> index hash table
    logger.info("Extracting vocabulary...")
    voc_s = make_voc(src_sents)
    voc_t = make_voc(trg_sents)

    if args.p_filename_fwd is not None:
        logger.info("Estimating forward counts...")
        counts, s_counts = compute_counts_fwd(voc_s, voc_t, src_sents,
                                              trg_sents,
                                              fwd_alignment_file.name,
                                              args.lowercase)
        logger.info("Estimating forward probabilities...")
        p = compute_p(voc_s, voc_t, counts, s_counts)
        logger.info("Saving forward probabilities...")
        model = IBM1(p, voc_s, voc_t)
        save_p(model, args.p_filename_fwd)
        if args.p_filename_fwd_h is not None:
            with xopen(args.p_filename_fwd_h, "w") as f:
                model.dump(f)

    if args.p_filename_rev is not None:
        logger.info("Estimating reverse counts...")
        counts, t_counts = compute_counts_rev(voc_s, voc_t, src_sents,
                                              trg_sents,
                                              rev_alignment_file.name,
                                              args.lowercase)
        logger.info("Estimating reverse probabilities...")
        p = compute_p(voc_t, voc_s, counts, t_counts)
        logger.info("Saving reverse probabilities...")
        model = IBM1(p, voc_t, voc_s)
        save_p(model, args.p_filename_rev)
        if args.p_filename_rev_h is not None:
            with xopen(args.p_filename_rev_h, "w") as f:
                model.dump(f)

    fwd_alignment_file.close()
    rev_alignment_file.close()
コード例 #4
0
from convenience import header, blue, green, yellow, orange, red, bold, underline

if __name__=="__main__":
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("input", help="Input model file. Pickle or text, gzipped or not. Automatically processed extensions are .pickle, .pickle.gz, .gz. Required.")
    parser.add_argument("--delimiter", "-d", type=str, dest="delimiter", default="\t",
                        help="Delimiter used in model file, if in text format. Use plain string between quotes. Default=<tab>.")
    parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase verbosity")
    args = parser.parse_args()
    logger = Logger(args.verbosity)
    
    logger.info("Loading model")
    
    if args.input.endswith(".pickle.gz") or args.input.endswith(".pickle"):
        logger.debug("Pickle detected")
        with xopen(args.input, "rb") as f:
            model = pickle.load(f)
        print(blue("Source vocabulary size:\t"+bold(str(len(model.voc_s)))))
        print(blue("Target vocabulary size:\t"+bold(str(len(model.voc_t)))))
        print(green("Number of entries:\t"+bold(str(model.p.count_nonzero()))))
    
    else:
        logger.debug("Text format detected")
        with xopen(args.input, "r") as f:
            n_entries = 0
            voc_s = set()
            voc_t = set()
            for line in f.readlines():
                entry = line.split(args.delimiter, maxsplit=2)
                voc_s.add(entry[0])
                voc_t.add(entry[1])