Ejemplo n.º 1
0
def tokenize(an_file, tok_file, full_tags):
    tok_to_id = {}
    id_to_tok = []
    sents = []
    an_file.seek(0)
    import numpy
    for line in an_file:
        cur = []
        for word in line.split('$'):
            if '^' not in word:
                continue
            an = word.split('^')[-1].split('/')[0]
            if '<' in an and an.split('<')[1][:-1] not in full_tags:
                an = word.split('>')[0] + '>'
            tid = tok_to_id.get(an, -1)
            if tid == -1:
                tok_to_id[an] = len(id_to_tok)
                id_to_tok.append(an)
            cur.append(tid)
        sents.append(numpy.asarray(cur, dtype=numpy.uint32))
    #tok_file.write('%d %d\n' % (len(sents), len(id_to_tok)))
    #for sn in sents:
    #    if len(sn) > 0x400:
    #        tok_file.write('0\n')
    #    else:
    #        tok_file.write(' '.join(str(n) for n in sn) + '\n')
    import eflomal
    eflomal.write_text(tok_file, tuple(sents), len(id_to_tok))
    return len(sents)
def run_eflomal(
        sents: List[Tuple[List[int], List[int]]]) -> List[Dict[int, int]]:
    sl_nums = NamedTemporaryFile('wb+')
    sl = tuple([numpy.asarray(x[0], dtype=numpy.uint32) for x in sents])
    eflomal.write_text(sl_nums, sl, 1 + max(map(lambda x: max(x[0]), sents)))
    tl_nums = NamedTemporaryFile('wb+')
    tl = tuple([numpy.asarray(x[1], dtype=numpy.uint32) for x in sents])
    eflomal.write_text(tl_nums, tl, 1 + max(map(lambda x: max(x[1]), sents)))

    # I don't know what this is calculating, but it corresponds to the
    # default arguments in the eflomal python interface
    defaults = ['-m', '3', '-n', '1', '-N', '0.2']
    iters = max(2, int(round(5000.0 / math.sqrt(len(sents)))))
    iters4 = max(1, iters // 4)
    defaults += [
        '-1', str(max(2, iters4)), '-2',
        str(iters4), '-3',
        str(iters)
    ]

    #align = NamedTemporaryFile('w+')
    align = open('jam-alignments.txt', 'w+')
    subprocess.run([
        'eflomal', '-s', sl_nums.name, '-t', tl_nums.name, '-f', align.name,
        '-q'
    ] + defaults)
    align.seek(0)
    ret = []
    for line in align:
        dct = {}
        for nums in line.split():
            sl, tl = nums.split('-')
            dct[int(sl)] = int(tl)
        ret.append(dct)
    align.close()
    return ret
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='eflomal: efficient low-memory aligner')
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='Enable verbose output')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        help='Enable gdb debugging of eflomal binary')
    parser.add_argument('--overwrite',
                        dest='overwrite',
                        action='store_true',
                        help='Overwrite existing output files')
    parser.add_argument('--null-prior',
                        dest='null_prior',
                        default=0.2,
                        metavar='X',
                        type=float,
                        help='Prior probability of NULL alignment')
    parser.add_argument(
        '-m',
        '--model',
        dest='model',
        default=3,
        metavar='N',
        type=int,
        help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)')
    parser.add_argument('-M',
                        '--score-model',
                        dest='score_model',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Model used for sentence scoring '
                        '(1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)')
    parser.add_argument('--source-prefix',
                        dest='source_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (source)')
    parser.add_argument('--source-suffix',
                        dest='source_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (source)')
    parser.add_argument('--target-prefix',
                        dest='target_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (target)')
    parser.add_argument('--target-suffix',
                        dest='target_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (target)')
    parser.add_argument('-l',
                        '--length',
                        dest='length',
                        default=1.0,
                        metavar='X',
                        type=float,
                        help='Relative number of sampling iterations')
    parser.add_argument('-1',
                        '--ibm1-iters',
                        dest='iters1',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of IBM1 iterations (overrides --length)')
    parser.add_argument('-2',
                        '--hmm-iters',
                        dest='iters2',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of HMM iterations (overrides --length)')
    parser.add_argument(
        '-3',
        '--fert-iters',
        dest='iters3',
        default=None,
        metavar='X',
        type=int,
        help='Number of HMM+fertility iterations (overrides --length)')
    parser.add_argument('--n-samplers',
                        dest='n_samplers',
                        default=3,
                        metavar='X',
                        type=int,
                        help='Number of independent samplers to run')
    parser.add_argument('-s',
                        '--source',
                        dest='source_filename',
                        type=str,
                        metavar='filename',
                        help='Source text filename')
    parser.add_argument('-t',
                        '--target',
                        dest='target_filename',
                        type=str,
                        metavar='filename',
                        help='Target text filename')
    parser.add_argument('-i',
                        '--input',
                        dest='joint_filename',
                        type=str,
                        metavar='filename',
                        help='fast_align style ||| separated file')
    parser.add_argument(
        '-f',
        '--forward-links',
        dest='links_filename_fwd',
        type=str,
        metavar='filename',
        help='Filename to write forward direction alignments to')
    parser.add_argument(
        '-r',
        '--reverse-links',
        dest='links_filename_rev',
        type=str,
        metavar='filename',
        help='Filename to write reverse direction alignments to')
    parser.add_argument(
        '-F',
        '--forward-scores',
        dest='scores_filename_fwd',
        type=str,
        metavar='filename',
        help='Filename to write alignment scores to (generation '
        'probability of target sentences)')
    parser.add_argument(
        '-R',
        '--reverse-scores',
        dest='scores_filename_rev',
        type=str,
        metavar='filename',
        help='Filename to write alignment scores to (generation '
        'probability of source sentences)')
    parser.add_argument('-p',
                        '--priors',
                        dest='priors_filename',
                        type=str,
                        metavar='filename',
                        help='File to read priors from')

    args = parser.parse_args()

    if not (args.joint_filename or
            (args.source_filename and args.target_filename)):
        print('ERROR: need to specify either -s and -t, or -i',
              file=sys.stderr,
              flush=True)
        sys.exit(1)

    for filename in ((args.joint_filename, ) if args.joint_filename else
                     (args.source_filename, args.target_filename)):
        if not os.path.exists(filename):
            print('ERROR: input file %s does not exist!' % filename,
                  file=sys.stderr,
                  flush=True)
            sys.exit(1)

    for filename in (args.links_filename_fwd, args.links_filename_rev):
        if (not args.overwrite) and (filename is not None) \
                and os.path.exists(filename):
            print('ERROR: output file %s exists, will not overwrite!' % \
                    filename,
                  file=sys.stderr, flush=True)
            sys.exit(1)

    if args.priors_filename:
        if args.verbose:
            print('Reading lexical priors from %s...' % args.priors_filename,
                  file=sys.stderr,
                  flush=True)

        priors_list = []  # list of (srcword, trgword, alpha)
        ferf_priors = []  # list of (wordform, alpha)
        ferr_priors = []  # list of (wordform, alpha)
        hmmf_priors = {}  # dict of jump: alpha
        hmmr_priors = {}  # dict of jump: alpha
        with open(args.priors_filename, 'r', encoding='utf-8') as f:
            # 5 types of lines valid:
            #
            # LEX   srcword     trgword     alpha   | lexical prior
            # HMMF  jump        alpha               | target-side HMM prior
            # HMMR  jump        alpha               | source-side HMM prior
            # FERF  srcword     fert   alpha        | source-side fertility p.
            # FERR  trgword     fert    alpha       | target-side fertility p.
            for i, line in enumerate(f):
                fields = line.rstrip('\n').split('\t')
                try:
                    alpha = float(fields[-1])
                except ValueError:
                    print('ERROR: priors file %s line %d contains alpha '
                          'value of "%s" which is not numeric' %
                          (args.priors_filename, i + 1, fields[2]),
                          file=sys.stderr,
                          flush=True)
                    sys.exit(1)

                if fields[0] == 'LEX' and len(fields) == 4:
                    priors_list.append((fields[1], fields[2], alpha))
                elif fields[0] == 'HMMF' and len(fields) == 3:
                    hmmf_priors[int(fields[1])] = alpha
                elif fields[0] == 'HMMR' and len(fields) == 3:
                    hmmr_priors[int(fields[1])] = alpha
                elif fields[0] == 'FERF' and len(fields) == 4:
                    ferf_priors.append((fields[1], int(fields[2]), alpha))
                elif fields[0] == 'FERR' and len(fields) == 4:
                    ferr_priors.append((fields[1], int(fields[2]), alpha))
                else:
                    print('ERROR: priors file %s line %d is invalid ' %
                          (args.priors_filename, i + 1),
                          file=sys.stderr,
                          flush=True)
                    sys.exit(1)

    if args.joint_filename:
        if args.verbose:
            print('Reading source/target sentences from %s...' %
                  args.joint_filename,
                  file=sys.stderr,
                  flush=True)
        with open(args.joint_filename, 'r', encoding='utf-8') as f:
            src_sents_text = []
            trg_sents_text = []
            for i, line in enumerate(f):
                fields = line.strip().split(' ||| ')
                if len(fields) != 2:
                    print('ERROR: line %d of %s does not contain a single |||'
                          ' separator, or sentence(s) are empty!' %
                          (i + 1, args.joint_filename),
                          file=sys.stderr,
                          flush=True)
                    sys.exit(1)
                src_sents_text.append(fields[0])
                trg_sents_text.append(fields[1])
            src_text = '\n'.join(src_sents_text) + '\n'
            trg_text = '\n'.join(trg_sents_text) + '\n'
            src_sents_text = None
            trg_sents_text = None

        with io.StringIO(src_text) as f:
            src_sents, src_index = read_text(f, True, args.source_prefix_len,
                                             args.source_suffix_len)
            n_src_sents = len(src_sents)
            src_voc_size = len(src_index)
            srcf = NamedTemporaryFile('wb', delete=False)
            write_text(srcf, tuple(src_sents), src_voc_size)
            src_sents = None
            src_text = None
            scrf.close()

        with io.StringIO(trg_text) as f:
            trg_sents, trg_index = read_text(f, True, args.target_prefix_len,
                                             args.target_suffix_len)
            trg_voc_size = len(trg_index)
            n_trg_sents = len(trg_sents)
            trgf = NamedTemporaryFile('wb', delete=False)
            write_text(trgf, tuple(trg_sents), trg_voc_size)
            trg_sents = None
            trg_text = None
            trgf.close()

    else:
        if args.verbose:
            print('Reading source text from %s...' % args.source_filename,
                  file=sys.stderr,
                  flush=True)
        with open(args.source_filename, 'r', encoding='utf-8') as f:
            src_sents, src_index = read_text(f, True, args.source_prefix_len,
                                             args.source_suffix_len)
            n_src_sents = len(src_sents)
            src_voc_size = len(src_index)
            srcf = NamedTemporaryFile('wb', delete=False)
            write_text(srcf, tuple(src_sents), src_voc_size)
            src_sents = None
            srcf.close()

        if args.verbose:
            print('Reading target text from %s...' % args.target_filename,
                  file=sys.stderr,
                  flush=True)
        with open(args.target_filename, 'r', encoding='utf-8') as f:
            trg_sents, trg_index = read_text(f, True, args.target_prefix_len,
                                             args.target_suffix_len)
            trg_voc_size = len(trg_index)
            n_trg_sents = len(trg_sents)
            trgf = NamedTemporaryFile('wb', delete=False)
            write_text(trgf, tuple(trg_sents), trg_voc_size)
            trg_sents = None
            trgf.close()

        if n_src_sents != n_trg_sents:
            print(
                'ERROR: number of sentences differ in input files (%d vs %d)' %
                (n_src_sents, n_trg_sents),
                file=sys.stderr,
                flush=True)
            sys.exit(1)

    def get_src_index(src_word):
        src_word = src_word.lower()
        if args.source_prefix_len != 0:
            src_word = src_word[:args.source_prefix_len]
        if args.source_suffix_len != 0:
            src_word = src_word[-args.source_suffix_len:]
        e = src_index.get(src_word)
        if e is not None:
            e = e + 1
        return e

    def get_trg_index(trg_word):
        trg_word = trg_word.lower()
        if args.target_prefix_len != 0:
            trg_word = trg_word[:args.target_prefix_len]
        if args.target_suffix_len != 0:
            trg_word = trg_word[-args.target_suffix_len:]
        f = trg_index.get(trg_word)
        if f is not None:
            f = f + 1
        return f

    if args.priors_filename:
        priors_indexed = {}
        for src_word, trg_word, alpha in priors_list:
            if src_word == '<NULL>':
                e = 0
            else:
                e = get_src_index(src_word)

            if trg_word == '<NULL>':
                f = 0
            else:
                f = get_trg_index(trg_word)

            if (e is not None) and (f is not None):
                priors_indexed[(e,f)] = priors_indexed.get((e,f), 0.0) \
                        + alpha

        ferf_indexed = {}
        for src_word, fert, alpha in ferf_priors:
            e = get_src_index(src_word)
            if e is not None:
                ferf_indexed[(e, fert)] = \
                        ferf_indexed.get((e, fert), 0.0) + alpha

        ferr_indexed = {}
        for trg_word, fert, alpha in ferr_priors:
            f = get_trg_index(trg_word)
            if f is not None:
                ferr_indexed[(f, fert)] = \
                        ferr_indexed.get((f, fert), 0.0) + alpha

        if args.verbose:
            print('%d (of %d) pairs of lexical priors used' %
                  (len(priors_indexed), len(priors_list)),
                  file=sys.stderr)
        priorsf = NamedTemporaryFile('w', encoding='utf-8', delete=False)
        print('%d %d %d %d %d %d %d' %
              (len(src_index) + 1, len(trg_index) + 1, len(priors_indexed),
               len(hmmf_priors), len(hmmr_priors), len(ferf_indexed),
               len(ferr_indexed)),
              file=priorsf)

        for (e, f), alpha in sorted(priors_indexed.items()):
            print('%d %d %g' % (e, f, alpha), file=priorsf)

        for jump, alpha in sorted(hmmf_priors.items()):
            print('%d %g' % (jump, alpha), file=priorsf)

        for jump, alpha in sorted(hmmr_priors.items()):
            print('%d %g' % (jump, alpha), file=priorsf)

        for (e, fert), alpha in sorted(ferf_indexed.items()):
            print('%d %d %g' % (e, fert, alpha), file=priorsf)

        for (f, fert), alpha in sorted(ferr_indexed.items()):
            print('%d %d %g' % (f, fert, alpha), file=priorsf)

        priorsf.flush()
        priorsf.close()

    trg_index = None
    src_index = None

    iters = (args.iters1, args.iters2, args.iters3)
    if any(x is None for x in iters[:args.model]):
        iters = None

    if args.verbose:
        print('Aligning %d sentences...' % n_src_sents,
              file=sys.stderr,
              flush=True)

    align(srcf.name,
          trgf.name,
          links_filename_fwd=args.links_filename_fwd,
          links_filename_rev=args.links_filename_rev,
          statistics_filename=None,
          scores_filename_fwd=args.scores_filename_fwd,
          scores_filename_rev=args.scores_filename_rev,
          priors_filename=(None
                           if args.priors_filename is None else priorsf.name),
          model=args.model,
          score_model=args.score_model,
          n_iterations=iters,
          n_samplers=args.n_samplers,
          quiet=not args.verbose,
          rel_iterations=args.length,
          null_prior=args.null_prior,
          use_gdb=args.debug)

    srcf.close()
    trgf.close()
    if args.priors_filename:
        priorsf.close()
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='eflomal: efficient low-memory aligner')
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='Enable verbose output')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        help='Enable gdb debugging of eflomal binary')
    parser.add_argument('--overwrite',
                        dest='overwrite',
                        action='store_true',
                        help='Overwrite existing output files')
    parser.add_argument('--null-prior',
                        dest='null_prior',
                        default=0.2,
                        metavar='X',
                        type=float,
                        help='Prior probability of NULL alignment')
    parser.add_argument(
        '-m',
        '--model',
        dest='model',
        default=3,
        metavar='N',
        type=int,
        help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)')
    parser.add_argument('--source-prefix',
                        dest='source_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (source)')
    parser.add_argument('--source-suffix',
                        dest='source_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (source)')
    parser.add_argument('--target-prefix',
                        dest='target_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (target)')
    parser.add_argument('--target-suffix',
                        dest='target_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (target)')
    parser.add_argument('-l',
                        '--length',
                        dest='length',
                        default=1.0,
                        metavar='X',
                        type=float,
                        help='Relative number of sampling iterations')
    parser.add_argument('-1',
                        '--ibm1-iters',
                        dest='iters1',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of IBM1 iterations (overrides --length)')
    parser.add_argument('-2',
                        '--hmm-iters',
                        dest='iters2',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of HMM iterations (overrides --length)')
    parser.add_argument(
        '-3',
        '--fert-iters',
        dest='iters3',
        default=None,
        metavar='X',
        type=int,
        help='Number of HMM+fertility iterations (overrides --length)')
    parser.add_argument('--n-samplers',
                        dest='n_samplers',
                        default=3,
                        metavar='X',
                        type=int,
                        help='Number of independent samplers to run')
    parser.add_argument('-s',
                        '--source',
                        dest='source_filename',
                        type=str,
                        metavar='filename',
                        help='Source text filename',
                        required=True)
    parser.add_argument('-t',
                        '--target',
                        dest='target_filename',
                        type=str,
                        metavar='filename',
                        help='Target text filename',
                        required=True)
    parser.add_argument(
        '-f',
        '--forward-links',
        dest='links_filename_fwd',
        type=str,
        metavar='filename',
        help='Filename to write forward direction alignments to')
    parser.add_argument(
        '-r',
        '--reverse-links',
        dest='links_filename_rev',
        type=str,
        metavar='filename',
        help='Filename to write reverse direction alignments to')

    args = parser.parse_args()

    for filename in (args.source_filename, args.target_filename):
        if not os.path.exists(filename):
            print('ERROR: input file %s does not exist!' % filename,
                  file=sys.stderr,
                  flush=True)
            sys.exit(1)

    for filename in (args.links_filename_fwd, args.links_filename_rev):
        if (not args.overwrite) and (filename is not None) \
                and os.path.exists(filename):
            print('ERROR: output file %s exists, will not overwrite!' % \
                    filename,
                  file=sys.stderr, flush=True)
            sys.exit(1)

    if args.verbose:
        print('Reading source text from %s...' % args.source_filename,
              file=sys.stderr,
              flush=True)
    with open(args.source_filename, 'r', encoding='utf-8') as f:
        src_sents, src_index = read_text(f, True, args.source_prefix_len,
                                         args.source_suffix_len)
        n_src_sents = len(src_sents)
        src_voc_size = len(src_index)
        src_index = None
        srcf = NamedTemporaryFile('wb')
        write_text(srcf, tuple(src_sents), src_voc_size)
        src_sents = None

    if args.verbose:
        print('Reading target text from %s...' % args.target_filename,
              file=sys.stderr,
              flush=True)
    with open(args.target_filename, 'r', encoding='utf-8') as f:
        trg_sents, trg_index = read_text(f, True, args.target_prefix_len,
                                         args.target_suffix_len)
        trg_voc_size = len(trg_index)
        n_trg_sents = len(trg_sents)
        trg_index = None
        trgf = NamedTemporaryFile('wb')
        write_text(trgf, tuple(trg_sents), trg_voc_size)
        trg_sents = None

    if n_src_sents != n_trg_sents:
        print('ERROR: number of sentences differ in input files (%d vs %d)' %
              (n_src_sents, n_trg_sents),
              file=sys.stderr,
              flush=True)
        sys.exit(1)

    iters = (args.iters1, args.iters2, args.iters3)
    if any(x is None for x in iters[:args.model]):
        iters = None

    if args.verbose:
        print('Aligning %d sentences...' % n_src_sents,
              file=sys.stderr,
              flush=True)

    align(srcf.name,
          trgf.name,
          links_filename_fwd=args.links_filename_fwd,
          links_filename_rev=args.links_filename_rev,
          statistics_filename=None,
          scores_filename=None,
          model=args.model,
          n_iterations=iters,
          n_samplers=args.n_samplers,
          quiet=not args.verbose,
          rel_iterations=args.length,
          null_prior=args.null_prior,
          use_gdb=args.debug)

    srcf.close()
    trgf.close()
Ejemplo n.º 5
0
def do_align(f_name, rev_f_name, seed=None):
    print('Reading source/target sentences from %s...' %
                f_name,
              file=sys.stderr, flush=True)
    with open(f_name, 'r', encoding='utf-8') as f:
        src_sents_text = []
        trg_sents_text = []
        for i, line in enumerate(f):
            fields = line.strip().split(' ||| ')
            if len(fields) != 2:
                print('ERROR: line %d of %s does not contain a single |||'
                      ' separator, or sentence(s) are empty!' % (
                          i+1, args.joint_filename),
                      file=sys.stderr, flush=True)
                sys.exit(1)
            src_sents_text.append(fields[0])
            trg_sents_text.append(fields[1])
        src_text = '\n'.join(src_sents_text) + '\n'
        trg_text = '\n'.join(trg_sents_text) + '\n'
        src_sents_text = None
        trg_sents_text = None

        source_prefix_len = 0
        source_suffix_len = 0
        
        target_prefix_len = 0
        target_suffix_len = 0
        
        with io.StringIO(src_text) as f:
            src_sents, src_index = read_text(
                    f, True, source_prefix_len, source_suffix_len)
            n_src_sents = len(src_sents)
            src_voc_size = len(src_index)
            srcf = NamedTemporaryFile('wb')
            write_text(srcf, tuple(src_sents), src_voc_size)
            src_sents = None
            src_text = None

        with io.StringIO(trg_text) as f:
            trg_sents, trg_index = read_text(
                    f, True, target_prefix_len, target_suffix_len)
            trg_voc_size = len(trg_index)
            n_trg_sents = len(trg_sents)
            trgf = NamedTemporaryFile('wb')
            write_text(trgf, tuple(trg_sents), trg_voc_size)
            trg_sents = None
            trg_text = None
    """
    print("source")
    with open(f_name, 'r', encoding='utf-8') as f:
        src_sents, src_index = read_text(
                f, True, 0, 0)
        n_src_sents = len(src_sents)
        src_voc_size = len(src_index)
        srcf = NamedTemporaryFile('wb')
        write_text(srcf, tuple(src_sents), src_voc_size)
        src_sents = None
    
    print("target")
    with open(rev_f_name, 'r', encoding='utf-8') as f:
        trg_sents, trg_index = read_text(
                f, True, 0, 0)
        trg_voc_size = len(trg_index)
        n_trg_sents = len(trg_sents)
        trgf = NamedTemporaryFile('wb')
        write_text(trgf, tuple(trg_sents), trg_voc_size)
        trg_sents = None
    """
    
    fwd_links_file = NamedTemporaryFile('r+')
    rev_links_file = NamedTemporaryFile('r+')
    stat_file = NamedTemporaryFile('r+')
    print("start align")
    
    
    align(srcf.name, trgf.name, statistics_filename = stat_file.name, quiet=False, links_filename_fwd=fwd_links_file.name, links_filename_rev=rev_links_file.name)
    
    # Not using stat_file at the moment
    result = fwd_links_file.readlines()
    rev_result = rev_links_file.readlines()
                   
    fwd_links_file.close()
    rev_links_file.close()
    stat_file.close()
    srcf.close()
    trgf.close()
    
        
    """
    if discretize:
        ibm_print(aaa, reverse, output.fileno())
    else: # Not used at the moment, but keeping this for the future
        with open(output_prob, 'wb') as f:
            pickle.dump(aaa, f, -1)

    output.seek(0)
    result = []
    for line in output:
        result.append(line.decode('ascii').strip())
    """
    return result, rev_result
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'mkmodel.py: compute IBM-1 translation probabilties using eflomal, the efficient low-memory aligner'
    )
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action="count",
                        default=0,
                        help='Enable verbose output')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        help='Enable gdb debugging of eflomal binary')
    parser.add_argument('--no-lowercase',
                        dest='lowercase',
                        action='store_false',
                        default=True,
                        help='Do not lowercase input text')
    parser.add_argument('--overwrite',
                        dest='overwrite',
                        action='store_true',
                        help='Overwrite existing output files')
    parser.add_argument('--null-prior',
                        dest='null_prior',
                        default=0.2,
                        metavar='X',
                        type=float,
                        help='Prior probability of NULL alignment')
    parser.add_argument(
        '-m',
        '--model',
        dest='model',
        default=3,
        metavar='N',
        type=int,
        help='Model (1 = IBM1, 2 = IBM1+HMM, 3 = IBM1+HMM+fertility)')
    parser.add_argument('--source-prefix',
                        dest='source_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (source)')
    parser.add_argument('--source-suffix',
                        dest='source_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (source)')
    parser.add_argument('--target-prefix',
                        dest='target_prefix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of prefix for stemming (target)')
    parser.add_argument('--target-suffix',
                        dest='target_suffix_len',
                        default=0,
                        metavar='N',
                        type=int,
                        help='Length of suffix for stemming (target)')
    parser.add_argument('-l',
                        '--length',
                        dest='length',
                        default=1.0,
                        metavar='X',
                        type=float,
                        help='Relative number of sampling iterations')
    parser.add_argument('-1',
                        '--ibm1-iters',
                        dest='iters1',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of IBM1 iterations (overrides --length)')
    parser.add_argument('-2',
                        '--hmm-iters',
                        dest='iters2',
                        default=None,
                        metavar='X',
                        type=int,
                        help='Number of HMM iterations (overrides --length)')
    parser.add_argument(
        '-3',
        '--fert-iters',
        dest='iters3',
        default=None,
        metavar='X',
        type=int,
        help='Number of HMM+fertility iterations (overrides --length)')
    parser.add_argument('--n-samplers',
                        dest='n_samplers',
                        default=3,
                        metavar='X',
                        type=int,
                        help='Number of independent samplers to run')
    parser.add_argument('-s',
                        '--source',
                        dest='source_filename',
                        type=str,
                        metavar='filename',
                        help='Source text filename',
                        required=True)
    parser.add_argument('-t',
                        '--target',
                        dest='target_filename',
                        type=str,
                        metavar='filename',
                        help='Target text filename',
                        required=True)
    parser.add_argument(
        '-f',
        '--forward-probabilities',
        dest='p_filename_fwd',
        type=str,
        metavar='filename',
        help=
        'Filename to write forward direction probabilities to, as pickle dump')
    parser.add_argument(
        '-r',
        '--reverse-probabilities',
        dest='p_filename_rev',
        type=str,
        metavar='filename',
        help=
        'Filename to write reverse direction probabilities to, as pickle dump')
    parser.add_argument(
        '-F',
        '--forward-probabilities-human',
        dest='p_filename_fwd_h',
        type=str,
        metavar='filename',
        help=
        'Filename to write forward direction probabilities to, as human readable dump'
    )
    parser.add_argument(
        '-R',
        '--reverse-probabilities-human',
        dest='p_filename_rev_h',
        type=str,
        metavar='filename',
        help=
        'Filename to write reverse direction probabilities to, as human readable dump'
    )

    args = parser.parse_args()

    logger = Logger(args.verbose)

    if args.p_filename_fwd is None and args.p_filename_rev is None:
        print('ERROR: no file to save probabilities (-f/-r), will do nothing.',
              file=sys.stderr,
              flush=True)
        sys.exit(1)

    for filename in (args.source_filename, args.target_filename):
        if not os.path.exists(filename):
            print('ERROR: input file %s does not exist!' % filename,
                  file=sys.stderr,
                  flush=True)
            sys.exit(1)

    for filename in (args.p_filename_fwd, args.p_filename_rev):
        if (not args.overwrite) and (filename is not None) \
                and os.path.exists(filename):
            print('ERROR: output file %s exists, will not overwrite!' % \
                    filename,
                  file=sys.stderr, flush=True)
            sys.exit(1)

    if args.verbose:
        print('Reading source text from %s...' % args.source_filename,
              file=sys.stderr,
              flush=True)
    with xopen(args.source_filename, 'r', encoding='utf-8') as f:
        src_sents, src_index = read_text(f, args.lowercase,
                                         args.source_prefix_len,
                                         args.source_suffix_len)
        n_src_sents = len(src_sents)
        src_voc_size = len(src_index)
        src_index = None
        srcf = NamedTemporaryFile('wb')
        write_text(srcf, tuple(src_sents), src_voc_size)
        src_sents = None

    if args.verbose:
        print('Reading target text from %s...' % args.target_filename,
              file=sys.stderr,
              flush=True)
    with xopen(args.target_filename, 'r', encoding='utf-8') as f:
        trg_sents, trg_index = read_text(f, args.lowercase,
                                         args.target_prefix_len,
                                         args.target_suffix_len)
        trg_voc_size = len(trg_index)
        n_trg_sents = len(trg_sents)
        trg_index = None
        trgf = NamedTemporaryFile('wb')
        write_text(trgf, tuple(trg_sents), trg_voc_size)
        trg_sents = None

    if n_src_sents != n_trg_sents:
        print('ERROR: number of sentences differ in input files (%d vs %d)' %
              (n_src_sents, n_trg_sents),
              file=sys.stderr,
              flush=True)
        sys.exit(1)

    iters = (args.iters1, args.iters2, args.iters3)
    if any(x is None for x in iters[:args.model]):
        iters = None

    if args.verbose:
        print('Aligning %d sentences...' % n_src_sents,
              file=sys.stderr,
              flush=True)

    fwd_alignment_file = NamedTemporaryFile('w')
    rev_alignment_file = NamedTemporaryFile('w')

    align(srcf.name,
          trgf.name,
          links_filename_fwd=fwd_alignment_file.name,
          links_filename_rev=rev_alignment_file.name,
          statistics_filename=None,
          scores_filename=None,
          model=args.model,
          n_iterations=iters,
          n_samplers=args.n_samplers,
          quiet=not args.verbose,
          rel_iterations=args.length,
          null_prior=args.null_prior,
          use_gdb=args.debug)

    srcf.close()
    trgf.close()

    # split and, if requested, lowercase tokens
    logger.info("Preprocessing sentences for probability estimation...")
    with xopen(args.source_filename, 'r',
               encoding='utf-8') as fsrc, xopen(args.target_filename,
                                                'r',
                                                encoding='utf-8') as ftgt:
        src_sents = preprocess(fsrc.readlines(), args.lowercase)
        trg_sents = preprocess(ftgt.readlines(), args.lowercase)

    # extract token --> index hash table
    logger.info("Extracting vocabulary...")
    voc_s = make_voc(src_sents)
    voc_t = make_voc(trg_sents)

    if args.p_filename_fwd is not None:
        logger.info("Estimating forward counts...")
        counts, s_counts = compute_counts_fwd(voc_s, voc_t, src_sents,
                                              trg_sents,
                                              fwd_alignment_file.name,
                                              args.lowercase)
        logger.info("Estimating forward probabilities...")
        p = compute_p(voc_s, voc_t, counts, s_counts)
        logger.info("Saving forward probabilities...")
        model = IBM1(p, voc_s, voc_t)
        save_p(model, args.p_filename_fwd)
        if args.p_filename_fwd_h is not None:
            with xopen(args.p_filename_fwd_h, "w") as f:
                model.dump(f)

    if args.p_filename_rev is not None:
        logger.info("Estimating reverse counts...")
        counts, t_counts = compute_counts_rev(voc_s, voc_t, src_sents,
                                              trg_sents,
                                              rev_alignment_file.name,
                                              args.lowercase)
        logger.info("Estimating reverse probabilities...")
        p = compute_p(voc_t, voc_s, counts, t_counts)
        logger.info("Saving reverse probabilities...")
        model = IBM1(p, voc_t, voc_s)
        save_p(model, args.p_filename_rev)
        if args.p_filename_rev_h is not None:
            with xopen(args.p_filename_rev_h, "w") as f:
                model.dump(f)

    fwd_alignment_file.close()
    rev_alignment_file.close()