Beispiel #1
0
def main():
    args = argparser().parse_args()
    if args.hypotheses:
        if not os.path.exists(args.hypotheses):
            raise FileNotFoundError('Hypotheses file not found: %s' %
                                    args.hypotheses)
        hstream = smart_ropen(args.hypotheses).readlines()
    else:
        hstream = sys.stdin.readlines()
    if not os.path.exists(args.references):
        raise FileNotFoundError('Reference file not found: %s' %
                                args.references)
    rstream = smart_ropen(args.references).readlines()

    # compute bleu
    bleu, pn, bp = stream_doc_bleu(hstream, rstream, args.order,
                                   args.smoothing)
    print(bleu)

    # log brevity penalty, n-gram precisions, and BLEU-1 to BLEU-order
    print('grasp.mt.bleu loaded %d segments' % len(hstream), file=sys.stderr)
    bleus = []
    for max_order in range(1, args.order + 1):
        bleus.append(
            (bp * np.exp(1.0 / max_order * np.sum(np.log(pn[0:max_order])))))
    print('bp=%.4f ||| %s ||| %s' %
          (bp, ' '.join('p%d=%.4f' % (i, x) for i, x in enumerate(pn, 1)),
           ' '.join('bleu-%d=%.4f' % (i, x) for i, x in enumerate(bleus, 1))),
          file=sys.stderr)
Beispiel #2
0
def mteval(args, staticdir, model, segments, hyp_path, ref_path, eval_path,
           ranking_path):
    """
    Decode and evaluate with an external tool.
    :return: BLEU score.
    """

    if ranking_path:
        os.makedirs(ranking_path, exist_ok=True)

    # decode
    with Pool(args.jobs) as workers:
        results = workers.map(
            partial(training_decode,
                    args=args,
                    n_samples=args.samples[1],
                    staticdir=staticdir,
                    decisiondir=ranking_path,
                    model=model,
                    redo=args.redo,
                    log=logging.info), segments)

    # write best decisions to file
    with smart_wopen(hyp_path) as fo:
        for y in results:
            print(y, file=fo)

    # call scoring tool
    cmd_str = '{0} -r {1}'.format(args.scoring_tool, ref_path)
    logging.info('Scoring: %s', cmd_str)
    # prepare args
    cmd_args = shlex.split(cmd_str)
    # assess
    score = None
    with smart_ropen(hyp_path) as fin:
        with smart_wopen('{0}.stdout'.format(eval_path)) as fout:
            with smart_wopen('{0}.stderr'.format(eval_path)) as ferr:
                with sp.Popen(cmd_args, stdin=fin, stdout=fout,
                              stderr=ferr) as proc:
                    proc.wait()
    try:
        with smart_ropen('{0}.stdout'.format(eval_path)) as fi:
            line = next(fi)
            score = float(line.strip())
    except:
        logging.error('Problem reading %s.stdout', eval_path)

    return score
Beispiel #3
0
def load_grammar(path):
    """
    Load a grammar from a text file.
    :param path:
    :return:
    """
    return SCFG(iterrules(smart_ropen(path)))
Beispiel #4
0
def construct_extractors(path):
    """
    Read a configuration file and construct the extractors specified in each line.
    :param path: path to configuration file
    :return: list of extractors (in the order they were listed in the configuration file)
    """
    extractors = []
    names = set()
    with smart_ropen(path) as fi:
        for i, line in enumerate(fi, 1):
            if line.startswith('#'):
                continue
            line = line.strip()
            if not line:
                continue

            try:
                cfg, [cls] = re_sub('^([^ ]+)', '', line)
            except:
                raise ValueError('In line %d, expected class name: %s' %
                                 (i, line))
            cfg, name = re_key_value('name', cfg)
            if not name:
                name = cls
            if name in names:
                raise ValueError(
                    'In line %d, duplicate name (%s), rename your extractor with name=<CustomName>'
                    % (i, name))
            names.add(name)
            cfg, pkg = re_key_value('pkg', cfg)
            impl = get_extractor_implementation(cls, pkg)
            extractor = impl.construct(len(extractors), name, cfg)
            extractors.append(extractor)
    return extractors
Beispiel #5
0
def read_weights(path,
                 default=None,
                 random=False,
                 temperature=1.0,
                 u=0,
                 std=0.01):
    """
    Read a sequence of key-value pairs.
    :param path: file where to read sequence from
    :param default: if set, overwrites the values read from file
    :param random: if set, sample values from N(u, std)
    :param temperature: scales the final weight: weight/T
    :param u: mean of normal
    :param std: standard deviation
    :return:
    """
    wmap = {}
    with smart_ropen(path) as fi:
        for line in fi.readlines():
            fields = line.split()
            if len(fields) != 2:
                continue
            w = float(fields[1])
            if default is not None:
                w = default
            elif random:
                w = np.random.normal(u, std)
            w /= temperature
            wmap[fields[0]] = w
    return wmap
Beispiel #6
0
def read_factorisation(path):
    """
    Return a joint and a conditional factorisation of the model.
    :param path: path to a file with the complete factorisation of a model
    """
    joint_cfg = defaultdict(set)
    conditional_cfg = defaultdict(set)
    if path:
        with smart_ropen(path) as fi:
            changes = None
            for line in fi:
                line = line.strip()
                if not line or line.startswith(
                        '#'):  # ignore comments and empty lines
                    continue
                if line == '[joint]':
                    changes = joint_cfg
                elif line == '[conditional]':
                    changes = conditional_cfg
                elif changes is None:
                    raise ValueError('Syntax error in factorisation file')
                elif line.startswith('local='):
                    names = line.replace('local=', '', 1)
                    changes['local'].update(names.split())
                elif line.startswith('nonlocal='):
                    names = line.replace('nonlocal=', '', 1)
                    changes['nonlocal'].update(names.split())

    return joint_cfg, conditional_cfg
Beispiel #7
0
def read_empirical_distribution(path):
    """
    Return the empirical distribution (a numpy array) and the support (tuples).
    :param path: path of distribution over projections
    :return:
    """
    Y = []
    P = []
    with smart_ropen(path) as fi:
        lines = fi.readlines()
        for line in lines:
            if line.startswith('#'):
                continue
            line = line.strip()
            if not line:
                continue
            fields = line.split('\t')
            if len(fields) < 2:
                raise ValueError(
                    'Bad format: I expected the first column to be an estimate and the last to be the solution.'
                )
            estimate = float(fields[0])
            projection = tuple(fields[-1].split())
            Y.append(projection)
            P.append(estimate)
    return np.array(P), tuple(Y)
Beispiel #8
0
def mteval(args, workspace, iteration, proxy, target, segments, alias):
    decisions = sample_and_decode(
        args, '{0}/{1}'.format(workspace, alias),
        '{0}/iterations/{1}/{2}'.format(workspace, iteration,
                                        alias), proxy, target, segments)
    evaldir = '{0}/iterations/{1}/{2}'.format(workspace, iteration, alias)
    os.makedirs(evaldir, exist_ok=True)
    with smart_wopen('{0}/hyps'.format(evaldir)) as fo:
        for y, p, l in decisions:
            print(y, file=fo)
    bleu, pn, bp = stream_doc_bleu(smart_ropen('{0}/hyps'.format(evaldir)),
                                   smart_ropen('{0}/{1}/refs'.format(
                                       workspace, alias)),
                                   max_order=args.bleu_order,
                                   smoothing=args.bleu_smoothing)
    logging.info('BLEU %s: %.4f', alias, bleu)
    return bleu
Beispiel #9
0
def read_segments_from_file(path, grammar_dir=None, shuffle=False) -> 'tuple':
    """
    Read cdec-formated input segments (possibly along with their reference translations) from a file.
    :param path: path to file (possibly gzipped)
    :param grammar_dir: overwrites grammar directory
    :param shuffle: shuffle segments inplace
    :return: tuple of SegmentMetaData objects
    """
    return read_segments_from_stream(smart_ropen(path), grammar_dir=grammar_dir, shuffle=shuffle)
Beispiel #10
0
def decode(seg, args, n_samples, model, saving, redo, log=dummyfunc):

    # first we check whether the decisions have been completed before
    if is_step_complete('decisions', saving, redo):
        log('[%d] Reusing decisions', seg.id)
        with smart_ropen(saving['decisions']) as fi:
            for line in fi.readlines():
                if line.startswith('#'):
                    continue
                line = line.strip()
                if not line:
                    continue
                fields = line.split(' ||| ')  # that should be (loss, posterior, solution)
                if len(fields) == 3:
                    return fields[2]  # that's the solution

    forest, lfunc, tsort, sampler = make_slice_sampler(seg,
                                                       model,
                                                       extra_grammar_paths=args.extra_grammar,
                                                       glue_grammar_paths=args.glue_grammar,
                                                       pass_through=args.pass_through,
                                                       default_symbol=args.default_symbol,
                                                       goal_str=args.goal,
                                                       start_str=args.start,
                                                       saving=saving,
                                                       redo=args.redo,
                                                       log=log)

    d0, markov_chain = sampler.sample(n_samples=n_samples,
                                      batch_size=args.batch,
                                      within=args.within,
                                      initial=args.initial,
                                      prior=args.prior,
                                      burn=args.burn,
                                      lag=args.lag,
                                      temperature0=args.temperature0)

    # TODO: save stuff

    samples = apply_filters(markov_chain,
                            burn=args.burn,
                            lag=args.lag)

    decisions = consensus(seg, forest, samples)
    if 'decisions' in saving:
        # write all decisions to file
        with smart_wopen(saving['decisions']) as fo:
            print('# co-loss ||| posterior ||| solution', file=fo)
            for l, p, y in decisions:
                print('{0} ||| {1} ||| {2}'.format(l, p, y), file=fo)
    return decisions[0][2]  # return best translation
Beispiel #11
0
def iterrules(path, transform, fname='Prob'):
    fi = smart_ropen(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split()
        lhs = fields[0]
        (num, den) = fields[-1].split('/')
        num = float(num)
        den = float(den)
        rhs = fields[
            1:-2]  # fields[-2] is the yield function, which we are ignoring
        yield CFGProduction(Nonterminal(lhs), [Nonterminal(s) for s in rhs],
                            {fname: transform(num / den)})
Beispiel #12
0
def iterlexicon(path, transform, fname='Prob'):
    fi = smart_ropen(path)
    for line in fi:
        line = line.strip()
        if not line:
            continue
        fields = line.split('\t')
        word = fields[0]
        for pair in fields[1:]:
            tag, fraction = pair.split(' ')
            num, den = fraction.split('/')
            num = float(num)
            den = float(den)
            r = CFGProduction(Nonterminal(tag), (Terminal(word), ),
                              {fname: transform(num / den)})
            yield r
Beispiel #13
0
def read_grammar(istream, transform=float, cdec_adapt=False, fprefix='UnnamedFeature', ply_based=True):
    """
    Read a grammar from an input stream.
    :param istream: an input stream or a path to grammar file.
    :param transform: a transformation (e.g. log).
    :param cdec_adapt: wehter or not the input grammar is in cdec format
    :param fprefix: prefix used in naming unnamed features
    :param ply_based: whether or not to use a lex-yacc parser
    :return: a CFG
    """

    if type(istream) is str:
        istream = smart_ropen(istream)
    if cdec_adapt:
        istream = cdec_adaptor(istream)
    if ply_based:
        parser = CFGYacc(transform=transform, fprefix=fprefix)
        parser.build(debug=False, optimize=True, write_tables=True, tabmodule='cfg_yacctab')
        return CFG(parser.parse(istream))
    else:
        return CFG(read_basic(istream, transform))
Beispiel #14
0
def get_factorised_models(model: Model, path='') -> (ModelView, ModelView):
    """
    Return a joint and a conditional factorisation of the model.

    :param model: a Model
    :param path: (optional) path to a file changing the default way of factorising a model
    :return: joint view and conditional view
    """
    joint_changes = defaultdict(set)
    conditional_changes = defaultdict(set)
    if path:
        with smart_ropen(path) as fi:
            changes = None
            for line in fi:
                line = line.strip()
                if not line or line.startswith('#'):  # ignore comments and empty lines
                    continue
                if line == '[joint]':
                    changes = joint_changes
                elif line == '[conditional]':
                    changes = conditional_changes
                elif changes is None:
                    raise ValueError('Syntax error in factorisation file')
                elif line.startswith('local='):
                    names = line.replace('local=', '', 1)
                    changes['local'].update(names.split())
                elif line.startswith('nonlocal='):
                    names = line.replace('nonlocal=', '', 1)
                    changes['nonlocal'].update(names.split())

    joint_model = ModelView(model.wmap, model.extractors(),
                            local_names=joint_changes['local'],
                            nonlocal_names=joint_changes['nonlocal'])
    conditional_model = ModelView(model.wmap, model.extractors(),
                                  local_names=conditional_changes['local'],
                                  nonlocal_names=conditional_changes['nonlocal'])
    return joint_model, conditional_model