Ejemplo n.º 1
0
    def _PREPARE_DEVSET_(workspace,
                         path,
                         config,
                         alias='dev',
                         input_format='cdec',
                         grammar_dir=None):
        # load dev set and separate input and references
        logging.info('Reading %s set: %s', alias, path)
        #if grammar_dir is None:
        #    if config.has_section('chisel:sampler'):
        #        sampler_map = dict(config.items('chisel:sampler'))
        #        grammar_dir = sampler_map.get('grammars', None)
        with smart_ropen(path) as f:
            devset = [
                SegmentMetaData.parse(line.strip(),
                                      input_format,
                                      grammar_dir=grammar_dir)
                for sid, line in enumerate(f)
            ]
        logging.info('%d %s instances', len(devset), alias)

        # dump source and references
        with smart_wopen('{0}/{1}.input'.format(workspace, alias)) as fi:
            with smart_wopen('{0}/{1}.refs'.format(workspace, alias)) as fr:
                for seg in devset:
                    print >> fi, seg.to_sgm(dump_refs=False)
                    print >> fr, ' ||| '.join(str(ref) for ref in seg.refs)
        return devset
Ejemplo n.º 2
0
    def sample(self,
               run,
               alias,
               config,
               samples=1000,
               grammar=None,
               extra_parameters=''):
        """
        Sample derivation for a certain set of segments.

        :param workspace: workspace
        :param alias: alias of the set (determines the workspace)
        :param config: the number of the configuration file to be used (if not given, we assume the same as iteration)
            For example, sample(1, 'dev', 0)  will sample at the beginning of iteration 1 using config0.ini.
            Alternatively, sample(1, 'devtest', 1) will sample at the end of iteration 1 using config1.ini.
        :param samples: how samples to draw
        :param grammar: path to a grammar (typically necessary when sampling for a devtest set)
        :param extra_parameters: additional parameters to chisel.sampler
        :returns: path to samples
        """
        options = {
            'config': '{0}/{1}'.format(self.workspace, config),
            'workspace': '{0}/{1}/{2}'.format(self.workspace, run, alias),
            'samples': samples
        }
        mkdir(options['workspace'])
        # command line
        cmd_str = 'python -m chisel.sampler %(config)s %(workspace)s --samples %(samples)d' % options
        # additional parameters including --grammar
        if grammar is not None:
            cmd_str = '{0} --grammar {1}'.format(cmd_str, grammar)
        if extra_parameters:
            cmd_str = '{0} {1}'.format(cmd_str, extra_parameters)
        logging.debug('[%s] Run: %s', run, cmd_str)
        # prepare args
        cmd_args = shlex.split(cmd_str)
        # sample
        t0 = time()
        logging.info('[%s] Sampling %d solutions (%s)...', run, samples, alias)
        with smart_ropen('{0}/{1}.input'.format(self.workspace, alias)) as fi:
            with smart_wopen(
                    self.path_to_log('sampling-{0}'.format(alias), run)) as fo:
                with smart_wopen(
                        self.path_to_log('sampling-{0}'.format(alias),
                                         run,
                                         err=True)) as fe:
                    fe.write('{0}\n'.format(cmd_str))
                    proc = sp.Popen(cmd_args, stdin=fi, stdout=fo, stderr=fe)
                    proc.wait()
        dt = time() - t0
        logging.info('[%s]  sampling took %f seconds', run, dt)
        return '{0}/samples'.format(options['workspace'])
Ejemplo n.º 3
0
    def decide(self, run, alias, config, extra_parameters=''):
        """
        Apply a decision rule..

        :param iteration: current iteration (determines the run folder)
        :param alias: alias of the set (determines the workspace)
        :param config: the number of the configuration file to be used (if not given, we assume the same as iteration)
            For example, decide(1, 'dev', 0)  will decide from samples drawn at the beginning of iteration 1 using config0.ini.
            Alternatively, decide(1, 'devtest', 1) will decide from sample drawn at the end of iteration 1 using config1.ini.
        :param extra_parameters: additional parameters to chisel.fast_consensus
        :returns: (path to ranked decisions, path to 1-best outputs)
        """
        # required options
        options = {
            'config': '{0}/{1}'.format(self.workspace, config),
            'workspace': '{0}/{1}/{2}'.format(self.workspace, run, alias)
        }
        # command line
        cmd_str = 'python -m chisel.fast_consensus %(config)s %(workspace)s ' % options
        # additional parameters
        if extra_parameters:
            cmd_str = '{0} {1}'.format(cmd_str, extra_parameters)
        logging.debug('[%s] Run: %s', run, cmd_str)
        # perpare args
        cmd_args = shlex.split(cmd_str)
        # decide
        t0 = time()
        logging.info('[%s] Deciding (%s)...', run, alias)
        with smart_wopen(self.path_to_log('decision-{0}'.format(alias),
                                          run)) as fo:
            with smart_wopen(
                    self.path_to_log('decision-{0}'.format(alias),
                                     run,
                                     err=True)) as fe:
                proc = sp.Popen(cmd_args, stdin=None, stdout=fo, stderr=fe)
                proc.wait()
        dt = time() - t0
        logging.info('[%s]  deciding took %f seconds', run, dt)
        return '{0}/decisions'.format(
            options['workspace']), '{0}/output'.format(options['workspace'])
Ejemplo n.º 4
0
    def _BASE_CONFIG_(config, workspace, proxy_wmap, target_wmap):
        config.remove_section('proxy')
        config.add_section('proxy')
        [config.set('proxy', f, v) for f, v in proxy_wmap.iteritems()]

        config.remove_section('target')
        config.add_section('target')
        [config.set('target', f, v) for f, v in target_wmap.iteritems()]

        with smart_wopen('{0}/config0.ini'.format(workspace)) as fo:
            config.write(fo)

        return '{0}/config0.ini'.format(workspace)
Ejemplo n.º 5
0
    def save(self, raw_samples, odir, suffix=''):
        with smart_wopen('{0}/{1}{2}.gz'.format(odir, self.segment_.id, suffix)) as fo:
            print >> fo, '[proxy]'
            print >> fo, '\n'.join('{0}={1}'.format(k, v) for k, v in sorted(self.proxy_weights_.iteritems(), key=lambda (k,v): k))
            print >> fo

            print >> fo, '[target]'
            print >> fo, '\n'.join('{0}={1}'.format(k, v) for k, v in sorted(self.target_weights_.iteritems(), key=lambda (k,v): k))
            print >> fo

            print >> fo, '[samples]'
            print >> fo, '# count projection vector'
            for sample in sorted(raw_samples, key=lambda r: r.count, reverse=True):
                print >> fo, '{0}\t{1}\t{2}'.format(sample.count, sample.projection, sample.vector)
Ejemplo n.º 6
0
 def assess(self, run, alias):
     # where samples, decisions and outputs can be found
     workspace = '{0}/{1}/{2}'.format(self.workspace, run, alias)
     # command line
     cmd_str = '{0} -r {1}'.format(
         self.args.scoring_tool,
         '{0}/{1}.refs'.format(self.workspace, alias))
     logging.debug('[%s] Run: %s', run, cmd_str)
     # prepare args
     cmd_args = shlex.split(cmd_str)
     # assess
     t0 = time()
     trans_path = '{0}/output/consensus-bleu'.format(workspace)
     logging.info('[%s] Assessing (%s)...', run, alias)
     score = None
     with smart_ropen(trans_path) as fin:
         bleu_out = '{0}.bleu.stdout'.format(splitext(trans_path)[0])
         bleu_err = '{0}.bleu.stderr'.format(splitext(trans_path)[0])
         with smart_wopen(bleu_out) as fout:
             with smart_wopen(bleu_err) as ferr:
                 # logging.info(cmd_args)
                 proc = sp.Popen(cmd_args,
                                 stdin=fin,
                                 stdout=fout,
                                 stderr=ferr)
                 proc.wait()
                 try:
                     with smart_ropen(bleu_out) as fi:
                         line = next(fi)
                         score = float(line.strip())
                 except:
                     logging.error('[%s] Problem reading %s for %s', run,
                                   bleu_out, alias)
     dt = time() - t0
     logging.info('[%s]  assessing took %f seconds', run, dt)
     return score
Ejemplo n.º 7
0
def main():
    options, config = argparse_and_config()

    # check for input folder
    samples_dir = '{0}/samples'.format(options.workspace)
    if not os.path.isdir(samples_dir):
        raise Exception(
            'If a workspace is set, samples are expected to be found under $workspace/samples'
        )
    logging.info('Reading samples from %s', samples_dir)
    # create output folders
    if not os.path.isdir('{0}/output'.format(options.workspace)):
        os.makedirs('{0}/output'.format(options.workspace))

    output_dir = create_decision_rule_dir(options.workspace, 'consensus',
                                          'bleu')
    one_best_file = '{0}/output/{1}-{2}'.format(options.workspace, 'consensus',
                                                'bleu')
    logging.info("Writing '%s' solutions to %s", 'consensus', output_dir)
    logging.info("Writing 1-best '%s' yields to %s", 'consensus',
                 one_best_file)

    # TODO: generalise this
    headers = {
        'derivation': 'd',
        'vector': 'v',
        'count': 'n',
        'log_ur': 'log_ur',
        'importance': 'importance'
    }

    # read jobs from workspace
    input_files = list_numbered_files(samples_dir)
    jobs = [(fid, input_file) for fid, input_file in input_files]
    logging.info('%d jobs', len(jobs))

    # run jobs in parallel
    pool = Pool(options.jobs)
    # run decision rules and save them to files
    results = pool.map(
        partial(decide_and_save,
                headers=headers,
                options=options,
                output_dir=output_dir), jobs)
    # save the 1-best solution for each decision rule in a separate file
    with smart_wopen(one_best_file) as fout:
        for y, l, p, q in results:
            fout.write('{0}\n'.format(y))
Ejemplo n.º 8
0
def decide_and_save(job_desc, headers, options, output_dirs):
    # this code runs in a Pool, thus we wrap in try/except in order to have more informative exceptions
    jid, path = job_desc
    try:
        # make decisions
        decisions = make_decisions(job_desc, headers, options)  #, q_wmap, p_wmap)
        # write to file if necessary
        for rule, ranking in decisions.iteritems():
            with smart_wopen('{0}/{1}.gz'.format(output_dirs[rule], jid)) as out:
                print >> out, '\t'.join(['#target', '#p', '#q', '#yield'])
                for solution in ranking:
                    print >> out, solution.format_str(keys=['p', 'q', 'yield'])
                print >> out
        return {rule: solutions[0] for rule, solutions in decisions.iteritems()}
    except:
        raise Exception('job={0} exception={1}'.format(jid, ''.join(traceback.format_exception(*sys.exc_info()))))
Ejemplo n.º 9
0
def decide_and_save(job_desc, headers, options, output_dirs):
    # this code runs in a Pool, thus we wrap in try/except in order to have more informative exceptions
    jid, path = job_desc
    try:
        # make decisions
        decisions = make_decisions(job_desc, headers,
                                   options)  #, q_wmap, p_wmap)
        # write to file if necessary
        for rule, ranking in decisions.iteritems():
            with smart_wopen('{0}/{1}.gz'.format(output_dirs[rule],
                                                 jid)) as out:
                print >> out, '\t'.join(['#target', '#p', '#q', '#yield'])
                for solution in ranking:
                    print >> out, solution.format_str(keys=['p', 'q', 'yield'])
                print >> out
        return {
            rule: solutions[0]
            for rule, solutions in decisions.iteritems()
        }
    except:
        raise Exception('job={0} exception={1}'.format(
            jid, ''.join(traceback.format_exception(*sys.exc_info()))))
Ejemplo n.º 10
0
    def update_config_file(self,
                           before,
                           after,
                           proxy_scaling=None,
                           target_scaling=None,
                           proxy=None,
                           target=None):
        config_path = '{0}/{1}'.format(self.workspace, before)
        if not os.path.exists(config_path):
            raise IOError(
                'Perhaps iteration %s did not complete successfully?' % path)

        config = Config(config_path)

        config.add_section('chisel:model')
        if proxy_scaling is not None:
            config.set('chisel:model', 'proxy_scaling', proxy_scaling)
        if target_scaling is not None:
            config.set('chisel:model', 'target_scaling', target_scaling)

        config.add_section('proxy')
        if proxy is None:
            [config.set('proxy', f, v) for f, v in self.wmap.proxy.iteritems()]
        else:
            [config.set('proxy', f, v) for f, v in proxy.iteritems()]

        config.add_section('target')
        if target is None:
            [
                config.set('target', f, v)
                for f, v in self.wmap.target.iteritems()
            ]
        else:
            [config.set('target', f, v) for f, v in target.iteritems()]

        config_path = '{0}/{1}'.format(self.workspace, after)
        with smart_wopen(config_path) as fo:
            config.write(fo)
        return config_path
Ejemplo n.º 11
0
def decide_and_save(job_desc, headers, options, output_dir):
    # this code runs in a Pool, thus we wrap in try/except in order to have more informative exceptions
    jid, path = job_desc
    try:
        # make decisions
        ranking = make_decisions(job_desc, headers, options)
        # write to file if necessary
        with smart_wopen('{0}/{1}.gz'.format(output_dir,
                                             jid)) as out:  # TODO: save nbest
            out.write('{0}\n'.format('\t'.join(
                ['#target', '#p', '#q', '#yield'])))
            if options.nbest > 0:
                for y, l, p, q in ranking[0:options.nbest]:
                    out.write('{0}\n'.format('\t'.join(
                        str(x) for x in [l, p, q, y])))
            else:
                for y, l, p, q in ranking:
                    out.write('{0}\n'.format('\t'.join(
                        str(x) for x in [l, p, q, y])))
        return ranking[0]
    except:
        raise Exception('job={0} exception={1}'.format(
            jid, ''.join(traceback.format_exception(*sys.exc_info()))))
Ejemplo n.º 12
0
    def training_loss(self, run, alias, segments, samples):
        L = []

        if self.args.save_loss:
            loss_dir = '{0}/{1}/loss'.format(self.workspace, run, alias)
            mkdir(loss_dir)

        logging.info('[%s] Computing loss (%s)...', run, alias)
        t0 = time()
        # run fast bleu implementation
        # TODO: generalise to other metrics
        for seg, derivations in zip(segments, samples):
            projections = frozenset(d.tree.projection for d in derivations)
            scorer = TrainingBLEU(seg.refs)
            lmap = {y: scorer.loss(y.split()) for y in projections}
            L.append(lmap)
            if self.args.save_loss:
                with smart_wopen('{0}/{1}.gz'.format(loss_dir, seg.id)) as fo:
                    for d in derivations:
                        fo.write('{0}\n'.format(lmap[d.tree.projection]))
        dt = time() - t0
        logging.info('[%s]  computing loos took %s seconds', run, dt)
        return L
Ejemplo n.º 13
0
def main():
    options, config = argparse_and_config()

    # loads mteval modules
    if config.has_section('chisel:metrics'):
        metrics_map = dict(config.items('chisel:metrics'))
    else:
        metrics_map = {'bleu': 'chisel.mteval.bleu'}
    mteval.load(metrics_map, frozenset([options.metric]))

    if not mteval.sanity_check(options.metric):
        raise Exception(
            "Perhaps you forgot to include the metric '%s' in the configuration file?"
            % options.metric)

    # configure mteval metrics
    if config.has_section('chisel:metrics:config'):
        metrics_config = dict(config.items('chisel:metrics:config'))
    else:
        metrics_config = {}
    logging.debug('chisel:metrics:config: %s', metrics_config)
    # configure metrics
    mteval.configure(metrics_config)

    # gather decision rules to be run
    decision_rules = []
    if options.map:
        decision_rules.append('MAP')
    if options.mbr:
        decision_rules.append('MBR')
    if options.consensus:
        decision_rules.append('consensus')

    # check for input folder
    samples_dir = '{0}/samples'.format(options.workspace)
    if not os.path.isdir(samples_dir):
        raise Exception(
            'If a workspace is set, samples are expected to be found under $workspace/samples'
        )
    logging.info('Reading samples from %s', samples_dir)
    # create output folders
    if not os.path.isdir('{0}/output'.format(options.workspace)):
        os.makedirs('{0}/output'.format(options.workspace))
    output_dirs = {}
    one_best_files = {}
    # TODO: check whether decisions already exist (and warn the user)
    for rule in decision_rules:
        if rule == 'MAP':
            output_dirs[rule] = create_decision_rule_dir(
                options.workspace, rule)
            one_best_files[rule] = '{0}/output/{1}'.format(
                options.workspace, rule)
        else:
            output_dirs[rule] = create_decision_rule_dir(
                options.workspace, rule, options.metric)
            one_best_files[rule] = '{0}/output/{1}-{2}'.format(
                options.workspace, rule, options.metric)
        logging.info("Writing '%s' solutions to %s", rule, output_dirs[rule])
        logging.info("Writing 1-best '%s' yields to %s", rule,
                     one_best_files[rule])

    # TODO: generalise this
    headers = {
        'derivation': 'd',
        'vector': 'v',
        'count': 'n',
        'log_ur': 'log_ur',
        'importance': 'importance'
    }

    # read jobs from workspace
    input_files = list_numbered_files(samples_dir)
    jobs = [(fid, input_file) for fid, input_file in input_files]
    logging.info('%d jobs', len(jobs))
    """
    # sometimes I use this for profiling (gotta write a better switch)
    for job in jobs:
        decide_and_save(job, headers=headers,
                               options=options,
                               fnames=target_features,
                               gnames=proxy_features,
                               output_dirs=output_dirs)

    sys.exit(0)
    """

    # run jobs in parallel
    pool = Pool(options.jobs)
    # run decision rules and save them to files
    results = pool.map(
        partial(
            decide_and_save,
            headers=headers,
            options=options,
            #q_wmap=proxy_wmap,
            #p_wmap=target_wmap,
            output_dirs=output_dirs),
        jobs)
    # save the 1-best solution for each decision rule in a separate file
    for rule in decision_rules:
        with smart_wopen(one_best_files[rule]) as fout:
            for decisions in results:
                best = decisions[rule]  # instance of KBestSolution
                print >> fout, best.solution.Dy.projection
Ejemplo n.º 14
0
def main():
    options, config = argparse_and_config()

    # loads mteval modules
    if config.has_section('chisel:metrics'):
        metrics_map = dict(config.items('chisel:metrics'))
    else:
        metrics_map = {'bleu': 'chisel.mteval.bleu'}
    mteval.load(metrics_map, frozenset([options.metric]))

    if not mteval.sanity_check(options.metric):
        raise Exception("Perhaps you forgot to include the metric '%s' in the configuration file?" % options.metric)

    # configure mteval metrics
    if config.has_section('chisel:metrics:config'):
        metrics_config = dict(config.items('chisel:metrics:config'))
    else:
        metrics_config = {}
    logging.debug('chisel:metrics:config: %s', metrics_config)
    # configure metrics
    mteval.configure(metrics_config)

    # gather decision rules to be run
    decision_rules = []
    if options.map:
        decision_rules.append('MAP')
    if options.mbr:
        decision_rules.append('MBR')
    if options.consensus:
        decision_rules.append('consensus')

    # check for input folder
    samples_dir = '{0}/samples'.format(options.workspace)
    if not os.path.isdir(samples_dir):
        raise Exception('If a workspace is set, samples are expected to be found under $workspace/samples')
    logging.info('Reading samples from %s', samples_dir)
    # create output folders
    if not os.path.isdir('{0}/output'.format(options.workspace)):
        os.makedirs('{0}/output'.format(options.workspace))
    output_dirs = {}
    one_best_files = {}
    # TODO: check whether decisions already exist (and warn the user)
    for rule in decision_rules:
        if rule == 'MAP':
            output_dirs[rule] = create_decision_rule_dir(options.workspace, rule)
            one_best_files[rule] = '{0}/output/{1}'.format(options.workspace, rule)
        else:
            output_dirs[rule] = create_decision_rule_dir(options.workspace, rule, options.metric)
            one_best_files[rule] = '{0}/output/{1}-{2}'.format(options.workspace, rule, options.metric)
        logging.info("Writing '%s' solutions to %s", rule, output_dirs[rule])
        logging.info("Writing 1-best '%s' yields to %s", rule, one_best_files[rule])

    # TODO: generalise this
    headers = {'derivation': 'd', 'vector': 'v', 'count': 'n', 'log_ur': 'log_ur', 'importance': 'importance'}

    # read jobs from workspace
    input_files = list_numbered_files(samples_dir)
    jobs = [(fid, input_file) for fid, input_file in input_files]
    logging.info('%d jobs', len(jobs))

    """
    # sometimes I use this for profiling (gotta write a better switch)
    for job in jobs:
        decide_and_save(job, headers=headers,
                               options=options,
                               fnames=target_features,
                               gnames=proxy_features,
                               output_dirs=output_dirs)

    sys.exit(0)
    """

    # run jobs in parallel
    pool = Pool(options.jobs)
    # run decision rules and save them to files
    results = pool.map(partial(decide_and_save,
                               headers=headers,
                               options=options,
                               #q_wmap=proxy_wmap,
                               #p_wmap=target_wmap,
                               output_dirs=output_dirs),
                       jobs)
    # save the 1-best solution for each decision rule in a separate file
    for rule in decision_rules:
        with smart_wopen(one_best_files[rule]) as fout:
            for decisions in results:
                best = decisions[rule]  # instance of KBestSolution
                print >> fout, best.solution.Dy.projection