Beispiel #1
0
    def KL(self):
        t0 = time()
        # read list of input files
        samples_dir = self.path_to_samples()
        if not os.path.isdir(samples_dir):
            raise Exception('[%d/%d] could not find samples' %
                            (self.parent_iteration, self.iteration))
        #logging.info('[%d] reading samples from %s', self.iteration, samples_dir)
        input_files = list_numbered_files(samples_dir)

        # make jobs
        jobs = [(self.devset[fid], input_file)
                for fid, input_file in input_files]
        #logging.info('[%d] %d sampling jobs', self.iteration, len(jobs))

        # run jobs in parallel
        pool = Pool(self.args.jobs)
        results = pool.map(
            partial(wrapped_divergence,
                    iteration=self.iteration,
                    q_wmap=self.wmap.proxy,
                    p_wmap=self.wmap.target,
                    sample_headers=KLDriver.SAMPLING_HEADERS), jobs)
        # gather risks into an array
        divergences = np.array([result.KL for result in results], float)
        derivatives = np.array([result.dKL for result in results], float)
        dt = time() - t0
        logging.info('[%d/%d] assessing divergence took %f seconds',
                     self.parent_iteration, self.iteration, dt)
        # gather KL
        return divergences, derivatives
Beispiel #2
0
 def read_samples(self, run, alias):
     samples_dir = '{0}/{1}/{2}/samples'.format(self.workspace, run, alias)
     logging.info('[%s] Reading samples (%s)...', run, alias)
     input_files = list_numbered_files(samples_dir)
     S = []
     t0 = time()
     for fid, input_file in input_files:
         logging.debug(' reading %s from %s', fid, input_file)
         derivations, _qmap, _pmap = sampled_derivations_from_file(
             input_file)
         S.append(derivations)
     dt = time() - t0
     logging.info('[%s]  reading samples took %f seconds', run, dt)
     return S
Beispiel #3
0
def main():
    options, config = argparse_and_config()

    # check for input folder
    samples_dir = '{0}/samples'.format(options.workspace)
    if not os.path.isdir(samples_dir):
        raise Exception(
            'If a workspace is set, samples are expected to be found under $workspace/samples'
        )
    logging.info('Reading samples from %s', samples_dir)
    # create output folders
    if not os.path.isdir('{0}/output'.format(options.workspace)):
        os.makedirs('{0}/output'.format(options.workspace))

    output_dir = create_decision_rule_dir(options.workspace, 'consensus',
                                          'bleu')
    one_best_file = '{0}/output/{1}-{2}'.format(options.workspace, 'consensus',
                                                'bleu')
    logging.info("Writing '%s' solutions to %s", 'consensus', output_dir)
    logging.info("Writing 1-best '%s' yields to %s", 'consensus',
                 one_best_file)

    # TODO: generalise this
    headers = {
        'derivation': 'd',
        'vector': 'v',
        'count': 'n',
        'log_ur': 'log_ur',
        'importance': 'importance'
    }

    # read jobs from workspace
    input_files = list_numbered_files(samples_dir)
    jobs = [(fid, input_file) for fid, input_file in input_files]
    logging.info('%d jobs', len(jobs))

    # run jobs in parallel
    pool = Pool(options.jobs)
    # run decision rules and save them to files
    results = pool.map(
        partial(decide_and_save,
                headers=headers,
                options=options,
                output_dir=output_dir), jobs)
    # save the 1-best solution for each decision rule in a separate file
    with smart_wopen(one_best_file) as fout:
        for y, l, p, q in results:
            fout.write('{0}\n'.format(y))
Beispiel #4
0
    def risk(self):
        t0 = time()
        # read list of input files
        samples_dir = self.path_to_samples()
        if not os.path.isdir(samples_dir):
            raise Exception('[%d] could not find samples' % self.iteration)
        #logging.info('[%d] reading samples from %s', self.iteration, samples_dir)
        input_files = list_numbered_files(samples_dir)

        # make jobs
        jobs = [(self.devset[fid], input_file)
                for fid, input_file in input_files]
        #logging.info('[%d] %d sampling jobs', self.iteration, len(jobs))

        if not os.path.exists(self.path_to_estimates()):
            os.makedirs(self.path_to_estimates())
        if not os.path.exists(self.path_to_loss()):
            os.makedirs(self.path_to_loss())
        if not os.path.exists(self.path_to_risk()):
            os.makedirs(self.path_to_risk())

        # run jobs in parallel
        pool = Pool(self.args.jobs)
        results = pool.map(
            partial(wrapped_risk,
                    iteration=self.iteration,
                    q_wmap=self.wmap.proxy,
                    p_wmap=self.wmap.target,
                    metric=self.args.metric,
                    sample_headers=Driver.SAMPLING_HEADERS,
                    save_to=(self.path_to_estimates(), self.path_to_loss(),
                             self.path_to_risk())), jobs)
        # gather risks into an array
        risks = np.array([result.R for result in results])
        # gather jacobias into an array
        #jacs = np.array([self.wmap.concatenate(proxy=result.dl, target=result.dt) for result in results])
        jacs = np.array([result.dR for result in results])
        dt = time() - t0
        logging.info('[%d] assessing risk took %f seconds', self.iteration, dt)
        # gather KL
        kls = np.array([result.KL for result in results])
        return risks, jacs, kls
Beispiel #5
0
def main():
    options, config = argparse_and_config()

    # loads mteval modules
    if config.has_section('chisel:metrics'):
        metrics_map = dict(config.items('chisel:metrics'))
    else:
        metrics_map = {'bleu': 'chisel.mteval.bleu'}
    mteval.load(metrics_map, frozenset([options.metric]))

    if not mteval.sanity_check(options.metric):
        raise Exception(
            "Perhaps you forgot to include the metric '%s' in the configuration file?"
            % options.metric)

    # configure mteval metrics
    if config.has_section('chisel:metrics:config'):
        metrics_config = dict(config.items('chisel:metrics:config'))
    else:
        metrics_config = {}
    logging.debug('chisel:metrics:config: %s', metrics_config)
    # configure metrics
    mteval.configure(metrics_config)

    # gather decision rules to be run
    decision_rules = []
    if options.map:
        decision_rules.append('MAP')
    if options.mbr:
        decision_rules.append('MBR')
    if options.consensus:
        decision_rules.append('consensus')

    # check for input folder
    samples_dir = '{0}/samples'.format(options.workspace)
    if not os.path.isdir(samples_dir):
        raise Exception(
            'If a workspace is set, samples are expected to be found under $workspace/samples'
        )
    logging.info('Reading samples from %s', samples_dir)
    # create output folders
    if not os.path.isdir('{0}/output'.format(options.workspace)):
        os.makedirs('{0}/output'.format(options.workspace))
    output_dirs = {}
    one_best_files = {}
    # TODO: check whether decisions already exist (and warn the user)
    for rule in decision_rules:
        if rule == 'MAP':
            output_dirs[rule] = create_decision_rule_dir(
                options.workspace, rule)
            one_best_files[rule] = '{0}/output/{1}'.format(
                options.workspace, rule)
        else:
            output_dirs[rule] = create_decision_rule_dir(
                options.workspace, rule, options.metric)
            one_best_files[rule] = '{0}/output/{1}-{2}'.format(
                options.workspace, rule, options.metric)
        logging.info("Writing '%s' solutions to %s", rule, output_dirs[rule])
        logging.info("Writing 1-best '%s' yields to %s", rule,
                     one_best_files[rule])

    # TODO: generalise this
    headers = {
        'derivation': 'd',
        'vector': 'v',
        'count': 'n',
        'log_ur': 'log_ur',
        'importance': 'importance'
    }

    # read jobs from workspace
    input_files = list_numbered_files(samples_dir)
    jobs = [(fid, input_file) for fid, input_file in input_files]
    logging.info('%d jobs', len(jobs))
    """
    # sometimes I use this for profiling (gotta write a better switch)
    for job in jobs:
        decide_and_save(job, headers=headers,
                               options=options,
                               fnames=target_features,
                               gnames=proxy_features,
                               output_dirs=output_dirs)

    sys.exit(0)
    """

    # run jobs in parallel
    pool = Pool(options.jobs)
    # run decision rules and save them to files
    results = pool.map(
        partial(
            decide_and_save,
            headers=headers,
            options=options,
            #q_wmap=proxy_wmap,
            #p_wmap=target_wmap,
            output_dirs=output_dirs),
        jobs)
    # save the 1-best solution for each decision rule in a separate file
    for rule in decision_rules:
        with smart_wopen(one_best_files[rule]) as fout:
            for decisions in results:
                best = decisions[rule]  # instance of KBestSolution
                print >> fout, best.solution.Dy.projection
Beispiel #6
0
def main():
    options, config = argparse_and_config()

    # loads mteval modules
    if config.has_section('chisel:metrics'):
        metrics_map = dict(config.items('chisel:metrics'))
    else:
        metrics_map = {'bleu': 'chisel.mteval.bleu'}
    mteval.load(metrics_map, frozenset([options.metric]))

    if not mteval.sanity_check(options.metric):
        raise Exception("Perhaps you forgot to include the metric '%s' in the configuration file?" % options.metric)

    # configure mteval metrics
    if config.has_section('chisel:metrics:config'):
        metrics_config = dict(config.items('chisel:metrics:config'))
    else:
        metrics_config = {}
    logging.debug('chisel:metrics:config: %s', metrics_config)
    # configure metrics
    mteval.configure(metrics_config)

    # gather decision rules to be run
    decision_rules = []
    if options.map:
        decision_rules.append('MAP')
    if options.mbr:
        decision_rules.append('MBR')
    if options.consensus:
        decision_rules.append('consensus')

    # check for input folder
    samples_dir = '{0}/samples'.format(options.workspace)
    if not os.path.isdir(samples_dir):
        raise Exception('If a workspace is set, samples are expected to be found under $workspace/samples')
    logging.info('Reading samples from %s', samples_dir)
    # create output folders
    if not os.path.isdir('{0}/output'.format(options.workspace)):
        os.makedirs('{0}/output'.format(options.workspace))
    output_dirs = {}
    one_best_files = {}
    # TODO: check whether decisions already exist (and warn the user)
    for rule in decision_rules:
        if rule == 'MAP':
            output_dirs[rule] = create_decision_rule_dir(options.workspace, rule)
            one_best_files[rule] = '{0}/output/{1}'.format(options.workspace, rule)
        else:
            output_dirs[rule] = create_decision_rule_dir(options.workspace, rule, options.metric)
            one_best_files[rule] = '{0}/output/{1}-{2}'.format(options.workspace, rule, options.metric)
        logging.info("Writing '%s' solutions to %s", rule, output_dirs[rule])
        logging.info("Writing 1-best '%s' yields to %s", rule, one_best_files[rule])

    # TODO: generalise this
    headers = {'derivation': 'd', 'vector': 'v', 'count': 'n', 'log_ur': 'log_ur', 'importance': 'importance'}

    # read jobs from workspace
    input_files = list_numbered_files(samples_dir)
    jobs = [(fid, input_file) for fid, input_file in input_files]
    logging.info('%d jobs', len(jobs))

    """
    # sometimes I use this for profiling (gotta write a better switch)
    for job in jobs:
        decide_and_save(job, headers=headers,
                               options=options,
                               fnames=target_features,
                               gnames=proxy_features,
                               output_dirs=output_dirs)

    sys.exit(0)
    """

    # run jobs in parallel
    pool = Pool(options.jobs)
    # run decision rules and save them to files
    results = pool.map(partial(decide_and_save,
                               headers=headers,
                               options=options,
                               #q_wmap=proxy_wmap,
                               #p_wmap=target_wmap,
                               output_dirs=output_dirs),
                       jobs)
    # save the 1-best solution for each decision rule in a separate file
    for rule in decision_rules:
        with smart_wopen(one_best_files[rule]) as fout:
            for decisions in results:
                best = decisions[rule]  # instance of KBestSolution
                print >> fout, best.solution.Dy.projection