コード例 #1
0
def run_predict(dataset,
                inference,
                do_execute,
                inference_output_path,
                evaluate_on_all=False):
    """Runs inference of given model on eval set, and executes resulting code.

    Args:
        tag: str, tag of the run to save report.
        dataset: Dataset, iterable of CodeExample to evaluate on.
        inference: func, produces code for given CodeExamples.
        do_execute: func, runs given code with given arguments.
        show_info: Show specific example additional information.
    """
    assert inference_output_path is not None, "must provide path"
    assert not os.path.exists(
        inference_output_path), "must be a path that doesn't exist"
    assert os.path.isdir(
        os.path.dirname(inference_output_path)), "parent folder must exist"
    predictions = []
    success = total = 0
    pdataset = tqdm.tqdm(dataset)
    for batch in pdataset:
        results = inference(batch)
        for res, example in zip(results, batch.orig_examples):
            tests = []
            if evaluate_on_all:
                tests += list(example.input_tests)
            tests += list(example.tests)
            stats = executor.evaluate_code(res.code_sequence,
                                           example.schema.args, tests,
                                           do_execute)
            prediction = dict(
                output=res.info['candidates'][0],
                beams=res.info['candidates'],
                beams_correct=[
                    executor.evaluate_code(hypothesis, example.schema.args,
                                           tests, do_execute)
                    for hypothesis in res.info['candidates']
                ],
                is_correct=stats['correct'] == stats['total'],
                individual=stats['individual'],
                guid=example.guid,
            )
            if evaluate_on_all:
                prediction['passes_given_tests'] = all(
                    stats['individual'][:len(example.input_tests)])
            predictions.append(prediction)
            success += stats['correct'] == stats['total']
            total += 1
            pdataset.set_description("Accuracy: {:.2f}%".format(success /
                                                                total * 100))
    with open(inference_output_path, "w") as f:
        json.dump(predictions, f)
コード例 #2
0
 def _try_sequences(self, vocab, sequences, batch, beam_size):
     result = [[] for _ in range(len(batch))]
     counters = [0 for _ in range(len(batch))]
     candidates = [[] for _ in range(len(batch))]
     max_eval_trials = self.args.max_eval_trials or beam_size
     for batch_id, outputs in enumerate(sequences):
         example = batch[batch_id]
         #print("===", example.code_tree)
         candidates[batch_id] = [[vocab.itos(idx) for idx in ids]
                                 for ids in outputs]
         for code in candidates[batch_id][:max_eval_trials]:
             counters[batch_id] += 1
             stats = executor.evaluate_code(code, example.schema.args,
                                            example.input_tests,
                                            self.executor.execute)
             ok = (stats['correct'] == stats['total'])
             #print(code, stats)
             if ok:
                 result[batch_id] = code
                 break
     return [
         InferenceResult(code_sequence=seq,
                         info={
                             'trees_checked': c,
                             'candidates': cand
                         })
         for seq, c, cand in zip(result, counters, candidates)
     ]
コード例 #3
0
ファイル: karel_model.py プロジェクト: sunblaze-ucb/SED
 def _try_sequences(self, vocab, sequences, input_grids, output_grids,
                    beam_size):
     result = [[] for _ in range(len(sequences))]
     counters = [0 for _ in range(len(sequences))]
     candidates = [[] for _ in range(len(sequences))]
     max_eval_trials = self.args.max_eval_trials or beam_size
     for batch_id, outputs in enumerate(sequences):
         input_tests = [{
             'input': np.where(inp.numpy().ravel())[0].tolist(),
             'output': np.where(out.numpy().ravel())[0].tolist(),
         } for inp, out in zip(
             torch.split(input_grids[batch_id].data.cpu(), 1),
             torch.split(output_grids[batch_id].data.cpu(), 1),
         )]
         candidates[batch_id] = [[vocab.itos(idx) for idx in ids]
                                 for ids in outputs]
         for code in candidates[batch_id][:max_eval_trials]:
             counters[batch_id] += 1
             stats = executor.evaluate_code(code, None, input_tests,
                                            self.executor.execute)
             ok = (stats['correct'] == stats['total'])
             if ok:
                 result[batch_id] = code
                 break
     return [
         InferenceResult(code_sequence=seq,
                         info={
                             'trees_checked': c,
                             'candidates': cand
                         })
         for seq, c, cand in zip(result, counters, candidates)
     ]
コード例 #4
0
ファイル: karel_model.py プロジェクト: sunblaze-ucb/SED
 def calculate_policy_gradient_loss(self, input_grids, io_embed,
                                    orig_examples, ref_code,
                                    ref_code_memory, ref_trace_memory):
     init_state = self.model.decoder.init_state(ref_code_memory,
                                                ref_trace_memory,
                                                io_embed.shape[0],
                                                io_embed.shape[1])
     memory = self.model.decoder.prepare_memory(io_embed, ref_code_memory,
                                                ref_trace_memory, ref_code)
     sequences = beam_search.beam_search(
         len(input_grids),
         init_state,
         memory,
         self.model.decode_token,
         self.args.max_beam_trees,
         cuda=self.args.cuda,
         max_decoder_length=self.args.max_decoder_length,
         return_beam_search_result=True,
         volatile=False,
         differentiable=True,
         use_length_penalty=self.args.use_length_penalty,
         factor=self.args.length_penalty_factor)
     output_code = self.model.decoder.postprocess_output(
         [[x.sequence for x in y] for y in sequences], memory)
     all_logits = []
     rewards = []
     for logit_beam, code_beam, example in zip(sequences, output_code,
                                               orig_examples):
         for i, (logits, code) in enumerate(zip(logit_beam, code_beam)):
             code = list(map(self.vocab.itos, code))
             all_logits.append(
                 torch.sum(
                     torch.cat([x.view(1)
                                for x in logits.log_probs_torch])))
             run_cases = lambda tests: executor.evaluate_code(
                 code, example.schema.args, tests, self.executor.execute)
             input_tests = run_cases(example.input_tests)
             reward = input_tests['correct'] / input_tests['total']
             if self.args.use_held_out_test_for_rl:
                 held_out_test = run_cases(example.tests)
                 reward += held_out_test[
                     'correct']  # worth as much as all the other ones combined
             rewards.append(reward)
     all_logits = torch.cat([x.view(1) for x in all_logits])
     print(np.mean(rewards))
     rewards = torch.tensor(rewards)
     if not self.args.no_baseline:
         rewards = rewards - np.mean(rewards)
     if all_logits.is_cuda:
         rewards = rewards.cuda()
     return -(rewards * all_logits).mean()
コード例 #5
0
def run_eval(tag,
             dataset,
             inference,
             do_execute,
             show_info=True,
             report_path=None,
             limit=None,
             evaluate_on_all=False):
    """Runs inference of given model on eval set, and executes resulting code.

    Args:
        tag: str, tag of the run to save report.
        dataset: Dataset, iterable of CodeExample to evaluate on.
        inference: func, produces code for given CodeExamples.
        do_execute: func, runs given code with given arguments.
        show_info: Show specific example additional information.
    """
    report = EvalReport(tag=tag, show_info=show_info, report_path=report_path)
    done = False
    try:
        for batch in limited(dataset, limit):
            start = time.time()
            results = inference(batch)
            for res, example in zip(results, batch.orig_examples):
                tests = []
                if evaluate_on_all:
                    tests += list(example.input_tests)
                tests += list(example.tests)
                stats = executor.evaluate_code(
                    res.code_tree if res.code_tree else res.code_sequence,
                    example.schema.args, tests, do_execute)
                report.add_example(example, res, stats)
            print("[Eval] Elapsed time for %d examples: %f" %
                  (len(batch.orig_examples), time.time() - start))
            report.display()
        done = True
    finally:
        print("Stopped.")
        report.save(done)
        report.display()
コード例 #6
0
 def test_results(self, code, example):
     return evaluate_code(code, example.schema.args, example.input_tests, self.executor.execute)