Esempio n. 1
0
	def loadSyntheticData(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			data = load_json(cache_file)
			prog_items = data['raw_programs']
			anon_progs = data['anon_programs']
		else:
			standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
			uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
			tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
			standardDict = pickle.load(open(standard_path, "rb" ))
			uniformDict = pickle.load(open(uniform_path, "rb" ))
			temperedDict =  pickle.load(open(tempered_path, "rb" ))

			all_dicts = [standardDict, uniformDict, temperedDict]

			# this step is not stable across different runs if caching forest
			# so this needs to be cached too
			prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys())
			anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items]
			data = dict(raw_programs=prog_items, anon_programs=anon_progs)

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_json(data, cache_file)

			# if we dont load cache here, we should regenerate forest too
			self.evict_cache = True

		return prog_items, anon_progs
Esempio n. 2
0
    def finalise(self):
        self.logger.info("Saving final versions of model...")
        self.save_checkpoint(filename='final.pth.tar')

        save_json(self.accuracies,
                  os.path.join(self.config.out_dir, 'accuracies.json'))
        self.summary_writer.export_scalars_to_json(
            os.path.join(self.config.summary_dir, "all_scalars.json".format()))
        self.summary_writer.close()
Esempio n. 3
0
    def backup(self):
        save_json(self.accuracies,
                  os.path.join(self.config.out_dir, 'accuracies.json'))
        self.summary_writer.export_scalars_to_json(
            os.path.join(self.config.summary_dir, "all_scalars.json".format()))
        self.summary_writer.close()

        self.logger.info("Backing up current version of model...")
        self.save_checkpoint(filename='backup.pth.tar')
    def run(self, results_dir, problem, exp_dir, error_rate=0):
        grammar_dir = paths.grammar_path(problem)
        inf_e = EngineGuided(grammar_dir, exp_dir)

        N = len(inf_e.dataset)

        n_lim = 100

        data, raw_prgs = self.create_data_loader(inf_e.dataset, N)
        data_loader = DataLoader(data, batch_size=1)
        tqdm_batch = tqdm(data_loader, total=N)

        time_data = []
        failed = []

        for i, data_list in enumerate(tqdm_batch):
            program_args, _, _, _, _ = data_list[:-4], data_list[
                -4], data_list[-3], data_list[-2], data_list[-1]
            program = raw_prgs[i]

            if error_rate > 0:
                program_args = self.dropout_words(program_args, error_rate)

            num_steps = self.infer_matches(inf_e,
                                           program,
                                           program_args,
                                           n_lim=n_lim)

            if num_steps < 0:
                failed.append(program)

            time_data.append(num_steps)

        error_rate_str = str(error_rate).replace('.', '_')
        # TODO: make sure failed programs are safe to save
        out_json = dict(failed_progs=failed,
                        time_data=time_data,
                        error_rate=error_rate)
        out_file = os.path.join(results_dir, f'data_err_{error_rate_str}.json')
        save_json(out_json, out_file)

        plt.hist(time_data, bins=50, color='purple')
        plt.title(
            f'Num samples to recover validation program (N={N}, max_steps={n_lim}, error_rate={error_rate})'
        )
        plt.xlabel('Number of steps (neg. = not recovered)')
        plt.ylabel('Frequency')
        plt.savefig(
            os.path.join(results_dir,
                         f'validation_recovery_err_{error_rate_str}.png'))
def make_vocabs(problem, domain='education'):
    vocab_paths = paths.vocab_paths(problem, domain)
    os.makedirs(vocab_paths['data_path'], exist_ok=True)

    all_programs, all_anon_programs = [], []

    for sampling_strategy in ['standard', 'uniform', 'tempered']:
        (counts_paths, labels_paths,rv_order_paths, 
         tiers_paths, anon_mapping_paths, all_rvs_path) = \
            paths.raw_data_paths(problem, 'train', domain, sampling_strategy)
        n_shards = len(counts_paths)

        for i in range(n_shards):
            programs_i, anon_programs_i, _, _, _, _ = load_raw_rubric_data(
                counts_paths[i], labels_paths[i], rv_order_paths[i],
                tiers_paths[i], anon_mapping_paths[i])

            all_programs.extend(programs_i)
            all_anon_programs.extend(anon_programs_i)

    vocab = build_vocab_rnn(all_programs, character_level=False)
    char_vocab = build_vocab_rnn(all_programs, character_level=True)
    anon_vocab = build_vocab_rnn(all_anon_programs, character_level=False)
    anon_char_vocab = build_vocab_rnn(all_anon_programs, character_level=True)

    io.save_json(vocab, vocab_paths['vocab_path'])
    io.save_json(char_vocab, vocab_paths['char_vocab_path'])
    io.save_json(anon_vocab, vocab_paths['anon_vocab_path'])
    io.save_json(anon_char_vocab, vocab_paths['anon_char_vocab_path'])
    def run(self, n, out_dir, grammar_dir, problem):
        std_data = self._run_base(n, grammar_dir, SAMPLE_STANDARD)
        unif_data = self._run_base(n, grammar_dir, SAMPLE_UNIFORM)

        data = dict(standard=std_data, uniform=unif_data, tempered=dict())

        for r in [0.01, 0.1, 1.]:
            for d in [0.3, 0.6, 0.9]:
                data['tempered'][f'{r}_{d}'] = self._run_base(n,
                                                              grammar_dir,
                                                              SAMPLE_TEMPERED,
                                                              reward=r,
                                                              discount=d)

        save_json(data, os.path.join(out_dir, f'{problem}.json'))
    def evaluateAgent(self, agent_name, inf_nn):
        tqdm_batch = tqdm(self.dataloader, total=len(self.student_data))

        distance_data = dict()
        nn_map = dict()

        for i, data_list in enumerate(tqdm_batch):
            program_args, program, anon_program = data_list[:2], data_list[
                -2], data_list[-1]  # for non-anon, non-char model
            # program_args, program, anon_program = data_list[:4], data_list[-2], data_list[-1]	# for anon xor char model

            # TODO: hack to make it work right now. Investigate why this is happening
            if program_args[1].item() == 0:
                print('SKIPPING STUDENT PROG OF LENGTH 0')
                continue

            # access 0 index to unbatch the batch of size 1
            program = program[0]
            anon_program = anon_program[0]
            clean_prog = self.transformRawProgram(program)

            if agent_name == 'lsh':
                top_k_progs = self.lsh_nn.findNearestNeighbours(anon_program)
            elif agent_name == 'random':
                top_k_progs = self.random_nn.findNearestNeighbours(
                    anon_program)
            else:
                top_k_progs = inf_nn.findNearestNeighbours(
                    program, program_args=program_args)

            if len(top_k_progs) == 0:
                print(f'SKIPPING programs with 0 top_k for agent={agent_name}')
                print(clean_prog)
                continue

            if len(top_k_progs) != self.top_k:
                print(
                    f'[WARNING] Found programs with not enough top_k. Expected {self.top_k}, received [{agent_name}_len={len(top_k_progs)}]'
                )

            best_prog, _ = self.getBestProgram(clean_prog, top_k_progs)
            nn_map[program] = best_prog
            self.computeDistances(distance_data, clean_prog, top_k_progs)

        all_data = dict(distances=distance_data, nns=nn_map)
        save_json(all_data, os.path.join(self.results_dir,
                                         f'{agent_name}.json'))
    def compareNNs(self):
        tqdm_batch = tqdm(self.dataloader, total=len(self.student_data))

        metadata = dict(top_k=self.top_k)
        distance_data = dict(metadata=metadata)

        for i, data_list in enumerate(tqdm_batch):
            # TODO: this indexing is super hardcoded
            program_args, program, anon_program = data_list[:2], data_list[
                -2], data_list[-1]  # for non-anon, non-char model
            #program_args, program, anon_program = data_list[:4], data_list[-2], data_list[-1]	# for anon, not_char model

            # TODO: hack to make it work right now. Investigate why this is happening
            if program_args[1].item() == 0:
                print('SKIPPING STUDENT PROG OF LENGTH 0')
                continue

            # access 0 index to unbatch the batch of size 1
            program = program[0]
            anon_program = anon_program[0]
            clean_prog = self.transformRawProgram(program)

            top_k_lsh, top_k_inf = self.getNNs(program, anon_program,
                                               program_args)

            if len(top_k_lsh) == 0:
                print(f'SKIPPING programs with 0 LSH')
                print(clean_prog)
                continue

            if len(top_k_lsh) != self.top_k or len(top_k_inf) != self.top_k:
                print(
                    f'[WARNING] Found programs with not enough top_k. Expected {self.top_k}, received (lsh={len(top_k_lsh)}, inf={len(top_k_inf)})'
                )

            self.computeDistances(distance_data, clean_prog, top_k_lsh,
                                  top_k_inf)

        save_json(
            distance_data,
            os.path.join(self.results_dir,
                         f'recovery_similarity_k_{self.top_k}.json'))
def make_rnn_data(problem,
                  split,
                  domain='education',
                  sampling_strategy='standard'):
    rnn_paths = paths.rnn_data_paths(problem, split, domain, sampling_strategy)
    vocab_paths = paths.vocab_paths(problem, domain)
    os.makedirs(rnn_paths['data_path'], exist_ok=True)

    (counts_paths, labels_paths, rv_order_paths,
     tiers_paths, anon_mapping_paths, all_rvs_path) = \
         paths.raw_data_paths(problem, split, domain, sampling_strategy)
    n_shards = len(counts_paths)

    # get info that has to be collected across all shards
    max_lens = get_merged_info(counts_paths, labels_paths, rv_order_paths,
                               tiers_paths, anon_mapping_paths)
    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    max_len, char_max_len, anon_max_len, anon_char_max_len = max_lens

    all_rvs = io.load_json(all_rvs_path)
    rv_info = create_rv_info(all_rvs)
    # save all_rvs into rv_info
    rv_info['values'] = all_rvs

    data_len = 0
    shard_size = 0

    for i in range(n_shards):
        programs_i, anon_programs_i, labels_i, rv_order_i, tiers_i, _ = load_raw_rubric_data(
            counts_paths[i], labels_paths[i], rv_order_paths[i],
            tiers_paths[i], anon_mapping_paths[i])

        # assumes equally sized shards (except smaller remaining last one)
        shard_size = max(shard_size, len(programs_i))
        data_len += len(programs_i)

        feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs)
        feat_rv_order_i, rv_order_lengths_i = featurise_rv_order(
            rv_order_i, rv_info)

        feat_programs_i, program_lengths_i = featurise_programs_rnn(
            programs_i, vocab, max_len)
        anon_feat_programs_i, anon_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_vocab, anon_max_len)

        char_feat_programs_i, char_program_lengths_i = \
            featurise_programs_rnn(programs_i, char_vocab, char_max_len, character_level=True)

        anon_char_feat_programs_i, anon_char_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_char_vocab, anon_char_max_len, character_level=True)

        program_mats_i = dict(programs=feat_programs_i,
                              lengths=program_lengths_i,
                              tiers=tiers_i)
        char_program_mats_i = dict(programs=char_feat_programs_i,
                                   lengths=char_program_lengths_i,
                                   tiers=tiers_i)
        anon_program_mats_i = dict(programs=anon_feat_programs_i,
                                   lengths=anon_program_lengths_i,
                                   tiers=tiers_i)
        anon_char_program_mats_i = dict(programs=anon_char_feat_programs_i,
                                        lengths=anon_char_program_lengths_i,
                                        tiers=tiers_i)
        rv_order_mats_i = dict(rv_orders=feat_rv_order_i,
                               lengths=rv_order_lengths_i)

        io.save_pickle(programs_i, rnn_paths['raw_programs_path'].format(i))
        io.savemat(program_mats_i, rnn_paths['feat_programs_path'].format(i))
        io.savemat(char_program_mats_i,
                   rnn_paths['char_feat_programs_path'].format(i))

        # TODO: save raw labels in raw_labels_path
        io.save_np(feat_labels_i, rnn_paths['feat_labels_path'].format(i))
        io.save_pickle(anon_programs_i,
                       rnn_paths['anon_raw_programs_path'].format(i))
        io.savemat(anon_program_mats_i,
                   rnn_paths['anon_feat_programs_path'].format(i))
        io.savemat(anon_char_program_mats_i,
                   rnn_paths['anon_char_feat_programs_path'].format(i))
        io.save_pickle(rv_order_i, rnn_paths['raw_rvOrder_path'].format(i))
        io.savemat(rv_order_mats_i, rnn_paths['feat_rvOrder_path'].format(i))

    io.save_json(rv_info, rnn_paths['rv_info_path'])

    metadata = dict(max_len=max_len,
                    char_max_len=char_max_len,
                    anon_max_len=anon_max_len,
                    anon_char_max_len=anon_char_max_len,
                    data_len=data_len,
                    num_shards=n_shards,
                    shard_size=shard_size)

    io.save_json(metadata, rnn_paths['metadata_path'])
        offset_idx = render.index(to_find)
        end_idx = offset_idx + len(to_find)
        render = render[:offset_idx] + format_data[key] + render[end_idx:]
        for rv_set, (idx, n) in rv_data[key]:
            ret_idxs.append((rv_set, (idx  + offset_idx, n)))


    if len(rvs) > 0:
        ret_idxs.append((rvs, (0, len(render))))
    return nonterminal, render, ret_idxs

def tagged_data(data):
    ret = []
    for d in tqdm(data):
        p = d[0]
        ret.append(conv_sample(d[1]))
    return ret

if __name__ == "__main__":
    import argparse
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('data_file', type=str, help='which results to process')
    args = arg_parser.parse_args()


    data = load_json(args.data_file)
    conv_data = tagged_data(data)

    save_json(conv_data, os.path.join('output.json'))
from src.utils.io_utils import save_json

if __name__ == "__main__":
    import argparse
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('results_dir',
                            type=str,
                            help='where to save results')
    args = arg_parser.parse_args()

    MODEL_DIR = 'experiments/liftoff_final_100k_tempered/2019-04-20--13_57_14'
    GRAMMAR_DIR = 'src/rubricsampling/grammars/liftoff_hacks'

    student_data = StudentPrograms('liftoff', include_anonymized=False)
    dataloader = DataLoader(student_data, batch_size=1, shuffle=False)
    tqdm_batch = tqdm(dataloader, total=len(student_data))

    inf_nn = InferenceNNHighlight(GRAMMAR_DIR, MODEL_DIR)

    results = []
    for i, (seq_tokens, seq_lengths, program) in enumerate(tqdm_batch):
        program_args = (seq_tokens, seq_lengths)
        try:
            program, highlights, decisions = inf_nn.guided_sample(program_args)
        except:
            continue
        results.append((program, highlights))

    save_json(results, os.path.join(args.results_dir,
                                    'student_highlights.json'))
Esempio n. 12
0
def make_scene_graph_data(device,
                          problem,
                          split,
                          sampling_strategy='standard',
                          use_resnet=False):
    data_paths = paths.scene_graph_data_paths(problem, split,
                                              sampling_strategy)
    os.makedirs(data_paths['data_path'], exist_ok=True)

    (counts_paths, labels_paths, images_paths, rv_order_paths,
     tiers_paths, all_rvs_path) = \
         paths.raw_scene_graph_data_paths(problem, split, sampling_strategy)
    n_shards = len(counts_paths)

    all_rvs = io.load_json(all_rvs_path)
    rv_info = create_rv_info(all_rvs)
    # save all_rvs into rv_info
    rv_info['values'] = all_rvs

    data_len = 0
    shard_size = 0

    if use_resnet:
        # load huge model :(
        print('loading deep net for feature extraction...')
        net, expected_input_dim = load_classification_model()
        net = net.to(device)
        image_transforms = load_data_transforms(expected_input_dim)

    for i in range(n_shards):
        scene_graphs_i, images_i, labels_i, rv_order_i, tiers_i, _ = load_raw_scene_graph_data(
            counts_paths[i], labels_paths[i], rv_order_paths[i],
            images_paths[i], tiers_paths[i])

        n_items_i = len(scene_graphs_i)

        if use_resnet:
            feat_images_i = featurise_images(images_i, net, device,
                                             image_transforms)
        else:
            feat_images_i = images_i

        # assumes equally sized shards (except smaller remaining last one)
        shard_size = max(shard_size, n_items_i)
        data_len += n_items_i

        feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs)
        feat_rv_order_i, rv_order_lengths_i = featurise_rv_order(
            rv_order_i, rv_info)

        image_mats_i = dict(images=feat_images_i, tiers=tiers_i)
        rv_order_mats_i = dict(rv_orders=feat_rv_order_i,
                               lengths=rv_order_lengths_i)

        io.savemat(image_mats_i, data_paths['feat_images_path'].format(i))
        io.save_np(feat_labels_i, data_paths['feat_labels_path'].format(i))
        io.save_pickle(rv_order_i, data_paths['raw_rvOrder_path'].format(i))
        io.savemat(rv_order_mats_i, data_paths['feat_rvOrder_path'].format(i))

    io.save_json(rv_info, data_paths['rv_info_path'])

    metadata = dict(data_len=data_len,
                    num_shards=n_shards,
                    shard_size=shard_size)

    io.save_json(metadata, data_paths['metadata_path'])
Esempio n. 13
0
    def getModelConfig(self):
        return self.inf_e.config

    def guided_sample(self, program_args):
        # something that will crash if accessed without setting
        initAssignments = 1000000 * torch.ones(1, self.inf_e.model.num_nodes)
        program, highlights, labels, decisions, rvOrder, rvAssignments_pred = \
            self.inf_e.renderProgram(program_args, initAssignments)

        return program, highlights, decisions


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--N', type=int, default=1)
    args = parser.parse_args()

    data = []
    e = EngineHighlight('src/rubricsampling/grammars/liftoff_hacks')
    for i in range(args.N):
        program, highlights = e.renderProgram()
        print(program)
        print(highlights)
        print('----')
        print()
        data.append((program, highlights))

    save_json(data, os.path.join('in_grammar_highlights.json'))