def preprocess_human_annotations_p1(problem_id, source_pickle, annotation_pickle): r"""Similar to <preprocess_source_pickle> but for programs where humans annotated each program. @param problem_id: integer 1|2|3|4|5|6|7|8 @param source_pickle: cPickle file contains unique AST by index. @param annotation_pickle: cPickle file contains annotations by index. """ label_dim, _, label_to_ix, _, _ = get_label_params(problem_id) with open(annotation_pickle) as fp: annotations = cPickle.load(fp) n_annotations = len(annotations) with open(source_pickle) as fp: ast_programs = cPickle.load(fp) programs = [] program2label = {} for i in tqdm(range(n_annotations)): annotation = annotations[i] program_id = int(annotation['answerId']) targets = json.loads(annotation['gradeData']) targets = convert_V2_to_V3(targets) targets = labels_to_numpy(targets, label_dim, label_to_ix).tolist() ast = ast_programs[program_id] removeColors(ast) ast = flatten_ast(ast) program = ' '.join(ast) programs.append(program) program2label[program] = targets # important, make sure there are no duplicates here programs = list(set(programs)) labels = [program2label[prog] for prog in programs] return programs, labels
def preprocess_human_annotations_p8(problem_id, source_pickle, annotation_csv): r"""The annotations for p8 are stored in CSV form so we need slightly different logic to handle it. @param problem_id: integer 1|2|3|4|5|6|7|8 @param source_pickle: cPickle file contains unique AST by index. @param annotation_csv: CSV file contains annotations per cell. """ label_dim, ix_to_label, label_to_ix, _, _ = get_label_params(problem_id) annotation_df = pd.read_csv(annotation_csv) annotation_df = annotation_df.fillna(0) n_annotations = len(annotation_df) with open(source_pickle) as fp: ast_programs = cPickle.load(fp) programs = [] program2label = {} for i in tqdm(range(n_annotations)): annotation_frame = annotation_df.iloc[i] program_id = int(annotation_frame['ID']) targets = [ int(annotation_frame[ix_to_label[ix]]) for ix in xrange(label_dim) ] ast = ast_programs[program_id] removeColors(ast) ast = flatten_ast(ast) program = ' '.join(ast) programs.append(program) program2label[program] = targets programs = list(set(programs)) labels = [program2label[prog] for prog in programs] return programs, labels
def __init__(self, dataset, problem_id, split, vocab=None, max_seq_len=50, min_occ=3): super(CodeOrgDataset, self).__init__() self.problem_id = problem_id self.split = split self.vocab = vocab self.max_seq_len = max_seq_len self.min_occ = min_occ self.label_dim, _, _, _, _ = get_label_params(problem_id) self.data_dir = get_codeorg_data_root(problem_id, dataset) self.raw_dir = get_codeorg_data_root(problem_id, 'raw') with open(os.path.join(self.data_dir, '%s.pickle' % self.split)) as fp: raw_data = cPickle.load(fp) assert 'programs' in raw_data self.raw_programs = raw_data['programs'] if 'labels' in raw_data: self.raw_labels = raw_data['labels'] else: self.raw_labels = None count_map = self.build_count_map() rank_map = build_rank_map_from_count_map(count_map) self.zipf_map = build_zipf_map(count_map, rank_map) if self.vocab is None: self.vocab = self.create_vocab(self.raw_programs) self.w2i, self.i2w = self.vocab['w2i'], self.vocab['i2w'] self.vocab_size = len(self.w2i) self.sequences, self.lengths = self.process_programs(self.raw_programs) if self.raw_labels is not None: self.labels = self.process_labels(self.raw_labels)
def preprocess_synthetic_samples(problem_id, sample_pickle): r"""Similar to <preprocess_source_pickle> but cleans up synthetic samples from a probabilistic model. @param problem_id: integer 1|2|3|4|5|6|7|8 @param sample_pickle: cPickle file contains raw string program and list of labels. """ label_dim, _, label_to_ix, _, _ = get_label_params(problem_id) with open(sample_pickle, 'rb') as fp: data = cPickle.load(fp) programs = data['programs'] labels = data['labels'] n_rows = len(programs) _programs = [] program2label = {} pbar = tqdm(total=n_rows) for program, label in zip(programs, labels): # don't use newline separation since that will print out # as several lines and we lose structure. label_lst = label.split(',') label = labels_to_numpy(label_lst, label_dim, label_to_ix).tolist() _programs.append(program) program2label[program] = label pbar.update() pbar.close() # important, make sure there are no duplicates here programs = list(set(programs)) labels = [program2label[prog] for prog in programs] return programs, labels
if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('dataset', type=str, help='annotated|synthetic') parser.add_argument('problem_id', type=int, help='1|2|3|4|5|6|7|8') parser.add_argument('out_dir', type=str, help='where to save outputs') parser.add_argument('--cuda', action='store_true', default=False, help='enables CUDA training [default: False]') args = parser.parse_args() args.cuda = args.cuda and torch.cuda.is_available() merge_args_with_dict(args, default_hyperparams) device = torch.device('cuda' if args.cuda else 'cpu') args.max_seq_len = get_max_seq_len(args.problem_id) label_dim, _, _, _, _ = get_label_params(args.problem_id) # reproducibility torch.manual_seed(args.seed) np.random.seed(args.seed) if not os.path.isdir(args.out_dir): os.makedirs(args.out_dir) train_dataset = load_dataset( args.dataset, args.problem_id, 'train', vocab=None, max_seq_len=args.max_seq_len, min_occ=args.min_occ) val_dataset = load_dataset( args.dataset, args.problem_id, 'val', vocab=train_dataset.vocab, max_seq_len=args.max_seq_len, min_occ=args.min_occ) test_dataset = load_dataset(args.dataset, args.problem_id, 'test', vocab=train_dataset.vocab, max_seq_len=args.max_seq_len, min_occ=args.min_occ)
if not os.path.isdir(args.out_dir): os.makedirs(args.out_dir) checkpoint = torch.load(args.checkpoint_path) train_args = checkpoint['cmd_line_args'] state_dict = checkpoint['state_dict'] vocab = checkpoint['vocab'] w2i, i2w = vocab['w2i'], vocab['i2w'] sos_idx = vocab['w2i'][SOS_TOKEN] eos_idx = vocab['w2i'][EOS_TOKEN] pad_idx = vocab['w2i'][PAD_TOKEN] unk_idx = vocab['w2i'][UNK_TOKEN] label_dim, ix_to_label, label_to_ix, _, _ = get_label_params( train_args.problem_id) model = ProgramMVAE(train_args.z_dim, label_dim, len(vocab['w2i']), sos_idx, eos_idx, pad_idx, unk_idx, embedding_dim=train_args.embedding_dim, hidden_dim=train_args.hidden_dim, word_dropout=train_args.word_dropout, num_layers=train_args.num_layers) model = model.to(device) model.load_state_dict(state_dict) model.eval()