Exemple #1
0
def preprocess_human_annotations_p1(problem_id, source_pickle,
                                    annotation_pickle):
    r"""Similar to <preprocess_source_pickle> but for programs where
    humans annotated each program.

    @param problem_id: integer
                       1|2|3|4|5|6|7|8
    @param source_pickle: cPickle file
                          contains unique AST by index.
    @param annotation_pickle: cPickle file
                              contains annotations by index.
    """
    label_dim, _, label_to_ix, _, _ = get_label_params(problem_id)

    with open(annotation_pickle) as fp:
        annotations = cPickle.load(fp)
        n_annotations = len(annotations)

    with open(source_pickle) as fp:
        ast_programs = cPickle.load(fp)

    programs = []
    program2label = {}

    for i in tqdm(range(n_annotations)):
        annotation = annotations[i]
        program_id = int(annotation['answerId'])
        targets = json.loads(annotation['gradeData'])
        targets = convert_V2_to_V3(targets)
        targets = labels_to_numpy(targets, label_dim, label_to_ix).tolist()

        ast = ast_programs[program_id]
        removeColors(ast)
        ast = flatten_ast(ast)
        program = ' '.join(ast)

        programs.append(program)
        program2label[program] = targets

    # important, make sure there are no duplicates here
    programs = list(set(programs))
    labels = [program2label[prog] for prog in programs]

    return programs, labels
Exemple #2
0
def preprocess_human_annotations_p8(problem_id, source_pickle, annotation_csv):
    r"""The annotations for p8 are stored in CSV form so we need slightly 
    different logic to handle it.

    @param problem_id: integer
                       1|2|3|4|5|6|7|8
    @param source_pickle: cPickle file
                          contains unique AST by index.
    @param annotation_csv: CSV file
                           contains annotations per cell.
    """
    label_dim, ix_to_label, label_to_ix, _, _ = get_label_params(problem_id)

    annotation_df = pd.read_csv(annotation_csv)
    annotation_df = annotation_df.fillna(0)
    n_annotations = len(annotation_df)

    with open(source_pickle) as fp:
        ast_programs = cPickle.load(fp)

    programs = []
    program2label = {}

    for i in tqdm(range(n_annotations)):
        annotation_frame = annotation_df.iloc[i]
        program_id = int(annotation_frame['ID'])
        targets = [
            int(annotation_frame[ix_to_label[ix]]) for ix in xrange(label_dim)
        ]

        ast = ast_programs[program_id]
        removeColors(ast)
        ast = flatten_ast(ast)
        program = ' '.join(ast)

        programs.append(program)
        program2label[program] = targets

    programs = list(set(programs))
    labels = [program2label[prog] for prog in programs]

    return programs, labels
Exemple #3
0
    def __init__(self,
                 dataset,
                 problem_id,
                 split,
                 vocab=None,
                 max_seq_len=50,
                 min_occ=3):
        super(CodeOrgDataset, self).__init__()

        self.problem_id = problem_id
        self.split = split
        self.vocab = vocab
        self.max_seq_len = max_seq_len
        self.min_occ = min_occ

        self.label_dim, _, _, _, _ = get_label_params(problem_id)
        self.data_dir = get_codeorg_data_root(problem_id, dataset)
        self.raw_dir = get_codeorg_data_root(problem_id, 'raw')

        with open(os.path.join(self.data_dir, '%s.pickle' % self.split)) as fp:
            raw_data = cPickle.load(fp)
            assert 'programs' in raw_data
            self.raw_programs = raw_data['programs']
            if 'labels' in raw_data:
                self.raw_labels = raw_data['labels']
            else:
                self.raw_labels = None

        count_map = self.build_count_map()
        rank_map = build_rank_map_from_count_map(count_map)
        self.zipf_map = build_zipf_map(count_map, rank_map)

        if self.vocab is None:
            self.vocab = self.create_vocab(self.raw_programs)

        self.w2i, self.i2w = self.vocab['w2i'], self.vocab['i2w']
        self.vocab_size = len(self.w2i)

        self.sequences, self.lengths = self.process_programs(self.raw_programs)

        if self.raw_labels is not None:
            self.labels = self.process_labels(self.raw_labels)
Exemple #4
0
def preprocess_synthetic_samples(problem_id, sample_pickle):
    r"""Similar to <preprocess_source_pickle> but cleans up synthetic
    samples from a probabilistic model.

    @param problem_id: integer
                       1|2|3|4|5|6|7|8
    @param sample_pickle: cPickle file
                          contains raw string program and list of labels.
    """
    label_dim, _, label_to_ix, _, _ = get_label_params(problem_id)

    with open(sample_pickle, 'rb') as fp:
        data = cPickle.load(fp)
        programs = data['programs']
        labels = data['labels']
        n_rows = len(programs)

    _programs = []
    program2label = {}

    pbar = tqdm(total=n_rows)
    for program, label in zip(programs, labels):
        # don't use newline separation since that will print out
        # as several lines and we lose structure.
        label_lst = label.split(',')
        label = labels_to_numpy(label_lst, label_dim, label_to_ix).tolist()

        _programs.append(program)
        program2label[program] = label

        pbar.update()
    pbar.close()

    # important, make sure there are no duplicates here
    programs = list(set(programs))
    labels = [program2label[prog] for prog in programs]

    return programs, labels
Exemple #5
0
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('dataset', type=str, help='annotated|synthetic')
    parser.add_argument('problem_id', type=int, help='1|2|3|4|5|6|7|8')
    parser.add_argument('out_dir', type=str, help='where to save outputs')
    parser.add_argument('--cuda', action='store_true', default=False,
                        help='enables CUDA training [default: False]')
    args = parser.parse_args()
    args.cuda = args.cuda and torch.cuda.is_available()
    merge_args_with_dict(args, default_hyperparams)
    device = torch.device('cuda' if args.cuda else 'cpu')
    args.max_seq_len = get_max_seq_len(args.problem_id)

    label_dim, _, _, _, _ = get_label_params(args.problem_id)

    # reproducibility
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if not os.path.isdir(args.out_dir):
        os.makedirs(args.out_dir)

    train_dataset = load_dataset(   args.dataset, args.problem_id, 'train', vocab=None, 
                                    max_seq_len=args.max_seq_len, min_occ=args.min_occ)
    val_dataset = load_dataset( args.dataset, args.problem_id, 'val', vocab=train_dataset.vocab, 
                                max_seq_len=args.max_seq_len, min_occ=args.min_occ)
    test_dataset = load_dataset(args.dataset, args.problem_id, 'test', vocab=train_dataset.vocab, 
                                max_seq_len=args.max_seq_len, min_occ=args.min_occ)
Exemple #6
0
    if not os.path.isdir(args.out_dir):
        os.makedirs(args.out_dir)

    checkpoint = torch.load(args.checkpoint_path)
    train_args = checkpoint['cmd_line_args']
    state_dict = checkpoint['state_dict']
    vocab = checkpoint['vocab']
    w2i, i2w = vocab['w2i'], vocab['i2w']

    sos_idx = vocab['w2i'][SOS_TOKEN]
    eos_idx = vocab['w2i'][EOS_TOKEN]
    pad_idx = vocab['w2i'][PAD_TOKEN]
    unk_idx = vocab['w2i'][UNK_TOKEN]

    label_dim, ix_to_label, label_to_ix, _, _ = get_label_params(
        train_args.problem_id)

    model = ProgramMVAE(train_args.z_dim,
                        label_dim,
                        len(vocab['w2i']),
                        sos_idx,
                        eos_idx,
                        pad_idx,
                        unk_idx,
                        embedding_dim=train_args.embedding_dim,
                        hidden_dim=train_args.hidden_dim,
                        word_dropout=train_args.word_dropout,
                        num_layers=train_args.num_layers)
    model = model.to(device)
    model.load_state_dict(state_dict)
    model.eval()