def make_vocabs(problem, domain='education'):
    vocab_paths = paths.vocab_paths(problem, domain)
    os.makedirs(vocab_paths['data_path'], exist_ok=True)

    all_programs, all_anon_programs = [], []

    for sampling_strategy in ['standard', 'uniform', 'tempered']:
        (counts_paths, labels_paths,rv_order_paths, 
         tiers_paths, anon_mapping_paths, all_rvs_path) = \
            paths.raw_data_paths(problem, 'train', domain, sampling_strategy)
        n_shards = len(counts_paths)

        for i in range(n_shards):
            programs_i, anon_programs_i, _, _, _, _ = load_raw_rubric_data(
                counts_paths[i], labels_paths[i], rv_order_paths[i],
                tiers_paths[i], anon_mapping_paths[i])

            all_programs.extend(programs_i)
            all_anon_programs.extend(anon_programs_i)

    vocab = build_vocab_rnn(all_programs, character_level=False)
    char_vocab = build_vocab_rnn(all_programs, character_level=True)
    anon_vocab = build_vocab_rnn(all_anon_programs, character_level=False)
    anon_char_vocab = build_vocab_rnn(all_anon_programs, character_level=True)

    io.save_json(vocab, vocab_paths['vocab_path'])
    io.save_json(char_vocab, vocab_paths['char_vocab_path'])
    io.save_json(anon_vocab, vocab_paths['anon_vocab_path'])
    io.save_json(anon_char_vocab, vocab_paths['anon_char_vocab_path'])
def process_student_data(problem):
    # TODO: fix this, it's outdated

    rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard')
    vocab_paths = paths.vocab_paths(problem, 'education')

    if not os.path.isfile(vocab_paths['vocab_path']):
        raise ValueError(
            'Run preprocessing script on rubric samples first to generate vocab file.'
        )

    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    metadata = io.load_json(rnn_paths['metadata_path'])

    # load training max-lengths
    max_len = metadata['max_len']
    char_max_len = metadata['char_max_len']
    anon_max_len = metadata['anon_max_len']
    anon_char_max_len = metadata['anon_char_max_len']

    programs, anon_programs = raw_student_data(problem)

    feat_programs, program_lengths, raw_programs = featurise_programs_rnn(
        programs, vocab, max_len)
    char_feat_programs, char_program_lengths, _ = featurise_programs_rnn(
        programs, char_vocab, char_max_len, character_level=True)

    anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn(
        anon_programs, anon_vocab, anon_max_len)
    anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn(
        anon_programs,
        anon_char_vocab,
        anon_char_max_len,
        character_level=True)

    program_mats = dict(programs=feat_programs, lengths=program_lengths)
    char_program_mats = dict(programs=char_feat_programs,
                             lengths=char_program_lengths)
    anon_program_mats = dict(programs=anon_feat_programs,
                             lengths=anon_program_lengths)
    anon_char_program_mats = dict(programs=anon_char_feat_programs,
                                  lengths=anon_char_program_lengths)

    io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path'])
    io.savemat(char_program_mats, rnn_paths['student_char_programs_path'])
    io.savemat(program_mats, rnn_paths['student_programs_path'])

    io.save_pickle(anon_raw_programs,
                   rnn_paths['anon_raw_student_programs_path'])
    io.savemat(anon_char_program_mats,
               rnn_paths['anon_student_char_programs_path'])
    io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])
    def _load_metadata(self):
        '''
            Loads all housekeeping data
        '''
        rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard')
        vocab_paths = paths.vocab_paths(self.problem, 'education')

        for _, path in rnn_paths.items():
            if not os.path.exists(path) and not os.path.exists(path.format(0)):
                if 'student' not in path:
                    raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path))

        metadata = io.load_json(rnn_paths['metadata_path'])
        self.max_len = metadata['max_len'] 
        self.char_max_len = metadata['char_max_len'] 
        self.anon_max_len = metadata['anon_max_len'] 
        self.anon_char_max_len = metadata['anon_char_max_len']

        self.vocab = io.load_json(vocab_paths['vocab_path'])
        self.w2i, self.i2w = self.vocab['w2i'], self.vocab['i2w']

        self.char_vocab = io.load_json(vocab_paths['char_vocab_path'])
        self.char_w2i, self.char_i2w = self.char_vocab['w2i'], self.char_vocab['i2w']

        assert self.char_w2i[PAD_TOKEN] == self.w2i[PAD_TOKEN]
        assert self.char_w2i[START_TOKEN] == self.w2i[START_TOKEN]
        assert self.char_w2i[END_TOKEN] == self.w2i[END_TOKEN]
        assert self.char_w2i[UNK_TOKEN] == self.w2i[UNK_TOKEN]

        self.anon_vocab = io.load_json(vocab_paths['anon_vocab_path'])
        self.anon_w2i, self.anon_i2w = self.anon_vocab['w2i'], self.anon_vocab['i2w']

        self.anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path'])
        self.anon_char_w2i, self.anon_char_i2w = self.anon_char_vocab['w2i'], self.anon_char_vocab['i2w']

        assert self.anon_char_w2i[PAD_TOKEN] == self.anon_w2i[PAD_TOKEN]
        assert self.anon_char_w2i[START_TOKEN] == self.anon_w2i[START_TOKEN]
        assert self.anon_char_w2i[END_TOKEN] == self.anon_w2i[END_TOKEN]
        assert self.anon_char_w2i[UNK_TOKEN] == self.anon_w2i[UNK_TOKEN]
def make_rnn_data(problem,
                  split,
                  domain='education',
                  sampling_strategy='standard'):
    rnn_paths = paths.rnn_data_paths(problem, split, domain, sampling_strategy)
    vocab_paths = paths.vocab_paths(problem, domain)
    os.makedirs(rnn_paths['data_path'], exist_ok=True)

    (counts_paths, labels_paths, rv_order_paths,
     tiers_paths, anon_mapping_paths, all_rvs_path) = \
         paths.raw_data_paths(problem, split, domain, sampling_strategy)
    n_shards = len(counts_paths)

    # get info that has to be collected across all shards
    max_lens = get_merged_info(counts_paths, labels_paths, rv_order_paths,
                               tiers_paths, anon_mapping_paths)
    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    max_len, char_max_len, anon_max_len, anon_char_max_len = max_lens

    all_rvs = io.load_json(all_rvs_path)
    rv_info = create_rv_info(all_rvs)
    # save all_rvs into rv_info
    rv_info['values'] = all_rvs

    data_len = 0
    shard_size = 0

    for i in range(n_shards):
        programs_i, anon_programs_i, labels_i, rv_order_i, tiers_i, _ = load_raw_rubric_data(
            counts_paths[i], labels_paths[i], rv_order_paths[i],
            tiers_paths[i], anon_mapping_paths[i])

        # assumes equally sized shards (except smaller remaining last one)
        shard_size = max(shard_size, len(programs_i))
        data_len += len(programs_i)

        feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs)
        feat_rv_order_i, rv_order_lengths_i = featurise_rv_order(
            rv_order_i, rv_info)

        feat_programs_i, program_lengths_i = featurise_programs_rnn(
            programs_i, vocab, max_len)
        anon_feat_programs_i, anon_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_vocab, anon_max_len)

        char_feat_programs_i, char_program_lengths_i = \
            featurise_programs_rnn(programs_i, char_vocab, char_max_len, character_level=True)

        anon_char_feat_programs_i, anon_char_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_char_vocab, anon_char_max_len, character_level=True)

        program_mats_i = dict(programs=feat_programs_i,
                              lengths=program_lengths_i,
                              tiers=tiers_i)
        char_program_mats_i = dict(programs=char_feat_programs_i,
                                   lengths=char_program_lengths_i,
                                   tiers=tiers_i)
        anon_program_mats_i = dict(programs=anon_feat_programs_i,
                                   lengths=anon_program_lengths_i,
                                   tiers=tiers_i)
        anon_char_program_mats_i = dict(programs=anon_char_feat_programs_i,
                                        lengths=anon_char_program_lengths_i,
                                        tiers=tiers_i)
        rv_order_mats_i = dict(rv_orders=feat_rv_order_i,
                               lengths=rv_order_lengths_i)

        io.save_pickle(programs_i, rnn_paths['raw_programs_path'].format(i))
        io.savemat(program_mats_i, rnn_paths['feat_programs_path'].format(i))
        io.savemat(char_program_mats_i,
                   rnn_paths['char_feat_programs_path'].format(i))

        # TODO: save raw labels in raw_labels_path
        io.save_np(feat_labels_i, rnn_paths['feat_labels_path'].format(i))
        io.save_pickle(anon_programs_i,
                       rnn_paths['anon_raw_programs_path'].format(i))
        io.savemat(anon_program_mats_i,
                   rnn_paths['anon_feat_programs_path'].format(i))
        io.savemat(anon_char_program_mats_i,
                   rnn_paths['anon_char_feat_programs_path'].format(i))
        io.save_pickle(rv_order_i, rnn_paths['raw_rvOrder_path'].format(i))
        io.savemat(rv_order_mats_i, rnn_paths['feat_rvOrder_path'].format(i))

    io.save_json(rv_info, rnn_paths['rv_info_path'])

    metadata = dict(max_len=max_len,
                    char_max_len=char_max_len,
                    anon_max_len=anon_max_len,
                    anon_char_max_len=anon_char_max_len,
                    data_len=data_len,
                    num_shards=n_shards,
                    shard_size=shard_size)

    io.save_json(metadata, rnn_paths['metadata_path'])
def process_student_data(problem, account_for_counts=False):
    rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard')
    os.makedirs(rnn_paths['student_data_path'], exist_ok=True)
    vocab_paths = paths.vocab_paths(problem, 'education')

    if not os.path.isfile(vocab_paths['vocab_path']):
        raise ValueError(
            'Run preprocessing script on rubric samples first to generate vocab file.'
        )

    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    metadata = io.load_json(rnn_paths['metadata_path'])

    # load training max-lengths
    # max_len = metadata['max_len']
    # char_max_len = metadata['char_max_len']
    # anon_max_len = metadata['anon_max_len']
    # anon_char_max_len = metadata['anon_char_max_len']

    # we do not want to load these from metadata bc some programs may be longer than ones
    # seen in training. Instead we want to recompute the maximum length...
    programs, labels, zipfs, anon_programs = raw_student_data(
        problem, account_for_counts)

    # we +2 to include start and end tokens
    max_len = max(len(x.split()) for x in programs) + 2
    char_max_len = max(len(x) for x in programs) + 2
    anon_max_len = max(len(x.split()) for x in anon_programs) + 2
    anon_char_max_len = max(len(x) for x in anon_programs) + 2

    feat_programs, program_lengths, raw_programs = featurise_programs_rnn(
        programs, vocab, max_len)
    char_feat_programs, char_program_lengths, _ = featurise_programs_rnn(
        programs, char_vocab, char_max_len, character_level=True)

    anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn(
        anon_programs, anon_vocab, anon_max_len)
    anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn(
        anon_programs,
        anon_char_vocab,
        anon_char_max_len,
        character_level=True)

    program_mats = dict(programs=feat_programs, lengths=program_lengths)
    char_program_mats = dict(programs=char_feat_programs,
                             lengths=char_program_lengths)
    anon_program_mats = dict(programs=anon_feat_programs,
                             lengths=anon_program_lengths)
    anon_char_program_mats = dict(programs=anon_char_feat_programs,
                                  lengths=anon_char_program_lengths)

    io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path'])
    io.savemat(char_program_mats, rnn_paths['student_char_programs_path'])
    io.savemat(program_mats, rnn_paths['student_programs_path'])

    io.save_pickle(anon_raw_programs,
                   rnn_paths['anon_raw_student_programs_path'])
    io.savemat(anon_char_program_mats,
               rnn_paths['anon_student_char_programs_path'])
    io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])

    io.save_np(labels, rnn_paths['feat_labels_path'])
    io.save_np(zipfs, rnn_paths['feat_zipfs_path'])
Exemple #6
0
    def _load_data(self):
        '''
            Loads all shard-independent data
        '''
        rv_info_list, metadata_dict, num_shards_list, shard_size_list, data_len_list = [], {}, [], [], []
        w2i_list, i2w_list = [], []
        char_w2i_list, char_i2w_list = [], []
        anon_w2i_list, anon_i2w_list = [], []
        anon_char_w2i_list, anon_char_i2w_list = [], []
        shard_num_to_sampling_strategy = []
        shard_num_to_sampling_shard_num = []
        max_len_list, char_max_len_list = [], []
        anon_max_len_list, anon_char_max_len_list = [], []

        for sampling_strategy in self.sampling_strategy_list:
            rnn_paths = paths.rnn_data_paths(self.problem, self.split,
                                             self.domain, sampling_strategy)
            vocab_paths = paths.vocab_paths(self.problem, self.domain)
            """
            for _, path in rnn_paths.items():
                if not os.path.exists(path) and not os.path.exists(path.format(0)):
                    if 'student' not in path:
                        raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path))
            """

            # contains w2i, i2w, num_categories for all rvs
            rv_info = io.load_json(rnn_paths['rv_info_path'])
            metadata = io.load_json(rnn_paths['metadata_path'])
            num_shards = metadata['num_shards']
            shard_size = metadata['shard_size']
            data_len = metadata['data_len']
            max_len = metadata['max_len']
            char_max_len = metadata['char_max_len']
            anon_max_len = metadata['anon_max_len']
            anon_char_max_len = metadata['anon_char_max_len']

            vocab = io.load_json(vocab_paths['vocab_path'])
            w2i, i2w = vocab['w2i'], vocab['i2w']

            char_vocab = io.load_json(vocab_paths['char_vocab_path'])
            char_w2i, char_i2w = char_vocab['w2i'], char_vocab['i2w']

            assert char_w2i[PAD_TOKEN] == w2i[PAD_TOKEN]
            assert char_w2i[START_TOKEN] == w2i[START_TOKEN]
            assert char_w2i[END_TOKEN] == w2i[END_TOKEN]
            assert char_w2i[UNK_TOKEN] == w2i[UNK_TOKEN]

            anon_vocab = io.load_json(vocab_paths['anon_vocab_path'])
            anon_w2i, anon_i2w = anon_vocab['w2i'], anon_vocab['i2w']

            anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path'])
            anon_char_w2i, anon_char_i2w = anon_char_vocab[
                'w2i'], anon_char_vocab['i2w']

            assert anon_char_w2i[PAD_TOKEN] == anon_w2i[PAD_TOKEN]
            assert anon_char_w2i[START_TOKEN] == anon_w2i[START_TOKEN]
            assert anon_char_w2i[END_TOKEN] == anon_w2i[END_TOKEN]
            assert anon_char_w2i[UNK_TOKEN] == anon_w2i[UNK_TOKEN]

            rv_info_list.append(rv_info)
            metadata_dict[sampling_strategy] = metadata
            num_shards_list.append(num_shards)
            shard_num_to_sampling_strategy.extend([sampling_strategy] *
                                                  num_shards)
            shard_num_to_sampling_shard_num.extend(range(num_shards))
            shard_size_list.append(shard_size)
            data_len_list.append(data_len)
            w2i_list.append(w2i)
            i2w_list.append(i2w)
            char_w2i_list.append(char_w2i)
            char_i2w_list.append(char_i2w)
            anon_w2i_list.append(anon_w2i)
            anon_i2w_list.append(anon_i2w)
            anon_char_w2i_list.append(anon_char_w2i)
            anon_char_i2w_list.append(anon_char_i2w)
            max_len_list.append(max_len)
            char_max_len_list.append(char_max_len)
            anon_max_len_list.append(anon_max_len)
            anon_char_max_len_list.append(anon_char_max_len)

        self.rv_info = rv_info_list[0]  # assume all of these are the same
        self.metadata_dict = metadata_dict
        self.num_shards = sum(num_shards_list)  # consider all shards
        self.shard_size_list = shard_size_list
        self.data_len = sum(data_len_list)
        self.w2i = merge_dicts(*w2i_list)
        self.i2w = merge_dicts(*i2w_list)
        self.vocab = {'w2i': self.w2i, 'i2w': self.i2w}
        self.char_w2i = merge_dicts(*char_w2i_list)
        self.char_i2w = merge_dicts(*char_i2w_list)
        self.char_vocab = {'w2i': self.char_w2i, 'i2w': self.char_i2w}
        self.anon_w2i = merge_dicts(*anon_w2i_list)
        self.anon_i2w = merge_dicts(*anon_i2w_list)
        self.anon_vocab = {'w2i': self.anon_w2i, 'i2w': self.anon_i2w}
        self.anon_char_w2i = merge_dicts(*anon_char_w2i_list)
        self.anon_char_i2w = merge_dicts(*anon_char_i2w_list)
        self.anon_char_vocab = {
            'w2i': self.anon_char_w2i,
            'i2w': self.anon_char_i2w
        }
        self.shard_num_to_sampling_strategy = shard_num_to_sampling_strategy
        self.shard_num_to_sampling_shard_num = shard_num_to_sampling_shard_num
        self.max_len_list = max_len_list
        self.char_max_len_list = char_max_len_list
        self.anon_max_len_list = anon_max_len_list
        self.anon_char_max_len_list = anon_char_max_len_list
        # take max and we will need to pad to this size
        self.max_len = max(max_len_list)
        self.char_max_len = max(char_max_len_list)
        self.anon_max_len = max(anon_max_len_list)
        self.anon_char_max_len = max(anon_char_max_len_list)