def _load_data(self):

        rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard')

        self.raw_programs = io.load_pickle(rnn_paths['raw_student_programs_path'])
        self.anon_raw_programs = io.load_pickle(rnn_paths['anon_raw_student_programs_path'])

        # Shape: (n x seq_len)
        programs_mat = io.loadmat(rnn_paths['student_programs_path'])
        char_programs_mat = io.loadmat(rnn_paths['student_char_programs_path'])

        anon_programs_mat = io.loadmat(rnn_paths['anon_student_programs_path'])
        anon_char_programs_mat = io.loadmat(rnn_paths['anon_student_char_programs_path'])

        self.programs = programs_mat['programs']
        self.lengths = programs_mat['lengths'].squeeze()
        #self.tiers = programs_mat['tiers'][0]

        self.char_programs = char_programs_mat['programs']
        self.char_lengths = char_programs_mat['lengths'].squeeze()

        self.anon_programs = anon_programs_mat['programs']
        self.anon_lengths = anon_programs_mat['lengths'].squeeze()

        self.anon_char_programs = anon_char_programs_mat['programs']
        self.anon_char_lengths = anon_char_programs_mat['lengths'].squeeze()
 def __init__(self, problem, character_level=False, include_anonymized=False):
     super().__init__(problem, character_level=character_level, 
                      include_anonymized=include_anonymized)
     
     rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard')
     self.labels = io.load_np(rnn_paths['feat_labels_path'])
     self.zipfs = io.load_np(rnn_paths['feat_zipfs_path'])
def process_student_data(problem):
    # TODO: fix this, it's outdated

    rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard')
    vocab_paths = paths.vocab_paths(problem, 'education')

    if not os.path.isfile(vocab_paths['vocab_path']):
        raise ValueError(
            'Run preprocessing script on rubric samples first to generate vocab file.'
        )

    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    metadata = io.load_json(rnn_paths['metadata_path'])

    # load training max-lengths
    max_len = metadata['max_len']
    char_max_len = metadata['char_max_len']
    anon_max_len = metadata['anon_max_len']
    anon_char_max_len = metadata['anon_char_max_len']

    programs, anon_programs = raw_student_data(problem)

    feat_programs, program_lengths, raw_programs = featurise_programs_rnn(
        programs, vocab, max_len)
    char_feat_programs, char_program_lengths, _ = featurise_programs_rnn(
        programs, char_vocab, char_max_len, character_level=True)

    anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn(
        anon_programs, anon_vocab, anon_max_len)
    anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn(
        anon_programs,
        anon_char_vocab,
        anon_char_max_len,
        character_level=True)

    program_mats = dict(programs=feat_programs, lengths=program_lengths)
    char_program_mats = dict(programs=char_feat_programs,
                             lengths=char_program_lengths)
    anon_program_mats = dict(programs=anon_feat_programs,
                             lengths=anon_program_lengths)
    anon_char_program_mats = dict(programs=anon_char_feat_programs,
                                  lengths=anon_char_program_lengths)

    io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path'])
    io.savemat(char_program_mats, rnn_paths['student_char_programs_path'])
    io.savemat(program_mats, rnn_paths['student_programs_path'])

    io.save_pickle(anon_raw_programs,
                   rnn_paths['anon_raw_student_programs_path'])
    io.savemat(anon_char_program_mats,
               rnn_paths['anon_student_char_programs_path'])
    io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])
    def _load_metadata(self):
        '''
            Loads all housekeeping data
        '''
        rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard')
        vocab_paths = paths.vocab_paths(self.problem, 'education')

        for _, path in rnn_paths.items():
            if not os.path.exists(path) and not os.path.exists(path.format(0)):
                if 'student' not in path:
                    raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path))

        metadata = io.load_json(rnn_paths['metadata_path'])
        self.max_len = metadata['max_len'] 
        self.char_max_len = metadata['char_max_len'] 
        self.anon_max_len = metadata['anon_max_len'] 
        self.anon_char_max_len = metadata['anon_char_max_len']

        self.vocab = io.load_json(vocab_paths['vocab_path'])
        self.w2i, self.i2w = self.vocab['w2i'], self.vocab['i2w']

        self.char_vocab = io.load_json(vocab_paths['char_vocab_path'])
        self.char_w2i, self.char_i2w = self.char_vocab['w2i'], self.char_vocab['i2w']

        assert self.char_w2i[PAD_TOKEN] == self.w2i[PAD_TOKEN]
        assert self.char_w2i[START_TOKEN] == self.w2i[START_TOKEN]
        assert self.char_w2i[END_TOKEN] == self.w2i[END_TOKEN]
        assert self.char_w2i[UNK_TOKEN] == self.w2i[UNK_TOKEN]

        self.anon_vocab = io.load_json(vocab_paths['anon_vocab_path'])
        self.anon_w2i, self.anon_i2w = self.anon_vocab['w2i'], self.anon_vocab['i2w']

        self.anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path'])
        self.anon_char_w2i, self.anon_char_i2w = self.anon_char_vocab['w2i'], self.anon_char_vocab['i2w']

        assert self.anon_char_w2i[PAD_TOKEN] == self.anon_w2i[PAD_TOKEN]
        assert self.anon_char_w2i[START_TOKEN] == self.anon_w2i[START_TOKEN]
        assert self.anon_char_w2i[END_TOKEN] == self.anon_w2i[END_TOKEN]
        assert self.anon_char_w2i[UNK_TOKEN] == self.anon_w2i[UNK_TOKEN]
def make_rnn_data(problem,
                  split,
                  domain='education',
                  sampling_strategy='standard'):
    rnn_paths = paths.rnn_data_paths(problem, split, domain, sampling_strategy)
    vocab_paths = paths.vocab_paths(problem, domain)
    os.makedirs(rnn_paths['data_path'], exist_ok=True)

    (counts_paths, labels_paths, rv_order_paths,
     tiers_paths, anon_mapping_paths, all_rvs_path) = \
         paths.raw_data_paths(problem, split, domain, sampling_strategy)
    n_shards = len(counts_paths)

    # get info that has to be collected across all shards
    max_lens = get_merged_info(counts_paths, labels_paths, rv_order_paths,
                               tiers_paths, anon_mapping_paths)
    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    max_len, char_max_len, anon_max_len, anon_char_max_len = max_lens

    all_rvs = io.load_json(all_rvs_path)
    rv_info = create_rv_info(all_rvs)
    # save all_rvs into rv_info
    rv_info['values'] = all_rvs

    data_len = 0
    shard_size = 0

    for i in range(n_shards):
        programs_i, anon_programs_i, labels_i, rv_order_i, tiers_i, _ = load_raw_rubric_data(
            counts_paths[i], labels_paths[i], rv_order_paths[i],
            tiers_paths[i], anon_mapping_paths[i])

        # assumes equally sized shards (except smaller remaining last one)
        shard_size = max(shard_size, len(programs_i))
        data_len += len(programs_i)

        feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs)
        feat_rv_order_i, rv_order_lengths_i = featurise_rv_order(
            rv_order_i, rv_info)

        feat_programs_i, program_lengths_i = featurise_programs_rnn(
            programs_i, vocab, max_len)
        anon_feat_programs_i, anon_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_vocab, anon_max_len)

        char_feat_programs_i, char_program_lengths_i = \
            featurise_programs_rnn(programs_i, char_vocab, char_max_len, character_level=True)

        anon_char_feat_programs_i, anon_char_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_char_vocab, anon_char_max_len, character_level=True)

        program_mats_i = dict(programs=feat_programs_i,
                              lengths=program_lengths_i,
                              tiers=tiers_i)
        char_program_mats_i = dict(programs=char_feat_programs_i,
                                   lengths=char_program_lengths_i,
                                   tiers=tiers_i)
        anon_program_mats_i = dict(programs=anon_feat_programs_i,
                                   lengths=anon_program_lengths_i,
                                   tiers=tiers_i)
        anon_char_program_mats_i = dict(programs=anon_char_feat_programs_i,
                                        lengths=anon_char_program_lengths_i,
                                        tiers=tiers_i)
        rv_order_mats_i = dict(rv_orders=feat_rv_order_i,
                               lengths=rv_order_lengths_i)

        io.save_pickle(programs_i, rnn_paths['raw_programs_path'].format(i))
        io.savemat(program_mats_i, rnn_paths['feat_programs_path'].format(i))
        io.savemat(char_program_mats_i,
                   rnn_paths['char_feat_programs_path'].format(i))

        # TODO: save raw labels in raw_labels_path
        io.save_np(feat_labels_i, rnn_paths['feat_labels_path'].format(i))
        io.save_pickle(anon_programs_i,
                       rnn_paths['anon_raw_programs_path'].format(i))
        io.savemat(anon_program_mats_i,
                   rnn_paths['anon_feat_programs_path'].format(i))
        io.savemat(anon_char_program_mats_i,
                   rnn_paths['anon_char_feat_programs_path'].format(i))
        io.save_pickle(rv_order_i, rnn_paths['raw_rvOrder_path'].format(i))
        io.savemat(rv_order_mats_i, rnn_paths['feat_rvOrder_path'].format(i))

    io.save_json(rv_info, rnn_paths['rv_info_path'])

    metadata = dict(max_len=max_len,
                    char_max_len=char_max_len,
                    anon_max_len=anon_max_len,
                    anon_char_max_len=anon_char_max_len,
                    data_len=data_len,
                    num_shards=n_shards,
                    shard_size=shard_size)

    io.save_json(metadata, rnn_paths['metadata_path'])
def process_student_data(problem, account_for_counts=False):
    rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard')
    os.makedirs(rnn_paths['student_data_path'], exist_ok=True)
    vocab_paths = paths.vocab_paths(problem, 'education')

    if not os.path.isfile(vocab_paths['vocab_path']):
        raise ValueError(
            'Run preprocessing script on rubric samples first to generate vocab file.'
        )

    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    metadata = io.load_json(rnn_paths['metadata_path'])

    # load training max-lengths
    # max_len = metadata['max_len']
    # char_max_len = metadata['char_max_len']
    # anon_max_len = metadata['anon_max_len']
    # anon_char_max_len = metadata['anon_char_max_len']

    # we do not want to load these from metadata bc some programs may be longer than ones
    # seen in training. Instead we want to recompute the maximum length...
    programs, labels, zipfs, anon_programs = raw_student_data(
        problem, account_for_counts)

    # we +2 to include start and end tokens
    max_len = max(len(x.split()) for x in programs) + 2
    char_max_len = max(len(x) for x in programs) + 2
    anon_max_len = max(len(x.split()) for x in anon_programs) + 2
    anon_char_max_len = max(len(x) for x in anon_programs) + 2

    feat_programs, program_lengths, raw_programs = featurise_programs_rnn(
        programs, vocab, max_len)
    char_feat_programs, char_program_lengths, _ = featurise_programs_rnn(
        programs, char_vocab, char_max_len, character_level=True)

    anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn(
        anon_programs, anon_vocab, anon_max_len)
    anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn(
        anon_programs,
        anon_char_vocab,
        anon_char_max_len,
        character_level=True)

    program_mats = dict(programs=feat_programs, lengths=program_lengths)
    char_program_mats = dict(programs=char_feat_programs,
                             lengths=char_program_lengths)
    anon_program_mats = dict(programs=anon_feat_programs,
                             lengths=anon_program_lengths)
    anon_char_program_mats = dict(programs=anon_char_feat_programs,
                                  lengths=anon_char_program_lengths)

    io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path'])
    io.savemat(char_program_mats, rnn_paths['student_char_programs_path'])
    io.savemat(program_mats, rnn_paths['student_programs_path'])

    io.save_pickle(anon_raw_programs,
                   rnn_paths['anon_raw_student_programs_path'])
    io.savemat(anon_char_program_mats,
               rnn_paths['anon_student_char_programs_path'])
    io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])

    io.save_np(labels, rnn_paths['feat_labels_path'])
    io.save_np(zipfs, rnn_paths['feat_zipfs_path'])
Beispiel #7
0
    def _load_data(self):
        '''
            Loads all shard-independent data
        '''
        rv_info_list, metadata_dict, num_shards_list, shard_size_list, data_len_list = [], {}, [], [], []
        w2i_list, i2w_list = [], []
        char_w2i_list, char_i2w_list = [], []
        anon_w2i_list, anon_i2w_list = [], []
        anon_char_w2i_list, anon_char_i2w_list = [], []
        shard_num_to_sampling_strategy = []
        shard_num_to_sampling_shard_num = []
        max_len_list, char_max_len_list = [], []
        anon_max_len_list, anon_char_max_len_list = [], []

        for sampling_strategy in self.sampling_strategy_list:
            rnn_paths = paths.rnn_data_paths(self.problem, self.split,
                                             self.domain, sampling_strategy)
            vocab_paths = paths.vocab_paths(self.problem, self.domain)
            """
            for _, path in rnn_paths.items():
                if not os.path.exists(path) and not os.path.exists(path.format(0)):
                    if 'student' not in path:
                        raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path))
            """

            # contains w2i, i2w, num_categories for all rvs
            rv_info = io.load_json(rnn_paths['rv_info_path'])
            metadata = io.load_json(rnn_paths['metadata_path'])
            num_shards = metadata['num_shards']
            shard_size = metadata['shard_size']
            data_len = metadata['data_len']
            max_len = metadata['max_len']
            char_max_len = metadata['char_max_len']
            anon_max_len = metadata['anon_max_len']
            anon_char_max_len = metadata['anon_char_max_len']

            vocab = io.load_json(vocab_paths['vocab_path'])
            w2i, i2w = vocab['w2i'], vocab['i2w']

            char_vocab = io.load_json(vocab_paths['char_vocab_path'])
            char_w2i, char_i2w = char_vocab['w2i'], char_vocab['i2w']

            assert char_w2i[PAD_TOKEN] == w2i[PAD_TOKEN]
            assert char_w2i[START_TOKEN] == w2i[START_TOKEN]
            assert char_w2i[END_TOKEN] == w2i[END_TOKEN]
            assert char_w2i[UNK_TOKEN] == w2i[UNK_TOKEN]

            anon_vocab = io.load_json(vocab_paths['anon_vocab_path'])
            anon_w2i, anon_i2w = anon_vocab['w2i'], anon_vocab['i2w']

            anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path'])
            anon_char_w2i, anon_char_i2w = anon_char_vocab[
                'w2i'], anon_char_vocab['i2w']

            assert anon_char_w2i[PAD_TOKEN] == anon_w2i[PAD_TOKEN]
            assert anon_char_w2i[START_TOKEN] == anon_w2i[START_TOKEN]
            assert anon_char_w2i[END_TOKEN] == anon_w2i[END_TOKEN]
            assert anon_char_w2i[UNK_TOKEN] == anon_w2i[UNK_TOKEN]

            rv_info_list.append(rv_info)
            metadata_dict[sampling_strategy] = metadata
            num_shards_list.append(num_shards)
            shard_num_to_sampling_strategy.extend([sampling_strategy] *
                                                  num_shards)
            shard_num_to_sampling_shard_num.extend(range(num_shards))
            shard_size_list.append(shard_size)
            data_len_list.append(data_len)
            w2i_list.append(w2i)
            i2w_list.append(i2w)
            char_w2i_list.append(char_w2i)
            char_i2w_list.append(char_i2w)
            anon_w2i_list.append(anon_w2i)
            anon_i2w_list.append(anon_i2w)
            anon_char_w2i_list.append(anon_char_w2i)
            anon_char_i2w_list.append(anon_char_i2w)
            max_len_list.append(max_len)
            char_max_len_list.append(char_max_len)
            anon_max_len_list.append(anon_max_len)
            anon_char_max_len_list.append(anon_char_max_len)

        self.rv_info = rv_info_list[0]  # assume all of these are the same
        self.metadata_dict = metadata_dict
        self.num_shards = sum(num_shards_list)  # consider all shards
        self.shard_size_list = shard_size_list
        self.data_len = sum(data_len_list)
        self.w2i = merge_dicts(*w2i_list)
        self.i2w = merge_dicts(*i2w_list)
        self.vocab = {'w2i': self.w2i, 'i2w': self.i2w}
        self.char_w2i = merge_dicts(*char_w2i_list)
        self.char_i2w = merge_dicts(*char_i2w_list)
        self.char_vocab = {'w2i': self.char_w2i, 'i2w': self.char_i2w}
        self.anon_w2i = merge_dicts(*anon_w2i_list)
        self.anon_i2w = merge_dicts(*anon_i2w_list)
        self.anon_vocab = {'w2i': self.anon_w2i, 'i2w': self.anon_i2w}
        self.anon_char_w2i = merge_dicts(*anon_char_w2i_list)
        self.anon_char_i2w = merge_dicts(*anon_char_i2w_list)
        self.anon_char_vocab = {
            'w2i': self.anon_char_w2i,
            'i2w': self.anon_char_i2w
        }
        self.shard_num_to_sampling_strategy = shard_num_to_sampling_strategy
        self.shard_num_to_sampling_shard_num = shard_num_to_sampling_shard_num
        self.max_len_list = max_len_list
        self.char_max_len_list = char_max_len_list
        self.anon_max_len_list = anon_max_len_list
        self.anon_char_max_len_list = anon_char_max_len_list
        # take max and we will need to pad to this size
        self.max_len = max(max_len_list)
        self.char_max_len = max(char_max_len_list)
        self.anon_max_len = max(anon_max_len_list)
        self.anon_char_max_len = max(anon_char_max_len_list)
Beispiel #8
0
    def _load_shard(self, _shard_num):
        self.curr_shard = _shard_num

        sampling_strategy = self.shard_num_to_sampling_strategy[_shard_num]
        # we need to recover the actual shard_num for a single sampling strategy
        shard_num = self.shard_num_to_sampling_shard_num[_shard_num]
        rnn_paths = paths.rnn_data_paths(self.problem, self.split, self.domain,
                                         sampling_strategy)

        self.raw_programs = io.load_pickle(
            rnn_paths['raw_programs_path'].format(shard_num))
        self.anon_raw_programs = io.load_pickle(
            rnn_paths['anon_raw_programs_path'].format(shard_num))
        self.raw_rvOrders = io.load_pickle(
            rnn_paths['raw_rvOrder_path'].format(shard_num))

        # Shape: (n x seq_len)
        programs_mat = io.loadmat(
            rnn_paths['feat_programs_path'].format(shard_num))
        char_programs_mat = io.loadmat(
            rnn_paths['char_feat_programs_path'].format(shard_num))

        anon_programs_mat = io.loadmat(
            rnn_paths['anon_feat_programs_path'].format(shard_num))
        anon_char_programs_mat = io.loadmat(
            rnn_paths['anon_char_feat_programs_path'].format(shard_num))

        self.programs = programs_mat['programs']
        self.lengths = programs_mat['lengths'].squeeze()
        self.tiers = programs_mat['tiers'][0]

        self.char_programs = char_programs_mat['programs']
        self.char_lengths = char_programs_mat['lengths'].squeeze()

        self.anon_programs = anon_programs_mat['programs']
        self.anon_lengths = anon_programs_mat['lengths'].squeeze()

        self.anon_char_programs = anon_char_programs_mat['programs']
        self.anon_char_lengths = anon_char_programs_mat['lengths'].squeeze()

        # pad programs to single shape
        self.programs = self._pad_program(self.w2i, self.programs,
                                          self.max_len)
        self.char_programs = self._pad_program(self.char_w2i,
                                               self.char_programs,
                                               self.char_max_len)
        self.anon_programs = self._pad_program(self.anon_w2i,
                                               self.anon_programs,
                                               self.anon_max_len)
        self.anon_char_programs = self._pad_program(self.anon_char_w2i,
                                                    self.anon_char_programs,
                                                    self.anon_char_max_len)

        # Shape: (n x num_labels).  1 if label, 0 otherwise
        self.labels = io.load_np(
            rnn_paths['feat_labels_path'].format(shard_num))

        rvOrders_mat = io.loadmat(
            rnn_paths['feat_rvOrder_path'].format(shard_num))
        self.rvOrders = rvOrders_mat['rv_orders']
        self.rvOrders_lengths = rvOrders_mat['lengths'].squeeze()