def make_vocabs(problem, domain='education'): vocab_paths = paths.vocab_paths(problem, domain) os.makedirs(vocab_paths['data_path'], exist_ok=True) all_programs, all_anon_programs = [], [] for sampling_strategy in ['standard', 'uniform', 'tempered']: (counts_paths, labels_paths,rv_order_paths, tiers_paths, anon_mapping_paths, all_rvs_path) = \ paths.raw_data_paths(problem, 'train', domain, sampling_strategy) n_shards = len(counts_paths) for i in range(n_shards): programs_i, anon_programs_i, _, _, _, _ = load_raw_rubric_data( counts_paths[i], labels_paths[i], rv_order_paths[i], tiers_paths[i], anon_mapping_paths[i]) all_programs.extend(programs_i) all_anon_programs.extend(anon_programs_i) vocab = build_vocab_rnn(all_programs, character_level=False) char_vocab = build_vocab_rnn(all_programs, character_level=True) anon_vocab = build_vocab_rnn(all_anon_programs, character_level=False) anon_char_vocab = build_vocab_rnn(all_anon_programs, character_level=True) io.save_json(vocab, vocab_paths['vocab_path']) io.save_json(char_vocab, vocab_paths['char_vocab_path']) io.save_json(anon_vocab, vocab_paths['anon_vocab_path']) io.save_json(anon_char_vocab, vocab_paths['anon_char_vocab_path'])
def process_student_data(problem): # TODO: fix this, it's outdated rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard') vocab_paths = paths.vocab_paths(problem, 'education') if not os.path.isfile(vocab_paths['vocab_path']): raise ValueError( 'Run preprocessing script on rubric samples first to generate vocab file.' ) vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths) metadata = io.load_json(rnn_paths['metadata_path']) # load training max-lengths max_len = metadata['max_len'] char_max_len = metadata['char_max_len'] anon_max_len = metadata['anon_max_len'] anon_char_max_len = metadata['anon_char_max_len'] programs, anon_programs = raw_student_data(problem) feat_programs, program_lengths, raw_programs = featurise_programs_rnn( programs, vocab, max_len) char_feat_programs, char_program_lengths, _ = featurise_programs_rnn( programs, char_vocab, char_max_len, character_level=True) anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn( anon_programs, anon_vocab, anon_max_len) anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn( anon_programs, anon_char_vocab, anon_char_max_len, character_level=True) program_mats = dict(programs=feat_programs, lengths=program_lengths) char_program_mats = dict(programs=char_feat_programs, lengths=char_program_lengths) anon_program_mats = dict(programs=anon_feat_programs, lengths=anon_program_lengths) anon_char_program_mats = dict(programs=anon_char_feat_programs, lengths=anon_char_program_lengths) io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path']) io.savemat(char_program_mats, rnn_paths['student_char_programs_path']) io.savemat(program_mats, rnn_paths['student_programs_path']) io.save_pickle(anon_raw_programs, rnn_paths['anon_raw_student_programs_path']) io.savemat(anon_char_program_mats, rnn_paths['anon_student_char_programs_path']) io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])
def _load_metadata(self): ''' Loads all housekeeping data ''' rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard') vocab_paths = paths.vocab_paths(self.problem, 'education') for _, path in rnn_paths.items(): if not os.path.exists(path) and not os.path.exists(path.format(0)): if 'student' not in path: raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path)) metadata = io.load_json(rnn_paths['metadata_path']) self.max_len = metadata['max_len'] self.char_max_len = metadata['char_max_len'] self.anon_max_len = metadata['anon_max_len'] self.anon_char_max_len = metadata['anon_char_max_len'] self.vocab = io.load_json(vocab_paths['vocab_path']) self.w2i, self.i2w = self.vocab['w2i'], self.vocab['i2w'] self.char_vocab = io.load_json(vocab_paths['char_vocab_path']) self.char_w2i, self.char_i2w = self.char_vocab['w2i'], self.char_vocab['i2w'] assert self.char_w2i[PAD_TOKEN] == self.w2i[PAD_TOKEN] assert self.char_w2i[START_TOKEN] == self.w2i[START_TOKEN] assert self.char_w2i[END_TOKEN] == self.w2i[END_TOKEN] assert self.char_w2i[UNK_TOKEN] == self.w2i[UNK_TOKEN] self.anon_vocab = io.load_json(vocab_paths['anon_vocab_path']) self.anon_w2i, self.anon_i2w = self.anon_vocab['w2i'], self.anon_vocab['i2w'] self.anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path']) self.anon_char_w2i, self.anon_char_i2w = self.anon_char_vocab['w2i'], self.anon_char_vocab['i2w'] assert self.anon_char_w2i[PAD_TOKEN] == self.anon_w2i[PAD_TOKEN] assert self.anon_char_w2i[START_TOKEN] == self.anon_w2i[START_TOKEN] assert self.anon_char_w2i[END_TOKEN] == self.anon_w2i[END_TOKEN] assert self.anon_char_w2i[UNK_TOKEN] == self.anon_w2i[UNK_TOKEN]
def make_rnn_data(problem, split, domain='education', sampling_strategy='standard'): rnn_paths = paths.rnn_data_paths(problem, split, domain, sampling_strategy) vocab_paths = paths.vocab_paths(problem, domain) os.makedirs(rnn_paths['data_path'], exist_ok=True) (counts_paths, labels_paths, rv_order_paths, tiers_paths, anon_mapping_paths, all_rvs_path) = \ paths.raw_data_paths(problem, split, domain, sampling_strategy) n_shards = len(counts_paths) # get info that has to be collected across all shards max_lens = get_merged_info(counts_paths, labels_paths, rv_order_paths, tiers_paths, anon_mapping_paths) vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths) max_len, char_max_len, anon_max_len, anon_char_max_len = max_lens all_rvs = io.load_json(all_rvs_path) rv_info = create_rv_info(all_rvs) # save all_rvs into rv_info rv_info['values'] = all_rvs data_len = 0 shard_size = 0 for i in range(n_shards): programs_i, anon_programs_i, labels_i, rv_order_i, tiers_i, _ = load_raw_rubric_data( counts_paths[i], labels_paths[i], rv_order_paths[i], tiers_paths[i], anon_mapping_paths[i]) # assumes equally sized shards (except smaller remaining last one) shard_size = max(shard_size, len(programs_i)) data_len += len(programs_i) feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs) feat_rv_order_i, rv_order_lengths_i = featurise_rv_order( rv_order_i, rv_info) feat_programs_i, program_lengths_i = featurise_programs_rnn( programs_i, vocab, max_len) anon_feat_programs_i, anon_program_lengths_i = \ featurise_programs_rnn(anon_programs_i, anon_vocab, anon_max_len) char_feat_programs_i, char_program_lengths_i = \ featurise_programs_rnn(programs_i, char_vocab, char_max_len, character_level=True) anon_char_feat_programs_i, anon_char_program_lengths_i = \ featurise_programs_rnn(anon_programs_i, anon_char_vocab, anon_char_max_len, character_level=True) program_mats_i = dict(programs=feat_programs_i, lengths=program_lengths_i, tiers=tiers_i) char_program_mats_i = dict(programs=char_feat_programs_i, lengths=char_program_lengths_i, tiers=tiers_i) anon_program_mats_i = dict(programs=anon_feat_programs_i, lengths=anon_program_lengths_i, tiers=tiers_i) anon_char_program_mats_i = dict(programs=anon_char_feat_programs_i, lengths=anon_char_program_lengths_i, tiers=tiers_i) rv_order_mats_i = dict(rv_orders=feat_rv_order_i, lengths=rv_order_lengths_i) io.save_pickle(programs_i, rnn_paths['raw_programs_path'].format(i)) io.savemat(program_mats_i, rnn_paths['feat_programs_path'].format(i)) io.savemat(char_program_mats_i, rnn_paths['char_feat_programs_path'].format(i)) # TODO: save raw labels in raw_labels_path io.save_np(feat_labels_i, rnn_paths['feat_labels_path'].format(i)) io.save_pickle(anon_programs_i, rnn_paths['anon_raw_programs_path'].format(i)) io.savemat(anon_program_mats_i, rnn_paths['anon_feat_programs_path'].format(i)) io.savemat(anon_char_program_mats_i, rnn_paths['anon_char_feat_programs_path'].format(i)) io.save_pickle(rv_order_i, rnn_paths['raw_rvOrder_path'].format(i)) io.savemat(rv_order_mats_i, rnn_paths['feat_rvOrder_path'].format(i)) io.save_json(rv_info, rnn_paths['rv_info_path']) metadata = dict(max_len=max_len, char_max_len=char_max_len, anon_max_len=anon_max_len, anon_char_max_len=anon_char_max_len, data_len=data_len, num_shards=n_shards, shard_size=shard_size) io.save_json(metadata, rnn_paths['metadata_path'])
def process_student_data(problem, account_for_counts=False): rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard') os.makedirs(rnn_paths['student_data_path'], exist_ok=True) vocab_paths = paths.vocab_paths(problem, 'education') if not os.path.isfile(vocab_paths['vocab_path']): raise ValueError( 'Run preprocessing script on rubric samples first to generate vocab file.' ) vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths) metadata = io.load_json(rnn_paths['metadata_path']) # load training max-lengths # max_len = metadata['max_len'] # char_max_len = metadata['char_max_len'] # anon_max_len = metadata['anon_max_len'] # anon_char_max_len = metadata['anon_char_max_len'] # we do not want to load these from metadata bc some programs may be longer than ones # seen in training. Instead we want to recompute the maximum length... programs, labels, zipfs, anon_programs = raw_student_data( problem, account_for_counts) # we +2 to include start and end tokens max_len = max(len(x.split()) for x in programs) + 2 char_max_len = max(len(x) for x in programs) + 2 anon_max_len = max(len(x.split()) for x in anon_programs) + 2 anon_char_max_len = max(len(x) for x in anon_programs) + 2 feat_programs, program_lengths, raw_programs = featurise_programs_rnn( programs, vocab, max_len) char_feat_programs, char_program_lengths, _ = featurise_programs_rnn( programs, char_vocab, char_max_len, character_level=True) anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn( anon_programs, anon_vocab, anon_max_len) anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn( anon_programs, anon_char_vocab, anon_char_max_len, character_level=True) program_mats = dict(programs=feat_programs, lengths=program_lengths) char_program_mats = dict(programs=char_feat_programs, lengths=char_program_lengths) anon_program_mats = dict(programs=anon_feat_programs, lengths=anon_program_lengths) anon_char_program_mats = dict(programs=anon_char_feat_programs, lengths=anon_char_program_lengths) io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path']) io.savemat(char_program_mats, rnn_paths['student_char_programs_path']) io.savemat(program_mats, rnn_paths['student_programs_path']) io.save_pickle(anon_raw_programs, rnn_paths['anon_raw_student_programs_path']) io.savemat(anon_char_program_mats, rnn_paths['anon_student_char_programs_path']) io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path']) io.save_np(labels, rnn_paths['feat_labels_path']) io.save_np(zipfs, rnn_paths['feat_zipfs_path'])
def _load_data(self): ''' Loads all shard-independent data ''' rv_info_list, metadata_dict, num_shards_list, shard_size_list, data_len_list = [], {}, [], [], [] w2i_list, i2w_list = [], [] char_w2i_list, char_i2w_list = [], [] anon_w2i_list, anon_i2w_list = [], [] anon_char_w2i_list, anon_char_i2w_list = [], [] shard_num_to_sampling_strategy = [] shard_num_to_sampling_shard_num = [] max_len_list, char_max_len_list = [], [] anon_max_len_list, anon_char_max_len_list = [], [] for sampling_strategy in self.sampling_strategy_list: rnn_paths = paths.rnn_data_paths(self.problem, self.split, self.domain, sampling_strategy) vocab_paths = paths.vocab_paths(self.problem, self.domain) """ for _, path in rnn_paths.items(): if not os.path.exists(path) and not os.path.exists(path.format(0)): if 'student' not in path: raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path)) """ # contains w2i, i2w, num_categories for all rvs rv_info = io.load_json(rnn_paths['rv_info_path']) metadata = io.load_json(rnn_paths['metadata_path']) num_shards = metadata['num_shards'] shard_size = metadata['shard_size'] data_len = metadata['data_len'] max_len = metadata['max_len'] char_max_len = metadata['char_max_len'] anon_max_len = metadata['anon_max_len'] anon_char_max_len = metadata['anon_char_max_len'] vocab = io.load_json(vocab_paths['vocab_path']) w2i, i2w = vocab['w2i'], vocab['i2w'] char_vocab = io.load_json(vocab_paths['char_vocab_path']) char_w2i, char_i2w = char_vocab['w2i'], char_vocab['i2w'] assert char_w2i[PAD_TOKEN] == w2i[PAD_TOKEN] assert char_w2i[START_TOKEN] == w2i[START_TOKEN] assert char_w2i[END_TOKEN] == w2i[END_TOKEN] assert char_w2i[UNK_TOKEN] == w2i[UNK_TOKEN] anon_vocab = io.load_json(vocab_paths['anon_vocab_path']) anon_w2i, anon_i2w = anon_vocab['w2i'], anon_vocab['i2w'] anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path']) anon_char_w2i, anon_char_i2w = anon_char_vocab[ 'w2i'], anon_char_vocab['i2w'] assert anon_char_w2i[PAD_TOKEN] == anon_w2i[PAD_TOKEN] assert anon_char_w2i[START_TOKEN] == anon_w2i[START_TOKEN] assert anon_char_w2i[END_TOKEN] == anon_w2i[END_TOKEN] assert anon_char_w2i[UNK_TOKEN] == anon_w2i[UNK_TOKEN] rv_info_list.append(rv_info) metadata_dict[sampling_strategy] = metadata num_shards_list.append(num_shards) shard_num_to_sampling_strategy.extend([sampling_strategy] * num_shards) shard_num_to_sampling_shard_num.extend(range(num_shards)) shard_size_list.append(shard_size) data_len_list.append(data_len) w2i_list.append(w2i) i2w_list.append(i2w) char_w2i_list.append(char_w2i) char_i2w_list.append(char_i2w) anon_w2i_list.append(anon_w2i) anon_i2w_list.append(anon_i2w) anon_char_w2i_list.append(anon_char_w2i) anon_char_i2w_list.append(anon_char_i2w) max_len_list.append(max_len) char_max_len_list.append(char_max_len) anon_max_len_list.append(anon_max_len) anon_char_max_len_list.append(anon_char_max_len) self.rv_info = rv_info_list[0] # assume all of these are the same self.metadata_dict = metadata_dict self.num_shards = sum(num_shards_list) # consider all shards self.shard_size_list = shard_size_list self.data_len = sum(data_len_list) self.w2i = merge_dicts(*w2i_list) self.i2w = merge_dicts(*i2w_list) self.vocab = {'w2i': self.w2i, 'i2w': self.i2w} self.char_w2i = merge_dicts(*char_w2i_list) self.char_i2w = merge_dicts(*char_i2w_list) self.char_vocab = {'w2i': self.char_w2i, 'i2w': self.char_i2w} self.anon_w2i = merge_dicts(*anon_w2i_list) self.anon_i2w = merge_dicts(*anon_i2w_list) self.anon_vocab = {'w2i': self.anon_w2i, 'i2w': self.anon_i2w} self.anon_char_w2i = merge_dicts(*anon_char_w2i_list) self.anon_char_i2w = merge_dicts(*anon_char_i2w_list) self.anon_char_vocab = { 'w2i': self.anon_char_w2i, 'i2w': self.anon_char_i2w } self.shard_num_to_sampling_strategy = shard_num_to_sampling_strategy self.shard_num_to_sampling_shard_num = shard_num_to_sampling_shard_num self.max_len_list = max_len_list self.char_max_len_list = char_max_len_list self.anon_max_len_list = anon_max_len_list self.anon_char_max_len_list = anon_char_max_len_list # take max and we will need to pad to this size self.max_len = max(max_len_list) self.char_max_len = max(char_max_len_list) self.anon_max_len = max(anon_max_len_list) self.anon_char_max_len = max(anon_char_max_len_list)