def raw_student_data(problem, account_for_counts=False):
    counts_path, labels_path, zipfs_path, anon_mapping_path = \
        paths.raw_codeorg_student_paths(problem)
    prog2counts = io.load_pickle(counts_path)
    prog2labels = io.load_pickle(labels_path)
    prog2zipfs = io.load_pickle(zipfs_path)
    prog2anons = io.load_pickle(anon_mapping_path)

    if account_for_counts:
        programs = []
        labels = []
        zipfs = []
        anon_programs = []
        for prog, count in prog2counts.iteritems():
            for i in range(count):
                programs.append(prog)
                labels.append(prog2labels[prog])
                zipfs.append(prog2zipfs[prog])
                anon_programs.append(prog2anons[prog])
    else:
        programs = list(prog2counts.keys())
        labels = [prog2labels[p] for p in programs]
        zipfs = [prog2zipfs[p] for p in programs]
        anon_programs = [prog2anons[p] for p in programs]

    return programs, labels, zipfs, anon_programs
def raw_student_data(problem):
    data_path, anon_mapping_path = paths.raw_student_paths(problem)
    progs = io.load_pickle(data_path)
    anon_mapping = io.load_pickle(anon_mapping_path)
    programs = list(progs.keys())
    anon_programs = [anon_mapping[p] for p in programs]
    return programs, anon_programs
    def _load_data(self):

        rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard')

        self.raw_programs = io.load_pickle(rnn_paths['raw_student_programs_path'])
        self.anon_raw_programs = io.load_pickle(rnn_paths['anon_raw_student_programs_path'])

        # Shape: (n x seq_len)
        programs_mat = io.loadmat(rnn_paths['student_programs_path'])
        char_programs_mat = io.loadmat(rnn_paths['student_char_programs_path'])

        anon_programs_mat = io.loadmat(rnn_paths['anon_student_programs_path'])
        anon_char_programs_mat = io.loadmat(rnn_paths['anon_student_char_programs_path'])

        self.programs = programs_mat['programs']
        self.lengths = programs_mat['lengths'].squeeze()
        #self.tiers = programs_mat['tiers'][0]

        self.char_programs = char_programs_mat['programs']
        self.char_lengths = char_programs_mat['lengths'].squeeze()

        self.anon_programs = anon_programs_mat['programs']
        self.anon_lengths = anon_programs_mat['lengths'].squeeze()

        self.anon_char_programs = anon_char_programs_mat['programs']
        self.anon_char_lengths = anon_char_programs_mat['lengths'].squeeze()
Ejemplo n.º 4
0
def load_raw_rubric_data(counts_path, labels_path, rv_order_path, tiers_path, anon_mapping_path):
    counts = io.load_pickle(counts_path)
    labels = io.load_pickle(labels_path)
    anon_mapping = io.load_pickle(anon_mapping_path)
    tiers = io.load_pickle(tiers_path)
    rv_order = io.load_pickle(rv_order_path)
    programs = list(counts.keys())
    anon_programs = [anon_mapping[p] for p in programs]
    p_labels = [fix_labels(labels[p]) for p in programs]
    p_rvorders = [rv_order[p] for p in programs]
    p_tiers = [tiers[p] for p in programs]
    return programs, anon_programs, p_labels, p_rvorders, p_tiers, counts
Ejemplo n.º 5
0
def load_raw_scene_graph_data(counts_path, labels_path, rv_order_path,
                              images_path, tiers_path):
    counts = io.load_pickle(counts_path)
    labels = io.load_pickle(labels_path)
    images = io.load_pickle(images_path)
    tiers = io.load_pickle(tiers_path)
    rv_order = io.load_pickle(rv_order_path)
    scene_graphs = list(counts.keys())
    p_images = [images[p] for p in scene_graphs]
    p_labels = [fix_labels(labels[p]) for p in scene_graphs]
    p_rvorders = [rv_order[p] for p in scene_graphs]
    p_tiers = [tiers[p] for p in scene_graphs]
    return (scene_graphs, p_images, p_labels, p_rvorders, p_tiers, counts)
Ejemplo n.º 6
0
    def _load_shard(self, _shard_num):
        self.curr_shard = _shard_num

        sampling_strategy = self.shard_num_to_sampling_strategy[_shard_num]
        # we need to recover the actual shard_num for a single sampling strategy
        shard_num = self.shard_num_to_sampling_shard_num[_shard_num]
        scene_paths = paths.scene_graph_data_paths(self.problem, self.split,
                                                   sampling_strategy)

        images_mat = io.loadmat(
            scene_paths['feat_images_path'].format(shard_num))
        self.raw_rvOrders = io.load_pickle(
            scene_paths['raw_rvOrder_path'].format(shard_num))

        # Shape: (n x 3 x 64 x 64)
        self.images = images_mat['images']
        self.tiers = images_mat['tiers'].squeeze()

        # Shape: (n x num_labels).  1 if label, 0 otherwise
        self.labels = io.load_np(
            scene_paths['feat_labels_path'].format(shard_num))

        rvOrders_mat = io.loadmat(
            scene_paths['feat_rvOrder_path'].format(shard_num))
        self.rvOrders = rvOrders_mat['rv_orders']
        self.rvOrders_lengths = rvOrders_mat['lengths'].squeeze()
Ejemplo n.º 7
0
	def create_lsh_forest(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			# load precomputed
			print('Loading cached forest')
			self.forest = load_pickle(cache_file)
		else:
			sampledSets = self.processData(self.sampledData)
			self.sampledMinHashes = self.createMinHashSet(sampledSets)

			self.forest = MinHashLSHForest(num_perm=self.num_perm)
			for prog_idx, minHash in enumerate(self.sampledMinHashes):
				self.forest.add(prog_idx, minHash)

			self.forest.index()

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_pickle(self.forest, cache_file)
Ejemplo n.º 8
0
    def _load_shard(self, _shard_num):
        self.curr_shard = _shard_num

        sampling_strategy = self.shard_num_to_sampling_strategy[_shard_num]
        # we need to recover the actual shard_num for a single sampling strategy
        shard_num = self.shard_num_to_sampling_shard_num[_shard_num]
        rnn_paths = paths.rnn_data_paths(self.problem, self.split, self.domain,
                                         sampling_strategy)

        self.raw_programs = io.load_pickle(
            rnn_paths['raw_programs_path'].format(shard_num))
        self.anon_raw_programs = io.load_pickle(
            rnn_paths['anon_raw_programs_path'].format(shard_num))
        self.raw_rvOrders = io.load_pickle(
            rnn_paths['raw_rvOrder_path'].format(shard_num))

        # Shape: (n x seq_len)
        programs_mat = io.loadmat(
            rnn_paths['feat_programs_path'].format(shard_num))
        char_programs_mat = io.loadmat(
            rnn_paths['char_feat_programs_path'].format(shard_num))

        anon_programs_mat = io.loadmat(
            rnn_paths['anon_feat_programs_path'].format(shard_num))
        anon_char_programs_mat = io.loadmat(
            rnn_paths['anon_char_feat_programs_path'].format(shard_num))

        self.programs = programs_mat['programs']
        self.lengths = programs_mat['lengths'].squeeze()
        self.tiers = programs_mat['tiers'][0]

        self.char_programs = char_programs_mat['programs']
        self.char_lengths = char_programs_mat['lengths'].squeeze()

        self.anon_programs = anon_programs_mat['programs']
        self.anon_lengths = anon_programs_mat['lengths'].squeeze()

        self.anon_char_programs = anon_char_programs_mat['programs']
        self.anon_char_lengths = anon_char_programs_mat['lengths'].squeeze()

        # pad programs to single shape
        self.programs = self._pad_program(self.w2i, self.programs,
                                          self.max_len)
        self.char_programs = self._pad_program(self.char_w2i,
                                               self.char_programs,
                                               self.char_max_len)
        self.anon_programs = self._pad_program(self.anon_w2i,
                                               self.anon_programs,
                                               self.anon_max_len)
        self.anon_char_programs = self._pad_program(self.anon_char_w2i,
                                                    self.anon_char_programs,
                                                    self.anon_char_max_len)

        # Shape: (n x num_labels).  1 if label, 0 otherwise
        self.labels = io.load_np(
            rnn_paths['feat_labels_path'].format(shard_num))

        rvOrders_mat = io.loadmat(
            rnn_paths['feat_rvOrder_path'].format(shard_num))
        self.rvOrders = rvOrders_mat['rv_orders']
        self.rvOrders_lengths = rvOrders_mat['lengths'].squeeze()