Esempio n. 1
0
    def __init__(self, data_folder, shuffle=True, load_only_dicts=False):
        self.rng = np.random.RandomState(1189)
        self.tl_dict, self.rev_tl_dict = load_dictionaries(data_folder)
        assert self.tl_dict is not None and self.rev_tl_dict is not None

        if load_only_dicts:
            return

        if not shuffle:
            self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                data_folder)

        else:
            try:
                self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                    os.path.join(data_folder, 'shuffled'))

                print "Successfully loaded shuffled data."
                sys.stdout.flush()

            except IOError:
                print "Generating shuffled data..."
                sys.stdout.flush()

                self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                    data_folder)

                self.rng.shuffle(self.train_ex)
                self.rng.shuffle(self.valid_ex)
                self.rng.shuffle(self.test_ex)

                make_dir_if_not_exists(os.path.join(data_folder, 'shuffled'))

                np.save(
                    os.path.join(data_folder, 'shuffled',
                                 'examples-train.npy'), self.train_ex)
                np.save(
                    os.path.join(data_folder, 'shuffled',
                                 'examples-validation.npy'), self.valid_ex)
                np.save(
                    os.path.join(data_folder, 'shuffled', 'examples-test.npy'),
                    self.test_ex)
        for problem_id, test_programs_ in bin_seeded_test_data.iteritems():
            seeded_test_data[bin_id][problem_id] = []
            for incorrect_program, name_dict, name_sequence, user_id, code_id in test_programs_:
                for row in cursor.execute(
                        'SELECT tokenized_code from Code where code_id=?;',
                    (code_id, )):
                    correct_program = str(row[0])
                seeded_test_data[bin_id][problem_id].append(
                    (code_id, convert_to_rla_format(incorrect_program),
                     convert_to_rla_format(correct_program)))

skipped = 0
for bin_id in range(5):
    print 'bin_%d' % bin_id,
    target_bin_dir = os.path.join(RLAssist_base_dir, 'bin_%d' % bin_id)
    tl_dict, _ = load_dictionaries(target_bin_dir)

    for which, test_data in [('raw', raw_test_data),
                             ('seeded', seeded_test_data)]:
        test_data_this_fold = {}
        for problem_id in test_data[bin_id]:
            for code_id, inc_tokens, cor_tokens in test_data[bin_id][
                    problem_id]:
                inc_vector = vectorize(inc_tokens, tl_dict)
                corr_vector = vectorize(cor_tokens, tl_dict)
                if inc_vector is None or corr_vector is None:
                    skipped += 1
                    continue
                test_data_this_fold[code_id] = (inc_vector, corr_vector)
        print which, len(test_data_this_fold),
        np.save(os.path.join(target_bin_dir, 'test_%s.npy' % which),