def __init__(self, data_folder, shuffle=True, load_only_dicts=False): self.rng = np.random.RandomState(1189) self.tl_dict, self.rev_tl_dict = load_dictionaries(data_folder) assert self.tl_dict is not None and self.rev_tl_dict is not None if load_only_dicts: return if not shuffle: self.train_ex, self.valid_ex, self.test_ex = self._deserialize( data_folder) else: try: self.train_ex, self.valid_ex, self.test_ex = self._deserialize( os.path.join(data_folder, 'shuffled')) print "Successfully loaded shuffled data." sys.stdout.flush() except IOError: print "Generating shuffled data..." sys.stdout.flush() self.train_ex, self.valid_ex, self.test_ex = self._deserialize( data_folder) self.rng.shuffle(self.train_ex) self.rng.shuffle(self.valid_ex) self.rng.shuffle(self.test_ex) make_dir_if_not_exists(os.path.join(data_folder, 'shuffled')) np.save( os.path.join(data_folder, 'shuffled', 'examples-train.npy'), self.train_ex) np.save( os.path.join(data_folder, 'shuffled', 'examples-validation.npy'), self.valid_ex) np.save( os.path.join(data_folder, 'shuffled', 'examples-test.npy'), self.test_ex)
for problem_id, test_programs_ in bin_seeded_test_data.iteritems(): seeded_test_data[bin_id][problem_id] = [] for incorrect_program, name_dict, name_sequence, user_id, code_id in test_programs_: for row in cursor.execute( 'SELECT tokenized_code from Code where code_id=?;', (code_id, )): correct_program = str(row[0]) seeded_test_data[bin_id][problem_id].append( (code_id, convert_to_rla_format(incorrect_program), convert_to_rla_format(correct_program))) skipped = 0 for bin_id in range(5): print 'bin_%d' % bin_id, target_bin_dir = os.path.join(RLAssist_base_dir, 'bin_%d' % bin_id) tl_dict, _ = load_dictionaries(target_bin_dir) for which, test_data in [('raw', raw_test_data), ('seeded', seeded_test_data)]: test_data_this_fold = {} for problem_id in test_data[bin_id]: for code_id, inc_tokens, cor_tokens in test_data[bin_id][ problem_id]: inc_vector = vectorize(inc_tokens, tl_dict) corr_vector = vectorize(cor_tokens, tl_dict) if inc_vector is None or corr_vector is None: skipped += 1 continue test_data_this_fold[code_id] = (inc_vector, corr_vector) print which, len(test_data_this_fold), np.save(os.path.join(target_bin_dir, 'test_%s.npy' % which),