def save_bins(destination, tl_dict, token_vectors, bins): full_list = [] for bin_ in bins: for problem_id in bin_: full_list.append(problem_id) for i, bin_ in enumerate(bins): test_problems = bin_ training_problems = list(set(full_list) - set(bin_)) token_vectors_this_fold = {'train': [], 'validation': [], 'test': []} for problem_id in training_problems: if problem_id in token_vectors['train']: token_vectors_this_fold['train'] += token_vectors['train'][ problem_id] token_vectors_this_fold['validation'] += token_vectors[ 'validation'][problem_id] for problem_id in test_problems: if problem_id in token_vectors['validation']: token_vectors_this_fold['test'] += token_vectors['validation'][ problem_id] make_dir_if_not_exists(os.path.join(destination, 'bin_%d' % i)) print "Fold %d: Train:%d Validation:%d Test:%d" % ( i, len(token_vectors_this_fold['train']), len(token_vectors_this_fold['validation']), len(token_vectors_this_fold['test'])) save_pairs(os.path.join(destination, 'bin_%d' % i), token_vectors_this_fold, tl_dict)
def my_save_bins(destination, tl_dict, token_vectors, rng): full_list = get_cs_tokenized().keys() rng.shuffle(full_list) fold_n = 5 bins = [] for i in range(fold_n): bins.append(full_list[len(full_list) * i // fold_n:len(full_list) * (i + 1) // fold_n]) for i, bin_ in enumerate(bins): token_vectors_this_fold = {'train': [], 'validation': [], 'test': []} for problem_id in set(full_list) - set(bin_): if rng.rand() < 0.8: token_vectors_this_fold['train'] += token_vectors['train'][ problem_id] else: token_vectors_this_fold['validation'] += token_vectors[ 'train'][problem_id] for problem_id in bin_: token_vectors_this_fold['test'] += token_vectors['train'][ problem_id] print('Fold {}: (Train, Validation, Test) == ({} {} {})'.format( i, len(token_vectors_this_fold['train']), len(token_vectors_this_fold['validation']), len(token_vectors_this_fold['test']))) make_dir_if_not_exists(os.path.join(destination, 'bin_{}'.format(i))) save_pairs(os.path.join(destination, 'bin_{}'.format(i)), token_vectors_this_fold, tl_dict)
def __init__(self, data_folder, shuffle=True, load_only_dicts=False): self.rng = np.random.RandomState(1189) self.tl_dict, self.rev_tl_dict = load_dictionaries(data_folder) assert self.tl_dict is not None and self.rev_tl_dict is not None if load_only_dicts: return if not shuffle: self.train_ex, self.valid_ex, self.test_ex = self._deserialize( data_folder) else: try: self.train_ex, self.valid_ex, self.test_ex = self._deserialize( os.path.join(data_folder, 'shuffled')) print "Successfully loaded shuffled data." sys.stdout.flush() except IOError: print "Generating shuffled data..." sys.stdout.flush() self.train_ex, self.valid_ex, self.test_ex = self._deserialize( data_folder) self.rng.shuffle(self.train_ex) self.rng.shuffle(self.valid_ex) self.rng.shuffle(self.test_ex) make_dir_if_not_exists(os.path.join(data_folder, 'shuffled')) np.save( os.path.join(data_folder, 'shuffled', 'examples-train.npy'), self.train_ex) np.save( os.path.join(data_folder, 'shuffled', 'examples-validation.npy'), self.valid_ex) np.save( os.path.join(data_folder, 'shuffled', 'examples-test.npy'), self.test_ex)
def save_bins(destination, tl_dict, token_vectors, rng): fold_n = 5 bins = [] full_list = get_cs_tokenized().keys() rng.shuffle(full_list) for i in range(fold_n): bins.append(full_list[len(full_list) * i // fold_n:len(full_list) * (i + 1) // fold_n]) for i, bin_ in enumerate(bins): test_problems = bin_ training_problems = list(set(full_list) - set(bin_)) token_vectors_this_fold = {'train': [], 'validation': [], 'test': []} for problem_id in training_problems: if problem_id in token_vectors['train']: token_vectors_this_fold['train'] += token_vectors['train'][ problem_id] if problem_id in token_vectors['validation']: token_vectors_this_fold['validation'] += token_vectors[ 'validation'][problem_id] for problem_id in test_problems: if problem_id in token_vectors['validation']: token_vectors_this_fold['test'] += token_vectors['validation'][ problem_id] make_dir_if_not_exists(os.path.join(destination, 'bin_%d' % i)) print "Fold %d: Train:%d Validation:%d Test:%d" % ( i, len(token_vectors_this_fold['train']), len(token_vectors_this_fold['validation']), len(token_vectors_this_fold['test'])) save_pairs(os.path.join(destination, 'bin_%d' % i), token_vectors_this_fold, tl_dict)
drop_ids = kind_mutations == 'typo' max_program_length = 450 min_program_length = 75 max_fix_length = 25 max_mutations = 5 max_variants = 4 if kind_mutations == 'ids' else 2 db_path = os.path.join('data', 'iitk-dataset', 'dataset.db') validation_users = np.load( os.path.join('data', 'iitk-dataset', 'validation_users.npy')).item() bins = np.load(os.path.join('data', 'iitk-dataset', 'bins.npy')) seed = 1189 output_directory = os.path.join('data/network_inputs', 'iitk-%s-%d' % (kind_mutations, seed)) print('output_directory:', output_directory) make_dir_if_not_exists(os.path.join(output_directory)) result = generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, max_fix_length, kind_mutations, max_mutations, max_variants, seed) np.save(os.path.join(output_directory, 'testing-tokenized-examples.npy'), result) print('\n\n--------------- all outputs written to {} ---------------\n\n'. format(output_directory))
def __init__(self, data_folder, load_real_test_data=False, load_seeded_test_data=False, load_only_dicts=False, shuffle=False, seed=1189): self.rng = np.random.RandomState(seed) self.tl_dict, self.rev_tl_dict = load_dictionaries(data_folder) assert self.tl_dict is not None and self.rev_tl_dict is not None assert self.tl_dict['-new-line-'] == 2 if load_only_dicts: return if load_real_test_data: try: self.real_test_data = np.load(os.path.join( data_folder, 'test_real_raw.npy'), allow_pickle=True).item() except: self.real_test_data = np.load(os.path.join( data_folder, 'test_raw.npy'), allow_pickle=True).item() if load_seeded_test_data: try: self.seeded_test_data = np.load(os.path.join( data_folder, 'test_real_seeded.npy'), allow_pickle=True).item() except: self.seeded_test_data = np.load(os.path.join( data_folder, 'test_seeded.npy'), allow_pickle=True).item() try: self.name_dict_store = np.load(os.path.join( data_folder, 'name_dict_store.npy'), allow_pickle=True).item() # 在这里我们读取也读取raw_test的name_dict,然后将二者组合在一起 self.test_name_dict_store = np.load(os.path.join( data_folder, 'test_name_dict_store.npy'), allow_pickle=True).item() self.name_dict_store = dict(self.name_dict_store, **self.test_name_dict_store) except: print 'init name_dict_store with {}' self.name_dict_store = {} if not shuffle: # Load originals self.train_ex, self.valid_ex, self.test_ex = self._deserialize( data_folder) print "Successfully loaded data." else: try: # to load pre-generated shuffled data self.train_ex, self.valid_ex, self.test_ex = self._deserialize( os.path.join(data_folder, 'shuffled')) print "Successfully loaded shuffled data." # or generate it except IOError: print "Generating shuffled data..." sys.stdout.flush() # Load originals self.train_ex, self.valid_ex, self.test_ex = self._deserialize( data_folder) # Shuffle self.rng.shuffle(self.train_ex) self.rng.shuffle(self.valid_ex) self.rng.shuffle(self.test_ex) # Save for later make_dir_if_not_exists(os.path.join(data_folder, 'shuffled')) np.save( os.path.join(data_folder, 'shuffled', 'examples-train.npy'), self.train_ex) np.save( os.path.join(data_folder, 'shuffled', 'examples-validation.npy'), self.valid_ex) np.save( os.path.join(data_folder, 'shuffled', 'examples-test.npy'), self.test_ex)
parser.add_argument('-v', '--vram', help='Fraction of GPU memory to use', type=float, default=1.0) args = parser.parse_args() # Default checkpoints directory if args.checkpoints_directory is None: checkpoints_directory = os.path.join('checkpoints', 'fold_%d' % args.fold) else: checkpoints_directory = args.checkpoints_directory # Make checkpoint directories make_dir_if_not_exists(checkpoints_directory) make_dir_if_not_exists(os.path.join(checkpoints_directory, 'best')) # Print options print 'Checkpoint every:', args.ckpt_every print 'Batch size:', args.batch_size print 'Embedding dim:', args.embedding_dim print 'Memory dim:', args.memory_dim print 'Layers:', args.num_layers print 'Epochs:', args.epochs print 'Resume at:', args.resume_at print 'Resume epoch:', args.resume_epoch print 'Resume training minibatch:', args.resume_training_minibatch print 'RNN cell:', args.rnn_cell print 'Bidirectional:', args.bidirectional sys.stdout.flush()
'''data/results/''') args.checkpoint_directory = args.checkpoint_directory + \ ('' if args.checkpoint_directory.endswith('/') else '/') bin_id = None try: if args.checkpoint_directory.find('bin_') == -1: raise ValueError('ERROR: failed to find the bin id') bin_id = int(args.checkpoint_directory[-2]) print 'bin_id:', bin_id except: raise if args.database: database = args.database else: make_dir_if_not_exists(database_path) database_name = args.which + '_' + args.task + '.db' database = os.path.join(database_path, database_name) print 'using database:', database if not args.data_directory: training_args = np.load( os.path.join(args.checkpoint_directory, 'experiment-configuration.npy')).item()['args'] args.data_directory = training_args.data_directory print 'data directory:', args.data_directory conn = sqlite3.connect(database) c = conn.cursor()
def __init__(self, fold, shuffle=True): data_folder = os.path.join('network_inputs') fold_folder = os.path.join(data_folder, 'fold_%d' % fold) self.tl_dict = np.load(os.path.join(data_folder, 'translate_dict.npy')).item() if not shuffle: # Load originals self.train_x = np.load( os.path.join(fold_folder, 'mutated-train.npy')) self.train_y = np.load(os.path.join(fold_folder, 'fixes-train.npy')) self.valid_x = np.load( os.path.join(fold_folder, 'mutated-validation.npy')) self.valid_y = np.load( os.path.join(fold_folder, 'fixes-validation.npy')) self.test_x = np.load(os.path.join(fold_folder, 'mutated-test.npy')) self.test_y = np.load(os.path.join(fold_folder, 'fixes-test.npy')) else: try: self.train_x = np.load( os.path.join(fold_folder, 'shuffled/mutated-train.npy')) self.train_y = np.load( os.path.join(fold_folder, 'shuffled/fixes-train.npy')) self.valid_x = np.load( os.path.join(fold_folder, 'shuffled/mutated-validation.npy')) self.valid_y = np.load( os.path.join(fold_folder, 'shuffled/fixes-validation.npy')) self.test_x = np.load( os.path.join(fold_folder, 'shuffled/mutated-test.npy')) self.test_y = np.load( os.path.join(fold_folder, 'shuffled/fixes-test.npy')) print "Successfully loaded shuffled data." sys.stdout.flush() # If not generate it except IOError: print "Generating shuffled data..." sys.stdout.flush() # Load originals self.train_x = np.load( os.path.join(fold_folder, 'mutated-train.npy')) self.train_y = np.load( os.path.join(fold_folder, 'fixes-train.npy')) self.valid_x = np.load( os.path.join(fold_folder, 'mutated-validation.npy')) self.valid_y = np.load( os.path.join(fold_folder, 'fixes-validation.npy')) self.test_x = np.load( os.path.join(fold_folder, 'mutated-test.npy')) self.test_y = np.load( os.path.join(fold_folder, 'fixes-test.npy')) # Shuffle self.train_x, self.train_y = zip( *_shuffle(zip(list(self.train_x), list(self.train_y)))) self.valid_x, self.valid_y = zip( *_shuffle(zip(list(self.valid_x), list(self.valid_y)))) self.test_x, self.test_y = zip( *_shuffle(zip(list(self.test_x), list(self.test_y)))) # Convert to np array self.train_x, self.train_y = np.array(self.train_x), np.array( self.train_y) self.valid_x, self.valid_y = np.array(self.valid_x), np.array( self.valid_y) self.test_x, self.test_y = np.array(self.test_x), np.array( self.test_y) # Save for later make_dir_if_not_exists(os.path.join(fold_folder, 'shuffled')) np.save( os.path.join(fold_folder, 'shuffled/mutated-train.npy'), self.train_x) np.save(os.path.join(fold_folder, 'shuffled/fixes-train.npy'), self.train_y) np.save( os.path.join(fold_folder, 'shuffled/mutated-validation.npy'), self.valid_x) np.save( os.path.join(fold_folder, 'shuffled/fixes-validation.npy'), self.valid_y) np.save(os.path.join(fold_folder, 'shuffled/mutated-test.npy'), self.test_x) np.save(os.path.join(fold_folder, 'shuffled/fixes-test.npy'), self.test_y) # Check assert (len(self.train_x) == len(self.train_y)) assert (len(self.valid_x) == len(self.valid_y)) assert (len(self.test_x) == len(self.test_y))