def __init__(self, experiment_name): self._encoder = SMILESEncoder() # Read parameter used during training self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode( [self._config['EVALUATION']['starting_token']]) if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode( [self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) # Read data if os.path.isfile('../data/' + self._file_name + '.csv'): self._data = pd.read_csv('../data/' + self._file_name + '.csv', header=None).values[:, 0] elif os.path.isfile('../data/' + self._file_name + '.tar.xz'): # Skip first line since empty and last line since nan self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz', compression='xz', header=None).values[1:-1, 0] # Clean data from start, end and padding token for i, mol_dat in enumerate(self._data): self._data[i] = clean_molecule(mol_dat, self._model_type)
def __init__(self, experiment_name='ForwardRNN'): self._encoder = SMILESEncoder() # Read all parameter from the .ini file self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = '../data/' + self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) # self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']]) # Read starting model self._start_model = self._config['FINETUNING']['start_model'] if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) self._data = self._encoder.encode_from_file(self._file_name)
def _train(args): """ Train the NADE model :param args: parsed arguments """ if K.backend() != 'tensorflow': print("This repository only support tensorflow backend.") raise NotImplementedError() batch_size_ = 512 nb_users = 6040 nb_movies = 3706 data_sample = 1.0 input_dim0 = 6040 input_dim1 = 5 std = 0.0 alpha = 1.0 print('Loading data...') train_file_list = sorted(glob.glob(os.path.join('data/train_set', 'part*'))) val_file_list = sorted(glob.glob(os.path.join('data/val_set/', 'part*'))) test_file_list = sorted(glob.glob(os.path.join('data/test_set/', 'part*'))) train_file_list = [ dfile for dfile in train_file_list if os.stat(dfile).st_size != 0 ] val_file_list = [ dfile for dfile in val_file_list if os.stat(dfile).st_size != 0 ] test_file_list = [ dfile for dfile in test_file_list if os.stat(dfile).st_size != 0 ] print("Shuffle the data...") random.shuffle(train_file_list) random.shuffle(val_file_list) random.shuffle(test_file_list) train_file_list = train_file_list[:max( int(len(train_file_list) * data_sample), 1)] print('Instantiate DataSet classes...') train_set = DataSet(train_file_list, num_users=nb_users, num_items=nb_movies, batch_size=batch_size_, mode=0) val_set = DataSet(val_file_list, num_users=nb_users, num_items=nb_movies, batch_size=batch_size_, mode=1) test_set = DataSet(test_file_list, num_users=nb_users, num_items=nb_movies, batch_size=batch_size_, mode=2) rating_freq = np.zeros((6040, 5)) init_b = np.zeros((6040, 5)) for batch in val_set.generate(max_iters=1): inp_r = batch[0]['input_ratings'] out_r = batch[0]['output_ratings'] inp_m = batch[0]['input_masks'] out_m = batch[0]['output_masks'] rating_freq += inp_r.sum(axis=0) log_rating_freq = np.log(rating_freq + 1e-8) log_rating_freq_diff = np.diff(log_rating_freq, axis=1) init_b[:, 1:] = log_rating_freq_diff init_b[:, 0] = log_rating_freq[:, 0] new_items = np.where(rating_freq.sum(axis=1) == 0)[0] input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings') output_ratings = Input(shape=(input_dim0, input_dim1), name='output_ratings') input_masks = Input(shape=(input_dim0, ), name='input_masks') output_masks = Input(shape=(input_dim0, ), name='output_masks') print("Build NADE architecture...") # nade_layer = Dropout(0.0)(input_layer) nade_layer = input_layer nade_layer = NADE(hidden_dim=args.hidden_dim, activation='tanh', bias=True, W_regularizer=keras.regularizers.l2(0.02), V_regularizer=keras.regularizers.l2(0.02), b_regularizer=keras.regularizers.l2(0.02), c_regularizer=keras.regularizers.l2(0.02), args=args)(nade_layer) predicted_ratings = Lambda(prediction_layer, output_shape=prediction_output_shape, name='predicted_ratings')(nade_layer) d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks) sum_masks = add([input_masks, output_masks]) D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks) loss_out = Lambda(rating_cost_lambda_func, output_shape=(1, ), name='nade_loss')([ nade_layer, output_ratings, input_masks, output_masks, D, d ]) cf_nade_model = Model( inputs=[input_layer, output_ratings, input_masks, output_masks], outputs=[loss_out, predicted_ratings]) print("Get NADE model summary...") cf_nade_model.summary() # Use Adam optimizer adam = Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8) # Compile NADE model cf_nade_model.compile(loss={ 'nade_loss': lambda y_true, y_pred: y_pred }, optimizer=adam) # Create EvaluationCallback for NADE model on train and validation sets train_evaluation_callback = EvaluationCallback(data_set=train_set, new_items=new_items, training_set=True) valid_evaluation_callback = EvaluationCallback(data_set=val_set, new_items=new_items, training_set=False) print('Training...') cf_nade_model.fit_generator( train_set.generate(), steps_per_epoch=(train_set.get_corpus_size() // batch_size_), epochs=args.n_epochs, validation_data=val_set.generate(), validation_steps=(val_set.get_corpus_size() // batch_size_), shuffle=True, callbacks=[ train_set, val_set, train_evaluation_callback, valid_evaluation_callback ], verbose=1) print('Testing...') rmses = [] rate_score = np.array([1, 2, 3, 4, 5], np.float32) new_items = new_items squared_error = [] n_samples = [] for i, batch in enumerate(test_set.generate(max_iters=1)): inp_r = batch[0]['input_ratings'] out_r = batch[0]['output_ratings'] inp_m = batch[0]['input_masks'] out_m = batch[0]['output_masks'] pred_batch = cf_nade_model.predict(batch[0])[1] true_r = out_r.argmax(axis=2) + 1 pred_r = (pred_batch * rate_score[np.newaxis, np.newaxis, :]).sum(axis=2) pred_r[:, new_items] = 3 mask = out_r.sum(axis=2) se = np.sum(np.square(true_r - pred_r) * mask) n = np.sum(mask) squared_error.append(se) n_samples.append(n) total_squared_error = np.array(squared_error).sum() total_n_samples = np.array(n_samples).sum() rmse = np.sqrt(total_squared_error / (total_n_samples * 1.0 + 1e-8)) print("test set RMSE is %f" % rmse)
def init_model(opts): model_config = opts["model"] model_config["in_feats"] = 28 * 28 model = NADE(**model_config) return model
def iterative_algorithm( self, name, pop_size=100, genome_length=20, lim_percentage=20, corruption_level=0.2, num_epochs=50, lr = 0.1, max_evaluations=200000, unique_training=False, hiddens=300, rtr = True, w=10 ): results_path = "results/autoencoder/{0}/".format(name) ensure_dir(results_path) fitfile = open("{0}fitnesses.dat".format(results_path),"w") self.mask = np.random.binomial(1,0.5,genome_length) trials = max_evaluations/pop_size population_limit = int(pop_size*(lim_percentage/100.0)) # self.dA = dA(n_visible=genome_length,n_hidden=hiddens) # self.dA.build_dA(corruption_level) # self.build_sample_dA() self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens) # self.NADE.build_NADE() new_population = np.random.binomial(1,0.5,(pop_size,genome_length)) self.population_fitnesses = self.fitness_many(new_population) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) for iteration in range(0,trials): print "iteration:",iteration population = new_population self.population = new_population rw = self.tournament_selection_replacement(population) good_strings,good_strings_fitnesses=self.get_good_strings( population, population_limit, unique=unique_training, fitnesses=self.population_fitnesses ) print "training A/E" training_data = np.array(good_strings) self.train_NADE(training_data, num_epochs=num_epochs, lr=lr) print "sampling..." sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b") self.sample_fitnesses = self.fitness_many(sampled_population) if rtr: new_population = self.RTR( population, sampled_population, population_fitnesses=self.population_fitnesses, sample_fitnesses=self.sample_fitnesses, w=w ) else: new_population = sampled_population new_population[0:1] = good_strings[0:1] self.population_fitnesses = self.sample_fitnesses self.population_fitnesses[0:1] = good_strings_fitnesses[0:1] print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses), np.min(self.population_fitnesses), np.max(self.population_fitnesses)) print "best from previous:",( self.fitness(new_population[np.argmax(self.population_fitnesses)]) ) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) fitfile.flush() fitfile.close() return new_population
class AESolver(object): """ The Denoising Autoencoder Genetic Algorithm """ def __init__(self,fitness_f): super(AESolver, self).__init__() self.FITNESS_F = fitness_f if self.FITNESS_F == "hiff": self.HIFF = HIFF(NUMGENES=128,K=2,P=7) self.fitness = self.hiff_fitness elif self.FITNESS_F == "knapsack": self.fitness = self.knapsack_fitness elif self.FITNESS_F == "max_ones": self.fitness = self.max_ones_fitness elif self.FITNESS_F == "left_ones": self.fitness = self.left_ones def generate_random_string(self,l=20): return [random.choice([0,1]) for i in range(l)] def knapsack_fitness(self,string): knapsack = self.knapsack weights = [] for i,c in enumerate(knapsack.capacities): weights.append(np.sum(np.array(knapsack.constraints[i])*string)) over = 0 for i,w in enumerate(weights): if w > knapsack.capacities[i]: over += (w - knapsack.capacities[i]) if over > 0: return -over else: _fitness = np.sum(np.array(knapsack.values)*string) return _fitness def hiff_fitness(self,string): fitness = self.HIFF.H(string) return fitness def max_ones_fitness(self,string): fitness = np.sum(string^self.mask) if cache: self.cache_fitness(fitness) return fitness def left_ones_fitness(self,_string): string =_string^self.mask fitness = sum(string[0:len(string)/2]) - sum(string[len(string)/2:]) if cache: self.cache_fitness(fitness) return fitness def tournament_selection_replacement(self, population, fitnesses=None, pop_size=None): if pop_size == None: pop_size = len(population) if fitnesses == None: fitnesses = self.fitness_many(population) new_population = [] while len(new_population) < pop_size: child_1 = int(np.random.random() * pop_size) child_2 = int(np.random.random() * pop_size) if fitnesses[child_1] > fitnesses[child_2]: new_population.append(copy.deepcopy(population[child_1])) else: new_population.append(copy.deepcopy(population[child_2])) return new_population def get_good_strings(self,strings,lim=20,unique=False,fitnesses=None): if fitnesses == None: fitnesses = [self.fitness(s) for s in strings] sorted_fitnesses = sorted(range(len(fitnesses)), key=lambda k: fitnesses[k]) sorted_fitnesses.reverse() if unique == False: return ([strings[i] for i in sorted_fitnesses[0:lim]], [fitnesses[k] for k in sorted_fitnesses[0:lim]]) else: uniques = {} good_pop = [] good_pop_fitnesses = [] index = 0 while len(good_pop) < lim and index < len(sorted_fitnesses): key = str(strings[sorted_fitnesses[index]]) if key not in uniques: uniques[key] = 0 good_pop.append(strings[sorted_fitnesses[index]]) good_pop_fitnesses.append( fitnesses[sorted_fitnesses[index]] ) index += 1 if len(good_pop) == lim: return [good_pop,good_pop_fitnesses] else: while len(good_pop) < lim: good_pop.append(self.generate_random_string( l=len(strings[0])) ) good_pop_fitnesses.append(self.fitness(good_pop[-1])) return [good_pop,good_pop_fitnesses] def RTR(self, population, sampled_population, population_fitnesses, sample_fitnesses, w=None): if w == None: w = len(population)/20 _population = np.array(population) for ind_i,individual in enumerate(sampled_population): indexes = np.random.choice(len(_population), w, replace=False) distances = cdist(_population[indexes],[individual],"hamming") replacement = indexes[np.argmin(distances.flatten())] if population_fitnesses[replacement] < sample_fitnesses[ind_i]: _population[replacement] = individual population_fitnesses[replacement] = sample_fitnesses[ind_i] return _population def fitness_many(self,strings): return [self.fitness(s) for s in strings] def train_dA(self, data, corruption_level=0.2, num_epochs=200, lr=0.1, output_folder="", iteration=0): train_data = data train_set = SequenceDataset(train_data,batch_size=20,number_batches=None) sgd_optimizer(self.dA.params,[self.dA.input],self.dA.cost,train_set, lr=lr,num_epochs=num_epochs,save=False, output_folder=output_folder,iteration=iteration) def train_NADE(self, data, num_epochs=200, lr=0.1, output_folder="", iteration=0): train_data = data train_set = SequenceDataset(train_data,batch_size=20,number_batches=None) sgd_optimizer(self.NADE.params,[self.NADE.v],self.NADE.cost,train_set, lr=lr,num_epochs=num_epochs,save=False, output_folder=output_folder,iteration=iteration) def build_sample_dA(self): self.sample_dA = theano.function([self.dA.input],self.dA.sample) def iterative_algorithm( self, name, pop_size=100, genome_length=20, lim_percentage=20, corruption_level=0.2, num_epochs=50, lr = 0.1, max_evaluations=200000, unique_training=False, hiddens=300, rtr = True, w=10 ): results_path = "results/autoencoder/{0}/".format(name) ensure_dir(results_path) fitfile = open("{0}fitnesses.dat".format(results_path),"w") self.mask = np.random.binomial(1,0.5,genome_length) trials = max_evaluations/pop_size population_limit = int(pop_size*(lim_percentage/100.0)) # self.dA = dA(n_visible=genome_length,n_hidden=hiddens) # self.dA.build_dA(corruption_level) # self.build_sample_dA() self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens) # self.NADE.build_NADE() new_population = np.random.binomial(1,0.5,(pop_size,genome_length)) self.population_fitnesses = self.fitness_many(new_population) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) for iteration in range(0,trials): print "iteration:",iteration population = new_population self.population = new_population rw = self.tournament_selection_replacement(population) good_strings,good_strings_fitnesses=self.get_good_strings( population, population_limit, unique=unique_training, fitnesses=self.population_fitnesses ) print "training A/E" training_data = np.array(good_strings) self.train_NADE(training_data, num_epochs=num_epochs, lr=lr) print "sampling..." sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b") self.sample_fitnesses = self.fitness_many(sampled_population) if rtr: new_population = self.RTR( population, sampled_population, population_fitnesses=self.population_fitnesses, sample_fitnesses=self.sample_fitnesses, w=w ) else: new_population = sampled_population new_population[0:1] = good_strings[0:1] self.population_fitnesses = self.sample_fitnesses self.population_fitnesses[0:1] = good_strings_fitnesses[0:1] print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses), np.min(self.population_fitnesses), np.max(self.population_fitnesses)) print "best from previous:",( self.fitness(new_population[np.argmax(self.population_fitnesses)]) ) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) fitfile.flush() fitfile.close() return new_population
class Sampler(): def __init__(self, experiment_name): self._encoder = SMILESEncoder() # Read parameter used during training self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode( [self._config['EVALUATION']['starting_token']]) if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode( [self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) # Read data if os.path.isfile('../data/' + self._file_name + '.csv'): self._data = pd.read_csv('../data/' + self._file_name + '.csv', header=None).values[:, 0] elif os.path.isfile('../data/' + self._file_name + '.tar.xz'): # Skip first line since empty and last line since nan self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz', compression='xz', header=None).values[1:-1, 0] # Clean data from start, end and padding token for i, mol_dat in enumerate(self._data): self._data[i] = clean_molecule(mol_dat, self._model_type) def sample(self, N=100, stor_dir='../evaluation', T=0.7, fold=[1], epoch=[9], valid=True, novel=True, unique=True, write_csv=True): '''Sample from a model where the number of novel valid unique molecules is fixed :param stor_dir: directory where the generated SMILES are saved :param N: number of samples :param T: Temperature :param fold: Folds to use for sampling :param epoch: Epochs to use for sampling :param valid: If True, only accept valid SMILES :param novel: If True, only accept novel SMILES :param unique: If True, only accept unique SMILES :param write_csv If True, the generated SMILES are written in stor_dir :return: res_molecules: list with all the generated SMILES ''' res_molecules = [] print('Sampling: started') for f in fold: for e in epoch: self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(f) + '_epochs_' + str(e)) new_molecules = [] while len(new_molecules) < N: new_mol = self._encoder.decode( self._model.sample(self._starting_token, T)) # Remove remains from generation new_mol = clean_molecule(new_mol[0], self._model_type) # If not valid, get new molecule if valid and not check_valid(new_mol): continue # If not unique, get new molecule if unique and (new_mol in new_molecules): continue # If not novel, get molecule if novel and (new_mol in self._data): continue # If all conditions checked, add new molecule new_molecules.append(new_mol) # Prepare name for file name = 'molecules_fold_' + str(f) + '_epochs_' + str( e) + '_T_' + str(T) + '_N_' + str(N) + '.csv' if unique: name = 'unique_' + name if valid: name = 'valid_' + name if novel: name = 'novel_' + name # Store final molecules if write_csv: if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules/'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules/') mol = np.array(new_molecules).reshape(-1) pd.DataFrame(mol).to_csv(stor_dir + '/' + self._experiment_name + '/molecules/' + name, header=None) res_molecules.append(new_molecules) print('Sampling: done') return res_molecules
class AESolver(object): """ The Denoising Autoencoder Genetic Algorithm """ def __init__(self,fitness_f): super(AESolver, self).__init__() self.FITNESS_F = fitness_f if self.FITNESS_F == "hiff": self.HIFF = HIFF(NUMGENES=128,K=2,P=7) self.fitness = self.hiff_fitness elif self.FITNESS_F == "knapsack": self.knapsack = pickle.load(open("weing8.pkl")) self.fitness = self.knapsack_fitness elif self.FITNESS_F == "max_ones": self.fitness = self.max_ones_fitness elif self.FITNESS_F == "left_ones": self.fitness = self.left_ones elif self.FITNESS_F == "royal_road": self.fitness = self.royal_road elif self.FITNESS_F == "churchill": self.fitness = self.churchills_road self.optimum = 33 def generate_random_string(self,l=20): return [random.choice([0,1]) for i in range(l)] def churchills_road(self,input,k=4,l=4): fitness = 0 for partitions in range(0,l): first_part = sum(input[partitions*k*2:partitions*k*2+k]) second_part = sum(input[(partitions*k*2)+k:(partitions*k*2)+k*2]) if first_part == k and second_part == 0: fitness += 8 if first_part == 0 and second_part == k: fitness += 8 if sum(input[0:k]) == k and sum(input[len(input)-k:]) == k: fitness += 1 if sum(input[0:k]) == 0 and sum(input[len(input)-k:]) == 0: fitness += 1 return fitness def knapsack_fitness(self,string): knapsack = self.knapsack weights = [] for i,c in enumerate(knapsack.capacities): weights.append(np.sum(np.array(knapsack.constraints[i])*string)) over = 0 for i,w in enumerate(weights): if w > knapsack.capacities[i]: over += (w - knapsack.capacities[i]) if over > 0: return -over else: _fitness = np.sum(np.array(knapsack.values)*string) return _fitness def hiff_fitness(self,string): fitness = self.HIFF.H(string) return fitness def max_ones_fitness(self,string): fitness = np.sum(string^self.mask) if cache: self.cache_fitness(fitness) return fitness def left_ones_fitness(self,_string): string =_string^self.mask fitness = sum(string[0:len(string)/2]) - sum(string[len(string)/2:]) if cache: self.cache_fitness(fitness) return fitness def royal_road(self,string, order=8): """Royal Road Function R1 as presented by Melanie Mitchell in : "An introduction to Genetic Algorithms". """ individual = string^self.mask nelem = len(individual) / order max_value = int(2**order - 1) total = 0 for i in xrange(nelem): value = int("".join(map(str, individual[i*order:i*order+order])), 2) total += int(order) * int(value/max_value) return total # def tournament_selection_replacement(self, # population, # fitnesses=None, # pop_size=None): # if pop_size == None: # pop_size = len(population) # if fitnesses == None: # fitnesses = self.fitness_many(population) # new_population = [] # while len(new_population) < pop_size: # child_1 = int(np.random.random() * pop_size) # child_2 = int(np.random.random() * pop_size) # if fitnesses[child_1] > fitnesses[child_2]: # new_population.append(copy.deepcopy(population[child_1])) # else: # new_population.append(copy.deepcopy(population[child_2])) # return new_population def tournament_selection_replacement(self, population, fitnesses=None, pop_size=None, tournament_size=2): if pop_size == None: pop_size = len(population) if fitnesses == None: fitnesses = self.fitness_many(population) new_population = [] while len(new_population) < pop_size: contenders=np.random.randint(0,len(population),tournament_size) # print "contenders:",contenders t_fitnesses = [fitnesses[c] for c in contenders] # print "fitnesses:",t_fitnesses # print "best_fitness:",np.argmax(t_fitnesses) # print "winner:",contenders[np.argmax(t_fitnesses)] winner = copy.deepcopy(population[contenders[np.argmax(t_fitnesses)]]) new_population.append(winner) return new_population def get_good_strings(self,strings,lim=20,unique=False,fitnesses=None): if fitnesses == None: fitnesses = [self.fitness(s) for s in strings] sorted_fitnesses = sorted(range(len(fitnesses)), key=lambda k: fitnesses[k]) sorted_fitnesses.reverse() if unique == False: return ([strings[i] for i in sorted_fitnesses[0:lim]], [fitnesses[k] for k in sorted_fitnesses[0:lim]]) else: uniques = {} good_pop = [] good_pop_fitnesses = [] index = 0 while len(good_pop) < lim and index < len(sorted_fitnesses): key = str(strings[sorted_fitnesses[index]]) if key not in uniques: uniques[key] = 0 good_pop.append(strings[sorted_fitnesses[index]]) good_pop_fitnesses.append( fitnesses[sorted_fitnesses[index]] ) index += 1 if len(good_pop) == lim: return [good_pop,good_pop_fitnesses] else: while len(good_pop) < lim: good_pop.append(self.generate_random_string( l=len(strings[0])) ) good_pop_fitnesses.append(self.fitness(good_pop[-1])) return [good_pop,good_pop_fitnesses] def RTR(self, population, sampled_population, population_fitnesses, sample_fitnesses, w=None): if w == None: w = len(population)/20 _population = np.array(population) for ind_i,individual in enumerate(sampled_population): indexes = np.random.choice(len(_population), w, replace=False) distances = cdist(_population[indexes],[individual],"hamming") replacement = indexes[np.argmin(distances.flatten())] if population_fitnesses[replacement] < sample_fitnesses[ind_i]: _population[replacement] = individual population_fitnesses[replacement] = sample_fitnesses[ind_i] return _population def fitness_many(self,strings): return [self.fitness(s) for s in strings] def train_dA(self, data, corruption_level=0.2, num_epochs=200, lr=0.1, output_folder="", iteration=0): train_data = data train_set = SequenceDataset(train_data,batch_size=20,number_batches=None) sgd_optimizer(self.dA.params,[self.dA.input],self.dA.cost,train_set, lr=lr,num_epochs=num_epochs,save=False, output_folder=output_folder,iteration=iteration) def train_NADE(self, data, num_epochs=200, lr=0.1, output_folder="", iteration=0): train_data = data train_set = SequenceDataset(train_data,batch_size=20,number_batches=None) sgd_optimizer(self.NADE.params,[self.NADE.v],self.NADE.cost,train_set, lr=lr,num_epochs=num_epochs,save=True, output_folder=output_folder,iteration=iteration) def build_sample_dA(self): self.sample_dA = theano.function([self.dA.input],self.dA.sample) def iterative_algorithm( self, name, pop_size=100, genome_length=20, lim_percentage=20, corruption_level=0.2, num_epochs=50, lr = 0.1, max_evaluations=200000, unique_training=False, hiddens=300, rtr = True, w=10 ): results_path = "results/autoencoder/{0}/".format(name) ensure_dir(results_path) fitfile = open("{0}fitnesses.dat".format(results_path),"w") self.mask = np.random.binomial(1,0.5,genome_length) trials = max_evaluations/pop_size population_limit = int(pop_size*(lim_percentage/100.0)) # self.dA = dA(n_visible=genome_length,n_hidden=hiddens) # self.dA.build_dA(corruption_level) # self.build_sample_dA() self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens) # self.NADE.build_NADE() new_population = np.random.binomial(1,0.5,(pop_size,genome_length)) self.population_fitnesses = self.fitness_many(new_population) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) for iteration in range(0,trials): print "iteration:",iteration population = new_population self.population = new_population rw = self.tournament_selection_replacement(population, fitnesses=self.population_fitnesses, pop_size=population_limit, tournament_size=4) if not rtr: good_strings,good_strings_fitnesses=self.get_good_strings( population, population_limit, unique=unique_training, fitnesses=self.population_fitnesses ) training_data = np.array(good_strings) else: training_data = np.array(rw) print "training A/E" self.train_NADE(training_data, num_epochs=num_epochs, lr=lr) print "sampling..." # sampled_population = [np.array(self.NADE.sample(),"b") for i in range(len(self.population))] sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b") # pdb.set_trace() self.sample_fitnesses = self.fitness_many(sampled_population) if rtr: new_population = self.RTR( population, sampled_population, population_fitnesses=self.population_fitnesses, sample_fitnesses=self.sample_fitnesses, w=w ) else: new_population = sampled_population new_population[0:1] = good_strings[0:1] self.population_fitnesses = self.sample_fitnesses self.population_fitnesses[0:1] = good_strings_fitnesses[0:1] print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses), np.min(self.population_fitnesses), np.max(self.population_fitnesses)) print "best from previous:",( self.fitness(new_population[np.argmax(self.population_fitnesses)]) ) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) fitfile.flush() if np.max(self.population_fitnesses) == self.optimum: pickle.dump({"pop":self.population,"fitnesses":self.population_fitnesses,"iteration":iteration},open("final_shit.pkl","w")) break fitfile.close() return new_population
class FineTuner(): def __init__(self, experiment_name='ForwardRNN'): self._encoder = SMILESEncoder() # Read all parameter from the .ini file self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = '../data/' + self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) # self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']]) # Read starting model self._start_model = self._config['FINETUNING']['start_model'] if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) self._data = self._encoder.encode_from_file(self._file_name) def fine_tuning(self, stor_dir='../evaluation/', restart=False): '''Perform fine-tuning and store statistic, NOTE: Directory should be prepared with the correct name and model NOTE: Molecules are not generated or validation is not performed. To sample molecules sampler should be used' :param stor_dir: directory to store data :return: ''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE random if self._model_type == 'NADE' and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Build model self._model.build(stor_dir + '/' + self._experiment_name + '/' + self._start_model) # Store total Statistics tot_stat = [] # only single fold fold = 1 for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: # Read existing files tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully completed else continue with normal training if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i: # Load model self._model.build( stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) # Skip this epoch continue else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train(self._data.reshape(-1, self._molecular_size, self._encoding_size), label.reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save( stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i) ) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode(self._model.sample(self._starting_token, self._T)) new_molecules.append(clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str( i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None)
init_b[:, 1:] = log_rating_freq_diff init_b[:, 0] = log_rating_freq[:, 0] new_items = np.where(rating_freq.sum(axis=1) == 0)[0] input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings') output_ratings = Input(shape=(input_dim0, input_dim1), name='output_ratings') input_masks = Input(shape=(input_dim0, ), name='input_masks') output_masks = Input(shape=(input_dim0, ), name='output_masks') nade_layer = Dropout(0.0)(input_layer) nade_layer = NADE(hidden_dim=hidden_dim, activation='tanh', bias=True, W_regularizer=keras.regularizers.l2(0.02), V_regularizer=keras.regularizers.l2(0.02), b_regularizer=keras.regularizers.l2(0.02), c_regularizer=keras.regularizers.l2(0.02))(nade_layer) predicted_ratings = Lambda(prediction_layer, output_shape=prediction_output_shape, name='predicted_ratings')(nade_layer) d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks) sum_masks = add([input_masks, output_masks]) D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks) loss_out = Lambda(rating_cost_lambda_func, output_shape=(1, ),
class AESolver(object): """ The Denoising Autoencoder Genetic Algorithm """ def __init__(self, fitness_f): super(AESolver, self).__init__() self.FITNESS_F = fitness_f if self.FITNESS_F == "hiff": self.HIFF = HIFF(NUMGENES=128, K=2, P=7) self.fitness = self.hiff_fitness elif self.FITNESS_F == "knapsack": self.knapsack = pickle.load(open("weing8.pkl")) self.fitness = self.knapsack_fitness elif self.FITNESS_F == "max_ones": self.fitness = self.max_ones_fitness elif self.FITNESS_F == "left_ones": self.fitness = self.left_ones elif self.FITNESS_F == "royal_road": self.fitness = self.royal_road elif self.FITNESS_F == "churchill": self.fitness = self.churchills_road self.optimum = 33 def generate_random_string(self, l=20): return [random.choice([0, 1]) for i in range(l)] def churchills_road(self, input, k=4, l=4): fitness = 0 for partitions in range(0, l): first_part = sum(input[partitions * k * 2:partitions * k * 2 + k]) second_part = sum(input[(partitions * k * 2) + k:(partitions * k * 2) + k * 2]) if first_part == k and second_part == 0: fitness += 8 if first_part == 0 and second_part == k: fitness += 8 if sum(input[0:k]) == k and sum(input[len(input) - k:]) == k: fitness += 1 if sum(input[0:k]) == 0 and sum(input[len(input) - k:]) == 0: fitness += 1 return fitness def knapsack_fitness(self, string): knapsack = self.knapsack weights = [] for i, c in enumerate(knapsack.capacities): weights.append(np.sum(np.array(knapsack.constraints[i]) * string)) over = 0 for i, w in enumerate(weights): if w > knapsack.capacities[i]: over += (w - knapsack.capacities[i]) if over > 0: return -over else: _fitness = np.sum(np.array(knapsack.values) * string) return _fitness def hiff_fitness(self, string): fitness = self.HIFF.H(string) return fitness def max_ones_fitness(self, string): fitness = np.sum(string ^ self.mask) if cache: self.cache_fitness(fitness) return fitness def left_ones_fitness(self, _string): string = _string ^ self.mask fitness = sum(string[0:len(string) / 2]) - sum( string[len(string) / 2:]) if cache: self.cache_fitness(fitness) return fitness def royal_road(self, string, order=8): """Royal Road Function R1 as presented by Melanie Mitchell in : "An introduction to Genetic Algorithms". """ individual = string ^ self.mask nelem = len(individual) / order max_value = int(2**order - 1) total = 0 for i in xrange(nelem): value = int( "".join(map(str, individual[i * order:i * order + order])), 2) total += int(order) * int(value / max_value) return total # def tournament_selection_replacement(self, # population, # fitnesses=None, # pop_size=None): # if pop_size == None: # pop_size = len(population) # if fitnesses == None: # fitnesses = self.fitness_many(population) # new_population = [] # while len(new_population) < pop_size: # child_1 = int(np.random.random() * pop_size) # child_2 = int(np.random.random() * pop_size) # if fitnesses[child_1] > fitnesses[child_2]: # new_population.append(copy.deepcopy(population[child_1])) # else: # new_population.append(copy.deepcopy(population[child_2])) # return new_population def tournament_selection_replacement(self, population, fitnesses=None, pop_size=None, tournament_size=2): if pop_size == None: pop_size = len(population) if fitnesses == None: fitnesses = self.fitness_many(population) new_population = [] while len(new_population) < pop_size: contenders = np.random.randint(0, len(population), tournament_size) # print "contenders:",contenders t_fitnesses = [fitnesses[c] for c in contenders] # print "fitnesses:",t_fitnesses # print "best_fitness:",np.argmax(t_fitnesses) # print "winner:",contenders[np.argmax(t_fitnesses)] winner = copy.deepcopy( population[contenders[np.argmax(t_fitnesses)]]) new_population.append(winner) return new_population def get_good_strings(self, strings, lim=20, unique=False, fitnesses=None): if fitnesses == None: fitnesses = [self.fitness(s) for s in strings] sorted_fitnesses = sorted(range(len(fitnesses)), key=lambda k: fitnesses[k]) sorted_fitnesses.reverse() if unique == False: return ([strings[i] for i in sorted_fitnesses[0:lim]], [fitnesses[k] for k in sorted_fitnesses[0:lim]]) else: uniques = {} good_pop = [] good_pop_fitnesses = [] index = 0 while len(good_pop) < lim and index < len(sorted_fitnesses): key = str(strings[sorted_fitnesses[index]]) if key not in uniques: uniques[key] = 0 good_pop.append(strings[sorted_fitnesses[index]]) good_pop_fitnesses.append( fitnesses[sorted_fitnesses[index]]) index += 1 if len(good_pop) == lim: return [good_pop, good_pop_fitnesses] else: while len(good_pop) < lim: good_pop.append( self.generate_random_string(l=len(strings[0]))) good_pop_fitnesses.append(self.fitness(good_pop[-1])) return [good_pop, good_pop_fitnesses] def RTR(self, population, sampled_population, population_fitnesses, sample_fitnesses, w=None): if w == None: w = len(population) / 20 _population = np.array(population) for ind_i, individual in enumerate(sampled_population): indexes = np.random.choice(len(_population), w, replace=False) distances = cdist(_population[indexes], [individual], "hamming") replacement = indexes[np.argmin(distances.flatten())] if population_fitnesses[replacement] < sample_fitnesses[ind_i]: _population[replacement] = individual population_fitnesses[replacement] = sample_fitnesses[ind_i] return _population def fitness_many(self, strings): return [self.fitness(s) for s in strings] def train_dA(self, data, corruption_level=0.2, num_epochs=200, lr=0.1, output_folder="", iteration=0): train_data = data train_set = SequenceDataset(train_data, batch_size=20, number_batches=None) sgd_optimizer(self.dA.params, [self.dA.input], self.dA.cost, train_set, lr=lr, num_epochs=num_epochs, save=False, output_folder=output_folder, iteration=iteration) def train_NADE(self, data, num_epochs=200, lr=0.1, output_folder="", iteration=0): train_data = data train_set = SequenceDataset(train_data, batch_size=20, number_batches=None) sgd_optimizer(self.NADE.params, [self.NADE.v], self.NADE.cost, train_set, lr=lr, num_epochs=num_epochs, save=True, output_folder=output_folder, iteration=iteration) def build_sample_dA(self): self.sample_dA = theano.function([self.dA.input], self.dA.sample) def iterative_algorithm(self, name, pop_size=100, genome_length=20, lim_percentage=20, corruption_level=0.2, num_epochs=50, lr=0.1, max_evaluations=200000, unique_training=False, hiddens=300, rtr=True, w=10): results_path = "results/autoencoder/{0}/".format(name) ensure_dir(results_path) fitfile = open("{0}fitnesses.dat".format(results_path), "w") self.mask = np.random.binomial(1, 0.5, genome_length) trials = max_evaluations / pop_size population_limit = int(pop_size * (lim_percentage / 100.0)) # self.dA = dA(n_visible=genome_length,n_hidden=hiddens) # self.dA.build_dA(corruption_level) # self.build_sample_dA() self.NADE = NADE(n_visible=genome_length, n_hidden=hiddens) # self.NADE.build_NADE() new_population = np.random.binomial(1, 0.5, (pop_size, genome_length)) self.population_fitnesses = self.fitness_many(new_population) fitfile.write( "{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses), np.min(self.population_fitnesses), np.max(self.population_fitnesses), np.std(self.population_fitnesses))) for iteration in range(0, trials): print "iteration:", iteration population = new_population self.population = new_population rw = self.tournament_selection_replacement( population, fitnesses=self.population_fitnesses, pop_size=population_limit, tournament_size=4) if not rtr: good_strings, good_strings_fitnesses = self.get_good_strings( population, population_limit, unique=unique_training, fitnesses=self.population_fitnesses) training_data = np.array(good_strings) else: training_data = np.array(rw) print "training A/E" self.train_NADE(training_data, num_epochs=num_epochs, lr=lr) print "sampling..." # sampled_population = [np.array(self.NADE.sample(),"b") for i in range(len(self.population))] sampled_population = np.array( self.NADE.sample_multiple(n=len(new_population)), "b") # pdb.set_trace() self.sample_fitnesses = self.fitness_many(sampled_population) if rtr: new_population = self.RTR( population, sampled_population, population_fitnesses=self.population_fitnesses, sample_fitnesses=self.sample_fitnesses, w=w) else: new_population = sampled_population new_population[0:1] = good_strings[0:1] self.population_fitnesses = self.sample_fitnesses self.population_fitnesses[0:1] = good_strings_fitnesses[0:1] print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses), np.min(self.population_fitnesses), np.max(self.population_fitnesses)) print "best from previous:", (self.fitness( new_population[np.argmax(self.population_fitnesses)])) fitfile.write("{0},{1},{2},{3}\n".format( np.mean(self.population_fitnesses), np.min(self.population_fitnesses), np.max(self.population_fitnesses), np.std(self.population_fitnesses))) fitfile.flush() if np.max(self.population_fitnesses) == self.optimum: pickle.dump( { "pop": self.population, "fitnesses": self.population_fitnesses, "iteration": iteration }, open("final_shit.pkl", "w")) break fitfile.close() return new_population
# Define datapoints for each class. inps = torch.stack([ torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 1]), torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([0, 0, 1, 0, 0, 1, 0, 1, 0, 1]), torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([1, 1, 0, 1, 1, 0, 0, 1, 0, 0]), torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1]) ], dim=0) # Define one model per class. models = [ NADE(inp_dimensions, inp_dimensions // 2) for _ in range(num_classes) ] # Train each model one by one. for inp, model in zip(inps, models): # Optimization scheme. optimizer = optim.SGD(model.parameters(), lr=0.01) for _ in range(num_training_iterations): # Zero out previous gradients. model.zero_grad() # Compute log-likehoods per sample. log_likelihoods = model(inp)
class Trainer(): def __init__(self, experiment_name='ForwardRNN'): self._encoder = SMILESEncoder() # Read all parameter from the .ini file self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = '../data/' + self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode( [self._config['EVALUATION']['starting_token']]) if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode( [self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) self._data = self._encoder.encode_from_file(self._file_name) def complete_run(self, stor_dir='../evaluation/', restart=False): '''Training without validation on complete data''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE if self._model_type == 'NADE' and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Build model self._model.build() # Store total Statistics tot_stat = [] # only single fold fold = 1 # Shuffle data before training (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) self._data, label = shuffle( self._data.reshape(-1, self._molecular_size, self._encoding_size), label.reshape(-1, self._molecular_size)) for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) # With restart read existing files if restart: tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully completed else continue with normal training if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i: # Load model self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) continue # Continue with normal training else: restart = False # Train model statistic = self._model.train(self._data, label, epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode( self._model.sample(self._starting_token, self._T)) new_molecules.append(clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None) def single_run(self, stor_dir='../evaluation/', restart=False): '''Training with validation and store data''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/validation'): os.makedirs(stor_dir + '/' + self._experiment_name + '/validation') # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE if (self._model_type == 'NADE' or self._model_type == 'NADE_v2') and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Split data into train and test data train_data, test_data, train_label, test_label = train_test_split( self._data, label, test_size=1. / 5, random_state=1, shuffle=True) # Build model self._model.build() # Store total Statistics tot_stat = [] # Store validation loss tot_loss = [] # only single fold fold = 1 for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: # Read existing files tmp_val_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None).to_numpy() tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully completed else continue with normal training if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_val_file.shape[ 0] > i and tmp_stat_file.shape[0] > i: # Load model self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) tot_loss.append(tmp_val_file[i, 1]) # Skip this epoch continue # Continue with normal training else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train( train_data.reshape(-1, self._molecular_size, self._encoding_size), train_label.reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Test model on validation set tot_loss.append( self._model.validate( test_data.reshape(-1, self._molecular_size, self._encoding_size), test_label.reshape(-1, self._molecular_size))) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode( self._model.sample(self._starting_token, self._T)) new_molecules.append(clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None) # Store validation data pd.DataFrame(np.array(tot_loss).reshape( -1, 1)).to_csv(stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None) def cross_validation(self, stor_dir='../evaluation/', restart=False): '''Perform cross-validation and store data''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/validation'): os.makedirs(stor_dir + '/' + self._experiment_name + '/validation') self._kf = KFold(n_splits=self._n_folds, shuffle=True, random_state=2) # Count iterations fold = 0 # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE if (self._model_type == 'NADE') and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Split data into train and test data for train, test in self._kf.split(self._data): # Shuffle index within test and train set np.random.shuffle(train) np.random.shuffle(test) fold += 1 self._model.build() # Store total statistics tot_stat = [] # Store validation loss tot_loss = [] for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: tmp_val_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None).to_numpy() tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully complete[0]d else continue with normal training if check_model( self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_val_file.shape[ 0] > i and tmp_stat_file.shape[0] > i: # Load model self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) tot_loss.append(tmp_val_file[i, 1]) # Skip this epoch continue else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train( self._data[train].reshape(-1, self._molecular_size, self._encoding_size), label[train].reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Test model on validation set tot_loss.append( self._model.validate( self._data[test].reshape(-1, self._molecular_size, self._encoding_size), label[test].reshape(-1, self._molecular_size))) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode( self._model.sample(self._starting_token, self._T)) new_molecules.append( clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None) # Store validation data pd.DataFrame(np.array(tot_loss).reshape(-1, 1)).to_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None)