def __init__(self, experiment_name): self._encoder = SMILESEncoder() # Read parameter used during training self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode( [self._config['EVALUATION']['starting_token']]) if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode( [self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) # Read data if os.path.isfile('../data/' + self._file_name + '.csv'): self._data = pd.read_csv('../data/' + self._file_name + '.csv', header=None).values[:, 0] elif os.path.isfile('../data/' + self._file_name + '.tar.xz'): # Skip first line since empty and last line since nan self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz', compression='xz', header=None).values[1:-1, 0] # Clean data from start, end and padding token for i, mol_dat in enumerate(self._data): self._data[i] = clean_molecule(mol_dat, self._model_type)
def __init__(self, experiment_name='ForwardRNN'): self._encoder = SMILESEncoder() # Read all parameter from the .ini file self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = '../data/' + self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) # self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']]) # Read starting model self._start_model = self._config['FINETUNING']['start_model'] if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) self._data = self._encoder.encode_from_file(self._file_name)
def _train(args): """ Train the NADE model :param args: parsed arguments """ if K.backend() != 'tensorflow': print("This repository only support tensorflow backend.") raise NotImplementedError() batch_size_ = 512 nb_users = 6040 nb_movies = 3706 data_sample = 1.0 input_dim0 = 6040 input_dim1 = 5 std = 0.0 alpha = 1.0 print('Loading data...') train_file_list = sorted(glob.glob(os.path.join('data/train_set', 'part*'))) val_file_list = sorted(glob.glob(os.path.join('data/val_set/', 'part*'))) test_file_list = sorted(glob.glob(os.path.join('data/test_set/', 'part*'))) train_file_list = [ dfile for dfile in train_file_list if os.stat(dfile).st_size != 0 ] val_file_list = [ dfile for dfile in val_file_list if os.stat(dfile).st_size != 0 ] test_file_list = [ dfile for dfile in test_file_list if os.stat(dfile).st_size != 0 ] print("Shuffle the data...") random.shuffle(train_file_list) random.shuffle(val_file_list) random.shuffle(test_file_list) train_file_list = train_file_list[:max( int(len(train_file_list) * data_sample), 1)] print('Instantiate DataSet classes...') train_set = DataSet(train_file_list, num_users=nb_users, num_items=nb_movies, batch_size=batch_size_, mode=0) val_set = DataSet(val_file_list, num_users=nb_users, num_items=nb_movies, batch_size=batch_size_, mode=1) test_set = DataSet(test_file_list, num_users=nb_users, num_items=nb_movies, batch_size=batch_size_, mode=2) rating_freq = np.zeros((6040, 5)) init_b = np.zeros((6040, 5)) for batch in val_set.generate(max_iters=1): inp_r = batch[0]['input_ratings'] out_r = batch[0]['output_ratings'] inp_m = batch[0]['input_masks'] out_m = batch[0]['output_masks'] rating_freq += inp_r.sum(axis=0) log_rating_freq = np.log(rating_freq + 1e-8) log_rating_freq_diff = np.diff(log_rating_freq, axis=1) init_b[:, 1:] = log_rating_freq_diff init_b[:, 0] = log_rating_freq[:, 0] new_items = np.where(rating_freq.sum(axis=1) == 0)[0] input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings') output_ratings = Input(shape=(input_dim0, input_dim1), name='output_ratings') input_masks = Input(shape=(input_dim0, ), name='input_masks') output_masks = Input(shape=(input_dim0, ), name='output_masks') print("Build NADE architecture...") # nade_layer = Dropout(0.0)(input_layer) nade_layer = input_layer nade_layer = NADE(hidden_dim=args.hidden_dim, activation='tanh', bias=True, W_regularizer=keras.regularizers.l2(0.02), V_regularizer=keras.regularizers.l2(0.02), b_regularizer=keras.regularizers.l2(0.02), c_regularizer=keras.regularizers.l2(0.02), args=args)(nade_layer) predicted_ratings = Lambda(prediction_layer, output_shape=prediction_output_shape, name='predicted_ratings')(nade_layer) d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks) sum_masks = add([input_masks, output_masks]) D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks) loss_out = Lambda(rating_cost_lambda_func, output_shape=(1, ), name='nade_loss')([ nade_layer, output_ratings, input_masks, output_masks, D, d ]) cf_nade_model = Model( inputs=[input_layer, output_ratings, input_masks, output_masks], outputs=[loss_out, predicted_ratings]) print("Get NADE model summary...") cf_nade_model.summary() # Use Adam optimizer adam = Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8) # Compile NADE model cf_nade_model.compile(loss={ 'nade_loss': lambda y_true, y_pred: y_pred }, optimizer=adam) # Create EvaluationCallback for NADE model on train and validation sets train_evaluation_callback = EvaluationCallback(data_set=train_set, new_items=new_items, training_set=True) valid_evaluation_callback = EvaluationCallback(data_set=val_set, new_items=new_items, training_set=False) print('Training...') cf_nade_model.fit_generator( train_set.generate(), steps_per_epoch=(train_set.get_corpus_size() // batch_size_), epochs=args.n_epochs, validation_data=val_set.generate(), validation_steps=(val_set.get_corpus_size() // batch_size_), shuffle=True, callbacks=[ train_set, val_set, train_evaluation_callback, valid_evaluation_callback ], verbose=1) print('Testing...') rmses = [] rate_score = np.array([1, 2, 3, 4, 5], np.float32) new_items = new_items squared_error = [] n_samples = [] for i, batch in enumerate(test_set.generate(max_iters=1)): inp_r = batch[0]['input_ratings'] out_r = batch[0]['output_ratings'] inp_m = batch[0]['input_masks'] out_m = batch[0]['output_masks'] pred_batch = cf_nade_model.predict(batch[0])[1] true_r = out_r.argmax(axis=2) + 1 pred_r = (pred_batch * rate_score[np.newaxis, np.newaxis, :]).sum(axis=2) pred_r[:, new_items] = 3 mask = out_r.sum(axis=2) se = np.sum(np.square(true_r - pred_r) * mask) n = np.sum(mask) squared_error.append(se) n_samples.append(n) total_squared_error = np.array(squared_error).sum() total_n_samples = np.array(n_samples).sum() rmse = np.sqrt(total_squared_error / (total_n_samples * 1.0 + 1e-8)) print("test set RMSE is %f" % rmse)
def init_model(opts): model_config = opts["model"] model_config["in_feats"] = 28 * 28 model = NADE(**model_config) return model
def iterative_algorithm( self, name, pop_size=100, genome_length=20, lim_percentage=20, corruption_level=0.2, num_epochs=50, lr = 0.1, max_evaluations=200000, unique_training=False, hiddens=300, rtr = True, w=10 ): results_path = "results/autoencoder/{0}/".format(name) ensure_dir(results_path) fitfile = open("{0}fitnesses.dat".format(results_path),"w") self.mask = np.random.binomial(1,0.5,genome_length) trials = max_evaluations/pop_size population_limit = int(pop_size*(lim_percentage/100.0)) # self.dA = dA(n_visible=genome_length,n_hidden=hiddens) # self.dA.build_dA(corruption_level) # self.build_sample_dA() self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens) # self.NADE.build_NADE() new_population = np.random.binomial(1,0.5,(pop_size,genome_length)) self.population_fitnesses = self.fitness_many(new_population) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) for iteration in range(0,trials): print "iteration:",iteration population = new_population self.population = new_population rw = self.tournament_selection_replacement(population) good_strings,good_strings_fitnesses=self.get_good_strings( population, population_limit, unique=unique_training, fitnesses=self.population_fitnesses ) print "training A/E" training_data = np.array(good_strings) self.train_NADE(training_data, num_epochs=num_epochs, lr=lr) print "sampling..." sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b") self.sample_fitnesses = self.fitness_many(sampled_population) if rtr: new_population = self.RTR( population, sampled_population, population_fitnesses=self.population_fitnesses, sample_fitnesses=self.sample_fitnesses, w=w ) else: new_population = sampled_population new_population[0:1] = good_strings[0:1] self.population_fitnesses = self.sample_fitnesses self.population_fitnesses[0:1] = good_strings_fitnesses[0:1] print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses), np.min(self.population_fitnesses), np.max(self.population_fitnesses)) print "best from previous:",( self.fitness(new_population[np.argmax(self.population_fitnesses)]) ) fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses))) fitfile.flush() fitfile.close() return new_population
init_b[:, 1:] = log_rating_freq_diff init_b[:, 0] = log_rating_freq[:, 0] new_items = np.where(rating_freq.sum(axis=1) == 0)[0] input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings') output_ratings = Input(shape=(input_dim0, input_dim1), name='output_ratings') input_masks = Input(shape=(input_dim0, ), name='input_masks') output_masks = Input(shape=(input_dim0, ), name='output_masks') nade_layer = Dropout(0.0)(input_layer) nade_layer = NADE(hidden_dim=hidden_dim, activation='tanh', bias=True, W_regularizer=keras.regularizers.l2(0.02), V_regularizer=keras.regularizers.l2(0.02), b_regularizer=keras.regularizers.l2(0.02), c_regularizer=keras.regularizers.l2(0.02))(nade_layer) predicted_ratings = Lambda(prediction_layer, output_shape=prediction_output_shape, name='predicted_ratings')(nade_layer) d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks) sum_masks = add([input_masks, output_masks]) D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks) loss_out = Lambda(rating_cost_lambda_func, output_shape=(1, ),
# Define datapoints for each class. inps = torch.stack([ torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 1]), torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([0, 0, 1, 0, 0, 1, 0, 1, 0, 1]), torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([1, 1, 0, 1, 1, 0, 0, 1, 0, 0]), torch.randn(num_samples_per_class, inp_dimensions) / 10 + torch.tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1]) ], dim=0) # Define one model per class. models = [ NADE(inp_dimensions, inp_dimensions // 2) for _ in range(num_classes) ] # Train each model one by one. for inp, model in zip(inps, models): # Optimization scheme. optimizer = optim.SGD(model.parameters(), lr=0.01) for _ in range(num_training_iterations): # Zero out previous gradients. model.zero_grad() # Compute log-likehoods per sample. log_likelihoods = model(inp)