def train_epoch(self, dataloader, model, loss_compute, device): pad = cfgd.DATA_DEFAULT['padding_value'] total_loss = 0 total_tokens = 0 for i, batch in enumerate( ul.progress_bar(dataloader, total=len(dataloader))): src, source_length, trg, src_mask, trg_mask, max_length_target, _ = batch trg_y = trg[:, 1:].to(device) # skip start token # number of tokens without padding ntokens = (trg_y != pad).data.sum() # Move to GPU src = src.to(device) trg = trg[:, :-1].to(device) # save start token, skip end token src_mask = src_mask.to(device) trg_mask = trg_mask.to(device) # Compute loss out = model.forward(src, trg, src_mask, trg_mask) loss = loss_compute(out, trg_y, ntokens) total_tokens += ntokens total_loss += loss loss_epoch = total_loss / total_tokens return loss_epoch
def train_epoch(self, data_loader, model, optimizer_encoder, optimizer_decoder, clip_gradient_norm, device): model.network.encoder.train() model.network.decoder.train() pad = cfgd.DATA_DEFAULT['padding_value'] total_loss = 0 total_tokens = 0 for i, batch in enumerate(ul.progress_bar(data_loader, total=len(data_loader))): encoder_input, source_length, decoder_output, mask, _, max_length_target, _ = batch # Move to GPU encoder_input = encoder_input.to(device) decoder_output = decoder_output.to(device) source_length = source_length.to(device) mask = torch.squeeze(mask, 1).to(device) loss_b_sq = model.loss_step(encoder_input, source_length, decoder_output, mask, max_length_target, device) ntokens = (decoder_output != pad).data.sum() loss = loss_b_sq.sum()/ntokens # Backprop optimizer_encoder.zero_grad() optimizer_decoder.zero_grad() loss.backward() if clip_gradient_norm > 0: tnnu.clip_grad_norm_(model.network.encoder.parameters(), clip_gradient_norm) tnnu.clip_grad_norm_(model.network.decoder.parameters(), clip_gradient_norm) # Update weights optimizer_encoder.step() optimizer_decoder.step() # loss total_tokens += ntokens total_loss += loss_b_sq.sum() loss_epoch = total_loss / total_tokens return loss_epoch
def main(): """Main function.""" args = parse_args() model = mm.DecoratorModel.load_from_file(args.model_path, mode="eval") input_scaffolds = list(uc.read_smi_file(args.input_scaffold_path)) if args.output_smiles_path: if args.use_gzip: args.output_smiles_path += ".gz" output_file = uc.open_file(args.output_smiles_path, "w+") write_func = functools.partial(output_file.write) else: output_file = tqdm.tqdm write_func = functools.partial(output_file.write, end="") sample_model = ma.SampleModel(model, args.batch_size) for scaff, dec, nll in ul.progress_bar(sample_model.run(input_scaffolds), total=len(input_scaffolds)): output_row = [scaff, dec] if args.with_nll: output_row.append("{:.8f}".format(nll)) write_func("\t".join(output_row) + "\n") if args.output_smiles_path: output_file.close()
def main(): """Main function.""" args = parse_args() model = mm.Model.load_from_file(args.model_path, mode="eval") open_func = open if args.use_gzip: open_func = gzip.open args.output_smiles_path += ".gz" if args.output_smiles_path: csv_file = open_func(args.output_smiles_path, "wt+") write_func = functools.partial(csv_file.write) else: csv_file = tqdm.tqdm write_func = functools.partial(csv_file.write, end="") sample_model = ma.SampleModel(model, args.batch_size) for smi, nll in ul.progress_bar(sample_model.run(args.num), total=args.num): output_row = [smi] if args.with_nll: output_row.append("{:.8f}".format(nll)) write_func("\t".join(output_row) + "\n") if args.output_smiles_path: csv_file.close()
def main(): """Main function.""" args = parse_args() ut.set_default_device("cuda") model = mm.Model.load_from_file(args.model_path, mode="sampling") input_csv = uc.open_file(args.input_csv_path, mode="rt") if args.use_gzip: args.output_csv_path += ".gz" output_csv = uc.open_file(args.output_csv_path, mode="wt+") calc_nlls_action = ma.CalculateNLLsFromModel(model, batch_size=args.batch_size, logger=LOG) smiles_list = list(uc.read_smi_file(args.input_csv_path)) for nll in ul.progress_bar(calc_nlls_action.run(smiles_list), total=len(smiles_list)): input_line = input_csv.readline().strip() output_csv.write("{}\t{:.8f}\n".format(input_line, nll)) input_csv.close() output_csv.close()
def main(): """Main function.""" args = parse_args() model = mm.DecoratorModel.load_from_file(args.model_path, mode="sampling") input_csv = uc.open_file(args.input_csv_path, mode="rt") if args.use_gzip: args.output_csv_path += ".gz" output_csv = uc.open_file(args.output_csv_path, mode="wt+") calc_nlls_action = ma.CalculateNLLsFromModel(model, batch_size=args.batch_size, logger=LOG) scaffold_decoration_list = [ fields[0:2] for fields in uc.read_csv_file(args.input_csv_path) ] for nll in ul.progress_bar(calc_nlls_action.run(scaffold_decoration_list), total=len(scaffold_decoration_list)): input_line = input_csv.readline().strip() output_csv.write("{}\t{:.8f}\n".format(input_line, nll)) input_csv.close() output_csv.close()
def validation_stat(self, dataloader, model, loss_compute, device, vocab): pad = cfgd.DATA_DEFAULT['padding_value'] total_loss = 0 n_correct = 0 total_n_trg = 0 total_tokens = 0 tokenizer = mv.SMILESTokenizer() for i, batch in enumerate( ul.progress_bar(dataloader, total=len(dataloader))): src, source_length, trg, src_mask, trg_mask, max_length_target, _ = batch trg_y = trg[:, 1:].to(device) # skip start token # number of tokens without padding ntokens = (trg_y != pad).data.sum() # Move to GPU src = src.to(device) trg = trg[:, :-1].to(device) # save start token, skip end token src_mask = src_mask.to(device) trg_mask = trg_mask.to(device) # Compute loss with teaching forcing out = model.forward(src, trg, src_mask, trg_mask) loss = loss_compute(out, trg_y, ntokens) total_loss += loss total_tokens += ntokens # Decode max_length_target = cfgd.DATA_DEFAULT['max_sequence_length'] smiles = decode(model, src, src_mask, max_length_target, type='greedy') # Compute accuracy for j in range(trg.size()[0]): seq = smiles[j, :] target = trg[j] target = tokenizer.untokenize( vocab.decode(target.cpu().numpy())) seq = tokenizer.untokenize(vocab.decode(seq.cpu().numpy())) if seq == target: n_correct += 1 # number of samples in current batch n_trg = trg.size()[0] # total samples total_n_trg += n_trg # Accuracy accuracy = n_correct * 1.0 / total_n_trg loss_epoch = total_loss / total_tokens return loss_epoch, accuracy
def main(): """Main function.""" params = parse_args() lr_params = params["learning_rate"] cs_params = params["collect_stats"] params = params["other"] ut.set_default_device("cuda") if params["collect_stats_frequency"] != 1 and lr_params["mode"] == "ada": LOG.warning( "Changed collect-stats-frequency to 1 to work well with adaptative training." ) params["collect_stats_frequency"] = 1 model = mm.Model.load_from_file(params["input_model_path"]) optimizer = torch.optim.Adam(model.network.parameters(), lr=lr_params["start"]) training_sets = load_sets(params["training_set_path"]) validation_sets = [] if params["collect_stats_frequency"] > 0: validation_sets = load_sets(cs_params["validation_set_path"]) if lr_params["mode"] == "ada": lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode="min", factor=lr_params["gamma"], patience=lr_params["patience"], threshold=lr_params["threshold"]) else: lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=lr_params["step"], gamma=lr_params["gamma"]) post_epoch_hook = TrainModelPostEpochHook( params["output_model_prefix_path"], params["epochs"], validation_sets, lr_scheduler, cs_params["log_path"], cs_params, lr_params, collect_stats_frequency=params["collect_stats_frequency"], save_frequency=params["save_every_n_epochs"], logger=LOG) epochs_it = ma.TrainModel(model, optimizer, training_sets, params["batch_size"], params["clip_gradients"], params["epochs"], post_epoch_hook, logger=LOG).run() for total, epoch_it in epochs_it: for _ in ul.progress_bar(epoch_it, total=total): pass # we could do sth in here, but not needed :)
def generate(self, opt): # set device device = ut.allocate_gpu() # Data loader dataloader_test = self.initialize_dataloader(opt, self.vocab, opt.test_file_name) # Load model file_name = os.path.join(opt.model_path, f'model_{opt.epoch}.pt') if opt.model_choice == 'transformer': model = EncoderDecoder.load_from_file(file_name) model.to(device) model.eval() elif opt.model_choice == 'seq2seq': model = Model.load_from_file(file_name, evaluation_mode=True) # move to GPU model.network.encoder.to(device) model.network.decoder.to(device) max_len = cfgd.DATA_DEFAULT['max_sequence_length'] df_list = [] sampled_smiles_list = [] for j, batch in enumerate(ul.progress_bar(dataloader_test, total=len(dataloader_test))): src, source_length, _, src_mask, _, max_length_target, df = batch # Move to GPU src = src.to(device) src_mask = src_mask.to(device) smiles= self.sample(opt.model_choice, model, src, src_mask, source_length, opt.decode_type, num_samples=opt.num_samples, max_len=max_len, device=device) df_list.append(df) sampled_smiles_list.extend(smiles) # prepare dataframe data_sorted = pd.concat(df_list) sampled_smiles_list = np.array(sampled_smiles_list) for i in range(opt.num_samples): data_sorted['Predicted_smi_{}'.format(i + 1)] = sampled_smiles_list[:, i] result_path = os.path.join(self.save_path, "generated_molecules.csv") LOG.info("Save to {}".format(result_path)) data_sorted.to_csv(result_path, index=False)
def main(): """Main function.""" params = parse_args() lr_params = params["learning_rate"] cs_params = params["collect_stats"] params = params["other"] # ut.set_default_device("cuda") model = mm.DecoratorModel.load_from_file(params["input_model_path"]) optimizer = torch.optim.Adam(model.network.parameters(), lr=lr_params["start"]) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_params["step"], gamma=lr_params["gamma"]) training_sets = load_sets(params["training_set_path"]) validation_sets = [] if params["collect_stats_frequency"] > 0: validation_sets = load_sets(cs_params["validation_set_path"]) post_epoch_hook = TrainModelPostEpochHook( params["output_model_prefix_path"], params["epochs"], validation_sets, lr_scheduler, cs_params, lr_params, collect_stats_frequency=params["collect_stats_frequency"], save_frequency=params["save_every_n_epochs"], logger=LOG) epochs_it = ma.TrainModel(model, optimizer, training_sets, params["batch_size"], params["clip_gradients"], params["epochs"], post_epoch_hook, logger=LOG).run() for num, (total, epoch_it) in enumerate(epochs_it): for _ in ul.progress_bar(epoch_it, total=total, desc="#{}".format(num)): pass # we could do sth in here, but not needed :)
def run(self): """ Calculates likelihoods of a set of molecules. """ ut.set_default_device("cuda") model = mm.Model.load_from_file(self._model_path, sampling_mode=True) nll_iterator, size = md.calculate_nlls_from_model( model, uc.read_smi_file(self._input_csv_path), batch_size=self._batch_size) with open(self._input_csv_path, "r") as input_csv: with open(self._output_csv_path, "w+") as output_csv: for nlls in ul.progress_bar(nll_iterator, size): for nll in nlls: line = input_csv.readline().strip() output_csv.write("{},{:.12f}\n".format(line, nll))
def _sample_and_write_scaffolds_to_disk(self, scaffolds, total_scaffolds): def _update_file(out_file, buffer): for scaff, dec, _ in self._sample_model_action.run(buffer): out_file.write("{}\t{}\n".format(scaff, dec)) out_file = open(self._tmp_path("sampled_decorations"), "w+") scaffold_buffer = [] for scaffold in ul.progress_bar(scaffolds, total=total_scaffolds, desc="Sampling"): scaffold_buffer += [scaffold] * self.num_decorations_per_scaffold if len(scaffold_buffer ) == self.batch_size * self.num_decorations_per_scaffold: _update_file(out_file, scaffold_buffer) scaffold_buffer = [] if scaffold_buffer: _update_file(out_file, scaffold_buffer) out_file.close()
def _train_epoch(self, epoch, training_set_path, validation_set_path): data_loader = self._initialize_dataloader(training_set_path) for _, batch in enumerate( ul.progress_bar(data_loader, total=len(data_loader))): input_vectors = batch.long() loss = self._calculate_loss(input_vectors) self._optimizer.zero_grad() loss.backward() if self._clip_gradient_norm > 0: tnnu.clip_grad_norm_(self._model.network.parameters(), self._clip_gradient_norm) self._optimizer.step() if self._save_every_n_epochs > 0 and epoch % self._save_every_n_epochs == 0: self.last_checkpoint_path = self._save_model(epoch) if self._collect_stats_frequency > 0 and epoch % self._collect_stats_frequency == 0: self._collect_stats(epoch, training_set_path, validation_set_path) self._update_lr_scheduler(epoch) return self._get_lr() >= self._learning_rate_args["min"]
def validation_stat(self, dataloader, model, device, vocab): pad = cfgd.DATA_DEFAULT['padding_value'] total_loss = 0 total_tokens = 0 n_correct = 0 total_n_trg = 0 tokenizer = mv.SMILESTokenizer() model.network.encoder.eval() model.network.decoder.eval() for _, batch in enumerate(ul.progress_bar(dataloader, total=len(dataloader))): encoder_input, source_length, decoder_output, mask, _, max_length_target, _ = batch # Move to GPU encoder_input = encoder_input.to(device) decoder_output = decoder_output.to(device) source_length = source_length.to(device) mask = torch.squeeze(mask, 1).to(device) # Loss with torch.no_grad(): loss_b_sq = model.loss_step(encoder_input, source_length, decoder_output, mask, max_length_target, device) ntokens = (decoder_output != pad).data.sum() total_tokens += ntokens total_loss += loss_b_sq.sum() # Sample using greedy, compute accuracy predicted_seqs, predicted_nlls = model.greedy_sample(encoder_input, source_length, decoder_output, mask, device) for j, seq in enumerate(predicted_seqs): target = tokenizer.untokenize(vocab.decode(decoder_output[j].cpu().numpy())) smi = tokenizer.untokenize(vocab.decode(seq.cpu().numpy())) if smi == target: n_correct += 1 total_n_trg += decoder_output.shape[0] accuracy = n_correct*1.0 / total_n_trg loss = total_loss/total_tokens return loss, accuracy