コード例 #1
0
    def train_epoch(self, dataloader, model, loss_compute, device):

        pad = cfgd.DATA_DEFAULT['padding_value']
        total_loss = 0
        total_tokens = 0
        for i, batch in enumerate(
                ul.progress_bar(dataloader, total=len(dataloader))):
            src, source_length, trg, src_mask, trg_mask, max_length_target, _ = batch

            trg_y = trg[:, 1:].to(device)  # skip start token

            # number of tokens without padding
            ntokens = (trg_y != pad).data.sum()

            # Move to GPU
            src = src.to(device)
            trg = trg[:, :-1].to(device)  # save start token, skip end token
            src_mask = src_mask.to(device)
            trg_mask = trg_mask.to(device)

            # Compute loss
            out = model.forward(src, trg, src_mask, trg_mask)
            loss = loss_compute(out, trg_y, ntokens)
            total_tokens += ntokens
            total_loss += loss

        loss_epoch = total_loss / total_tokens

        return loss_epoch
コード例 #2
0
    def train_epoch(self, data_loader, model, optimizer_encoder, optimizer_decoder, clip_gradient_norm, device):
        model.network.encoder.train()
        model.network.decoder.train()
        pad = cfgd.DATA_DEFAULT['padding_value']
        total_loss = 0
        total_tokens = 0
        for i, batch in enumerate(ul.progress_bar(data_loader, total=len(data_loader))):
            encoder_input, source_length, decoder_output, mask, _, max_length_target, _ = batch
            # Move to GPU
            encoder_input = encoder_input.to(device)
            decoder_output = decoder_output.to(device)
            source_length = source_length.to(device)
            mask = torch.squeeze(mask, 1).to(device)
            loss_b_sq = model.loss_step(encoder_input, source_length, decoder_output, mask, max_length_target, device)

            ntokens = (decoder_output != pad).data.sum()
            loss = loss_b_sq.sum()/ntokens

            # Backprop
            optimizer_encoder.zero_grad()
            optimizer_decoder.zero_grad()
            loss.backward()

            if clip_gradient_norm > 0:
                tnnu.clip_grad_norm_(model.network.encoder.parameters(), clip_gradient_norm)
                tnnu.clip_grad_norm_(model.network.decoder.parameters(), clip_gradient_norm)
            # Update weights
            optimizer_encoder.step()
            optimizer_decoder.step()
            # loss
            total_tokens += ntokens
            total_loss += loss_b_sq.sum()

        loss_epoch = total_loss / total_tokens
        return loss_epoch
コード例 #3
0
def main():
    """Main function."""
    args = parse_args()

    model = mm.DecoratorModel.load_from_file(args.model_path, mode="eval")

    input_scaffolds = list(uc.read_smi_file(args.input_scaffold_path))
    if args.output_smiles_path:
        if args.use_gzip:
            args.output_smiles_path += ".gz"
        output_file = uc.open_file(args.output_smiles_path, "w+")
        write_func = functools.partial(output_file.write)
    else:
        output_file = tqdm.tqdm
        write_func = functools.partial(output_file.write, end="")

    sample_model = ma.SampleModel(model, args.batch_size)

    for scaff, dec, nll in ul.progress_bar(sample_model.run(input_scaffolds),
                                           total=len(input_scaffolds)):
        output_row = [scaff, dec]
        if args.with_nll:
            output_row.append("{:.8f}".format(nll))
        write_func("\t".join(output_row) + "\n")

    if args.output_smiles_path:
        output_file.close()
コード例 #4
0
def main():
    """Main function."""
    args = parse_args()

    model = mm.Model.load_from_file(args.model_path, mode="eval")

    open_func = open
    if args.use_gzip:
        open_func = gzip.open
        args.output_smiles_path += ".gz"

    if args.output_smiles_path:
        csv_file = open_func(args.output_smiles_path, "wt+")
        write_func = functools.partial(csv_file.write)
    else:
        csv_file = tqdm.tqdm
        write_func = functools.partial(csv_file.write, end="")

    sample_model = ma.SampleModel(model, args.batch_size)

    for smi, nll in ul.progress_bar(sample_model.run(args.num), total=args.num):
        output_row = [smi]
        if args.with_nll:
            output_row.append("{:.8f}".format(nll))
        write_func("\t".join(output_row) + "\n")

    if args.output_smiles_path:
        csv_file.close()
コード例 #5
0
def main():
    """Main function."""
    args = parse_args()

    ut.set_default_device("cuda")

    model = mm.Model.load_from_file(args.model_path, mode="sampling")

    input_csv = uc.open_file(args.input_csv_path, mode="rt")
    if args.use_gzip:
        args.output_csv_path += ".gz"
    output_csv = uc.open_file(args.output_csv_path, mode="wt+")

    calc_nlls_action = ma.CalculateNLLsFromModel(model,
                                                 batch_size=args.batch_size,
                                                 logger=LOG)
    smiles_list = list(uc.read_smi_file(args.input_csv_path))

    for nll in ul.progress_bar(calc_nlls_action.run(smiles_list),
                               total=len(smiles_list)):
        input_line = input_csv.readline().strip()
        output_csv.write("{}\t{:.8f}\n".format(input_line, nll))

    input_csv.close()
    output_csv.close()
コード例 #6
0
def main():
    """Main function."""
    args = parse_args()

    model = mm.DecoratorModel.load_from_file(args.model_path, mode="sampling")

    input_csv = uc.open_file(args.input_csv_path, mode="rt")
    if args.use_gzip:
        args.output_csv_path += ".gz"
    output_csv = uc.open_file(args.output_csv_path, mode="wt+")

    calc_nlls_action = ma.CalculateNLLsFromModel(model,
                                                 batch_size=args.batch_size,
                                                 logger=LOG)
    scaffold_decoration_list = [
        fields[0:2] for fields in uc.read_csv_file(args.input_csv_path)
    ]

    for nll in ul.progress_bar(calc_nlls_action.run(scaffold_decoration_list),
                               total=len(scaffold_decoration_list)):
        input_line = input_csv.readline().strip()
        output_csv.write("{}\t{:.8f}\n".format(input_line, nll))

    input_csv.close()
    output_csv.close()
コード例 #7
0
    def validation_stat(self, dataloader, model, loss_compute, device, vocab):
        pad = cfgd.DATA_DEFAULT['padding_value']
        total_loss = 0

        n_correct = 0
        total_n_trg = 0
        total_tokens = 0

        tokenizer = mv.SMILESTokenizer()
        for i, batch in enumerate(
                ul.progress_bar(dataloader, total=len(dataloader))):

            src, source_length, trg, src_mask, trg_mask, max_length_target, _ = batch

            trg_y = trg[:, 1:].to(device)  # skip start token

            # number of tokens without padding
            ntokens = (trg_y != pad).data.sum()

            # Move to GPU
            src = src.to(device)
            trg = trg[:, :-1].to(device)  # save start token, skip end token
            src_mask = src_mask.to(device)
            trg_mask = trg_mask.to(device)

            # Compute loss with teaching forcing
            out = model.forward(src, trg, src_mask, trg_mask)
            loss = loss_compute(out, trg_y, ntokens)
            total_loss += loss
            total_tokens += ntokens

            # Decode
            max_length_target = cfgd.DATA_DEFAULT['max_sequence_length']
            smiles = decode(model,
                            src,
                            src_mask,
                            max_length_target,
                            type='greedy')

            # Compute accuracy
            for j in range(trg.size()[0]):
                seq = smiles[j, :]
                target = trg[j]
                target = tokenizer.untokenize(
                    vocab.decode(target.cpu().numpy()))
                seq = tokenizer.untokenize(vocab.decode(seq.cpu().numpy()))
                if seq == target:
                    n_correct += 1

            # number of samples in current batch
            n_trg = trg.size()[0]
            # total samples
            total_n_trg += n_trg

        # Accuracy
        accuracy = n_correct * 1.0 / total_n_trg
        loss_epoch = total_loss / total_tokens
        return loss_epoch, accuracy
コード例 #8
0
def main():
    """Main function."""
    params = parse_args()
    lr_params = params["learning_rate"]
    cs_params = params["collect_stats"]
    params = params["other"]

    ut.set_default_device("cuda")

    if params["collect_stats_frequency"] != 1 and lr_params["mode"] == "ada":
        LOG.warning(
            "Changed collect-stats-frequency to 1 to work well with adaptative training."
        )
        params["collect_stats_frequency"] = 1

    model = mm.Model.load_from_file(params["input_model_path"])
    optimizer = torch.optim.Adam(model.network.parameters(),
                                 lr=lr_params["start"])
    training_sets = load_sets(params["training_set_path"])
    validation_sets = []
    if params["collect_stats_frequency"] > 0:
        validation_sets = load_sets(cs_params["validation_set_path"])

    if lr_params["mode"] == "ada":
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode="min",
            factor=lr_params["gamma"],
            patience=lr_params["patience"],
            threshold=lr_params["threshold"])
    else:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer, step_size=lr_params["step"], gamma=lr_params["gamma"])

    post_epoch_hook = TrainModelPostEpochHook(
        params["output_model_prefix_path"],
        params["epochs"],
        validation_sets,
        lr_scheduler,
        cs_params["log_path"],
        cs_params,
        lr_params,
        collect_stats_frequency=params["collect_stats_frequency"],
        save_frequency=params["save_every_n_epochs"],
        logger=LOG)

    epochs_it = ma.TrainModel(model,
                              optimizer,
                              training_sets,
                              params["batch_size"],
                              params["clip_gradients"],
                              params["epochs"],
                              post_epoch_hook,
                              logger=LOG).run()

    for total, epoch_it in epochs_it:
        for _ in ul.progress_bar(epoch_it, total=total):
            pass  # we could do sth in here, but not needed :)
コード例 #9
0
    def generate(self, opt):

        # set device
        device = ut.allocate_gpu()

        # Data loader
        dataloader_test = self.initialize_dataloader(opt, self.vocab, opt.test_file_name)

        # Load model
        file_name = os.path.join(opt.model_path, f'model_{opt.epoch}.pt')
        if opt.model_choice == 'transformer':
            model = EncoderDecoder.load_from_file(file_name)
            model.to(device)
            model.eval()
        elif opt.model_choice == 'seq2seq':
            model = Model.load_from_file(file_name, evaluation_mode=True)
            # move to GPU
            model.network.encoder.to(device)
            model.network.decoder.to(device)
        max_len = cfgd.DATA_DEFAULT['max_sequence_length']
        df_list = []
        sampled_smiles_list = []
        for j, batch in enumerate(ul.progress_bar(dataloader_test, total=len(dataloader_test))):

            src, source_length, _, src_mask, _, max_length_target, df = batch

            # Move to GPU
            src = src.to(device)
            src_mask = src_mask.to(device)
            smiles= self.sample(opt.model_choice, model, src, src_mask,
                                                                       source_length,
                                                                       opt.decode_type,
                                                                       num_samples=opt.num_samples,
                                                                       max_len=max_len,
                                                                       device=device)

            df_list.append(df)
            sampled_smiles_list.extend(smiles)

        # prepare dataframe
        data_sorted = pd.concat(df_list)
        sampled_smiles_list = np.array(sampled_smiles_list)

        for i in range(opt.num_samples):
            data_sorted['Predicted_smi_{}'.format(i + 1)] = sampled_smiles_list[:, i]

        result_path = os.path.join(self.save_path, "generated_molecules.csv")
        LOG.info("Save to {}".format(result_path))
        data_sorted.to_csv(result_path, index=False)
コード例 #10
0
def main():
    """Main function."""
    params = parse_args()
    lr_params = params["learning_rate"]
    cs_params = params["collect_stats"]
    params = params["other"]

    # ut.set_default_device("cuda")

    model = mm.DecoratorModel.load_from_file(params["input_model_path"])
    optimizer = torch.optim.Adam(model.network.parameters(),
                                 lr=lr_params["start"])
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=lr_params["step"],
                                                   gamma=lr_params["gamma"])

    training_sets = load_sets(params["training_set_path"])
    validation_sets = []
    if params["collect_stats_frequency"] > 0:
        validation_sets = load_sets(cs_params["validation_set_path"])

    post_epoch_hook = TrainModelPostEpochHook(
        params["output_model_prefix_path"],
        params["epochs"],
        validation_sets,
        lr_scheduler,
        cs_params,
        lr_params,
        collect_stats_frequency=params["collect_stats_frequency"],
        save_frequency=params["save_every_n_epochs"],
        logger=LOG)

    epochs_it = ma.TrainModel(model,
                              optimizer,
                              training_sets,
                              params["batch_size"],
                              params["clip_gradients"],
                              params["epochs"],
                              post_epoch_hook,
                              logger=LOG).run()

    for num, (total, epoch_it) in enumerate(epochs_it):
        for _ in ul.progress_bar(epoch_it, total=total,
                                 desc="#{}".format(num)):
            pass  # we could do sth in here, but not needed :)
コード例 #11
0
    def run(self):
        """
        Calculates likelihoods of a set of molecules.
        """
        ut.set_default_device("cuda")

        model = mm.Model.load_from_file(self._model_path, sampling_mode=True)

        nll_iterator, size = md.calculate_nlls_from_model(
            model,
            uc.read_smi_file(self._input_csv_path),
            batch_size=self._batch_size)
        with open(self._input_csv_path, "r") as input_csv:
            with open(self._output_csv_path, "w+") as output_csv:
                for nlls in ul.progress_bar(nll_iterator, size):
                    for nll in nlls:
                        line = input_csv.readline().strip()
                        output_csv.write("{},{:.12f}\n".format(line, nll))
コード例 #12
0
    def _sample_and_write_scaffolds_to_disk(self, scaffolds, total_scaffolds):
        def _update_file(out_file, buffer):
            for scaff, dec, _ in self._sample_model_action.run(buffer):
                out_file.write("{}\t{}\n".format(scaff, dec))

        out_file = open(self._tmp_path("sampled_decorations"), "w+")
        scaffold_buffer = []
        for scaffold in ul.progress_bar(scaffolds,
                                        total=total_scaffolds,
                                        desc="Sampling"):
            scaffold_buffer += [scaffold] * self.num_decorations_per_scaffold
            if len(scaffold_buffer
                   ) == self.batch_size * self.num_decorations_per_scaffold:
                _update_file(out_file, scaffold_buffer)
                scaffold_buffer = []

        if scaffold_buffer:
            _update_file(out_file, scaffold_buffer)
        out_file.close()
コード例 #13
0
    def _train_epoch(self, epoch, training_set_path, validation_set_path):
        data_loader = self._initialize_dataloader(training_set_path)
        for _, batch in enumerate(
                ul.progress_bar(data_loader, total=len(data_loader))):
            input_vectors = batch.long()
            loss = self._calculate_loss(input_vectors)

            self._optimizer.zero_grad()
            loss.backward()
            if self._clip_gradient_norm > 0:
                tnnu.clip_grad_norm_(self._model.network.parameters(),
                                     self._clip_gradient_norm)
            self._optimizer.step()

        if self._save_every_n_epochs > 0 and epoch % self._save_every_n_epochs == 0:
            self.last_checkpoint_path = self._save_model(epoch)

        if self._collect_stats_frequency > 0 and epoch % self._collect_stats_frequency == 0:
            self._collect_stats(epoch, training_set_path, validation_set_path)

        self._update_lr_scheduler(epoch)

        return self._get_lr() >= self._learning_rate_args["min"]
コード例 #14
0
    def validation_stat(self, dataloader, model, device, vocab):
        pad = cfgd.DATA_DEFAULT['padding_value']
        total_loss = 0
        total_tokens = 0
        n_correct = 0
        total_n_trg = 0
        tokenizer = mv.SMILESTokenizer()
        model.network.encoder.eval()
        model.network.decoder.eval()
        for _, batch in enumerate(ul.progress_bar(dataloader, total=len(dataloader))):
            encoder_input, source_length, decoder_output, mask, _, max_length_target, _ = batch

            # Move to GPU
            encoder_input = encoder_input.to(device)
            decoder_output = decoder_output.to(device)
            source_length = source_length.to(device)
            mask = torch.squeeze(mask, 1).to(device)
            # Loss
            with torch.no_grad():
                loss_b_sq = model.loss_step(encoder_input, source_length, decoder_output, mask, max_length_target, device)
            ntokens = (decoder_output != pad).data.sum()
            total_tokens += ntokens
            total_loss += loss_b_sq.sum()

            # Sample using greedy, compute accuracy
            predicted_seqs, predicted_nlls = model.greedy_sample(encoder_input, source_length, decoder_output,
                                                         mask, device)
            for j, seq in enumerate(predicted_seqs):
                target = tokenizer.untokenize(vocab.decode(decoder_output[j].cpu().numpy()))
                smi = tokenizer.untokenize(vocab.decode(seq.cpu().numpy()))
                if smi == target:
                    n_correct += 1
            total_n_trg += decoder_output.shape[0]
        accuracy = n_correct*1.0 / total_n_trg
        loss = total_loss/total_tokens
        return loss, accuracy