def load_from_file(cls, file_path, evaluation_mode=False, LOG=None):
        """
        Load a model from specified file path
        :param file_path: model file
        :param evaluation_mode: training or evaluation mode
        :return:
        """
        # model parameters
        if torch.cuda.is_available():
            save_dict = torch.load(file_path)
        else:
            save_dict = torch.load(file_path,
                                   map_location=lambda storage, loc: storage)

        # encoder and decoder params
        encoder_params = save_dict.get("encoder_params", {})
        decoder_params = save_dict.get("decoder_params", {})

        # load model
        model = Model(vocabulary=save_dict['vocabulary'],
                      tokenizer=save_dict.get('tokenizer',
                                              mv.SMILESTokenizer()),
                      encoder_params=encoder_params,
                      decoder_params=decoder_params,
                      max_sequence_length=save_dict['max_sequence_length'])
        model.network.encoder.load_state_dict(save_dict["encoder"])
        model.network.decoder.load_state_dict(save_dict["decoder"])
        if evaluation_mode:
            model.network.encoder.eval()
            model.network.decoder.eval()
        if LOG:
            LOG.info(model.network.encoder)
            LOG.info(model.network.decoder)

        return model
    def validation_stat(self, dataloader, model, loss_compute, device, vocab):
        pad = cfgd.DATA_DEFAULT['padding_value']
        total_loss = 0

        n_correct = 0
        total_n_trg = 0
        total_tokens = 0

        tokenizer = mv.SMILESTokenizer()
        for i, batch in enumerate(
                ul.progress_bar(dataloader, total=len(dataloader))):

            src, source_length, trg, src_mask, trg_mask, max_length_target, _ = batch

            trg_y = trg[:, 1:].to(device)  # skip start token

            # number of tokens without padding
            ntokens = (trg_y != pad).data.sum()

            # Move to GPU
            src = src.to(device)
            trg = trg[:, :-1].to(device)  # save start token, skip end token
            src_mask = src_mask.to(device)
            trg_mask = trg_mask.to(device)

            # Compute loss with teaching forcing
            out = model.forward(src, trg, src_mask, trg_mask)
            loss = loss_compute(out, trg_y, ntokens)
            total_loss += loss
            total_tokens += ntokens

            # Decode
            max_length_target = cfgd.DATA_DEFAULT['max_sequence_length']
            smiles = decode(model,
                            src,
                            src_mask,
                            max_length_target,
                            type='greedy')

            # Compute accuracy
            for j in range(trg.size()[0]):
                seq = smiles[j, :]
                target = trg[j]
                target = tokenizer.untokenize(
                    vocab.decode(target.cpu().numpy()))
                seq = tokenizer.untokenize(vocab.decode(seq.cpu().numpy()))
                if seq == target:
                    n_correct += 1

            # number of samples in current batch
            n_trg = trg.size()[0]
            # total samples
            total_n_trg += n_trg

        # Accuracy
        accuracy = n_correct * 1.0 / total_n_trg
        loss_epoch = total_loss / total_tokens
        return loss_epoch, accuracy
Exemple #3
0
 def initialize_dataloader(self, data_path, batch_size, vocab, data_type):
     # Read train or validation
     data = pd.read_csv(os.path.join(data_path, data_type + '.csv'),
                        sep=",")
     dataset = md.Dataset(data=data,
                          vocabulary=vocab,
                          tokenizer=mv.SMILESTokenizer(),
                          prediction_mode=False)
     dataloader = torch.utils.data.DataLoader(
         dataset,
         batch_size,
         shuffle=True,
         collate_fn=md.Dataset.collate_fn)
     return dataloader
    def get_model(self, opt, vocab, device):
        # Train from scratch or resume training from a given epoch
        if opt.starting_epoch == 1:
            model = Model.make_model(opt.num_layers, opt.layer_size, opt.cell_type, opt.embedding_layer_size, opt.dropout,
               opt.bidirectional, opt.bidirect_model, opt.attn_model, cfgd.DATA_DEFAULT['max_sequence_length'],
                                     vocab, mv.SMILESTokenizer(), self.LOG)

        else:
            file_name = os.path.join(self.save_path, f'checkpoint/model_{opt.starting_epoch-1}.pt')
            model = Model.load_from_file(file_name)
        # move to GPU
        model.network.encoder.to(device)
        model.network.decoder.to(device)
        return model
    def __init__(self, opt):

        self.save_path = os.path.join('experiments', opt.save_directory, opt.test_file_name,
                                      f'evaluation_{opt.epoch}')
        global LOG
        LOG = ul.get_logger(name="generate",
                            log_path=os.path.join(self.save_path, 'generate.log'))
        LOG.info(opt)
        LOG.info("Save directory: {}".format(self.save_path))

        # Load vocabulary
        with open(os.path.join(opt.data_path, 'vocab.pkl'), "rb") as input_file:
            vocab = pkl.load(input_file)
        self.vocab = vocab
        self.tokenizer = mv.SMILESTokenizer()
    def validation_stat(self, dataloader, model, device, vocab):
        pad = cfgd.DATA_DEFAULT['padding_value']
        total_loss = 0
        total_tokens = 0
        n_correct = 0
        total_n_trg = 0
        tokenizer = mv.SMILESTokenizer()
        model.network.encoder.eval()
        model.network.decoder.eval()
        for _, batch in enumerate(ul.progress_bar(dataloader, total=len(dataloader))):
            encoder_input, source_length, decoder_output, mask, _, max_length_target, _ = batch

            # Move to GPU
            encoder_input = encoder_input.to(device)
            decoder_output = decoder_output.to(device)
            source_length = source_length.to(device)
            mask = torch.squeeze(mask, 1).to(device)
            # Loss
            with torch.no_grad():
                loss_b_sq = model.loss_step(encoder_input, source_length, decoder_output, mask, max_length_target, device)
            ntokens = (decoder_output != pad).data.sum()
            total_tokens += ntokens
            total_loss += loss_b_sq.sum()

            # Sample using greedy, compute accuracy
            predicted_seqs, predicted_nlls = model.greedy_sample(encoder_input, source_length, decoder_output,
                                                         mask, device)
            for j, seq in enumerate(predicted_seqs):
                target = tokenizer.untokenize(vocab.decode(decoder_output[j].cpu().numpy()))
                smi = tokenizer.untokenize(vocab.decode(seq.cpu().numpy()))
                if smi == target:
                    n_correct += 1
            total_n_trg += decoder_output.shape[0]
        accuracy = n_correct*1.0 / total_n_trg
        loss = total_loss/total_tokens
        return loss, accuracy
    # add property name before property change; save to file
    property_condition = []
    for property_name in cfgd.PROPERTIES:
        if property_name == 'LogD':
            intervals, _ = property_change_encoder[property_name]
            property_condition.extend(intervals)
        else:
            intervals = property_change_encoder[property_name]
            for name in intervals:
                property_condition.append("{}_{}".format(property_name, name))
    LOG.info("Property condition tokens: {}".format(len(property_condition)))

    encoded_file = pdp.save_df_property_encoded(args.input_data_path, property_change_encoder, LOG)

    LOG.info("Building vocabulary")
    tokenizer = mv.SMILESTokenizer()
    smiles_list = pdp.get_smiles_list(args.input_data_path)
    vocabulary = mv.create_vocabulary(smiles_list, tokenizer=tokenizer, property_condition=property_condition)
    tokens = vocabulary.tokens()
    LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens)

    # Save vocabulary to file
    parent_path = uf.get_parent_dir(args.input_data_path)
    output_file = os.path.join(parent_path, 'vocab.pkl')
    with open(output_file, 'wb') as pickled_file:
        pickle.dump(vocabulary, pickled_file)
    LOG.info("Save vocabulary to file: {}".format(output_file))

    # Split data into train, validation, test
    train, validation, test = pdp.split_data(encoded_file, LOG)