Esempio n. 1
0
def main():
    """Main function"""
    args = parse_args()

    smiles_list = uc.read_smi_file(args.input_smiles_path)

    LOG.info("Building vocabulary")
    tokenizer = mv.SMILESTokenizer()
    vocabulary = mv.create_vocabulary(smiles_list, tokenizer=tokenizer)

    tokens = vocabulary.tokens()
    LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens)
    network_params = {
        'num_layers': args.num_layers,
        'layer_size': args.layer_size,
        'cell_type': args.cell_type,
        'embedding_layer_size': args.embedding_layer_size,
        'dropout': args.dropout
    }
    model = mm.Model(no_cuda=True,
                     vocabulary=vocabulary,
                     tokenizer=tokenizer,
                     network_params=network_params,
                     max_sequence_length=args.max_sequence_length)
    LOG.info("Saving model at %s", args.output_model_path)
    model.save(args.output_model_path)
Esempio n. 2
0
 def _initialize_dataloader(self, training_set):
     dataset = md.Dataset(smiles_list=training_set,
                          vocabulary=self.model.vocabulary,
                          tokenizer=mv.SMILESTokenizer())
     return tud.DataLoader(dataset,
                           batch_size=self.batch_size,
                           shuffle=True,
                           collate_fn=md.Dataset.collate_fn)
Esempio n. 3
0
 def _initialize_dataloader(self, path):
     training_set = uc.read_smi_file(path)
     dataset = md.Dataset(smiles_list=training_set,
                          vocabulary=self._model.vocabulary,
                          tokenizer=mv.SMILESTokenizer())
     dataloader = torch.utils.data.DataLoader(
         dataset,
         batch_size=self._batch_size,
         shuffle=self._shuffle_each_epoch,
         collate_fn=md.Dataset.collate_fn)
     return dataloader
Esempio n. 4
0
    def run(self):
        """
        Performs the creation of the model.
        """
        if self._already_run:
            return

        LOG.info("Building vocabulary")
        tokenizer = mv.SMILESTokenizer()
        vocabulary = mv.create_vocabulary(self._smiles_list,
                                          tokenizer=tokenizer)

        tokens = vocabulary.tokens()
        LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens)
        LOG.info("Saving model at %s", self._output_model_path)
        network_params = {
            'num_layers': self._num_layers,
            'layer_size': self._layer_size,
            'embedding_layer_size': self._embedding_layer_size,
            'dropout': self._dropout,
            'memory_cells': self._memory_cells,
            'cell_size': self._cell_size,
            'read_heads': self._read_heads,
            'num_controller_layers': self._num_controller_layers,
            'controller_type': self._controller_type,
            'model_type': self._model_type
        }
        model = mm.Model(vocabulary=vocabulary,
                         tokenizer=tokenizer,
                         network_params=network_params,
                         model_type=self._model_type,
                         max_sequence_length=self._max_sequence_length)

        model_folder = model.model_name.split('.')[0]
        storage_folder_path = os.path.join(self._output_model_path,
                                           model_folder)
        i = 0
        while os.path.exists(storage_folder_path):
            if i == 0:
                storage_folder_path += '(%s)' % i
            else:
                cut_path = storage_folder_path[:-3]
                storage_folder_path = cut_path + '(%s)' % i
            i += 1

        os.makedirs(storage_folder_path)
        self._output_model_path = os.path.join(storage_folder_path,
                                               model.model_name)
        model.model_dir = storage_folder_path

        model.save(self._output_model_path)
        LOG.info('Model saved!')
        LOG.info(model.__dict__)
Esempio n. 5
0
 def _initialize_dataloader(self, path):
     training_set = chem_smiles.read_smiles_file(
         path,
         standardize=self._config.standardize,
         randomize=self._config.randomize)
     dataset = reinvent_dataset.Dataset(
         smiles_list=training_set,
         vocabulary=self._model.vocabulary,
         tokenizer=reinvent_vocabulary.SMILESTokenizer())
     dataloader = torch.utils.data.DataLoader(
         dataset,
         batch_size=self._config.batch_size,
         shuffle=self._config.shuffle_each_epoch,
         collate_fn=reinvent_dataset.Dataset.collate_fn)
     return dataloader
Esempio n. 6
0
    def run(self):
        """
        Carries out the creation of the model.
        """

        tokenizer = voc.SMILESTokenizer()
        vocabulary = voc.create_vocabulary(self._smiles_list, tokenizer=tokenizer)

        network_params = {
            'num_layers': self._num_layers,
            'layer_size': self._layer_size,
            'cell_type': self._cell_type,
            'embedding_layer_size': self._embedding_layer_size,
            'dropout': self._dropout,
            'layer_normalization': self._layer_normalization
        }
        model = reinvent.Model(no_cuda=True, vocabulary=vocabulary, tokenizer=tokenizer, network_params=network_params, max_sequence_length=self._max_sequence_length)
        model.save(self._output_model_path)
        return model
Esempio n. 7
0
    def load_from_file(cls, file_path, mode="train"):
        """
        Loads a model from a single file
        :param file_path: Path of the file where the model data was previously stored.
        :param mode: Mode to load the model as (training or eval).
        :return: A new instance of the Model or an exception if it was not possible to load it.
        """
        if torch.cuda.is_available():
            save_dict = torch.load(file_path)
        else:
            save_dict = torch.load(file_path,
                                   map_location=lambda storage, loc: storage)

        network_params = save_dict.get("network_params", {})
        model = Model(vocabulary=save_dict['vocabulary'],
                      tokenizer=save_dict.get('tokenizer',
                                              mv.SMILESTokenizer()),
                      network_params=network_params,
                      max_sequence_length=save_dict['max_sequence_length'],
                      mode=mode)
        model.network.load_state_dict(save_dict["network"])
        return model
Esempio n. 8
0
    def load_from_file(cls, file_path: str, sampling_mode=False):
        """
        Loads a model from a single file
        :param file_path: input file path
        :return: new instance of the RNN or an exception if it was not possible to load it.
        """
        if torch.cuda.is_available():
            save_dict = torch.load(file_path)
        else:
            save_dict = torch.load(file_path, map_location=lambda storage, loc: storage)

        network_params = save_dict.get("network_params", {})
        model = Model(
            vocabulary=save_dict['vocabulary'],
            tokenizer=save_dict.get('tokenizer', mv.SMILESTokenizer()),
            network_params=network_params,
            max_sequence_length=save_dict['max_sequence_length']
        )
        model.network.load_state_dict(save_dict["network"])
        if sampling_mode:
            model.network.eval()
        return model
Esempio n. 9
0
    def load_from_file(cls, file_path, sampling_mode=False):
        """
        Loads a model from a single file
        :param file: filpath as string
        :return: new instance of the RNN or None if it was not possible to load
        """
        if torch.cuda.is_available():
            save_dict = torch.load(file_path)
        else:
            save_dict = torch.load(file_path, map_location=lambda storage, loc: storage)

        network_params = save_dict.get("network_params", {})

        path_parts = file_path.split('/')

        # name of the model is the name of the dir
        model_name = path_parts[len(path_parts) - 2] + Model.MODEL_EXTENSION

        # popping the model file name, leaving only dir path
        path_parts.pop()

        model = Model(
            vocabulary=save_dict['vocabulary'],
            tokenizer=save_dict.get('tokenizer', mv.SMILESTokenizer()),
            network_params=network_params,
            max_sequence_length=save_dict['max_sequence_length'],
            model_type=save_dict['model_type'],
            model_name=model_name,
            model_dir='/'.join(path_parts),
        )

        model.network.load_state_dict(save_dict["network"])

        if sampling_mode:
            torch.no_grad()
            model.network.eval()

        return model
Esempio n. 10
0
 def setUp(self):
     self.tokenizer = mv.SMILESTokenizer()
Esempio n. 11
0
 def test_create(self):
     voc = mv.create_vocabulary(smiles_list=tfv.SMILES_LIST, tokenizer=mv.SMILESTokenizer())
     self.assertEqual(voc, tfv.simple())