def main(): """Main function""" args = parse_args() smiles_list = uc.read_smi_file(args.input_smiles_path) LOG.info("Building vocabulary") tokenizer = mv.SMILESTokenizer() vocabulary = mv.create_vocabulary(smiles_list, tokenizer=tokenizer) tokens = vocabulary.tokens() LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens) network_params = { 'num_layers': args.num_layers, 'layer_size': args.layer_size, 'cell_type': args.cell_type, 'embedding_layer_size': args.embedding_layer_size, 'dropout': args.dropout } model = mm.Model(no_cuda=True, vocabulary=vocabulary, tokenizer=tokenizer, network_params=network_params, max_sequence_length=args.max_sequence_length) LOG.info("Saving model at %s", args.output_model_path) model.save(args.output_model_path)
def _initialize_dataloader(self, training_set): dataset = md.Dataset(smiles_list=training_set, vocabulary=self.model.vocabulary, tokenizer=mv.SMILESTokenizer()) return tud.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=md.Dataset.collate_fn)
def _initialize_dataloader(self, path): training_set = uc.read_smi_file(path) dataset = md.Dataset(smiles_list=training_set, vocabulary=self._model.vocabulary, tokenizer=mv.SMILESTokenizer()) dataloader = torch.utils.data.DataLoader( dataset, batch_size=self._batch_size, shuffle=self._shuffle_each_epoch, collate_fn=md.Dataset.collate_fn) return dataloader
def run(self): """ Performs the creation of the model. """ if self._already_run: return LOG.info("Building vocabulary") tokenizer = mv.SMILESTokenizer() vocabulary = mv.create_vocabulary(self._smiles_list, tokenizer=tokenizer) tokens = vocabulary.tokens() LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens) LOG.info("Saving model at %s", self._output_model_path) network_params = { 'num_layers': self._num_layers, 'layer_size': self._layer_size, 'embedding_layer_size': self._embedding_layer_size, 'dropout': self._dropout, 'memory_cells': self._memory_cells, 'cell_size': self._cell_size, 'read_heads': self._read_heads, 'num_controller_layers': self._num_controller_layers, 'controller_type': self._controller_type, 'model_type': self._model_type } model = mm.Model(vocabulary=vocabulary, tokenizer=tokenizer, network_params=network_params, model_type=self._model_type, max_sequence_length=self._max_sequence_length) model_folder = model.model_name.split('.')[0] storage_folder_path = os.path.join(self._output_model_path, model_folder) i = 0 while os.path.exists(storage_folder_path): if i == 0: storage_folder_path += '(%s)' % i else: cut_path = storage_folder_path[:-3] storage_folder_path = cut_path + '(%s)' % i i += 1 os.makedirs(storage_folder_path) self._output_model_path = os.path.join(storage_folder_path, model.model_name) model.model_dir = storage_folder_path model.save(self._output_model_path) LOG.info('Model saved!') LOG.info(model.__dict__)
def _initialize_dataloader(self, path): training_set = chem_smiles.read_smiles_file( path, standardize=self._config.standardize, randomize=self._config.randomize) dataset = reinvent_dataset.Dataset( smiles_list=training_set, vocabulary=self._model.vocabulary, tokenizer=reinvent_vocabulary.SMILESTokenizer()) dataloader = torch.utils.data.DataLoader( dataset, batch_size=self._config.batch_size, shuffle=self._config.shuffle_each_epoch, collate_fn=reinvent_dataset.Dataset.collate_fn) return dataloader
def run(self): """ Carries out the creation of the model. """ tokenizer = voc.SMILESTokenizer() vocabulary = voc.create_vocabulary(self._smiles_list, tokenizer=tokenizer) network_params = { 'num_layers': self._num_layers, 'layer_size': self._layer_size, 'cell_type': self._cell_type, 'embedding_layer_size': self._embedding_layer_size, 'dropout': self._dropout, 'layer_normalization': self._layer_normalization } model = reinvent.Model(no_cuda=True, vocabulary=vocabulary, tokenizer=tokenizer, network_params=network_params, max_sequence_length=self._max_sequence_length) model.save(self._output_model_path) return model
def load_from_file(cls, file_path, mode="train"): """ Loads a model from a single file :param file_path: Path of the file where the model data was previously stored. :param mode: Mode to load the model as (training or eval). :return: A new instance of the Model or an exception if it was not possible to load it. """ if torch.cuda.is_available(): save_dict = torch.load(file_path) else: save_dict = torch.load(file_path, map_location=lambda storage, loc: storage) network_params = save_dict.get("network_params", {}) model = Model(vocabulary=save_dict['vocabulary'], tokenizer=save_dict.get('tokenizer', mv.SMILESTokenizer()), network_params=network_params, max_sequence_length=save_dict['max_sequence_length'], mode=mode) model.network.load_state_dict(save_dict["network"]) return model
def load_from_file(cls, file_path: str, sampling_mode=False): """ Loads a model from a single file :param file_path: input file path :return: new instance of the RNN or an exception if it was not possible to load it. """ if torch.cuda.is_available(): save_dict = torch.load(file_path) else: save_dict = torch.load(file_path, map_location=lambda storage, loc: storage) network_params = save_dict.get("network_params", {}) model = Model( vocabulary=save_dict['vocabulary'], tokenizer=save_dict.get('tokenizer', mv.SMILESTokenizer()), network_params=network_params, max_sequence_length=save_dict['max_sequence_length'] ) model.network.load_state_dict(save_dict["network"]) if sampling_mode: model.network.eval() return model
def load_from_file(cls, file_path, sampling_mode=False): """ Loads a model from a single file :param file: filpath as string :return: new instance of the RNN or None if it was not possible to load """ if torch.cuda.is_available(): save_dict = torch.load(file_path) else: save_dict = torch.load(file_path, map_location=lambda storage, loc: storage) network_params = save_dict.get("network_params", {}) path_parts = file_path.split('/') # name of the model is the name of the dir model_name = path_parts[len(path_parts) - 2] + Model.MODEL_EXTENSION # popping the model file name, leaving only dir path path_parts.pop() model = Model( vocabulary=save_dict['vocabulary'], tokenizer=save_dict.get('tokenizer', mv.SMILESTokenizer()), network_params=network_params, max_sequence_length=save_dict['max_sequence_length'], model_type=save_dict['model_type'], model_name=model_name, model_dir='/'.join(path_parts), ) model.network.load_state_dict(save_dict["network"]) if sampling_mode: torch.no_grad() model.network.eval() return model
def setUp(self): self.tokenizer = mv.SMILESTokenizer()
def test_create(self): voc = mv.create_vocabulary(smiles_list=tfv.SMILES_LIST, tokenizer=mv.SMILESTokenizer()) self.assertEqual(voc, tfv.simple())