def __init__(self, config, data_type='train'): self.config = config self.data_type = data_type assert self.data_type in ['train', 'valid', 'finetune'] self.max_len = 0 if self.data_type == 'train': self.smiles = self._load(self.config.data_filename) elif self.data_type == 'finetune': self.smiles = self._load(self.config.finetune_data_filename) else: pass self.st = SmilesTokenizer() self.one_hot_dict = self.st.one_hot_dict self.tokenized_smiles = self._tokenize(self.smiles) if self.data_type in ['train', 'valid']: self.idx = np.arange(len(self.tokenized_smiles)) self.valid_size = int( np.ceil( len(self.tokenized_smiles) * self.config.validation_split)) np.random.seed(self.config.seed) np.random.shuffle(self.idx)
def main(input_file, output_file, **kwargs): assert os.path.exists(input_file) assert not os.path.exists(output_file), f'{output_file} already exists.' print("kwargs? :", kwargs['finetune']) pp = Preprocessor() with open(input_file, 'r') as f: smiles = [l.rstrip() for l in f] print(f'input SMILES num: {len(smiles)}') print('start to clean up') pp_smiles = [pp.process(smi) for smi in tqdm(smiles)] print('Step 1 / 3 completed') cl_smiles = list(set([s for s in pp_smiles if s])) print('Step 2 / 3 completed') # token limits (34 to 128) out_smiles = [] print('Initiating tokenizer') st = SmilesTokenizer() print('Tokenizer initiated') if kwargs['finetune']: print('In finetune kwargs') total = len(cl_smiles) print(total) count = 0 skip_count = 0 timeout_count = 0 for cl_smi in cl_smiles: try: tokenized_smi = st.tokenize(cl_smi) if tokenized_smi == []: timeout_count += 1 elif 34 <= len(tokenized_smi) <= 128: out_smiles.append(cl_smi) except: skip_count += 1 count += 1 if count % 25000 == 0: print(count, ' completed out of ', total,'. Skipped ', skip_count,'. Timed out ', timeout_count) # print(count, ' completed out of ', total,'. Have skipped ', skip_count) else: print('Not in finetune kwargs') out_smiles = cl_smiles print('done.') print(f'output SMILES num: {len(out_smiles)}') with open(output_file, 'w') as f: for smi in out_smiles: f.write(smi + '\n') return
def build_model(self): st = SmilesTokenizer() n_table = len(st.table) weight_init = RandomNormal(mean=0.0, stddev=0.05, seed=self.config.seed) self.model = Sequential() self.model.add( LSTM(units=self.config.units, input_shape=(None, n_table), return_sequences=True, kernel_initializer=weight_init, dropout=0.3)) self.model.add( LSTM(units=self.config.units, input_shape=(None, n_table), return_sequences=True, kernel_initializer=weight_init, dropout=0.5)) self.model.add( Dense(units=n_table, activation='softmax', kernel_initializer=weight_init)) arch = self.model.to_json(indent=2) self.config.model_arch_filename = os.path.join(self.config.exp_dir, 'model_arch.json') with open(self.config.model_arch_filename, 'w') as f: f.write(arch) self.model.compile(optimizer=self.config.optimizer, loss='categorical_crossentropy')
class LSTMChemGenerator(object): def __init__(self, modeler): self.session = modeler.session self.model = modeler.model self.config = modeler.config self.st = SmilesTokenizer() def _generate(self, sequence): while (sequence[-1] != 'E') and (len(self.st.tokenize(sequence)) <= self.config.smiles_max_length): x = self.st.one_hot_encode(self.st.tokenize(sequence)) preds = self.model.predict_on_batch(x)[0][-1] next_idx = self.sample_with_temp(preds) sequence += self.st.table[next_idx] sequence = sequence[1:].rstrip('E') return sequence def sample_with_temp(self, preds): streched = np.log(preds) / self.config.sampling_temp streched_probs = np.exp(streched) / np.sum(np.exp(streched)) return np.random.choice(range(len(streched)), p=streched_probs) def sample(self, num=1, start='G'): sampled = [] if self.session == 'generate': for _ in tqdm(range(num)): sampled.append(self._generate(start)) return sampled else: from rdkit import Chem, RDLogger RDLogger.DisableLog('rdApp.*') while len(sampled) < num: sequence = self._generate(start) mol = Chem.MolFromSmiles(sequence) if mol is not None: canon_smiles = Chem.MolToSmiles(mol) sampled.append(canon_smiles) return sampled
def __init__(self, modeler, finetune_data_loader): self.session = modeler.session self.model = modeler.model self.config = modeler.config self.finetune_data_loader = finetune_data_loader self.st = SmilesTokenizer()
def __init__(self, modeler): self.session = modeler.session self.model = modeler.model self.config = modeler.config self.st = SmilesTokenizer()
class DataLoader(Sequence): def __init__(self, config, data_type='train'): self.config = config self.data_type = data_type assert self.data_type in ['train', 'valid', 'finetune'] self.max_len = 0 if self.data_type == 'train': self.smiles = self._load(self.config.data_filename) elif self.data_type == 'finetune': self.smiles = self._load(self.config.finetune_data_filename) else: pass self.st = SmilesTokenizer() self.one_hot_dict = self.st.one_hot_dict self.tokenized_smiles = self._tokenize(self.smiles) if self.data_type in ['train', 'valid']: self.idx = np.arange(len(self.tokenized_smiles)) self.valid_size = int( np.ceil( len(self.tokenized_smiles) * self.config.validation_split)) np.random.seed(self.config.seed) np.random.shuffle(self.idx) def _set_data(self): if self.data_type == 'train': ret = [ self.tokenized_smiles[self.idx[i]] for i in self.idx[self.valid_size:] ] elif self.data_type == 'valid': ret = [ self.tokenized_smiles[self.idx[i]] for i in self.idx[:self.valid_size] ] else: ret = self.tokenized_smiles return ret def _load(self, data_filename): length = self.config.data_length print('loading SMILES...') with open(data_filename) as f: smiles = [s.rstrip() for s in f] if length != 0: smiles = smiles[:length] print('done.') return smiles def _tokenize(self, smiles): assert isinstance(smiles, list) print('tokenizing SMILES...') tokenized_smiles = [self.st.tokenize(smi) for smi in tqdm(smiles)] if self.data_type == 'train': for tokenized_smi in tokenized_smiles: length = len(tokenized_smi) if self.max_len < length: self.max_len = length self.config.train_smi_max_len = self.max_len print('done.') return tokenized_smiles def __len__(self): target_tokenized_smiles = self._set_data() if self.data_type in ['train', 'valid']: ret = int( np.ceil( len(target_tokenized_smiles) / float(self.config.batch_size))) else: ret = int( np.ceil( len(target_tokenized_smiles) / float(self.config.finetune_batch_size))) return ret def __getitem__(self, idx): target_tokenized_smiles = self._set_data() if self.data_type in ['train', 'valid']: data = target_tokenized_smiles[idx * self.config.batch_size:(idx + 1) * self.config.batch_size] else: data = target_tokenized_smiles[idx * self.config.finetune_batch_size: (idx + 1) * self.config.finetune_batch_size] data = self._padding(data) self.X, self.y = [], [] for tp_smi in data: X = [self.one_hot_dict[symbol] for symbol in tp_smi[:-1]] self.X.append(X) y = [self.one_hot_dict[symbol] for symbol in tp_smi[1:]] self.y.append(y) self.X = np.array(self.X, dtype=np.float32) self.y = np.array(self.y, dtype=np.float32) return self.X, self.y, [None] def _pad(self, tokenized_smi): return ['G'] + tokenized_smi + ['E'] + [ 'A' for _ in range(self.max_len - len(tokenized_smi)) ] def _padding(self, data): padded_smiles = [self._pad(t_smi) for t_smi in data] return padded_smiles