def encode(self, chunk, use_random=False): """ Args: chunk: a list of `n` strings, each being a SMILES. Returns: A numpy array of dtype np.float32, of shape (n, latent_dim) Note: Each row should be the *mean* of the latent space distrubtion rather than a sampled point from that distribution. (It can be anythin as long as it fits what self.decode expects) """ cfg_tree_list = [] for smiles in chunk: ts = parser.parse(smiles, self.grammar) assert isinstance(ts, list) and len(ts) == 1 n = AnnotatedTree2MolTree(ts[0]) cfg_tree_list.append(n) if type(chunk[0]) is str: cfg_tree_list = parse(chunk, self.grammar) else: cfg_tree_list = chunk onehot, _ = batch_make_att_masks(cfg_tree_list, self.tree_decoder, self.onehot_walker, dtype=np.float32) x_inputs = np.transpose(onehot, [0, 2, 1]) x_inputs = paddle.to_tensor(x_inputs) z_mean, _ = self.ae.encoder(x_inputs) return z_mean.numpy()
def parse_single(smiles, grammar): """ tbd """ ts = parser.parse(smiles, grammar) assert isinstance(ts, list) and len(ts) == 1 n = AnnotatedTree2MolTree(ts[0]) return n
def parse_smiles_with_cfg(smiles_file, grammar_file): grammar = parser.Grammar(grammar_file) cfg_tree_list = [] with open(smiles_file, 'r') as f: for row in tqdm(f): smiles = row.strip() ts = parser.parse(smiles, grammar) assert isinstance(ts, list) and len(ts) == 1 n = AnnotatedTree2MolTree(ts[0]) cfg_tree_list.append(n) return cfg_tree_list
def process_chunk(smiles_list): grammar = parser.Grammar(cmd_args.grammar_file) cfg_tree_list = [] for smiles in smiles_list: ts = parser.parse(smiles, grammar) assert isinstance(ts, list) and len(ts) == 1 n = AnnotatedTree2MolTree(ts[0]) cfg_tree_list.append(n) walker = OnehotBuilder() tree_decoder = create_tree_decoder() onehot, masks = batch_make_att_masks(cfg_tree_list, tree_decoder, walker, dtype=np.byte) return (onehot, masks)
def parse_smiles_with_cfg(smiles_file, grammar_file): grammar = parser.Grammar(cmd_args.grammar_file) smiles_list = [] cfg_tree_list = [] annotated_trees = [] with open(smiles_file, 'r') as f: for row in tqdm(f): smiles = row.strip() smiles_list.append(smiles) ts = parser.parse(smiles, grammar) assert isinstance(ts, list) and len(ts) == 1 annotated_trees.append(ts[0]) n = AnnotatedTree2MolTree(ts[0]) cfg_tree_list.append(n) st = get_smiles_from_tree(n) assert st == smiles return (smiles_list, cfg_tree_list, annotated_trees)
h = self.w1(flatten) h = F.relu(h) z_mean = self.mean_w(h) z_log_var = self.log_var_w(h) return (z_mean, z_log_var) if __name__ == '__main__': smiles_list = ['N\SCPP#IOS', 'CP\P', 'PINI'] cfg_trees = [] cfg_onehots = [] grammar = parser.Grammar(cmd_args.grammar_file) for smiles in smiles_list: ts = parser.parse(smiles, grammar) assert isinstance(ts, list) and len(ts) == 1 n = AnnotatedTree2MolTree(ts[0]) cfg_trees.append(n) cfg_onehots.append(AnnotatedTree2Onehot(ts[0], 50)) cfg_onehots = np.stack(cfg_onehots, axis=0) encoder = CNNEncoder(max_len=50, latent_dim=64) if cmd_args.mode == 'gpu': encoder.cuda() z = encoder(cfg_onehots) print(z[0].size())