Esempio n. 1
0
    def encode(self, chunk, use_random=False):
        """
        Args:
            chunk: a list of `n` strings, each being a SMILES.

        Returns:
            A numpy array of dtype np.float32, of shape (n, latent_dim)
            Note: Each row should be the *mean* of the latent space distrubtion rather than a sampled point from that distribution.
            (It can be anythin as long as it fits what self.decode expects)
        """

        cfg_tree_list = []
        for smiles in chunk:
            ts = parser.parse(smiles, self.grammar)
            assert isinstance(ts, list) and len(ts) == 1

            n = AnnotatedTree2MolTree(ts[0])
            cfg_tree_list.append(n)

        if type(chunk[0]) is str:
            cfg_tree_list = parse(chunk, self.grammar)
        else:
            cfg_tree_list = chunk

        onehot, _ = batch_make_att_masks(cfg_tree_list,
                                         self.tree_decoder,
                                         self.onehot_walker,
                                         dtype=np.float32)

        x_inputs = np.transpose(onehot, [0, 2, 1])

        x_inputs = paddle.to_tensor(x_inputs)
        z_mean, _ = self.ae.encoder(x_inputs)

        return z_mean.numpy()
Esempio n. 2
0
def parse_single(smiles, grammar):
    """
    tbd
    """
    ts = parser.parse(smiles, grammar)
    assert isinstance(ts, list) and len(ts) == 1
    n = AnnotatedTree2MolTree(ts[0])
    return n
Esempio n. 3
0
def parse_smiles_with_cfg(smiles_file, grammar_file):
    grammar = parser.Grammar(grammar_file)

    cfg_tree_list = []
    with open(smiles_file, 'r') as f:
        for row in tqdm(f):
            smiles = row.strip()
            ts = parser.parse(smiles, grammar)
            assert isinstance(ts, list) and len(ts) == 1
            n = AnnotatedTree2MolTree(ts[0])
            cfg_tree_list.append(n)

    return cfg_tree_list
Esempio n. 4
0
def process_chunk(smiles_list):
    grammar = parser.Grammar(cmd_args.grammar_file)

    cfg_tree_list = []
    for smiles in smiles_list:
        ts = parser.parse(smiles, grammar)
        assert isinstance(ts, list) and len(ts) == 1

        n = AnnotatedTree2MolTree(ts[0])
        cfg_tree_list.append(n)

    walker = OnehotBuilder()
    tree_decoder = create_tree_decoder()
    onehot, masks = batch_make_att_masks(cfg_tree_list, tree_decoder, walker, dtype=np.byte)

    return (onehot, masks)
Esempio n. 5
0
def parse_smiles_with_cfg(smiles_file, grammar_file):
    grammar = parser.Grammar(cmd_args.grammar_file)

    smiles_list = []
    cfg_tree_list = []
    annotated_trees = []
    with open(smiles_file, 'r') as f:
        for row in tqdm(f):
            smiles = row.strip()
            smiles_list.append(smiles)
            ts = parser.parse(smiles, grammar)
            assert isinstance(ts, list) and len(ts) == 1
            annotated_trees.append(ts[0])
            n = AnnotatedTree2MolTree(ts[0])
            cfg_tree_list.append(n)
            st = get_smiles_from_tree(n)

            assert st == smiles

    return (smiles_list, cfg_tree_list, annotated_trees)
Esempio n. 6
0
        h = self.w1(flatten)
        h = F.relu(h)

        z_mean = self.mean_w(h)
        z_log_var = self.log_var_w(h)

        return (z_mean, z_log_var)


if __name__ == '__main__':

    smiles_list = ['N\SCPP#IOS', 'CP\P', 'PINI']

    cfg_trees = []
    cfg_onehots = []
    grammar = parser.Grammar(cmd_args.grammar_file)
    for smiles in smiles_list:
        ts = parser.parse(smiles, grammar)
        assert isinstance(ts, list) and len(ts) == 1
        n = AnnotatedTree2MolTree(ts[0])
        cfg_trees.append(n)
        cfg_onehots.append(AnnotatedTree2Onehot(ts[0], 50))

    cfg_onehots = np.stack(cfg_onehots, axis=0)

    encoder = CNNEncoder(max_len=50, latent_dim=64)
    if cmd_args.mode == 'gpu':
        encoder.cuda()
    z = encoder(cfg_onehots)
    print(z[0].size())