コード例 #1
0
ファイル: datahelper.py プロジェクト: Maryghysr/DeepDTA
def label_smiles(line, MAX_SMI_LEN, smi_ch_ind):
    X = np.zeros(MAX_SMI_LEN)
    smi_to_sel = list(sf.split_selfies(line))
    for i, ch in enumerate(smi_to_sel):  #	x, smi_ch_ind, y
        X[i] = smi_ch_ind[ch]

    return X  #.tolist()
コード例 #2
0
 def tokenize(self, selfie):
     """
     Convert a SELFIES string into a sequence of tokens.
     """
     tokens = list(sf.split_selfies(selfie))
     tokens = ["SOS"] + tokens + ["EOS"]
     return tokens
コード例 #3
0
ファイル: datahelper.py プロジェクト: Maryghysr/DeepDTA
def one_hot_smiles(line, MAX_SMI_LEN, smi_ch_ind):
    X = np.zeros((MAX_SMI_LEN, len(smi_ch_ind)))  #+1
    smi_to_sel = list(sf.split_selfies(line))
    for i, ch in enumerate(smi_to_sel):
        X[i, (smi_ch_ind[ch] - 1)] = 1

    return X  #.tolist()
コード例 #4
0
 def test_tokenize_selfies_match(self) -> None:
     """Test deprecated tokenize_selfies."""
     for smiles in ['c1cnoc1', '[O-][n+]1ccccc1S', 'c1snnc1-c1ccccn1']:
         transform = Selfies()
         selfies = transform(smiles)
         self.assertListEqual(
             tokenize_selfies(selfies), list(sf.split_selfies(selfies))
         )
コード例 #5
0
    def tokenize_smiles(self, mol):
        """convert the smiles to selfies, then return 
        integer tokens."""
        ints = [self.vocab['<sos>']]

        #encoded_selfies = sf.encoder(smiles)
        selfies_list = list(sf.split_selfies(mol))
        for token in selfies_list:
            ints.append(self.vocab[token])

        ints.append(self.vocab['<eos>'])

        return ints
コード例 #6
0
def get_smiles_length(input_file, tokenizer):
    molecule_length = {}
    end_token_id = tokenizer.encode('<end>').ids[0]
    with open(input_file, 'r') as f:
        for l in f:
            l = l.strip()
            encoding = tokenizer.encode(args.test_string)
            cur_molecule_length = 0
            for i in range(0, len(encoding.ids)):
                if encoding.ids[i] == end_token_id:
                    break
                cur_molecule_length += 1
            cur_molecule_length = len(list(sf.split_selfies(l)))
コード例 #7
0
def get_selfies_length(input_file, remove_null):
    molecule_length = []
    with open(input_file, 'r') as f:
        for l in f:
            l = l.strip()
            cur_molecule_length = len(list(sf.split_selfies(l)))
            if remove_null == True:
                if cur_molecule_length > 0:
                    molecule_length.append(cur_molecule_length)
            else:
                molecule_length.append(cur_molecule_length)
    print("######################################")
    print(
        "The tokenized captions lengths using selfies for {}.\n The statistics are as follows:Average:{}, Min:{}, Max:{}, Median{}, 5% percentile {}, 95% percentile {}, 50% percentile {}."
        .format(input_file,
                np.average(molecule_length), np.min(molecule_length),
                np.max(molecule_length), np.median(molecule_length),
                np.percentile(molecule_length, 0.05),
                np.percentile(molecule_length, 0.95),
                np.percentile(molecule_length, 0.5)))
    print("######################################")
コード例 #8
0
def selfies_to_hot(selfie, largest_selfie_len, alphabet):
    """Go from a single selfies string to a one-hot encoding.
    """

    symbol_to_int = dict((c, i) for i, c in enumerate(alphabet))

    # pad with [nop]
    selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie))

    # integer encode
    symbol_list = sf.split_selfies(selfie)
    integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list]

    # one hot-encode the integer encoded selfie
    onehot_encoded = list()
    for index in integer_encoded:
        letter = [0] * len(alphabet)
        letter[index] = 1
        onehot_encoded.append(letter)

    return integer_encoded, np.array(onehot_encoded)
コード例 #9
0
def create_tokenized_smiles_json(tokenizer, data_dir, split, config_output_name, max_length, label_filename, idx2selfies, selfies2idx):
    data = {"images" : []}
    start_token_id = tokenizer.encode('<start>').ids[0]
    end_token_id = tokenizer.encode('<end>').ids[0]
    pad_token_id = tokenizer.get_vocab_size()
    print("The <start> token id is:{}\nThe <end> token id is:{}\nThe <pad> token id is {}".format(start_token_id, end_token_id, pad_token_id)) 
    with open(os.path.join(data_dir, label_filename), "r") as f:
        for i, l in enumerate(tqdm(f)):
            try:
                smiles, idx = l.strip().split("\t")
                encoding = tokenizer.encode(smiles)
                selfies = sf.encoder(smiles)
                if selfies == None:
                    selfies = ''
                selfies_cap = [selfies2idx['[start]']] + [selfies2idx[i] for i in sf.split_selfies(selfies)]
                if len(selfies_cap) > max_length-1:
                    selfies_cap = selfies_cap[:max_length-1]
                selfies_cap = selfies_cap + [selfies2idx['[end]']]
                selfies_len = len(selfies_cap)
                selfies_len_orig = selfies_len
                while selfies_len < max_length:
                    selfies_cap = selfies_cap + [selfies2idx['[pad]']]
                    selfies_len += 1
                encodingids = encoding.ids
                encodingids = [start_token_id] + encodingids[:-1] #add <start> token and shorten to 150
                cap_len = max_length
                for j in range(0, len(encodingids)):
                    if encodingids[j] == pad_token_id:
                        cap_len = j+1
                        encodingids[j] = end_token_id
                        break
                current_sample = {"filepath": data_dir, "filename": "{}".format(idx), "imgid": 0, "split": split, "sentences" : [{"tokens": encoding.tokens, "raw": smiles, "ids": encodingids , "length": cap_len, "selfies_raw": selfies, "selfies_ids": selfies_cap, "selfies_length":selfies_len_orig }] } # note if image augmentation ever happens need to introduce a sentence id token. see mscoco json for example
                data["images"].append(current_sample)
            except:
                pass
    pickle.dump(data, open(os.path.join(data_dir, config_output_name),'wb'))
    del data
コード例 #10
0
def sf_tokenizer(sfi):
    return ' '.join("." if tok == "." else tok[1:-1]
                    for tok in sf.split_selfies(sfi))
コード例 #11
0
 def __call__(self, smi):
     tokens = selfies.split_selfies(smi)
     return ' '.join(tokens)
コード例 #12
0
    def test_selfies_split(self) -> None:
        """Test tokenization by selfies package has not changed."""
        benzene = 'c1ccccc1'
        encoded_selfies = sf.encoder(benzene)
        # '[c][c][c][c][c][c][Ring1][Branch1_1]' v0.2.4
        # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' v1.0.2 (no aromatic)

        # sf.split_selfies returns generator
        symbols_benzene = list(sf.split_selfies(encoded_selfies))
        # before selfies 2.0.0 the last token is [Branch1_2]
        self.assertListEqual(
            symbols_benzene,
            ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'],
        )

        for smiles, ground_truth in [
            (
                'c1cnoc1',
                # before selfies 2.0.0 the 2 last token are [Expl=Ring1] and [Branch1_1]
                ['[C]', '[C]', '[=N]', '[O]', '[C]', '[=Ring1]', '[Branch1]'],
            ),
            (
                '[O-][n+]1ccccc1S',
                # before selfies 2.0.0 it is: [O-expl], [N+expl] and [=Branch1_2]
                [
                    '[O-1]',
                    '[N+1]',
                    '[=C]',
                    '[C]',
                    '[=C]',
                    '[C]',
                    '[=C]',
                    '[Ring1]',
                    '[=Branch1]',
                    '[S]',
                ],
            ),
            (
                'c1snnc1-c1ccccn1',
                # before selfies 2.0.0 it is: [Expl=Ring1], [Branch1_1] and [Branch1_2]
                [
                    '[C]',
                    '[S]',
                    '[N]',
                    '[=N]',
                    '[C]',
                    '[=Ring1]',
                    '[Branch1]',
                    '[C]',
                    '[=C]',
                    '[C]',
                    '[=C]',
                    '[C]',
                    '[=N]',
                    '[Ring1]',
                    '[=Branch1]',
                ],
            ),
        ]:
            self.assertListEqual(
                # list wrapping version
                split_selfies(sf.encoder(smiles)),
                ground_truth,
            )
コード例 #13
0
def tokenzie_smile(smi):
    tokens = selfies.split_selfies(smi)
    return ' '.join(tokens)
コード例 #14
0
def test_split_selfies(test_cases):
    for s, (_, symbols) in test_cases.items():
        assert list(sf.split_selfies(s)) == symbols
コード例 #15
0
 def _tokenize_selfies(self, selfies_string):
     return list(sf.split_selfies(selfies_string))
コード例 #16
0
if __name__ == "__main__":

    # 6.655716427238491
    vocab = get_arpa_vocab(
        '../resources/chemts_250k_selfies_klm_10gram_210908.arpa')
    lm = KenLMSELFIESLanguageModel(
        '../resources/chemts_250k_selfies_klm_10gram_210908.klm', vocab)

    def smiles_to_selfies(smiles):
        canonical = pybel.readstring("smi", smiles).write("can").strip()
        return sf.encoder(canonical)

    with open("../resources/zinc12_enaminebb_smiles_corpus.txt") as f:
        all_smiles = [s.strip() for s in f.readlines()]

    # the sum of log10 probs of each sentence in the corpus
    sum_log_prob = 0.0

    # the total number of "words" (i.e. tokens) in the corpus
    M = 0

    for smiles in all_smiles:
        s = smiles_to_selfies(smiles.strip())
        tokens = list(sf.split_selfies(s))
        M += len(tokens)
        sum_log_prob += lm.log_prob(' '.join(tokens))

    perplexity = 10**(-sum_log_prob / M)  # log probs are in base 10

    print(perplexity)
コード例 #17
0
    logger.info("best generated text: %s" % generated_text)
    decoded = SELFIESLanguageModelUtils.decode(generated_text,
                                               start='<s>',
                                               end='</s>')
    smiles = SELFIESLanguageModelUtils.sanitize(decoded)
    logger.info("best SMILES: %s, J: %s (%s seconds)" %
                (smiles, scorer.score(smiles), str((end - start))))

    log_top_best(all_smiles, 5, logger)

    logger.info("writing dataset...")
    name = 'molexit-%d' % n
    dataset = '../models/molexit/%s.txt' % name
    with open(dataset, 'w') as f:
        for smi in list(
                reversed(sorted(all_smiles.items(),
                                key=lambda kv: kv[1][0])))[:keep_top_n]:
            ssmi = smiles_to_selfies(smi[0].strip())
            if ssmi is None:
                logger.info("WARNING: could not convert: %s" % smi[0].strip())
                continue
            tokens = sf.split_selfies(ssmi)
            f.write(' '.join(tokens))
            f.write("\n")

    logger.info('training new LM...')
    lm_trainer.train(10, dataset, '../models/molexit', name)

    vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name)
    lm = KenLMSELFIESLanguageModel('../models/molexit/%s.klm' % name, vocab)
コード例 #18
0
ファイル: test_selfies_utils.py プロジェクト: ncfrey/selfies
def test_split_selfies(dataset):
    for entry in dataset[0]:
        assert list(sf.split_selfies(entry.selfies)) == entry.symbols