Ejemplo n.º 1
0
def time_roundtrip(file_path: str, sample_size: int = -1):
    """Tests the amount of time it takes to encode and then decode an
    entire .txt file of SMILES strings <n> times. If <sample_size> is positive,
    then a random sample is taken from the file instead.
    """

    curr_dir = os.path.dirname(__file__)
    file_path = os.path.join(curr_dir, file_path)

    # load data
    with open(file_path, 'r') as file:
        smiles = [line.rstrip() for line in file.readlines()]
        smiles.pop(0)

        if sample_size > 0:
            smiles = random.sample(smiles, sample_size)
        selfies = list(map(sf.encoder, smiles))

    print(f"Timing {len(smiles)} SMILES from {file_path}")

    # time sf.encoder
    start = time.time()
    for s in smiles:
        sf.encoder(s)
    enc_time = time.time() - start
    print(f"--> selfies.encoder: {enc_time:0.7f}s")

    # time sf.decoder
    start = time.time()
    for s in selfies:
        sf.decoder(s)
    dec_time = time.time() - start
    print(f"--> selfies.decoder: {dec_time:0.7f}s")
Ejemplo n.º 2
0
def test_standardized_alphabet():
    """Tests that equivalent SMILES atom symbols are translated into the
    same SELFIES atom symbol.
    """

    assert sf.encoder("[C][O][N][P][F]") == "[CH0][OH0][NH0][PH0][FH0]"
    assert sf.encoder("[Fe][Si]") == "[Fe][Si]"
    assert sf.encoder("[Fe++][Fe+2]") == "[Fe+2][Fe+2]"
    assert sf.encoder("[CH][CH1]") == "[CH1][CH1]"
Ejemplo n.º 3
0
def get_selfie_and_smiles_encodings_for_dataset(filename_data_set_file_smiles):
    """
    Returns encoding, alphabet and length of largest molecule in SMILES and SELFIES, given a file containing SMILES molecules.
    input:
        csv file with molecules. Column's name must be 'smiles'.
    output:
        - selfies encoding
        - selfies alphabet
        - longest selfies string
        - smiles encoding (equivalent to file content)
        - smiles alphabet (character based)
        - longest smiles string
    """

    df = pd.read_csv(filename_data_set_file_smiles)
    smiles_list = np.asanyarray(df.smiles)
    smiles_alphabet=list(set(''.join(smiles_list)))
    largest_smiles_len=len(max(smiles_list, key=len))
    selfies_list=[]    
    selfies_len=[]
    print('--> Translating SMILES to SELFIES...')
    for individual_smile in smiles_list:
        individual_selfie=selfies.encoder(individual_smile)
        selfies_list.append(individual_selfie)
        selfies_len.append(len(individual_selfie)-len(individual_selfie.replace('[',''))) # len of SELFIES
    selfies_alphabet_pre=list(set(''.join(selfies_list)[1:-1].split('][')))
    selfies_alphabet=[]
    for selfies_element in selfies_alphabet_pre:
        selfies_alphabet.append('['+selfies_element+']')        
    largest_selfies_len=max(selfies_len)
    print('Finished translating SMILES to SELFIES.')
    return(selfies_list, selfies_alphabet, largest_selfies_len, smiles_list, smiles_alphabet, largest_smiles_len)
Ejemplo n.º 4
0
def selfies(dir="../data/xyz/"):

    temp = xyz_to_smiles(dir)
    ret = []

    for i in temp:
        selfies_temp = encoder(i)
        ret.append(selfies_temp)
    ret = np.array(ret)

    return ret
Ejemplo n.º 5
0
def mol2string(mol):
    smiles = Chem.MolToSmiles(mol)

    if string_type == 'selfies':
        return encoder(smiles).split('][')

    if string_type == 'deepsmiles':
        string = converter.encode(smiles)
        return list(string)

    return list(smiles)
Ejemplo n.º 6
0
def mol2string(mol):
    smiles = Chem.MolToSmiles(mol)

    if string_type == 'SELFIES':
        return encoder(smiles).split('][')

    if string_type == 'DeepSMILES':
        string = converter.encode(smiles)
        return list(string)

    return list(smiles)
Ejemplo n.º 7
0
def generate_ops(smi):
    smi = smi.strip()
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        print(f"None value: {smi}\n")
        return []
    scaffold = Chem.Scaffolds.MurckoScaffold.GetScaffoldForMol(mol)
    frags = sg.get_next_murcko_fragments(scaffold)

    smi_selfies = selfies.encoder(smi)
    scaffold_selfies = selfies.encoder(Chem.MolToSmiles(scaffold))

    data = []
    data += [('SCAFFOLD', smi_selfies, scaffold_selfies),
             ('EXPAND', scaffold_selfies, smi_selfies)]
    for frag in frags:
        frag_seflies = selfies.encoder(Chem.MolToSmiles(frag))
        data.append(('LOWER', scaffold_selfies, frag_seflies))
        data.append(('UPPER', frag_seflies, scaffold_selfies))
    return data
Ejemplo n.º 8
0
    def __init__(self, smiles_file, percentage, vocab):
        """
        smiles_file: path to the .smi file containing SMILES.
        percantage: percentage of the dataset to use.
        """
        super(SMILESDataset, self).__init__()
        assert (0 < percentage <= 1)

        self.percentage = percentage
        self.vocab = vocab

        # load eaqual portion of data from each tranche
        self.data = self.read_smiles_file(smiles_file)
        print("total number of SMILES loaded: ", len(self.data))

        # convert the smiles to selfies
        if self.vocab.name == "selfies":
            self.data = [
                sf.encoder(x) for x in self.data if sf.encoder(x) is not None
            ]
            print("total number of valid SELFIES: ", len(self.data))
Ejemplo n.º 9
0
def mol2string(mol):
    Chem.Kekulize(mol, clearAromaticFlags=True)
    smiles = Chem.MolToSmiles(mol, canonical=False)

    if string_type == 'selfies':
        return encoder(smiles).split('][')

    if string_type == 'deepsmiles':
        string = converter.encode(smiles)
        return list(string)
    
    return list(smiles)
Ejemplo n.º 10
0
def create_annotations(input_file, output_file, idx2selfies, selfies2idx, max_length):
    with open(input_file, 'r) as f:
        for l in f:
            l = l.strip().split('\t')
            if len(l) > 1:
                smiles = l[0]
                img = l[1]
            try:
                selfies = [i + ']' for i in sf.encoder(l[0]).split(']')[:-1]]
                selfies = [i + ']' for i in s]
            except:
                pass
Ejemplo n.º 11
0
def test_invalid_or_unsupported_smiles_encoder():
    malformed_smiles = [
        "",
        "(",
        "C(Cl)(Cl)CC[13C",
        "C(CCCOC",
        "C=(CCOC",
        "CCCC)",
        "C1CCCCC",
        "C(F)(F)(F)(F)(F)F",  # violates bond constraints
        "C=C1=CCCCCC1",  # violates bond constraints
        "CC*CC",  # uses wildcard
        "C$C",  # uses $ bond
        "S[As@TB1](F)(Cl)(Br)N",  # unrecognized chirality,
        "SOMETHINGWRONGHERE",
        "1243124124",
    ]

    for smiles in malformed_smiles:
        with pytest.raises(sf.EncoderError):
            sf.encoder(smiles)
Ejemplo n.º 12
0
def selfies_scanner(*, parent_smiles: str):
    # get the parent mol set up properly with defined aromaticity
    parent_mol = Chem.MolFromSmiles(parent_smiles, sanitize=True)
    Chem.rdmolops.Kekulize(parent_mol)
    parent_smiles = Chem.MolToSmiles(parent_mol,
                                     isomericSmiles=True,
                                     kekuleSmiles=True)
    logger.info(f"Generating children from: {parent_smiles}")

    children = []  # finished children
    spawns = 0  # counter for children produced
    parent_selfies = encoder(parent_smiles)
    symbols = re.findall(r"[^[]*\[([^]]*)\]",
                         parent_selfies)  # get the SELFIES symbols into a list

    for i, symb in enumerate(symbols):
        if not (symb == "epsilon" or "Branch" in symb or "Ring" in symb):
            if symb in ALLOWED_SUBS:  # if we have rules for how to handle this symbol
                for replacement in ALLOWED_SUBS[symb]:
                    mut_symbols = symbols.copy(
                    )  # don't manipulate the original
                    mut_symbols[i] = replacement
                    child_symbols = [f"[{symb}]" for symb in mut_symbols]
                    child = "".join(child_symbols)
                    child_smiles = decoder(child)  # get the smiles
                    # test that smiles is valid
                    try:
                        # same as parent, have to have explicit aromaticity
                        child_mol = Chem.MolFromSmiles(child_smiles,
                                                       sanitize=True)
                        Chem.rdmolops.Kekulize(child_mol)
                        child_smiles = Chem.MolToSmiles(child_mol,
                                                        isomericSmiles=True,
                                                        kekuleSmiles=True)
                        assert child_mol  # if MolToSmiles fails, it will be a None
                        if child_smiles == parent_smiles:  # ignore this child if it's the same as the parent
                            continue
                    except Exception:  # pylint: disable=broad-except
                        logger.warning(
                            f"Produced improper SELFIES. Ignoring and trying again. Details below:"
                        )
                        logger.warning(f"Child SELFIES: {child}")
                        logger.warning(f"Parent SELFIES:{parent_selfies}")
                        logger.warning(f"Child SMILES: {child_smiles}")
                        logger.warning(f"Parent SMILES: {parent_smiles}")
                        continue
                    # Every good child deserves fudge
                    children.append(child_smiles)
                    spawns += 1  # update our counter

    return children
Ejemplo n.º 13
0
 def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]:
     for _, row in dataset.iterrows():
         res = encoder(row.smiles)
         if not res:
             continue
         res = res.replace("]", "] ").replace(".", "DOT ")
         sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer)
         for col, val in row.items():
             if isinstance(val, float):
                 if val == 1.0:
                     sent.add_label(None, col.replace(" ", "_") + "_P ")
                 if val == 0.0:
                     sent.add_label(None, col.replace(" ", "_") + "_N ")
         yield sent
Ejemplo n.º 14
0
def test_encoder_attribution():
    smiles = "C1([O-])C=CC=C1Cl"
    indices = [0, 3, 3, 3, 5, 7, 8, 10, None, None, 12]
    s, am = sf.encoder(smiles, attribute=True)
    # check that Cl lined up
    for i, ta in enumerate(am):
        if ta[1]:
            assert indices[i] == ta[1][0][0], \
                f'found {ta[1]}; should be {indices[i]}'
        if ta[0] == '[Cl]':
            for i, v in ta[1]:
                if v == 'Cl':
                    return
    raise ValueError('Failed to find Cl in attribution map')
Ejemplo n.º 15
0
def test_roundtrip_translation(test_path, dataset_samples):
    """Tests SMILES -> SELFIES -> SMILES translation on various datasets.
    """

    # very relaxed constraints
    constraints = sf.get_preset_constraints("hypervalent")
    constraints.update({"P": 7, "P-1": 8, "P+1": 6, "?": 12})
    sf.set_semantic_constraints(constraints)

    error_path = ERROR_LOG_DIR / "{}.csv".format(test_path.stem)
    with open(error_path, "w+") as error_log:
        error_log.write("In, Out\n")

    error_data = []
    error_found = False

    n_lines = sum(1 for _ in open(test_path)) - 1
    n_keep = dataset_samples if (0 < dataset_samples <= n_lines) else n_lines
    skip = random.sample(range(1, n_lines + 1), n_lines - n_keep)
    reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip)

    for chunk in reader:

        for in_smiles in chunk["smiles"]:
            in_smiles = in_smiles.strip()

            mol = Chem.MolFromSmiles(in_smiles, sanitize=True)
            if (mol is None) or ("*" in in_smiles):
                continue

            try:
                selfies = sf.encoder(in_smiles, strict=True)
                out_smiles = sf.decoder(selfies)
            except (sf.EncoderError, sf.DecoderError):
                error_data.append((in_smiles, ""))
                continue

            if not is_same_mol(in_smiles, out_smiles):
                error_data.append((in_smiles, out_smiles))

        with open(error_path, "a") as error_log:
            for entry in error_data:
                error_log.write(",".join(entry) + "\n")

        error_found = error_found or error_data
        error_data = []

    sf.set_semantic_constraints()  # restore constraints

    assert not error_found
Ejemplo n.º 16
0
def smiles2string(smiles):
    if string_type == 'smiles':
        string = smiles

    if string_type == 'selfies':
        try:
            string = encoder(smiles, PrintErrorMessage=False)
        except:
            return None

    if string_type == 'deepsmiles':
        try:
            string = converter.encode(smiles)
        except deepsmiles.DecodeError as e:
            return None

    return string
Ejemplo n.º 17
0
def batch_mode(input_file, output_file, model_size):

    outfile = open(output_file, "w")

    with open(input_file, "r") as f:
        for i, line in enumerate(f):
            smiles_string = line.strip()
            canonical_smiles = subprocess.check_output([
                'java', '-cp', 'Java_dependencies/cdk-2.1.1.jar:.',
                'SMILEStoCanonicalSMILES', smiles_string
            ])
            iupac_name = translate(
                selfies.encoder(
                    canonical_smiles.decode('utf-8').strip()).replace(
                        "][", "] ["), model_size)
            outfile.write(
                iupac_name.replace(" ", "").replace("<end>", "") + "\n")

    outfile.close()

    return output_file
Ejemplo n.º 18
0
def to_selfies(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
    """Convert a mol to SELFIES.

    Args:
        mol: a molecule or a SMILES.

    Returns:
        selfies: SELFIES string.
    """
    if mol is None:
        return None

    if isinstance(mol, Chem.rdchem.Mol):
        mol = to_smiles(mol)

    selfies = sf.encoder(mol)  # type: ignore

    if selfies == -1:
        return None

    return selfies
Ejemplo n.º 19
0
def roundtrip_translation():
    sf.set_semantic_constraints("hypervalent")

    n_entries = 0
    for chunk in make_reader():
        n_entries += len(chunk)
    pbar = tqdm(total=n_entries)

    reader = make_reader()
    error_log = open(ERROR_LOG_DIR / f"{TEST_SET_PATH.stem}.txt", "a+")

    curr_idx = 0
    for chunk_idx, chunk in enumerate(reader):
        for in_smiles in chunk[args.col_name]:
            pbar.update(1)
            curr_idx += 1
            if curr_idx < args.start_from:
                continue

            in_smiles = in_smiles.strip()

            mol = Chem.MolFromSmiles(in_smiles, sanitize=True)
            if (mol is None) or ("*" in in_smiles):
                continue

            try:
                selfies = sf.encoder(in_smiles, strict=True)
                out_smiles = sf.decoder(selfies)
            except (sf.EncoderError, sf.DecoderError):
                error_log.write(in_smiles + "\n")
                tqdm.write(in_smiles)
                continue

            if not is_same_mol(in_smiles, out_smiles):
                error_log.write(in_smiles + "\n")
                tqdm.write(in_smiles)

    error_log.close()
Ejemplo n.º 20
0
def create_tokenized_smiles_json(tokenizer, data_dir, split, config_output_name, max_length, label_filename, idx2selfies, selfies2idx):
    data = {"images" : []}
    start_token_id = tokenizer.encode('<start>').ids[0]
    end_token_id = tokenizer.encode('<end>').ids[0]
    pad_token_id = tokenizer.get_vocab_size()
    print("The <start> token id is:{}\nThe <end> token id is:{}\nThe <pad> token id is {}".format(start_token_id, end_token_id, pad_token_id)) 
    with open(os.path.join(data_dir, label_filename), "r") as f:
        for i, l in enumerate(tqdm(f)):
            try:
                smiles, idx = l.strip().split("\t")
                encoding = tokenizer.encode(smiles)
                selfies = sf.encoder(smiles)
                if selfies == None:
                    selfies = ''
                selfies_cap = [selfies2idx['[start]']] + [selfies2idx[i] for i in sf.split_selfies(selfies)]
                if len(selfies_cap) > max_length-1:
                    selfies_cap = selfies_cap[:max_length-1]
                selfies_cap = selfies_cap + [selfies2idx['[end]']]
                selfies_len = len(selfies_cap)
                selfies_len_orig = selfies_len
                while selfies_len < max_length:
                    selfies_cap = selfies_cap + [selfies2idx['[pad]']]
                    selfies_len += 1
                encodingids = encoding.ids
                encodingids = [start_token_id] + encodingids[:-1] #add <start> token and shorten to 150
                cap_len = max_length
                for j in range(0, len(encodingids)):
                    if encodingids[j] == pad_token_id:
                        cap_len = j+1
                        encodingids[j] = end_token_id
                        break
                current_sample = {"filepath": data_dir, "filename": "{}".format(idx), "imgid": 0, "split": split, "sentences" : [{"tokens": encoding.tokens, "raw": smiles, "ids": encodingids , "length": cap_len, "selfies_raw": selfies, "selfies_ids": selfies_cap, "selfies_length":selfies_len_orig }] } # note if image augmentation ever happens need to introduce a sentence id token. see mscoco json for example
                data["images"].append(current_sample)
            except:
                pass
    pickle.dump(data, open(os.path.join(data_dir, config_output_name),'wb'))
    del data
Ejemplo n.º 21
0
    def __getitem__(self, idx):
        # Returns tuple
        # Smiles has to be in first column of the csv !!

        row = self.df.iloc[idx, :]

        smiles = row.smiles  # needed anyway to build graph
        m = Chem.MolFromSmiles(smiles)

        if self.compute_selfies:
            Chem.Kekulize(m)
            k = Chem.MolToSmiles(m, isomericSmiles=False,
                                 kekuleSmiles=True)  # kekuleSmiles
            selfie = encoder(k)
            """
            if selfie != row.selfies:
                print('new selfie:', selfie)
                print('prev : ', row.selfies)
            """

        else:
            selfie = row.selfies

        # 1 - Graph building
        if m != None:
            graph = smiles_to_nx(smiles)
        else:
            return None, 0, 0, 0

        one_hot = {
            edge: torch.tensor(self.edge_map[label])
            for edge, label in (
                nx.get_edge_attributes(graph, 'bond_type')).items()
        }
        nx.set_edge_attributes(graph, name='one_hot', values=one_hot)

        try:
            at_type = {
                a: oh_tensor(self.at_map[label], self.num_atom_types)
                for a, label in (
                    nx.get_node_attributes(graph, 'atomic_num')).items()
            }
            nx.set_node_attributes(graph, name='atomic_num', values=at_type)
        except KeyError:
            print('!!!! Atom type to one-hot error for input ', smiles,
                  ' ignored')
            return None, 0, 0, 0

        at_charge = {
            a: oh_tensor(self.charges_map[label], self.num_charges)
            for a, label in (
                nx.get_node_attributes(graph, 'formal_charge')).items()
        }
        nx.set_node_attributes(graph, name='formal_charge', values=at_charge)

        try:
            hydrogens = {
                a: torch.tensor(self.chi_map[label], dtype=torch.float)
                for a, label in (
                    nx.get_node_attributes(graph, 'num_explicit_hs')).items()
            }
            nx.set_node_attributes(graph,
                                   name='num_explicit_hs',
                                   values=hydrogens)
        except KeyError:
            print(
                '!!!! Number of explicit hydrogens to one-hot error for input ',
                smiles, ' ignored')
            return None, 0, 0, 0

        aromatic = {
            a: torch.tensor(self.chi_map[label], dtype=torch.float)
            for a, label in (
                nx.get_node_attributes(graph, 'is_aromatic')).items()
        }
        nx.set_node_attributes(graph, name='is_aromatic', values=aromatic)

        at_chir = {
            a: torch.tensor(self.chi_map[label], dtype=torch.float)
            for a, label in (
                nx.get_node_attributes(graph, 'chiral_tag')).items()
        }
        nx.set_node_attributes(graph, name='chiral_tag', values=at_chir)

        # to dgl
        g_dgl = dgl.DGLGraph()
        node_features = [
            'atomic_num', 'formal_charge', 'num_explicit_hs', 'is_aromatic',
            'chiral_tag'
        ]
        g_dgl.from_networkx(nx_graph=graph,
                            node_attrs=node_features,
                            edge_attrs=['one_hot'])

        N = g_dgl.number_of_nodes()

        g_dgl.ndata['h'] = torch.cat(
            [g_dgl.ndata[f].view(N, -1) for f in node_features], dim=1)

        if self.graph_only:  # give only the graph (to encode in latent space)
            return g_dgl, 0, 0, 0

        # 2 - Smiles / selfies to integer indices array

        if self.language == 'selfies':

            a, valid_flag = self.selfies_to_hot(selfie)
            if valid_flag == 0:  # no one hot encoding for this selfie, ignore
                print('!!! Selfie to one-hot failed with current alphabet')
                return None, 0, 0, 0

        else:
            a = np.zeros(self.max_len)
            idces = [self.char_to_index[c] for c in smiles]
            a[:len(idces)] = idces

        # 3 - Optional props and affinities

        props, targets = 0, 0
        if len(self.props) > 0:
            props = np.array(row[self.props], dtype=np.float32)

        if len(self.targets) > 0 and self.binned_scores:
            targets = np.array(row[self.targets],
                               dtype=np.int64)  # for torch.long class labels
        elif len(self.targets) > 0:
            targets = np.array(row[self.targets],
                               dtype=np.float32)  # for torch.float values

        targets = np.nan_to_num(targets)  # if nan somewhere, change to 0.

        return g_dgl, a, props, targets
Ejemplo n.º 22
0
    def test_selfies_split(self) -> None:
        """Test tokenization by selfies package has not changed."""
        benzene = 'c1ccccc1'
        encoded_selfies = sf.encoder(benzene)
        # '[c][c][c][c][c][c][Ring1][Branch1_1]' v0.2.4
        # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' v1.0.2 (no aromatic)

        # sf.split_selfies returns generator
        symbols_benzene = list(sf.split_selfies(encoded_selfies))
        # before selfies 2.0.0 the last token is [Branch1_2]
        self.assertListEqual(
            symbols_benzene,
            ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'],
        )

        for smiles, ground_truth in [
            (
                'c1cnoc1',
                # before selfies 2.0.0 the 2 last token are [Expl=Ring1] and [Branch1_1]
                ['[C]', '[C]', '[=N]', '[O]', '[C]', '[=Ring1]', '[Branch1]'],
            ),
            (
                '[O-][n+]1ccccc1S',
                # before selfies 2.0.0 it is: [O-expl], [N+expl] and [=Branch1_2]
                [
                    '[O-1]',
                    '[N+1]',
                    '[=C]',
                    '[C]',
                    '[=C]',
                    '[C]',
                    '[=C]',
                    '[Ring1]',
                    '[=Branch1]',
                    '[S]',
                ],
            ),
            (
                'c1snnc1-c1ccccn1',
                # before selfies 2.0.0 it is: [Expl=Ring1], [Branch1_1] and [Branch1_2]
                [
                    '[C]',
                    '[S]',
                    '[N]',
                    '[=N]',
                    '[C]',
                    '[=Ring1]',
                    '[Branch1]',
                    '[C]',
                    '[=C]',
                    '[C]',
                    '[=C]',
                    '[C]',
                    '[=N]',
                    '[Ring1]',
                    '[=Branch1]',
                ],
            ),
        ]:
            self.assertListEqual(
                # list wrapping version
                split_selfies(sf.encoder(smiles)),
                ground_truth,
            )
Ejemplo n.º 23
0
'''
Written by Jan H. Jensen 2019
'''

from selfies import encoder, decoder
import pandas as pd
from rdkit import Chem

pd.set_option('max_colwidth', 200)
df = pd.read_csv('ZINC_250k.smi', sep=" ", header=None)
df.columns = ["smiles"]

rows = 1000
symbols_list = []
for index, row in df.iterrows():
    smiles = row['smiles']
    mol = Chem.MolFromSmiles(smiles)
    Chem.Kekulize(mol, clearAromaticFlags=True)
    smiles = Chem.MolToSmiles(mol)
    symbols = encoder(smiles).split('][')
    for symbol in symbols:
        symbol = symbol.replace(']', '').replace('[', '')
        if symbol not in symbols_list:
            symbols_list.append(symbol)

print(symbols_list)
Ejemplo n.º 24
0
def selfies_substitution(*,
                         parent_smiles: str,
                         n_children: int = 100,
                         mut_rate: float = 0.03,
                         mut_min: int = 1,
                         mut_max: int = 2,
                         max_trials: int = 100
                         ):

    """
    This function takes a parent molecule, and generates a number of children derived from it, via converting
    the parent into SELFIES format and substituting symbols for other symbols, based on primitive chemical rules.
    :param parent_smiles: The smiles of the parent molecule.
    :param n_children: How many children to produce.
    :param mut_rate: How frequent should mutations be? 0.0 to 1.0
    :param mut_min: What is the min number of mutations to allow in a given child, relative to the parent?
    :param mut_max: Same as above but the max.
    :param max_trials: number of attempts to create valid mutations before moving on, useful for pathological selfies
    :return:
    """

    # get the parent mol set up properly with defined aromaticity
    parent_mol = Chem.MolFromSmiles(parent_smiles, sanitize=True)
    Chem.rdmolops.Kekulize(parent_mol)
    parent_smiles = Chem.MolToSmiles(parent_mol, isomericSmiles=True, kekuleSmiles=True)
    logger.info(f"Generating children from: {parent_smiles}")

    children = []  # finished children
    spawns = 0  # counter for children produced
    parent_selfies = encoder(parent_smiles)
    symbols = re.findall(r"[^[]*\[([^]]*)\]", parent_selfies)   # get the SELFIES symbols into a list

    while spawns < n_children:  # try to produce the correct number of children
        muts = 0
        mutations = []  # which parts of the SELFIES to remove
        mut_symbols = symbols.copy()  # don't manipulate the original
        mut_positions = list(range(len(symbols)))  # need the index

        t = 0
        while (muts < mut_min) and (t <= max_trials):
            random.shuffle(mut_positions)  # shuffle the order so that mutations will be random
            for pos in mut_positions:  # try to mutate
                if pos not in mutations:
                    if random.random() <= mut_rate:
                        # ignore special symbols, leave them alone
                        if not (symbols[pos] == "epsilon" or "Branch" in symbols[pos] or "Ring" in symbols[pos]):
                            if symbols[pos] in ALLOWED_SUBS:  # if we have rules for how to handle this symbol
                                mutations.append(pos)  # record which symbol this is
                                muts += 1  # record the intention to mutate
                                if muts == mut_max:
                                    # when we're done, stop looking
                                    break
            t += 1
        if t > max_trials:
            logger.warning(f"Failed to produce any selfies after {max_trials} trials. Returning empty list.")
            return list()

        # for each planned mutation, actually execute it, on the non-shuffled original SELFIES list
        for index in sorted(mutations, reverse=True):
            mut_symbols[index] = random.choice(ALLOWED_SUBS[mut_symbols[index]])

        # convert the new child into a SELFIES (rather than a list)
        child_symbols = [f"[{symb}]" for symb in mut_symbols]
        child = "".join(child_symbols)
        child_smiles = decoder(child)  # get the smiles

        # test that smiles is valid
        try:
            # same as parent, have to have explicit aromaticity
            child_mol = Chem.MolFromSmiles(child_smiles, sanitize=True)
            Chem.rdmolops.Kekulize(child_mol)
            child_smiles = Chem.MolToSmiles(child_mol, isomericSmiles=True, kekuleSmiles=True)
            assert child_mol  # if MolToSmiles fails, it will be a None
            if child_smiles == parent_smiles:  # ignore this child if it's the same as the parent
                continue
        except Exception:  # pylint: disable=broad-except
            logger.warning(f"Produced improper SELFIES. Ignoring and trying again. Details below:")
            logger.warning(f"Child SELFIES: {child}")
            logger.warning(f"Parent SELFIES:{parent_selfies}")
            logger.warning(f"Child SMILES: {child_smiles}")
            logger.warning(f"Parent SMILES: {parent_smiles}")
            continue
        # Every good child deserves fudge
        children.append(child_smiles)
        spawns += 1  # update our counter

    return children
Ejemplo n.º 25
0
    num_iterations  = 1

    results_dir = du.make_clean_results_dir()
    total_time = time.time()
    for i in range(num_iterations):
        for beta in beta_params:
            
            max_fitness_collector = []
            image_dir, saved_models_dir, data_dir = du.make_clean_directories(beta, results_dir, i)
            
            torch.cuda.empty_cache()
            writer = SummaryWriter()   

            smiles_all_counter = train( num_generations            = 100,
                                        generation_size            = 500,
                                        starting_selfies           = [encoder('C')],
                                        max_molecules_len          = 81,
                                        disc_epochs_per_generation = 10,
                                        disc_enc_type              = 'properties_rdkit',
                                        disc_layers                = [100, 10],
                                        training_start_gen         = 0,                         
                                        device                     = 'cuda',
                                        properties_calc_ls         = ['logP', 'SAS', 'RingP', 'QED'], 
                                        num_processors             = multiprocessing.cpu_count(),
                                        beta                       = beta,
                                        max_fitness_collector      = max_fitness_collector,
                                        impose_time_adapted_pen    = True
                                        )
            
    print('Total time: ', (time.time()-total_time)/60, ' mins')
                'VAE_dependencies/Saved_models/VAE_decode_epoch_{}'.format(
                    settings['training_VAE']['num_epochs']))
            #plot epoch vs reconstruction loss / quality
            print(recons_quality_valid, recons_quality_train, recons_loss)
            if settings['plot']['plot_quality']:
                line1, = plt.plot(recons_quality_valid, label='Validation set')
                line2, = plt.plot(recons_quality_train, label='Training set')
                plt.xlabel('Epochs')
                plt.ylabel('Reconstruction Quality (%)')
                plt.legend(handles=[line1, line2])
                plt.show()
            if settings['plot']['plot_loss']:
                plt.plot(recons_loss)
                plt.xlabel('Epochs')
                plt.ylabel('Reconstruction Loss')
                plt.show()
        else:
            print('Linear interpolation of 10 steps between ' + test_mol1 +
                  ' and ' + test_mol2 + ':')
            mol_inter = linear_interpolation(encoder(test_mol1),
                                             encoder(test_mol2), 10)
            print(mol_inter)

        with open('COMPLETED', 'w') as content:
            content.write('exit code: 0')

    except AttributeError:
        _, error_message, _ = sys.exc_info()
        print(error_message)
        print(error_message)
Ejemplo n.º 27
0
def test_roundtrip_translation(test_name, column_name, dataset_samples):
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.
    """

    # modify semantic bond constraints
    sf.set_semantic_constraints({
        'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1,
        'O': 2, 'O+1': 3, 'O-1': 1,
        'N': 6, 'N+1': 4, 'N-1': 2,
        'C': 4, 'C+1': 5, 'C-1': 3,
        'S': 6, 'S+1': 7, 'S-1': 5,
        'P': 7, 'P+1': 8, 'P-1': 6,
        '?': 8,
    })

    # file I/O
    curr_dir = os.path.dirname(__file__)
    test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt")
    error_path = os.path.join(curr_dir,
                              'error_sets',
                              "errors_{}.csv".format(test_name))

    # create error directory
    os.makedirs(os.path.dirname(error_path), exist_ok=True)
    error_list = []

    # add header in error log text file
    with open(error_path, "w+") as error_log:
        error_log.write("In, Out\n")
    error_found_flag = False

    # make pandas reader
    N = sum(1 for _ in open(test_path)) - 1
    S = dataset_samples if (0 < dataset_samples <= N) else N
    skip = sorted(random.sample(range(1, N + 1), N - S))
    reader = pd.read_csv(test_path,
                         chunksize=10000,
                         header=0,
                         skiprows=skip)

    # roundtrip testing
    for chunk in reader:
        for in_smiles in chunk[column_name]:
            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):
                continue

            # encode SELFIE string
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))
                continue

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, str(out_smiles)))

        # open and write all errors to errors_{test_name}.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

    sf.set_semantic_constraints()  # restore defaults

    assert not error_found_flag
Ejemplo n.º 28
0
def test_malformed_smiles_encoder():
    """Tests selfies.encoder() terminates on a malformed SMILES."""
    sf.encoder("C(Cl)(Cl)CC[13C")
    assert True
Ejemplo n.º 29
0
def test_roundtrip_translation():
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.
    """

    # modify constraints
    constraints = sf.get_semantic_constraints()
    constraints['N'] = 6
    constraints['Br'] = 7
    constraints['Cl'] = 7
    constraints['I'] = 7
    sf.set_semantic_constraints(constraints)

    # file I/O
    ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt')
    error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv')

    # check if a previous checkpoint exists to continue tests
    if os.path.exists(ckpt_path):
        with open(ckpt_path, 'r') as ckpt_file:
            checkpoint = int(ckpt_file.readlines()[0])

    # if no path to a checkpoint exists,
    # create a new directory for error logging and checkpoints
    else:
        os.makedirs(os.path.dirname(ckpt_path), exist_ok=True)
        os.makedirs(os.path.dirname(error_path), exist_ok=True)

        with open(error_path, "w+") as error_log:
            error_log.write("In, Out\n")
        checkpoint = -1

    error_list = []
    error_found_flag = False

    # make pandas reader
    reader = pd.read_csv(EMOL_PATH,
                         chunksize=10000,
                         compression='gzip',
                         delimiter=' ',
                         header=0)

    # roundtrip testing
    for chunk_idx, chunk in enumerate(reader):

        if chunk_idx <= checkpoint:
            continue

        for in_smiles in chunk[COL_NAME]:

            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):
                continue

            # encode selfies
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))
                continue

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, out_smiles))

        # open and write all errors to errors_emolecule.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

        # create checkpoint from the current pandas reader chunk,
        # to load from and continue testing.
        with open(ckpt_path, 'w+') as ckpt_file:
            ckpt_file.write(str(chunk_idx))

    sf.set_semantic_constraints()  # restore defaults
    os.remove(ckpt_path)  # remove checkpoint

    assert not error_found_flag
Ejemplo n.º 30
0
def main(args):
    with open(args.input_file, 'r') as f:
        with open(args.output_file, 'w') as w:
            for l in f:
                l = l.strip()
                w.write('{}\n'.format(sf.encoder(l)))