Beispiel #1
0
    def process(self):
        data1 = np.load(self.raw_paths[0])
        data2 = np.load(self.raw_paths[1])
        data1_feed_dict = {
            'E': torch.as_tensor(data1['E']),
            'N': torch.as_tensor(data1['N']),
            'R': torch.as_tensor(data1['R_qm'] if self.qm else data1['R_mmff']),
            'D': torch.as_tensor(data1['D_qm'] if self.qm else data1['D_mmff']),
            'Q': torch.as_tensor(data1['Q']),
            'Z': torch.as_tensor(data1['Z'])
        }
        data2_feed_dict = {
            'E': torch.as_tensor(data2['E']),
            'N': torch.as_tensor(data2['N']),
            'R': torch.as_tensor(data2['R_qm'] if self.qm else data2['R_mmff']),
            'D': torch.as_tensor(data2['D_qm'] if self.qm else data2['D_mmff']),
            'Q': torch.as_tensor(data2['Q']),
            'Z': torch.as_tensor(data2['Z'])
        }

        data1_size = data1['E'].shape[0]
        data2_size = data2['E'].shape[0]

        if not self.sep_heavy_atom:
            data_size = data1_size + data2_size
        else:
            in_part1 = (self.num_heavy_atom < 14)
            heavy_atom_data = pd.read_csv(self.raw_paths[2] if in_part1 else self.raw_paths[3])
            num_heavy_atom = torch.as_tensor(heavy_atom_data['numberHA']).long()
            atom_mask = (num_heavy_atom == self.num_heavy_atom)
            atom_mask = atom_mask.view(-1)
            data_dict_used = data1_feed_dict if in_part1 else data2_feed_dict
            for key in data_dict_used.keys():
                data_dict_used[key] = data_dict_used[key][atom_mask]
            '''
            Here is a trick to make sure later part only calculate data_dict_used
            '''
            data_size = data_dict_used['E'].shape[0]
            data1_feed_dict = data_dict_used

        data_array = np.empty(data_size, dtype=Data)

        for i in tqdm(range(data_size)):

            data_index = i if i < data1_size else i - data1_size

            if i < data1_size:
                tmp_data = _get_ith_data(data_index, **data1_feed_dict)
            else:
                tmp_data = _get_ith_data(data_index, **data2_feed_dict)
            tmp_data = self.pre_transform(tmp_data, edge_version='cutoff', do_sort_edge=True, cal_efg=False,
                                          cutoff=self.cutoff, boundary_factor=None, use_center=None,
                                          mol=AddHs(MolFromSmiles('C')),
                                          cal_3body_term=self.cal_3body_term, bond_atom_sep=self.bond_atom_sep,
                                          record_long_range=self.record_long_range)
            data_array[i] = tmp_data

        data_list = [data_array[i] for i in range(data_size)]
        print('collating...')
        data1, slices = self.collate(data_list)
        print('saving...')
        torch.save((data1, slices), self.processed_paths[0])
Beispiel #2
0
def get_mol_objects(SMILES):
    if type(SMILES) == list:
        return [MolFromSmiles(SMILES) for SMILES in SMILES]
    if type(SMILES) == str:
        return MolFromSmiles(SMILES)
Beispiel #3
0
def structure_standardization(smi: str) -> str:
    """
    Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object.
    Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen).
    If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file.
    Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit
    (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization
    process are saved in the log file. The remaining standardized structures are converted back into their canonical
    SMILES format.
    :param smi: Input SMILES from the given structure data file T4
    :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES.


    Args:
        smi (str): Non-standardized smiles string

    Returns:
        str: standardized smiles string
    """

    # tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules()
    # importlib.reload(MolVS_standardizer)
    # param = ReadConfig()
    standardization_param = ConfigDict.get_parameters()["standardization"]

    max_num_atoms = standardization_param["max_num_atoms"]
    max_num_tautomers = standardization_param["max_num_tautomers"]
    include_stereoinfo = standardization_param["include_stereoinfo"]

    ## Load new tautomer enumarator/canonicalizer
    tautomerizer = rdMolStandardize.TautomerEnumerator()
    tautomerizer.SetMaxTautomers(max_num_tautomers)
    tautomerizer.SetRemoveSp3Stereo(
        False)  # Keep stereo information of keto/enol tautomerization

    def isotope_parent(mol: Chem.Mol) -> Chem.Mol:
        """
        Isotope parent from MOLVS
        Return the isotope parent of a given molecule.
        The isotope parent has all atoms replaced with the most abundant isotope for that element.
        Args:
            mol (Chem.Mol): input rdkit mol object

        Returns:
            Chem.Mol: isotope parent rdkit mol object
        """
        mol = copy.deepcopy(mol)
        # Replace isotopes with common weight
        for atom in mol.GetAtoms():
            atom.SetIsotope(0)
        return mol

    def my_standardizer(mol: Chem.Mol) -> Chem.Mol:
        """
        MolVS implementation of standardization

        Args:
            mol (Chem.Mol): non-standardized rdkit mol object

        Returns:
            Chem.Mol: stndardized rdkit mol object
        """
        mol = copy.deepcopy(mol)
        Chem.SanitizeMol(mol)
        mol = Chem.RemoveHs(mol)
        disconnector = rdMolStandardize.MetalDisconnector()
        mol = disconnector.Disconnect(mol)
        normalizer = rdMolStandardize.Normalizer()
        mol = normalizer.normalize(mol)
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)
        Chem.AssignStereochemistry(mol, force=True, cleanIt=True)
        # TODO: Check this removes symmetric stereocenters
        return mol

    mol = MolFromSmiles(smi)  # Read SMILES and convert it to RDKit mol object.
    if (mol is not None
        ):  # Check, if the input SMILES has been converted into a mol object.
        if (
                mol.GetNumAtoms() <= max_num_atoms
        ):  # check size of the molecule based on the non-hydrogen atom count.
            try:

                mol = rdMolStandardize.ChargeParent(
                    mol)  # standardize molecules using MolVS and RDKit
                mol = isotope_parent(mol)
                if include_stereoinfo is False:
                    Chem.RemoveStereochemistry(mol)
                    mol = tautomerizer.Canonicalize(mol)
                    mol_clean = my_standardizer(mol)
                    smi_clean = MolToSmiles(
                        mol_clean)  # convert mol object back to SMILES
                else:
                    mol = tautomerizer.Canonicalize(mol)
                    mol_clean = my_standardizer(mol)
                    smi_clean = MolToSmiles(mol_clean)
            except (ValueError, AttributeError) as e:
                smi_clean = np.nan
                logging.error(
                    "Standardization error, " + smi + ", Error Type: " + str(e)
                )  # write failed molecules during standardization to log file

        else:
            smi_clean = np.nan
            logging.error("Molecule too large, " + smi)

    else:
        smi_clean = np.nan
        logging.error("Reading Error, " + smi)

    return smi_clean
Beispiel #4
0
    xmin = next_x(lb, ub, 5, 60)
    valid_smiles = []
    scores = []
    for x_new in xmin:
        #model = DGLJTNNVAE(vocab, hidden_size, latent_size, depth)
        #model.load_state_dict(torch.load(opts.model_path))
        #model = cuda(model)
        tree_vec, mol_vec = x_new.chunk(2, 1)
        print(x_new.shape, tree_vec.shape, mol_vec.shape)
        print(x_new)
        s = model.decode(tree_vec, mol_vec)
        if s is not None:
            valid_smiles.append(s)

            current_log_P_value = Descriptors.MolLogP(MolFromSmiles(s))
            current_SA_score = -sascorer.calculateScore(MolFromSmiles(s))
            cycle_list = nx.cycle_basis(
                nx.Graph(rdmolops.GetAdjacencyMatrix(MolFromSmiles(s))))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            else:
                cycle_length = cycle_length - 6

            current_cycle_score = -cycle_length

            current_SA_score_normalized = (
Beispiel #5
0
import csv
from rdkit.Chem import MolFromSmiles, Draw

labels = []
smiles = []

with open("smiles.csv", "r") as f:
    content = csv.reader(f)
    for row in content:
        name, smile = row
        labels.append(row[0])
        smiles.append(MolFromSmiles(row[1]))

img = Draw.MolsToGridImage(smiles,
                           molsPerRow=2,
                           subImgSize=(300, 300),
                           legends=labels)
img.save("happiness.png")
def test_roundtrip_translation():
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.
    """

    # modify constraints
    constraints = sf.get_hypervalent_constraints()
    constraints['N'] = 6
    constraints['Br'] = 7
    constraints['Cl'] = 7
    constraints['I'] = 7
    sf.set_semantic_constraints(constraints)

    # file I/O
    ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt')
    error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv')

    # check if a previous checkpoint exists to continue tests
    if os.path.exists(ckpt_path):
        with open(ckpt_path, 'r') as ckpt_file:
            checkpoint = int(ckpt_file.readlines()[0])

    # if no path to a checkpoint exists,
    # create a new directory for error logging and checkpoints
    else:
        os.makedirs(os.path.dirname(ckpt_path), exist_ok=True)
        os.makedirs(os.path.dirname(error_path), exist_ok=True)

        with open(error_path, "w+") as error_log:
            error_log.write("In, Out\n")
        checkpoint = -1

    error_list = []
    error_found_flag = False

    # make pandas reader
    reader = pd.read_csv(EMOL_PATH,
                         chunksize=10000,
                         compression='gzip',
                         delimiter=' ',
                         header=0)

    # roundtrip testing
    for chunk_idx, chunk in enumerate(reader):

        if chunk_idx <= checkpoint:
            continue

        for in_smiles in chunk[COL_NAME]:

            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):
                continue

            # encode selfies
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))
                continue

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, out_smiles))

        # open and write all errors to errors_emolecule.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

        # create checkpoint from the current pandas reader chunk,
        # to load from and continue testing.
        with open(ckpt_path, 'w+') as ckpt_file:
            ckpt_file.write(str(chunk_idx))

    sf.set_semantic_constraints()  # restore defaults
    os.remove(ckpt_path)  # remove checkpoint

    assert not error_found_flag
def scorer(smiles, pIC50_weight, QED_weight, logP_weight, SA_weight, cycle_weight, sim_weight):
    smiles_rdkit = []
    for i in range(len(smiles)):
        smiles_rdkit.append(
            MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True))

    # calculate IC50 of training set using MPNN
    #IC50_scores=calculateScore(smiles_rdkit)

    # read in IC50 of training set from database
    IC50_scores = np.loadtxt('../data/covid/ic50-fulltrain.txt')
    IC50_scores = [x for x in IC50_scores]
    IC50_scores_normalized = (np.array(IC50_scores) - np.mean(IC50_scores)) / np.std(IC50_scores)

    if sim_weight != 0:
        # df_100 = list of molecules to match similarity
        df_100 = pd.read_csv('../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv')
        ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()]
        fps_db = [RDKFingerprint(x) for x in ms_db]

        sim_values = []
        for i in range(len(smiles)):
            sim_values.append(
                similarity_search(fps_db, smiles_rdkit[i]))
        sim_values_normalized = (
            np.array(sim_values) - np.mean(sim_values)) / np.std(sim_values)
    else:
        sim_values, sim_values_normalized = [], []
        for i in range(len(smiles)):
            sim_values.append(0)
            sim_values_normalized.append(0)
        sim_values_normalized=np.array(sim_values_normalized)
    
    logP_values = []
    for i in range(len(smiles)):
        logP_values.append(
            Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i])))

    qed_values = []
    for i in range(len(smiles)):
        qed_values.append(
            QED.qed(MolFromSmiles(smiles_rdkit[i])))

    SA_scores = []
    for i in range(len(smiles)):
        SA_scores.append(
            -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i])))

    cycle_scores = []
    for i in range(len(smiles)):
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_scores.append(-cycle_length)

    SA_scores_normalized = (
        np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores)
    qed_values_normalized = (
        np.array(qed_values) - np.mean(qed_values)) / np.std(qed_values)
    cycle_scores_normalized = (
        np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores)
    logP_values_normalized = (
        np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values)

    targets = (pIC50_weight * IC50_scores_normalized + 
               logP_weight * logP_values_normalized +
               SA_weight * SA_scores_normalized +
               QED_weight * qed_values_normalized +
               cycle_weight * cycle_scores_normalized + 
               sim_weight * sim_values_normalized)
   
    return (IC50_scores, qed_values, logP_values, SA_scores, cycle_scores, sim_values, targets)
Beispiel #8
0
            print('This mol fails! ' + MolToSmiles(mol))
            continue
        products = rxn.RunReactants((Chem.AddHs(mol), ))
        if products != ():
            for prod in products:
                prod1_list.append(prod[0])
                prod2_list.append(prod[1])
    return prod1_list, prod2_list


df_orig = pd.read_csv('../data/acry_activity.smi')
# df_actives = df_orig[df_orig['activity']==1]
# print('Number of acry actives: {}'.format(len(df_actives)))
smiles_list = df_orig['SMILES'].values
smiles_list = list(
    set([MolToSmiles(MolFromSmiles(smi)) for smi in smiles_list]))
# print(smiles_list)
# print([MolFromSmiles(smi) for smi in smiles_list])
# mol_list = [MolFromSmiles(MolToSmiles(MolFromSmiles(smi))) for smi in smiles_list]
# mol_list = [mol for mol in mol_list if mol]
# print(mol_list)
# print(len(list(set([MolToSmiles(mol) for mol in mol_list]))))
mols = [MolFromSmiles(smi) for smi in smiles_list]
#print('Size of actives: {}'.format(len(canonicalize(mols))))
print('Size of original dataset: {}'.format(len(canonicalize(mols))))
acry_slice = AllChem.ReactionFromSmarts(
    '[c,C:1][C](=[O])[N]([c,C,#1:2])[C]([c,C,#1:3])([c,C,#1:4])[C](=[O])[N]([#1])[c,C:5]>>[*:1][C](=[O])[O][#1].[*:2][N]([#1])[#1].[*:3][C](=[O])[*:4].[*:5][N+]#[C-]'
)
acry_comb = AllChem.ReactionFromSmarts(
    '[c,C:1][C](=[O])[O][#1].[c,C:2][N]([#1])[#1].[c,C,#1:3][C](=[O])[c,C,#1:4].[c,C:5][N+]#[C-]>>[*:1][C](=[O])[N]([*:2])[C]([*:3])([*:4])[C](=[O])[N]([#1])[*:5]'
)
Beispiel #9
0
def canonicalize_smiles(smiles, isomeric=True, sanitize=True):
    try:
        mol = MolFromSmiles(smiles, sanitize=sanitize)
        return MolToSmiles(mol, isomericSmiles=isomeric)
    except Exception:
        pass
Beispiel #10
0
    def test_topological_fprint_min_path_lesser_than_atoms(self):
        atomic_mols = [
            MolFromSmiles(smiles) for smiles in ['C', 'O', 'N', 'P']
        ]
        diatomic_mols = [
            MolFromSmiles(smiles) for smiles in ['CC', 'CO', 'CN', 'CP']
        ]
        triatomic_mols = [
            MolFromSmiles(smiles) for smiles in ['CCC', 'COO', 'CCN', 'CCP']
        ]
        min_path = 1
        for mol in atomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for diatomic_mol in diatomic_mols:
            descriptor = Descriptor()
            try:
                descriptor.make_fingerprint(
                    molecule_graph=diatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
            except InvalidConfigurationError:
                self.fail("Did not expect Descriptor to raise "
                          "InvalidConfigurationError")
        for triatomic_mol in triatomic_mols:
            descriptor = Descriptor()
            try:
                descriptor.make_fingerprint(
                    molecule_graph=triatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
            except InvalidConfigurationError:
                self.fail("Did not expect Descriptor to raise "
                          "InvalidConfigurationError")

        min_path = 2
        for mol in atomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for diatomic_mol in diatomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=diatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for triatomic_mol in triatomic_mols:
            descriptor = Descriptor()
            try:
                descriptor.make_fingerprint(
                    molecule_graph=triatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
            except InvalidConfigurationError:
                self.fail("Did not expect Descriptor to raise "
                          "InvalidConfigurationError")

        min_path = 3
        for mol in atomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for diatomic_mol in diatomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=diatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
        for triatomic_mol in triatomic_mols:
            with self.assertRaises(InvalidConfigurationError):
                descriptor = Descriptor()
                descriptor.make_fingerprint(
                    molecule_graph=triatomic_mol,
                    fingerprint_type='topological_fingerprint',
                    fingerprint_params={'min_path': min_path})
Beispiel #11
0
def test_roundtrip_translation(test_name, column_name, dataset_samples):
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.
    """

    # modify semantic bond constraints
    constraints = sf.get_semantic_constraints()
    constraints['N'] = 6
    sf.set_semantic_constraints(constraints)

    # file I/O
    curr_dir = os.path.dirname(__file__)
    test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt")
    error_path = os.path.join(curr_dir, 'error_sets',
                              "errors_{}.csv".format(test_name))

    # create error directory
    os.makedirs(os.path.dirname(error_path), exist_ok=True)
    error_list = []

    # add header in error log text file
    with open(error_path, "w+") as error_log:
        error_log.write("In, Out\n")
    error_found_flag = False

    # make pandas reader
    N = sum(1 for _ in open(test_path)) - 1
    S = dataset_samples if (0 < dataset_samples <= N) else N
    skip = sorted(random.sample(range(1, N + 1), N - S))
    reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip)

    # roundtrip testing
    for chunk in reader:
        for in_smiles in chunk[column_name]:
            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):
                continue

            # encode SELFIE string
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))
                continue

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, out_smiles))

        # open and write all errors to errors_{test_name}.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

    sf.set_semantic_constraints()  # restore defaults

    assert not error_found_flag
Beispiel #12
0
 def test4MolToInchiKey(self):
     m = MolFromSmiles("CC=C(N)C")
     inchi = MolToInchi(m)
     k1 = InchiToInchiKey(inchi)
     k2 = MolToInchiKey(m)
     self.assertEqual(k1, k2)
Beispiel #13
0
 def test2InchiOptions(self):
     m = MolFromSmiles("CC=C(N)C")
     inchi1 = MolToInchi(m).split('/', 1)[1]
     inchi2 = MolToInchi(m, "/SUU").split('/', 1)[1]
     self.assertEqual(inchi1 + '/b4-3?', inchi2)
Beispiel #14
0
    def test1InchiReadPubChem(self):
        for f in self.dataset.values():
            same, diff, reasonable = 0, 0, 0
            for m in f:
                if m is None:  # pragma: nocover
                    continue
                x = MolToInchi(m)
                y = None
                RDLogger.DisableLog('rdApp.error')
                mol = MolFromInchi(x)
                RDLogger.EnableLog('rdApp.error')
                if mol is not None:
                    y = MolToInchi(
                        MolFromSmiles(MolToSmiles(mol, isomericSmiles=True)))
                if y is None:
                    # metal involved?
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        _, error = inst.args
                        if 'Metal' in error or \
                                'Charges were rearranged' in error:
                            reasonable += 1
                            continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # RDKit does not like the SMILES? use MolBlock instead
                    inchiMol = MolFromInchi(x)
                    if inchiMol:
                        rdDepictor.Compute2DCoords(inchiMol)
                        z = MolToInchi(MolFromMolBlock(
                            MolToMolBlock(inchiMol)))
                        if x == z:
                            reasonable += 1
                            continue
                    # InChI messed up the radical?
                    unsanitizedInchiMol = MolFromInchi(x, sanitize=False)
                    if sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in m.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]) != sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in unsanitizedInchiMol.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]):
                        reasonable += 1
                        continue

                    diff += 1
                    cid = m.GetProp('PUBCHEM_COMPOUND_CID')
                    print(COLOR_GREEN + 'Empty mol for PubChem Compound ' +
                          cid + '\n' + COLOR_RESET)
                    continue
                if x != y:
                    # if there was warning in the first place, then this is
                    # tolerable
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                        MolFromInchi(x, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        reasonable += 1
                        continue
                    # or if there are big rings
                    SanitizeMol(m)
                    if filter(lambda i: i >= 8,
                              [len(r) for r in m.GetRingInfo().AtomRings()]):
                        reasonable += 1
                        continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # or if RDKit loses bond stereo
                    s = MolToSmiles(m, True)
                    if MolToSmiles(MolFromSmiles(s), True) != s:
                        reasonable += 1
                        continue
                    # or if it is RDKit SMILES writer unhappy about the mol
                    inchiMol = MolFromInchi(x)
                    rdDepictor.Compute2DCoords(inchiMol)
                    z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol)))
                    if x == z:
                        reasonable += 1
                        continue

                    diff += 1
                    print(COLOR_GREEN +
                          'Molecule mismatch for PubChem Compound ' + cid +
                          COLOR_RESET)
                    print(inchiDiff(x, y))
                    print()
                else:
                    same += 1
            fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}"
            print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET))
            self.assertEqual(same, 684)
            self.assertEqual(diff, 0)
            self.assertEqual(reasonable, 497)
Beispiel #15
0
def test_kekulize():
    mol = Filters.kekulize(MolFromSmiles('c1ccccc1'))
    assert MolToSmiles(mol) == 'C1=CC=CC=C1'
Beispiel #16
0
    def construct_feature_matrices(self, smiles):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = len(mol.GetAtoms())
        n_bond = 2 * len(mol.GetBonds())

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0

        atom_seq = mol.GetAtoms()
        atoms = [atom_seq[i] for i in range(n_atom)]

        for n, atom in enumerate(atoms):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[n] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        return {
            'n_atom': n_atom,
            'n_bond': n_bond,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }
Beispiel #17
0
def smiles_reader(smiles, **kwargs):
    kwargs.setdefault('sanitize', True)
    return MolFromSmiles(smiles, **kwargs)
Beispiel #18
0
batch_size = 32
# hidden_size = int(args.hidden_size)
# latent_size = int(args.latent_size)
# depth = int(opts.depth)

model = JTNNVAE(vocab, args.hidden_size, args.latent_size, args.depthT,
                args.depthG, args.num_layers, args.use_graph_conv)
model.load_state_dict(torch.load(args.model))
model = model.cuda()

smiles_rdkit = []
for i in range(len(smiles)):
    print(i, 'smiles')
    smiles_rdkit.append(
        MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True))

logP_values = []
for i in range(len(smiles)):
    print(i, 'logP_values')
    logP_values.append(Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i])))

SA_scores = []
for i in range(len(smiles)):
    print(i, 'SA_scores')
    SA_scores.append(-sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i])))

import networkx as nx

cycle_scores = []
for i in range(len(smiles)):
 def from_smiles(cls, smiles):
     mol = MolFromSmiles(smiles)
     return cls.from_mol(mol)
Beispiel #20
0
smiles_list, y  = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA

dat_size = len(smiles_list)

mpi_comm = MPI.COMM_WORLD
mpi_rank = mpi_comm.Get_rank()
mpi_size = mpi_comm.Get_size()

my_border_low, my_border_high = return_borders(mpi_rank, dat_size, mpi_size)

my_list = smiles_list[my_border_low:my_border_high]

bit_list = [2048]
for bits in bit_list:        
    my_mols = [MolFromSmiles(smiles) for smiles in my_list]
    X = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=bits) for mol in my_mols]
    X = np.asarray(X)
    
    dbs = mpi_comm.gather(X, root=0) 
    
    if mpi_rank==0:
       for db in dbs[1:]:
           X = np.vstack([X, db])
       #print(X)
       #np.save('data/'+task+'/'+task+'_ecfp_'+str(bits)+'.npy',X)
       
       print('Number of bits: {}'.format(bits))
       kernel = 1-tanimoto(X,X)
       print(kernel)
       print(kernel.shape)
Beispiel #21
0
#!/usr/bin/python2
# Little harness for timing how long it takes to embed a molecule
# which seems extremely variable on one machine,
from __future__ import print_function, division
import sys, time, os
from rdkit.Chem import MolFromSmiles, AddHs, RemoveHs
from rdkit.Chem.AllChem import EmbedMolecule

if __name__ == "__main__":
    dotimestamp = int(os.getenv('MOLEMBED_TIME', '0'))
    doaddh = int(os.getenv('MOLEMBED_ADDH', '0'))
    rseed = int(os.getenv('MOLEMBED_SEED', '0'))
    t0 = time.time()
    for line in sys.stdin.readlines():
        s = line.strip()
        if dotimestamp:
            t1 = time.time()
            dt = (t1 - t0) * 1e3
            print('%.3f' % dt, s)
            t0 = t1
        else:
            print(s)
        m = MolFromSmiles(s)
        if doaddh:
            m2 = AddHs(m)
        else:
            m2 = m
        EmbedMolecule(m2, randomSeed=rseed)
Beispiel #22
0
def checksmi(smiles):
    return MolFromSmiles(str(smiles))

if __name__ == '__main__':

    if TASK == 'e_iso_pi':
        X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK)
    elif TASK == 'z_iso_pi':
        X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK)
    elif TASK == 'e_iso_n':
        X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK)
    elif TASK == 'z_iso_n':
        X_train, X_test, y_train, y_test, dft_vals = dft_train_test_split(PATH, TASK)
    else:
        raise Exception('Must specify a valid task')

    rdkit_train_mols = [MolFromSmiles(smiles) for smiles in X_train]
    X_train = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512) for mol in rdkit_train_mols]
    X_train = np.asarray(X_train)

    rdkit_test_mols = [MolFromSmiles(smiles) for smiles in X_test]
    X_test = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512) for mol in rdkit_test_mols]
    X_test = np.asarray(X_test)

    X_train, y_train, X_test, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

    regr_rf = RandomForestRegressor(n_estimators=100, max_depth=30, random_state=2)
    regr_rf.fit(X_train, y_train)

    # Predict on new data
    y_rf = regr_rf.predict(X_test)
    y_rf = y_scaler.inverse_transform(y_rf)
Beispiel #24
0
def test_remove_isotope():
    mol = Filters.remove_isotope(MolFromSmiles('c1cc[14cH]cc1'))
    assert MolToSmiles(mol) == ('c1ccccc1')
Beispiel #25
0
def gen_latent_demo(data_path):
    import sys
    sys.path.append('/home/icml18-jtnn')
    import torch
    import torch.nn as nn
    from torch.autograd import Variable
    from optparse import OptionParser

    import rdkit
    from rdkit.Chem import Descriptors
    from rdkit.Chem import MolFromSmiles, MolToSmiles
    from rdkit.Chem import rdmolops
    import sascorer

    import numpy as np
    from jtnn import *

    lg = rdkit.RDLogger.logger()
    lg.setLevel(rdkit.RDLogger.CRITICAL)
    #data_path=args.data_path
    vocab_path = '../data/vocab.txt'

    with open(data_path) as f:
        smiles = f.readlines()

    for i in xrange(len(smiles)):
        smiles[i] = smiles[i].strip()

    vocab = [x.strip("\r\n ") for x in open(vocab_path)]
    vocab = Vocab(vocab)
    batch_size = 1
    hidden_size = 450
    latent_size = 56
    depth = 3

    model = JTNNVAE(vocab, hidden_size, latent_size, depth)
    model.load_state_dict(
        torch.load('../molvae/MPNVAE-h450-L56-d3-beta0.005/model.iter-4',
                   map_location=lambda storage, loc: storage))

    smiles_rdkit = []
    for i in xrange(len(smiles)):
        smiles_rdkit.append(
            MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True))

    logP_values = []
    for i in xrange(len(smiles)):
        logP_values.append(Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i])))

    SA_scores = []
    for i in xrange(len(smiles)):
        SA_scores.append(
            -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i])))

    import networkx as nx

    cycle_scores = []
    for i in range(len(smiles)):
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_scores.append(-cycle_length)

    SA_scores_normalized = (np.array(SA_scores) -
                            np.mean(SA_scores)) / np.std(SA_scores)
    logP_values_normalized = (np.array(logP_values) -
                              np.mean(logP_values)) / np.std(logP_values)
    cycle_scores_normalized = (np.array(cycle_scores) -
                               np.mean(cycle_scores)) / np.std(cycle_scores)

    latent_points = []
    for i in xrange(0, len(smiles), batch_size):
        batch = smiles[i:i + batch_size]
        mol_vec = model.encode_latent_mean(batch)
        latent_points.append(mol_vec.data.cpu().numpy())

# We store the results
    latent_points = np.vstack(latent_points)
    np.savetxt('latent_features_demo.txt', latent_points)

    targets = SA_scores_normalized + logP_values_normalized + cycle_scores_normalized
    np.savetxt('targets_demo.txt', targets)
    np.savetxt('logP_values_demo.txt', np.array(logP_values))
    np.savetxt('SA_scores_demo.txt', np.array(SA_scores))
    np.savetxt('cycle_scores_demo.txt', np.array(cycle_scores))
Beispiel #26
0
def test_neutralise_charge():
    mol = Filters.neutralise_charge(MolFromSmiles('CC(C(=O)[O-])O'))
    assert MolToSmiles(mol) == ('CC(O)C(=O)O')
    print(len(valid_smiles), " molecules are found")
    valid_smiles = valid_smiles[:50]
    new_features = next_inputs[:50]
    new_features = np.vstack(new_features)
    save_object(valid_smiles,
                args.save_dir + "/valid_smiles{}.dat".format(iteration))

    import sascorer
    import networkx as nx
    from rdkit.Chem import rdmolops

    scores = []
    for i in range(len(valid_smiles)):
        print(i, 'calculating scores')
        current_log_P_value = Descriptors.MolLogP(
            MolFromSmiles(valid_smiles[i]))
        current_SA_score = -sascorer.calculateScore(
            MolFromSmiles(valid_smiles[i]))
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(valid_smiles[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6

        current_cycle_score = -cycle_length
Beispiel #28
0
def test_add_hydrogen():
    mol = Filters.add_hydrogen(MolFromSmiles('CC(O)C(=O)O'))
    assert MolToSmiles(mol) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]'
    mol = Filters.add_hydrogen(MolFromSmiles('CC(C(=O)[O-])O'))
    assert MolToSmiles(mol) == '[H]OC([H])(C(=O)[O-])C([H])([H])[H]'
Beispiel #29
0
    def process(self,
                input: Union[str, list] = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                output_file_cml: str = "",
                sdf_append: bool = False,
                format_output: bool = True,
                opsin_output_format: str = "",
                output_formats: list = None,
                write_header: bool = True,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                standardize_mols: bool = True,
                normalize_plurals: bool = True,
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OPSIN.

        Parameters
        ----------
        input : str or list
            | str: String with IUPAC names, one per line.
            | list: List of IUPAC names.
        input_file : str
            Path to file to be processed by OPSIN. One IUPAC name per line.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in.
        output_file_cml : str
            | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml".
            | Not supported by RDKit so standardization and conversion to other formats cannot be done.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys:
            | "iupac", <output formats>, ..., "error"
            | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error"
            | If False, the value of "content" key of returned dict will be None.
        opsin_output_format : str
            | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey"
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | Default value: ["smiles"]

            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         Value         |         Source        |                                            Note                                            |
            +=======================+=======================+============================================================================================+
            |         smiles        |         RDKit         |                                          canonical                                         |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      smiles_opsin     |     OPSIN ("smi")     |                                           SMILES                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            | smiles_extended_opsin | OPSIN ("extendedsmi") |                          Extended SMILES. Not supported by RDKit.                          |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         inchi         |         RDKit         | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      inchi_opsin      |    OPSIN ("inchi")    |                                            InChI                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |     stdinchi_opsin    |   OPSIN ("stdinchi")  |                                       standard InChI                                       |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |        inchikey       |         RDKit         |      The same applies as for "inchi". Also molecule cannot be created from InChI-key.      |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |   stdinchikey_opsin   | OPSIN ("stdinchikey") |               Standard InChI-key. Cannot be used by RDKit to create molecule.              |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |          sdf          |         RDKit         |                     If present, an additional SDF file will be created.                    |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+

        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        normalize_plurals : bool
            | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can
              set your own regex pattern with `plural_patterns` in __init__.
        continue_on_failure : bool
            | If True, continue running even if OPSIN returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OPSIN
            - stderr: str ... standard error output from OPSIN
            - exit_code: int ... exit code from OPSIN
            - content:

              - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error"
              - None ... when format_output is False
        """

        options_internal = self.options_internal.copy()
        opsin_nonreadable_formats = ["cml", "stdinchikey"]

        if input and input_file:
            input_file = ""
            self.logger.warning(
                "Both 'input' and 'input_file' are set, but 'input' will be prefered."
            )
        elif not input and not input_file:
            raise ValueError("One of 'input' or 'input_file' must be set.")

        # OSRA output format check
        if opsin_output_format:
            options_internal["output_format"] = opsin_output_format
        else:
            opsin_output_format = options_internal["output_format"]

        opsin_valid_output_formats = {
            "cml": "cml_opsin",
            "smi": "smiles_opsin",
            "extendedsmi": "smiles_extended_opsin",
            "inchi": "inchi_opsin",
            "stdinchi": "stdinchi_opsin",
            "stdinchikey": "stdinchikey_opsin"
        }

        if opsin_output_format not in opsin_valid_output_formats:
            raise ValueError(
                "Unknown OPSIN output format. Possible values: {}".format(
                    list(opsin_valid_output_formats.keys())))

        if standardize_mols and opsin_output_format in opsin_nonreadable_formats:
            self.logger.warning(
                "OPSIN output format is \"{}\", which cannot be used by RDKit."
                .format(opsin_output_format))

        # output formats check
        if not output_formats:
            output_formats = ["smiles"]
        else:
            if opsin_output_format == "stdinchikey":
                output_formats = ["stdinchikey_opsin"]
            elif opsin_output_format == "extendedsmi":
                output_formats = ["smiles_extended_opsin"]
            else:
                output_formats = sorted(list(set(output_formats)))
                possible_output_formats = [
                    "smiles", "inchi", "inchikey", "sdf"
                ]
                output_formats = [
                    x for x in output_formats if x in possible_output_formats
                    or x == opsin_valid_output_formats[opsin_output_format]
                ]

        if normalize_plurals:
            if input_file:
                with open(input_file, mode="r", encoding="utf-8") as f:
                    input = "\n".join([x.strip() for x in f.readlines()])
                input_file = ""
            input = self.normalize_iupac(input)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)

        if input_file:
            commands.append(input)
            stdout, stderr, exit_code = common_subprocess(commands)
        elif input:
            if isinstance(input, list):
                input = "\n".join([x.strip() for x in input])
            stdout, stderr, exit_code = common_subprocess(commands,
                                                          stdin=input)
        else:
            raise UserWarning("Input is empty.")

        if dry_run:
            return " ".join(commands)

        to_return = {
            "stdout": stdout,
            "stderr": stderr,
            "exit_code": exit_code,
            "content": None
        }

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("OPSIN error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if output_file_cml and opsin_output_format == "cml":
            with open(output_file_cml, mode="w", encoding="utf-8") as f:
                f.write(stdout)
            return to_return
        elif output_file_cml and opsin_output_format != "cml":
            self.logger.warning(
                "Output file for CML is requested, but OPSIN output format is '{}'"
                .format(opsin_output_format))

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(stdout)
            return to_return

        compounds = []
        standardizer = Standardizer()
        empty_cols = OrderedDict([(x, "") for x in output_formats])

        if output_file_sdf:
            if sdf_append:
                if not os.path.isfile(output_file_sdf):
                    open(output_file_sdf, mode="w", encoding="utf-8").close()
                writer = SDWriter(
                    open(output_file_sdf, mode="a", encoding="utf-8"))
            else:
                writer = SDWriter(output_file_sdf)

        stdout = stdout.split("\n")
        del stdout[-1]
        stderr = [
            x.strip() for x in stderr.split("\n")[1:] if x
        ]  # remove first line of stderr because there is OPSIN message (y u du dis...)

        if input_file:
            with open(input_file, mode="r", encoding="utf-8") as f:
                lines = iter(f.readlines())
        else:
            lines = iter(input.split("\n"))

        mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats +
                                                   ["error"])

        e = 0
        for i, line in enumerate(lines):
            line = line.strip()
            converted = stdout[i].strip()
            mol_output = mol_output_template.copy()

            if converted:
                if opsin_output_format == "stdinchikey":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("stdinchikey_opsin", converted),
                                     ("error", "")]))
                    continue
                elif opsin_output_format == "extendedsmi":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("smiles_extended_opsin", converted),
                                     ("error", "")]))
                    continue

                if opsin_output_format == "smi":
                    mol = MolFromSmiles(
                        converted,
                        sanitize=False if standardize_mols else True)
                elif opsin_output_format in ["inchi", "stdinchi"]:
                    mol = MolFromInchi(
                        converted,
                        sanitize=False if standardize_mols else True,
                        removeHs=False if standardize_mols else True)

                if mol:
                    if standardize_mols:
                        try:
                            mol = standardizer.standardize(mol)
                        except ValueError as e:
                            self.logger.warning(
                                "Cannot standardize '{}': {}".format(
                                    MolToSmiles(mol), str(e)))

                    for f in output_formats:
                        if f == "smiles":
                            mol_output["smiles"] = MolToSmiles(
                                mol, isomericSmiles=True)
                        elif f == "smiles_opsin" and opsin_output_format == "smi":
                            mol_output["smiles_opsin"] = converted
                        elif f == "inchi":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchi"] = inchi
                            else:
                                mol_output["inchi"] = ""
                                self.logger.warning(
                                    "Cannot convert to InChI: {}".format(
                                        converted))
                        elif f == "inchi_opsin" and opsin_output_format == "inchi":
                            mol_output["inchi_opsin"] = converted
                        elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi":
                            mol_output["stdinchi_opsin"] = converted
                        elif f == "inchikey":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchikey"] = InchiToInchiKey(inchi)
                            else:
                                mol_output["inchikey"] = ""
                                self.logger.warning(
                                    "Cannot create InChI-key from InChI: {}".
                                    format(converted))
                        elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey":
                            mol_output["stdinchikey_opsin"] = converted
                        elif f == "sdf":
                            mol_output["sdf"] = MolToMolBlock(
                                mol, includeStereo=True)

                    if output_file_sdf:
                        writer.write(mol)

                    mol_output.update(
                        OrderedDict([("iupac", line), ("error", "")]))
                else:
                    mol_output.update([
                        ("iupac", line),
                        ("error",
                         "Cannot convert to RDKit mol: {}".format(converted))
                    ])
                    mol_output.update(empty_cols)
                    self.logger.warning(compounds[-1].error)
            else:
                try:
                    error = stderr[e].strip()
                except IndexError:
                    error = ""

                mol_output.update([("iupac", line), ("error", error)])
                mol_output.update(empty_cols)
                e += 1
            compounds.append(mol_output)

        to_return["content"] = compounds

        if output_file and compounds:
            dict_to_csv(to_return["content"],
                        output_file=output_file,
                        csv_delimiter=csv_delimiter,
                        write_header=write_header)
        elif output_file and not compounds:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(mol_output_template.keys()),
                             write_header=write_header)

        return to_return
Beispiel #30
0
    reg_scores = []  # collect scores for objective function
    logP_scores = []  # collect scores for logP term in objective function
    SA_values = [
    ]  # collect scores for synthetic accessibility term in objective function # 30 September - CAREFUL about variable names!!! This is the cause of the nans. This conflicts with the variable on line 340 and causes nans in the program at runtime.

    # 2 October - changed to SA_values - will have to change the variable names to be consistents between values and scores.

    for i in range(len(valid_smiles_final)):
        to_add = []
        logP = []
        SA = []
        if len(valid_smiles_final[i]) != 0:
            for j in range(0, len(valid_smiles_final[i])):
                current_log_P_value = Descriptors.MolLogP(
                    MolFromSmiles(valid_smiles_final[i][j]))
                current_SA_score = -sascorer.calculateScore(
                    MolFromSmiles(valid_smiles_final[i][j]))
                cycle_list = nx.cycle_basis(
                    nx.Graph(
                        rdmolops.GetAdjacencyMatrix(
                            MolFromSmiles(valid_smiles_final[i][j]))))

                if len(cycle_list) == 0:
                    cycle_length = 0
                else:
                    cycle_length = max([len(j) for j in cycle_list])
                if cycle_length <= 6:
                    cycle_length = 0
                else:
                    cycle_length = cycle_length - 6