Beispiel #1
0
def desalt(mol):
    # This molecule escaped my patterns: InChI InChI=1S/2C6H11NO5.O.V/c2*1-3(5(8)9)7(12)4(2)6(10)11;;/h2*3-4,12H,1-2H3,(H,8,9)(H,10,11);;/q;;;+2/p-2/t2*3-,4-;;/m00../s1 gave an error Molecule must be fully connected by covalent bonds.:

    #input is an rdkit mol
    #returns an rdkit mol keeping the biggest component
    #returns original mol if only one component
    #returns a boolean indicated if cleaning was necessary
    mol = MolToInchi(mol)
    mol = MolFromInchi(mol)
    SanitizeMol(mol)
    d = Chem.rdmolops.GetMolFrags(mol)  #these are atom indices
    if len(
            d
    ) == 1:  #If there are fragments or multiple molecules this will be greater than 1
        return mol, False
    my_smiles = Chem.MolToSmiles(mol, True)
    parent_atom_count = 0
    disconnected = my_smiles.split('.')
    #With GetMolFrags, we've already established that there is more than one disconnected structure
    status = False
    for s in disconnected:
        little_mol = Chem.MolFromInchi(
            Chem.MolToInchi(Chem.MolFromSmiles(s, sanitize=False)))
        #Sanitize=True will fail for choline sulfate.  Can't sanitize the radical.
        if little_mol is not None:
            count = little_mol.GetNumAtoms()
            if count > parent_atom_count:
                parent_atom_count = count
                parent_mol = little_mol
                status = True
    return parent_mol, status
Beispiel #2
0
def smi_to_inchi_with_val(smiles, ovalues):
    inchis = []
    values = []

    for smi, val in zip(smiles, ovalues):
        mol = MolFromSmiles(smi)
        if mol is not None:
            try:
                inchi = MolToInchi(mol)
                m = MolFromInchi(inchi)
                if m is not None:  # ensure rdkit can read an inchi it just wrote...
                    inchis.append(inchi)
                    values.append(val)
            except:
                continue
    return inchis, values
Beispiel #3
0
def inchi_from_smiles(smiles: str) -> Union[str, None]:
    """
    Get an InChI descriptor from a SMILES descriptors.
    Uses RDKit for the conversion.

    Args:
        smiles (str): The SMILES descriptor.

    Returns:
        str: The corresponding InChI descriptor.
    """
    try:
        inchi = MolToInchi(MolFromSmiles(smiles))
    except:
        return None
    return inchi
Beispiel #4
0
def process_cyp(neutralize=False):
    df = pd.read_csv(os.path.join(DATA_PATH, "cyp", "CYP3A4.csv"),
                     header=0,
                     sep=";")
    df["Value"] = [1 if class_ == "Active" else 0 for class_ in df["Class"]]
    inchis, values = smi_to_inchi_with_val(df["SMILES"], df["Value"])
    df = pd.DataFrame({"inchi": inchis, "values": values})
    inchis, values = mean_by_key(df, "inchi", "values")

    if neutralize:
        inchis = [
            MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN))
            for inchi in inchis
        ]

    with open(os.path.join(DATA_PATH, "cyp", "data_cyp.pt"), "wb") as handle:
        pickle.dump([inchis, values], handle)
Beispiel #5
0
def process_herg(list_csvs, keep_operators=False, neutralize=False):
    df = pd.read_csv(list_csvs[0], sep="\t")

    for idx, csv in enumerate(list_csvs):
        if idx > 0:
            df_next = pd.read_csv(csv, sep="\t")
            df = pd.concat([df, df_next])

    # filter only IC50, nM, = data.
    condition = (df.Value_type == "IC50") & (df.Unit == "nM")
    if not keep_operators:
        condition = condition & (df.Relation == "=")

    df = df.loc[condition, ["Canonical_smiles", "Value"], ]

    df.Value = -np.log10(df.Value * 1e-9)  # pIC50 conversion
    per_dup, stds = duplicate_analysis(df, "Canonical_smiles", "Value")
    print(
        "Percentage of duplicates for hERG dataset: {:.3f}, with average std.: {:.3f}, and median std.:{:.3f}"
        .format(per_dup, np.mean(stds), np.median(stds)))

    df.drop_duplicates(inplace=True)

    # average values with several measurements
    uq_smiles, uq_values = mean_by_key(df, "Canonical_smiles", "Value")

    # drop faulty molecules
    print("Dropping faulty molecules...")
    inchis, values = smi_to_inchi_with_val(uq_smiles, uq_values)

    if neutralize:
        inchis = [
            MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN))
            for inchi in inchis
        ]

    with open(os.path.join(DATA_PATH, "herg", "data_herg.pt"), "wb") as handle:
        pickle.dump([inchis, values], handle)
Beispiel #6
0
def process_caco2(neutralize=False):
    # peerJ data
    df1 = pd.read_excel(
        os.path.join(DATA_PATH, "caco2", "peerj-03-1405-s001.xls"))
    df1 = df1.loc[:, ["InChI", "Caco-2 Papp * 10^6 cm/s"]]
    df1.dropna(inplace=True)
    df1["Value"] = -np.log10(df1["Caco-2 Papp * 10^6 cm/s"] * 1e-6)

    new_inchis = []
    values = []

    for inchi, val in zip(df1["InChI"], df1["Value"]):
        mol = MolFromInchi(inchi)
        if mol is not None:
            new_inchis.append(
                MolToInchi(mol))  # ensure same inchi specification
            values.append(val)

    df1 = pd.DataFrame({"InChI": new_inchis, "Value": values})

    # plos one data
    df2 = pd.read_csv(os.path.join(DATA_PATH, "caco2", "caco2perm_pone.csv"))
    df2["Value"] = -np.log10(df2["Papp (Caco-2) [cm/s]"])
    df2 = df2.loc[:, ["name", "Value"]]
    df2.dropna(inplace=True)

    print("Querying InchI strings from IUPAC names...")
    inchis = []
    values = []

    for mol_name, val in tqdm(zip(df2["name"], df2["Value"]), total=len(df2)):
        ans = requests.get(IUPAC_REST.format(mol_name))
        if ans.status_code == 200:
            inchi = ans.content.decode("utf8")
            new_mol = MolFromInchi(inchi)  # ensure same inchi specification
            if new_mol is not None:
                new_inchi = MolToInchi(new_mol)
                inchis.append(new_inchi)
                values.append(val)

    inchis.extend(df1["InChI"].tolist())
    values.extend(df1["Value"].tolist())

    df = pd.DataFrame({"inchi": inchis, "values": values})
    per_dup, stds = duplicate_analysis(df, "inchi", "values")

    print(
        "Percentage of duplicates for CaCO2 dataset: {:.5f}, with average std.: {:.3f}, and median std.:{:.3f}"
        .format(per_dup, np.mean(stds), np.median(stds)))

    uq_inchi = pd.unique(df["inchi"]).tolist()

    print("Averaging values and ensuring rdkit readability...")
    inchis = []
    values = []

    # Average values and make sure rdkit can read all inchis
    for inchi in tqdm(uq_inchi):
        mol = MolFromInchi(inchi)
        if mol is not None:
            df_uq = df.loc[df["inchi"] == inchi]
            inchis.append(inchi)
            values.append(df_uq["values"].mean())

    if neutralize:
        inchis = [
            MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN))
            for inchi in inchis
        ]

    with open(os.path.join(DATA_PATH, "caco2", "data_caco2.pt"),
              "wb") as handle:
        pickle.dump([inchis, values], handle)
Beispiel #7
0
def process_ppb(neutralize=False):
    inchis = []
    values = []

    # first dataset
    xlsxs = glob(
        os.path.join(DATA_PATH, "ppb", "11095_2013_1023_MOESM[2-4]_ESM.xlsx"))
    for idx, xlsx in enumerate(xlsxs):
        ppb_col = "Experimental_%PPB" if idx < 3 else "Experimental PPB_[%]"
        df1 = pd.read_excel(xlsx)
        df1 = df1.loc[:, ["SMILES", ppb_col]]
        inchis_1, values_1 = smi_to_inchi_with_val(df1["SMILES"], df1[ppb_col])
        inchis.extend(inchis_1)
        values.extend(values_1)

    # second dataset
    df2 = pd.read_excel(os.path.join(DATA_PATH, "ppb",
                                     "ci6b00291_si_001.xlsx"))
    df2 = df2.loc[:, ["SMILES", "Fub"]]
    df2["Value"] = 100 * (1 - df2["Fub"])
    inchis_2, values_2 = smi_to_inchi_with_val(df2["SMILES"], df2["Value"])
    inchis.extend(inchis_2)
    values.extend(values_2)

    # third dataset
    df3 = pd.read_excel(
        os.path.join(DATA_PATH, "ppb",
                     "cmdc201700582-sup-0001-misc_information.xlsx"),
        sheet_name=4,
    )
    df3 = df3.loc[:,
                  ["SMILES", "PPB_Traditional_assay(serve as the true value)"]]
    df3["Value"] = 100 * df3["PPB_Traditional_assay(serve as the true value)"]
    inchis_3, values_3 = smi_to_inchi_with_val(df3["SMILES"], df3["Value"])
    inchis.extend(inchis_3)
    values.extend(values_3)

    # fourth dataset
    df4 = pd.read_excel(
        os.path.join(DATA_PATH, "ppb", "jm051245vsi20061025_033631.xls"))
    df4 = df4.loc[:, ["NAME (Drug or chemical  name)", "PBexp(%)"]]

    for mol_name, val in tqdm(zip(df4["NAME (Drug or chemical  name)"],
                                  df4["PBexp(%)"]),
                              total=len(df4)):
        ans = requests.get(IUPAC_REST.format(mol_name))
        if ans.status_code == 200:
            inchi = ans.content.decode("utf8")
            mol = MolFromInchi(inchi)
            # Use same inchi specification as rdkit...
            new_inchi = MolToInchi(mol)
            new_mol = MolFromInchi(new_inchi)
            if new_mol is not None:
                inchis.append(new_inchi)
                values.append(val)

    # fifth dataset
    df5 = pd.read_excel(os.path.join(DATA_PATH, "ppb",
                                     "mp8b00785_si_002.xlsx"))
    df5 = df5.loc[:, ["canonical_smiles", "fup"]]
    df5["Value"] = 100 * (1 - df5["fup"])
    inchis_5, values_5 = smi_to_inchi_with_val(df5["canonical_smiles"],
                                               df5["Value"])
    inchis.extend(inchis_5)
    values.extend(values_5)

    # sixth dataset
    df6 = pd.read_html(os.path.join(DATA_PATH, "ppb", "kratochwil2002.html"),
                       header=0)[0]
    df6 = df6.loc[:, ["Compound", "fb (%)b"]].dropna()

    for mol_name, val in tqdm(zip(df6["Compound"], df6["fb (%)b"]),
                              total=len(df6)):
        ans = requests.get(IUPAC_REST.format(mol_name))
        if ans.status_code == 200:
            inchi = ans.content.decode(
                "utf8")  # maybe not the same standard as rdkit...
            mol = MolFromInchi(inchi)
            if mol is not None:
                new_inchi = MolToInchi(mol)
                new_mol = MolFromInchi(new_inchi)
                if new_mol is not None:
                    inchis.append(new_inchi)
                    values.append(val)

    # join them all together
    df = pd.DataFrame({"inchi": inchis, "values": values})

    # checking duplicates
    per_dup, stds = duplicate_analysis(df, "inchi", "values")
    print(
        "Percentage of duplicates for PPB dataset: {:.5f}, with average std.: {}, and median std.:{}"
        .format(per_dup, np.mean(stds), np.median(stds)))

    # average values w. equal inchi and check readability
    print("Averaging values and ensuring rdkit readability...")
    inchis, values = mean_by_key(df, "inchi", "values")

    inchis, values = ensure_readability(inchis, values, MolFromInchi)

    if neutralize:
        inchis = [
            MolToInchi(neutralize_atoms(MolFromInchi(inchi), PATTERN))
            for inchi in inchis
        ]

    with open(os.path.join(DATA_PATH, "ppb", "data_ppb.pt"), "wb") as handle:
        pickle.dump([inchis, values], handle)
Beispiel #8
0
def process(init_data, use_cache=True):
    '''
    
    Examples
    --------
    
    >>> res = process({'CAS': '10170-69-1', 'synonyms': ['14267-36-8', 'NSC 22319'], 'name': 'Manganese, decacarbonyldi-, (Mn-Mn)'})
    >>> res['inchi'], res['smiles'], res['cid'], res['CAS']
    ('InChI=1S/10CO.2Mn/c10*1-2;;', '[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[Mn].[Mn]', 517769, '10170-69-1')
    '''
    # print(locals())
    init_data = init_data.copy()
    cc = cc_CAS = cc_name = cc_inchi = cc_inchikey = cc_smiles = cc_synonyms = cc_deprecated_CASs = None
    if 'CAS' in init_data:
        try:
            cc = common_chemistry_data(init_data['CAS'])
            cc_CAS, cc_name, cc_inchi, cc_inchikey, cc_smiles, cc_synonyms, cc_deprecated_CASs = cc
        except ValueError:
            # Compund is not in common chemistry; this is OK
            pass

    cid = iupac_name = p_MW = p_inchi = p_inchikey = p_smiles = p_formula = p_synonyms = None

    if init_data.get('mol', None) is not None:
        # If not in common chemistry or no InChi there, but if we have a mol file, get the inchi and inchikey for the
        # pubchem lookup
        mol = Chem.MolFromMolFile(init_data['mol'])
        if mol is not None:
            init_data['inchi'] = MolToInchi(mol)
            init_data['inchikey'] = InchiToInchiKey(init_data['inchi'])

    can_search_pubchem = (init_data.get('pubchem') is not None
                          or init_data.get('CASRN', cc_CAS) is not None
                          or init_data.get('inchi', cc_inchi) is not None
                          or init_data.get('inchikey', cc_inchikey) is not None
                          or init_data.get('smiles', cc_smiles) is not None)

    if can_search_pubchem:
        try:
            p = find_pubchem_from_ids(
                pubchem=init_data.get('pubchem'),
                CASRN=init_data.get('CASRN', cc_CAS),
                inchi=init_data.get('inchi', cc_inchi),
                inchikey=init_data.get('inchikey', cc_inchikey),
                smiles=init_data.get('smiles', cc_smiles),
                use_cache=use_cache)
        except Exception as e:
            p = None
            print(e, 'exception')
        if p is not None:
            cid, iupac_name, p_MW, p_inchi, p_inchikey, p_smiles, p_formula, p_synonyms = p
    # print(locals())
    mol = None
    # Be aware some smiles descriptions are wrong
    # Start with user overridding
    if 'mol' in init_data:
        mol = Chem.MolFromMolFile(init_data['mol'])
    if mol is None and 'smiles' in init_data:
        mol = Chem.MolFromSmiles(init_data['smiles'])
    if mol is None and 'inchi' in init_data:
        mol = MolFromInchi(
            init_data['inchi']) if init_data['inchi'].startswith(
                "InChI=1S/") else MolFromInchi("InChI=1S/" +
                                               init_data['inchi'])
    # Trust common chemistry next
    if mol is None and cc_smiles is not None:
        mol = Chem.MolFromSmiles(cc_smiles)
    if mol is None and cc_inchi is not None:
        mol = MolFromInchi(cc_inchi) if cc_inchi.startswith(
            "InChI=1S/") else MolFromInchi("InChI=1S/" + cc_inchi)
    # Did we pull up the structure from pubchem??
    if mol is None and p_smiles is not None:
        mol = Chem.MolFromSmiles(p_smiles)
    if mol is None and p_inchi is not None:
        mol = MolFromInchi(p_inchi) if p_inchi.startswith(
            "InChI=1S/") else MolFromInchi("InChI=1S/" + p_inchi)
    if mol is None:
        raise ValueError("No structure found")

    smiles = Chem.MolToSmiles(mol, True)
    inchi = MolToInchi(mol)
    inchikey = InchiToInchiKey(inchi)
    #MW = Descriptors.ExactMolWt(mol)
    formula = CalcMolFormula(mol, True, True)
    formula = serialize_formula(formula)
    MW = molecular_weight(nested_formula_parser(formula))

    # print(inchi, cc_inchi, p_inchi)
    # print(inchikey, cc_inchikey, p_inchikey)
    # print(smiles, cc_smiles, p_smiles)

    # output values
    if 'pubchem' in init_data:
        cid = init_data['pubchem']
    elif cid is None:
        cid = -1

    if cc_CAS is not None:
        CAS = cc_CAS
    elif 'CAS' in init_data:
        CAS = init_data['CAS']
    else:
        raise ValueError("CAS culd not be found")

    if 'formula' in init_data:
        # Override rdkit
        formula = init_data['formula']

    if 'MW' in init_data:
        # Override rdkit
        MW = init_data['MW']

    if 'smiles' in init_data:
        smiles = init_data['smiles']
    if 'inchi' in init_data:
        inchi = init_data['inchi']
    if 'inchikey' in init_data:
        inchikey = init_data['inchikey']

    if inchikey == '*' or smiles == '*' or inchi == '*':
        raise ValueError("Failure in rdkit")

    # Do we have a name specified in the settings?
    if 'name' in init_data:
        name = init_data['name']
    elif cc_name is not None:
        name = cc_name
    elif iupac_name is not None:
        name = iupac_name
    else:
        raise ValueError("There is no name for this compound")

    synonyms = []
    if cc_synonyms is not None:
        synonyms += cc_synonyms
    if cc_deprecated_CASs is not None:
        synonyms += cc_deprecated_CASs
    if p_synonyms is not None:
        synonyms += p_synonyms
    if 'synonyms' in init_data:
        synonyms += init_data['synonyms']
    synonyms = list(set(synonyms))
    if name in synonyms:
        synonyms.remove(name)
    if synonyms:

        def key_sort_str(s):
            return len(s), s.lower()

        synonyms = sorted(synonyms, key=key_sort_str)
        # synonyms = natsorted(synonyms)
    # synonyms = []

    return {
        'cid': cid,
        'CAS': CAS,
        'formula': formula,
        'MW': MW,
        'smiles': smiles,
        'inchi': inchi,
        'inchikey': inchikey,
        'name': name,
        'synonyms': synonyms
    }
def inchi_from_smiles(smiles):
    mol = MolFromSmiles(smiles)
    return MolToInchi(mol)
def physnet_to_datalist(self,
                        N,
                        R,
                        E,
                        D,
                        Q,
                        Z,
                        num_mol,
                        mols,
                        efgs_batch,
                        EFG_R,
                        EFG_Z,
                        num_efg,
                        sol_data=None):
    """
    load data from PhysNet structure to InMemoryDataset structure (more compact)
    :return:
    """
    from rdkit.Chem.inchi import MolToInchi

    data_array = np.empty(num_mol, dtype=Data)
    t0 = time.time()
    Z_0 = Z[0, :]
    n_heavy = len(Z_0) - (Z_0 == 0).sum() - (Z_0 == 1).sum()

    jianing_to_dongdong_map = []

    for i in tqdm(range(num_mol)):
        if self.bond_atom_sep:
            mol = mols[i]
        else:
            mol = None
        # atomic infos
        _tmp_Data = Data()

        num_atoms = N[i]
        _tmp_Data.N = num_atoms.view(-1)
        _tmp_Data.R = R[i, :N[i]].view(-1, 3)
        _tmp_Data.E = E[i].view(-1)
        _tmp_Data.D = D[i].view(-1, 3)
        _tmp_Data.Q = Q[i].view(-1)
        _tmp_Data.Z = Z[i, :N[i]].view(-1)

        if self.cal_efg:
            _tmp_Data.atom_to_EFG_batch = efgs_batch[i, :N[i]].view(-1)
            _tmp_Data.EFG_R = EFG_R[i, :num_efg[i]].view(-1, 3)
            _tmp_Data.EFG_Z = EFG_Z[i, :num_efg[i]].view(-1)
            _tmp_Data.EFG_N = num_efg[i].view(-1)

        if sol_data is not None:
            # find molecule from solvation csv file based on InChI, if found, add it
            this_sol_data = sol_data.loc[sol_data["InChI"] == MolToInchi(mol)]
            if this_sol_data.shape[0] == 1:
                for key in sol_keys:
                    _tmp_Data.__setattr__(
                        key,
                        torch.as_tensor(this_sol_data.iloc[0][key]).view(-1))
                jianing_to_dongdong_map.append(1)
            else:
                jianing_to_dongdong_map.append(0)
                continue

        _tmp_Data = self.pre_transform(
            data=_tmp_Data,
            edge_version=self.edge_version,
            do_sort_edge=self.sort_edge,
            cal_efg=self.cal_efg,
            cutoff=self.cutoff,
            extended_bond=self.extended_bond,
            boundary_factor=self.boundary_factor,
            type_3_body=self.type_3_body,
            use_center=self.use_center,
            mol=mol,
            cal_3body_term=self.cal_3body_term,
            bond_atom_sep=self.bond_atom_sep,
            record_long_range=self.record_long_range)

        data_array[i] = _tmp_Data

    if sol_data is not None:
        torch.save(torch.as_tensor(jianing_to_dongdong_map),
                   "jianing_to_dongdong_map_{}.pt".format(n_heavy))

    data_list = [
        data_array[i] for i in range(num_mol) if data_array[i] is not None
    ]

    return data_list