Example #1
0
def pipe_sim_filter(stream,
                    query,
                    cutoff=0.8,
                    summary=None,
                    comp_id="pipe_sim_filter"):
    """Filter for compounds that have a similarity greater or equal
    than `cutoff` to the `query` Smiles.
    If the field `FP_b64` (e.g. pre-calculated) is present, this will be used,
    otherwise the fingerprint will be generated on-the-fly (much slower)."""
    rec_counter = 0

    query_mol = Chem.MolFromSmiles(query)
    if not query_mol:
        print("* {} ERROR: could not generate query from SMILES.".format(
            comp_id))
        return None

    query_fp = FingerprintMols.FingerprintMol(query_mol)
    for rec in stream:
        if "mol" not in rec: continue

        if "FP_b64" in rec:  # use the pre-defined fingerprint if it is present in the stream
            mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"]))
        else:
            mol_fp = FingerprintMols.FingerprintMol(rec["mol"])

        sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp)
        if sim >= cutoff:
            rec_counter += 1

            if summary is not None:
                summary[comp_id] = rec_counter

            yield rec
Example #2
0
def ScreenInDb(details, mol):
  try:
    probeFp = FingerprintMols.FingerprintMol(mol, **details.__dict__)
  except Exception:
    import traceback
    FingerprintMols.error('Error: problems fingerprinting molecule.\n')
    traceback.print_exc()
    return []
  
  if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity,
                            DataStructs.CosineSimilarity):
    return ScreenFingerprints(details, data=GetFingerprints(details), mol=mol)
  
  conn: DbConnect = _ConnectToDatabase(details)
  if details.metric == DataStructs.TanimotoSimilarity:
    func = 'rd_tanimoto'
  elif details.metric == DataStructs.DiceSimilarity:
    func = 'rd_dice'
  elif details.metric == DataStructs.CosineSimilarity:
    func = 'rd_cosine'
  pkl = probeFp.ToBitString()
  extraFields = f"{func}({DbModule.placeHolder},{details.fpColName}) as tani"
  cmd = _ConstructSQL(details, extraFields=extraFields)

  if details.doThreshold:
    # we need to do a subquery here:
    cmd = f"select * from ({cmd}) tmp where tani>{details.screenThresh}"
  cmd += " order by tani desc"
  if not details.doThreshold and details.topN > 0:
    cmd += f" limit {details.topN}"
  
  curs = conn.GetCursor()
  curs.execute(cmd, (pkl, ))
  return curs.fetchall()
Example #3
0
def calc_similarity(compound_one, compound_two):
    if compound_one in joint_sim:
        if compound_two in joint_sim[compound_one]:
            return joint_sim[compound_one][compound_two]
    else:
        joint_sim[compound_one] = dict()

    if compound_two not in joint_sim:
        joint_sim[compound_two] = dict()

    if cg_props[compound_one.lower()]["type"] != cg_props[compound_one.lower()]["type"]:
        joint_sim[compound_one][compound_two] = 0.0
        joint_sim[compound_two][compound_one] = 0.0
        return 0.0

    from rdkit import DataStructs
    from rdkit.Chem.Fingerprints import FingerprintMols
    from rdkit import Chem

    mol_one = Chem.MolFromSmiles(str(cg_props[compound_one.lower()]["smiles"]))
    mol_two = Chem.MolFromSmiles(str(cg_props[compound_two.lower()]["smiles"]))
    fp_1 = FingerprintMols.FingerprintMol(mol_one)
    fp_2 = FingerprintMols.FingerprintMol(mol_two)
    similarity = DataStructs.FingerprintSimilarity(fp_1, fp_2)
    joint_sim[compound_one][compound_two] = similarity
    joint_sim[compound_two][compound_one] = similarity
    return similarity
Example #4
0
    def __init__(self, moli, molj):
        """
        Inizialization function
    
        Parameters
        ----------

        moli : RDKit molecule object 
            the first molecule used to perform the Figureprint calculation
        molj : RDKit molecule object 
            the second molecule used to perform the Figureprint calculation
        options : argparse python object 
            the list of user options 
       
        """

        # Set logging level and format
        logging.basicConfig(format='%(levelname)s:\t%(message)s',
                            level=logging.INFO)

        # Local pointers to the passed molecules
        self.moli = moli
        self.molj = molj

        if not options.verbose == 'pedantic':
            lg = RDLogger.logger()
            lg.setLevel(RDLogger.CRITICAL)

        self.fps_moli = FingerprintMols.FingerprintMol(self.moli)
        self.fps_molj = FingerprintMols.FingerprintMol(self.molj)
        self.fps_tan = DataStructs.FingerprintSimilarity(
            self.fps_moli, self.fps_molj)
Example #5
0
    def test__init__(self):
        from rdkit.Chem.Fingerprints import FingerprintMols
        ms = [
            Chem.MolFromSmiles('CCOC'),
            Chem.MolFromSmiles('CCO'),
            Chem.MolFromSmiles('COC')
        ]
        fps = [FingerprintMols.FingerprintMol(x) for x in ms]
        self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]),
                               0.6,
                               places=2)

        details = FingerprinterDetails()
        fpArgs = details.__dict__
        fps = []
        for i, x in enumerate(ms, 1):
            fpArgs['fpSize'] = 16 * i
            fps.append(FingerprintMols.FingerprintMol(x, **fpArgs))
        self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]),
                               0.555,
                               places=2)
        self.assertAlmostEqual(FingerprintSimilarity(fps[1], fps[0]),
                               0.555,
                               places=2)

        fpArgs['fpSize'] = 1024
        fpArgs['tgtDensity'] = 0.8
        fp = FingerprintMols.FingerprintMol(ms[0], **fpArgs)
        self.assertEqual(len(fp), 64)
        fp = DataStructs.FoldToTargetDensity(fp, density=0.1, minLength=2)
        self.assertEqual(len(fp), 4)
Example #6
0
def _mols_similarity_base_r0(ms_smiles_mid, ms_smiles_base):
    """
    Input: dictionary type required such as {nick name: smiles code, ...}
    """
    from rdkit import Chem
    from rdkit import DataStructs
    from rdkit.Chem.Fingerprints import FingerprintMols

    # processing for mid
    print("Target: " + ms_smiles_mid.keys())
    ms_mid = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_mid.values()]
    # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')]
    fps_mid = [FingerprintMols.FingerprintMol(x) for x in ms_mid]

    # processing for base
    print("Base: " + ms_smiles_base.keys())
    ms_base = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_base.values()]
    # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')]
    fps_base = [FingerprintMols.FingerprintMol(x) for x in ms_base]

    for (bx, f_b) in enumerate(fps_base):
        for (dx, f_d) in enumerate(fps_mid):
            print("Base:{0}, Target:{1}".format(ms_smiles_base.keys()[bx],
                                                ms_smiles_mid.keys()[dx]))
            print(DataStructs.FingerprintSimilarity(f_b, f_d))
Example #7
0
def molecular_similarity(best, parent_candidates, all=False):
    """
    returns a similarity score (0-1) of best with the
    closest molecular relative in parent_candidates

    Parameters
    ----------
    best : object
        Chromosome object, the current
        mutated candidate
    parent_candidates : array
        parent pool of molecules to compare with best.
        These are represented by SMILES
    all : boolean, optional, default = False
        default behavior is false and the tanimoto
        similarity score is returned. If True
        tanimoto, dice, cosine, sokal, kulczynski,
        and mcconnaughey similarities are returned

    Returns
    ----------
    similarity_score : float
    similarity_index : int
        if all=False the best tanimoto similarity score
        as well as the index of the closest molecular
        relative are returned
        if all=True an array of best scores and indeces
        of the closest molecular relative are returned
    """
    scores = []
    if all:
        indices = []
        metrics = [
            DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity,
            DataStructs.CosineSimilarity, DataStructs.SokalSimilarity,
            DataStructs.KulczynskiSimilarity,
            DataStructs.McConnaugheySimilarity
        ]

        for j in range(len(metrics)):

            scores_micro = []
            for i in range(len(parent_candidates)):
                ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])]
                fps = [FingerprintMols.FingerprintMol(x) for x in ms]
                score = DataStructs.FingerprintSimilarity(fps[0],
                                                          fps[1],
                                                          metric=metrics[j])
                scores_micro.append(score)
            scores.append(max(scores_micro))
            indices.append(scores_micro.index(max(scores_micro)))
        return scores, indices
    else:
        for i in range(len(parent_candidates)):
            ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])]
            fps = [FingerprintMols.FingerprintMol(x) for x in ms]
            score = DataStructs.FingerprintSimilarity(fps[0], fps[1])
            scores.append(score)
        return max(scores), scores.index(max(scores))
Example #8
0
def tanimotoComparison(pred_prod_list, true_prod_list):
    # Return the tanimoto score
    pred_mol = Chem.MolFromSmiles(pred_prod_list)
    answer_mol = Chem.MolFromSmiles(true_prod_list)

    pred_fps = FingerprintMols.FingerprintMol(pred_mol)
    answer_fps = FingerprintMols.FingerprintMol(answer_mol)

    return DataStructs.FingerprintSimilarity(pred_fps, answer_fps)
Example #9
0
def build_reactions(perturbations_all_paths, mcs_neighbours):
    # loop over each perturbation in the list and load the pdb files:
    perturbation_reactions = []

    ###########################################

    perturbations_unnested = list(
        itertools.chain.from_iterable(perturbations_all_paths))
    all_members = []
    all_members_FPs = []
    for member in perturbations_unnested:
        member_pdb_file = open(member, 'r').read()
        all_members.append(rdmolfiles.MolFromPDBBlock(member_pdb_file))
        all_members_FPs.append([
            FingerprintMols.FingerprintMol(
                rdmolfiles.MolFromPDBBlock(member_pdb_file))
        ])
    first_pair = perturbations_all_paths[0]

    member1_pdb_file = open(first_pair[0], 'r').read()
    member2_pdb_file = open(first_pair[1], 'r').read()

    size_member1 = rdMolDescriptors.CalcExactMolWt(
        rdmolfiles.MolFromPDBBlock(member1_pdb_file))
    size_member2 = rdMolDescriptors.CalcExactMolWt(
        rdmolfiles.MolFromPDBBlock(member2_pdb_file))

    if size_member1 >= size_member2:
        query_member = FingerprintMols.FingerprintMol(
            rdmolfiles.MolFromPDBBlock(member1_pdb_file))
    else:
        query_member = FingerprintMols.FingerprintMol(
            rdmolfiles.MolFromPDBBlock(member2_pdb_file))

    similarities = [
        AllChem.DataStructs.FingerprintSimilarity(query_member, target_fp[0])
        for target_fp in all_members_FPs
    ]
    similarities_to_query = dict(zip(all_members, similarities))

    mol_similarities = dict(
        sorted(similarities_to_query.items(),
               key=lambda kv: kv[1],
               reverse=True))

    ordered_mol_similarities = {}

    for key, value in mol_similarities.items():
        if value not in ordered_mol_similarities.values():
            ordered_mol_similarities[key] = value

    similar_hits = []
    for key, value in ordered_mol_similarities.items():
        similar_hits.append(key)

    neighbours = similar_hits[:5]
    print(neighbours)
Example #10
0
def cal_pairwise_tanimoto(pair):
    i, j = (pair)
    x = Chem.MolFromSmiles(smiles[i])
    y = Chem.MolFromSmiles(smiles[j])
    fps1 = FingerprintMols.FingerprintMol(x)
    fps2 = FingerprintMols.FingerprintMol(y)
    tani = DataStructs.TanimotoSimilarity(fps1, fps2)
    values = (str(i) + "," + str(j) + "," + str(tani) + "\n")
    return values
Example #11
0
def GetFingerprints(details):
    """ returns an iterable sequence of fingerprints
  each fingerprint will have a _fieldsFromDb member whose first entry is
  the id.

  """
    if details.dbName and details.tableName:
        try:
            conn = DbConnect(details.dbName, details.tableName)
            if hasattr(details, 'dbUser'):
                conn.user = details.dbUser
            if hasattr(details, 'dbPassword'):
                conn.password = details.dbPassword
        except:
            import traceback
            FingerprintMols.error(
                'Error: Problems establishing connection to database: %s|%s\n'
                % (details.dbName, details.tableName))
            traceback.print_exc()
        cmd = _ConstructSQL(details, extraFields=details.fpColName)
        curs = conn.GetCursor()
        #curs.execute(cmd)
        #print 'CURSOR:',curs,curs.closed
        if _dataSeq:
            suppl = _dataSeq(curs,
                             cmd,
                             depickle=not details.noPickle,
                             klass=DataStructs.ExplicitBitVect)
            _dataSeq._conn = conn
        else:
            suppl = DbFpSupplier.ForwardDbFpSupplier(
                data, fpColName=details.fpColName)
    elif details.inFileName:
        conn = None
        try:
            inF = open(details.inFileName, 'r')
        except IOError:
            import traceback
            FingerprintMols.error('Error: Problems reading from file %s\n' %
                                  (details.inFileName))
            traceback.print_exc()

        supple = []
        done = 0
        while not done:
            try:
                id, fp = cPickle.load(inF)
            except:
                done = 1
            else:
                fp._fieldsFromDb = [id]
                suppl.append(fp)
    else:
        suppl = None

    return suppl
Example #12
0
def TakeInput(filepath, hmdb_filepath, OR_name):
    positive_Cancer = extractPositiveOnes(filepath)
    data_hmdb = pd.read_csv(hmdb_filepath, encoding="ISO-8859-1")
    positive_Cancer = pd.read_csv(filepath, encoding="ISO-8859-1")
    hmdb_names = data_hmdb['NAME']
    hmdb_SMILES = data_hmdb['SMILES']
    positive_Cancer_SMILES = positive_Cancer['Smiles']
    positive_Cancer_Names = positive_Cancer["Ligand"]
    hmdb_data = pd.concat([hmdb_SMILES, hmdb_names], axis=1)
    dataframe = pd.concat([positive_Cancer_SMILES, positive_Cancer_Names],
                          axis=1)
    Cancer_clean_data = dataframe.drop_duplicates()
    Cancer_clean_data = Cancer_clean_data.reset_index(drop=True)
    df1 = pd.DataFrame({
        "Cancer_Molecule": [],
        "Cancer_SMILES": [],
        "HMDB_Molecule": [],
        "HMDB_SMILES": [],
        "TANIMOTO_Similarity_Value": []
    })
    hmdb_data = hmdb_data.reset_index(drop=True)
    k = 0
    for i in range(len(Cancer_clean_data)):
        # df1=df1.iloc[0:0]
        # df1= pd.DataFrame({"Cancer_clean_data_Molecule":[],"Cancer_clean_data_SMILES":[],"HMDB_Molecule":[],"HMDB_SMILES":[],"TANIMOTO_Similarity_Value":[]})
        y = Chem.MolFromSmiles(Cancer_clean_data['Smiles'][i])
        fps1 = FingerprintMols.FingerprintMol(y)
        for j in range(len(hmdb_data)):
            try:
                x = Chem.MolFromSmiles(hmdb_data['SMILES'][j])
                fps2 = FingerprintMols.FingerprintMol(x)
                sim_val = DataStructs.FingerprintSimilarity(fps1, fps2)
                if sim_val >= 0.85:  # threshold for similarity value
                    df1.loc[k] = [
                        Cancer_clean_data['Ligand'][i],
                        Cancer_clean_data['Smiles'][i], hmdb_data['NAME'][j],
                        hmdb_data['SMILES'][j], sim_val
                    ]
                    k = k + 1
            except:
                print("WARNING")
        print("Comparison Done for Ligand :" + str(i))
    df1.to_csv("Final_test_set_" + OR_name + ".csv")
    Ligand = df1["Cancer_clean_data_Molecule"]
    Smiles = df1["Cancer_clean_data_SMILES"]
    Activation_Status = []
    Shortlisted_Metabolites = pd.DataFrame(
        list(zip(Smiles, Ligand, Activation_Status)),
        columns=['Smiles', 'Ligand', 'Activation Status'])
    Shortlisted_Metabolites = Shortlisted_Metabolites.drop_duplicates(
        subset='Ligand', keep='first')
    Shortlisted_Metabolites.to_csv("Shortlisted_Metabolites" + OR_Name +
                                   ".csv")
    print("Shortlisted_Metabolites" + OR_Name + ".csv" + " has been saved")
    print("Congrats! Final_test_set_" + OR_Name +
          ".csv has been successfully saved!")
Example #13
0
def tanimoto_score(mol1, mol2):
    """Compute the similarity via Tanimoto fingerprints for mol1 and mol2."""

    from rdkit.Chem.Fingerprints import FingerprintMols
    from rdkit import DataStructs

    fp1 = FingerprintMols.FingerprintMol(mol1)
    fp2 = FingerprintMols.FingerprintMol(mol2)

    return 1.0 / (DataStructs.FingerprintSimilarity(fp1, fp2) + 1e-15)
Example #14
0
def ScreenInDb(details, mol):
    try:
        probeFp = apply(FingerprintMols.FingerprintMol, (mol, ),
                        details.__dict__)
    except:
        import traceback
        FingerprintMols.error('Error: problems fingerprinting molecule.\n')
        traceback.print_exc()
        return []
    if details.dbName and details.tableName:
        try:
            conn = DbConnect(details.dbName, details.tableName)
            if hasattr(details, 'dbUser'):
                conn.user = details.dbUser
            if hasattr(details, 'dbPassword'):
                conn.password = details.dbPassword
        except:
            import traceback
            FingerprintMols.error(
                'Error: Problems establishing connection to database: %s|%s\n'
                % (details.dbName, details.tableName))
            traceback.print_exc()

    if details.metric not in (DataStructs.TanimotoSimilarity,
                              DataStructs.DiceSimilarity,
                              DataStructs.CosineSimilarity):
        data = GetFingerprints(details)
        res = ScreenFingerprints(details, data, mol)
    else:
        res = []
        if details.metric == DataStructs.TanimotoSimilarity:
            func = 'rd_tanimoto'
            pkl = probeFp.ToBitString()
        elif details.metric == DataStructs.DiceSimilarity:
            func = 'rd_dice'
            pkl = probeFp.ToBitString()
        elif details.metric == DataStructs.CosineSimilarity:
            func = 'rd_cosine'
            pkl = probeFp.ToBitString()
        extraFields = "%s(%s,%s) as tani" % (func, DbModule.placeHolder,
                                             details.fpColName)
        cmd = _ConstructSQL(details, extraFields=extraFields)

        if details.doThreshold:
            # we need to do a subquery here:
            cmd = "select * from (%s) tmp where tani>%f" % (
                cmd, details.screenThresh)
        cmd += " order by tani desc"
        if not details.doThreshold and details.topN > 0:
            cmd += " limit %d" % details.topN
        curs = conn.GetCursor()
        curs.execute(cmd, (pkl, ))
        res = curs.fetchall()

    return res
Example #15
0
def caculate_similarity_fingerprint(smiles_A, smiles_B):
    try:
        m1 = Chem.MolFromSmiles(smiles_A)
        m2 = Chem.MolFromSmiles(smiles_B)
        f1 = FingerprintMols.FingerprintMol(m1)
        f2 = FingerprintMols.FingerprintMol(m2)

        similaritt_f1_f2 = DataStructs.FingerprintSimilarity(f1, f2)
        return round(similaritt_f1_f2, 4)
    except:
        return -1
Example #16
0
def combine_via_chemsimilarity(cypfile,cypreact):

	checked_file = open("final_.csv","w",newline='')
	csv_writer_checked = csv.writer(checked_file,quoting=csv.QUOTE_ALL)
	# this file is for keep track of duplicate compounds 
	# if drugbank state the compound is inhibitor but chembl state substrate; then it needs to investigate
	checked_file_2 = open("duplicates.csv","w",newline='')
	csv_writer_checked_2 = csv.writer(checked_file,quoting=csv.QUOTE_ALL)
	# currently only support Drugbank data and ChEMBL data
	# later could add self-annotating data 
	drugbank_csv = open(cypfile, newline='')
	drugbank_csvreader = csv.reader(drugbank_csv, delimiter=',')
	ChEMBL_csv = open(cypreact, newline='')
	ChEMBL_csvreader = csv.reader(ChEMBL_csv, delimiter=',')

	DRUGBANK = []
	for row in drugbank_csvreader:
		DRUGBANK.append(row)
	CHEMBL   = []
	for row in ChEMBL_csvreader:
		CHEMBL.append(row)

	# if selecting drugbank compound exist in chembl;
	# print it/ save it to file
	# later need automatically store into file 
	for cl in CHEMBL:
		for db in DRUGBANK:
			mol_object_c = Chem.MolFromSmiles(cl[1])
			mol_object_d = Chem.MolFromSmiles(db[1])

			fps_c = FingerprintMols.FingerprintMol(mol_object_c)
			fps_d = FingerprintMols.FingerprintMol(mol_object_d)
			similiarty  = DataStructs.FingerprintSimilarity(fps_c,fps_d)
			if similiarty == 1:
				single_list = ["Duplicates"]
				csv_writer_checked_2.writerow(single_list)
				csv_writer_checked_2.writerow(cl)
				csv_writer_checked_2.writerow(db)
				# db_list = list(db)
				# csv_writer.writerow(db_list)
				DRUGBANK.remove(db)

	checked_file_2.close()
	print("remaining compound from drugbank is: "+str(len(DRUGBANK)))
	for i in CHEMBL:
		csv_writer_checked.writerow(i)
	for i in DRUGBANK:
		csv_writer_checked.writerow(i)


	# csv_write_file.close()
	checked_file.close()
	print("similarity check done ...")
	return None
Example #17
0
def GetFingerprints(details):
    """ returns an iterable sequence of fingerprints
  each fingerprint will have a _fieldsFromDb member whose first entry is
  the id.

  """
    if details.dbName and details.tableName:
        try:
            conn = DbConnect(details.dbName, details.tableName)
            if hasattr(details, "dbUser"):
                conn.user = details.dbUser
            if hasattr(details, "dbPassword"):
                conn.password = details.dbPassword
        except Exception:
            import traceback

            FingerprintMols.error(
                "Error: Problems establishing connection to database: %s|%s\n" % (details.dbName, details.tableName)
            )
            traceback.print_exc()
        cmd = _ConstructSQL(details, extraFields=details.fpColName)
        curs = conn.GetCursor()
        # curs.execute(cmd)
        # print 'CURSOR:',curs,curs.closed
        if _dataSeq:
            suppl = _dataSeq(curs, cmd, depickle=not details.noPickle, klass=DataStructs.ExplicitBitVect)
            _dataSeq._conn = conn
        else:
            suppl = DbFpSupplier.ForwardDbFpSupplier(data, fpColName=details.fpColName)
    elif details.inFileName:
        conn = None
        try:
            inF = open(details.inFileName, "r")
        except IOError:
            import traceback

            FingerprintMols.error("Error: Problems reading from file %s\n" % (details.inFileName))
            traceback.print_exc()

        suppl = []
        done = 0
        while not done:
            try:
                ID, fp = cPickle.load(inF)
            except Exception:
                done = 1
            else:
                fp._fieldsFromDb = [ID]
                suppl.append(fp)
    else:
        suppl = None

    return suppl
Example #18
0
 def compute_single_tanimoto_metric(first_smile, second_smile):
     if first_smile is None or second_smile is None:
         return 0.
     first_mol = MolFromSmiles(first_smile)
     second_mol = MolFromSmiles(second_smile)
     if first_mol is None or second_mol is None:
         return 0.
     tanimoto_similarity = DataStructs.FingerprintSimilarity(
         FingerprintMols.FingerprintMol(first_mol),
         FingerprintMols.FingerprintMol(second_mol),
         metric=DataStructs.TanimotoSimilarity)
     return tanimoto_similarity
Example #19
0
 def calc_tanimoto(self, reference):
     """
     Determin the tanimoto similarity score based on the rd mol fingureprint
     """
     try:
         fps_ref = FingerprintMols.FingerprintMol(reference.rd_mol)
         fps_self = FingerprintMols.FingerprintMol(self.rd_mol)
         tanimoto = DataStructs.FingerprintSimilarity(fps_ref, fps_self)
         return tanimoto
     except:
         logger.exception('Caught exception attempting to run rdkit FingerprintMols.FingerprintMol or DataStructs.FingerprintSimilarity')
         return None
Example #20
0
def ScreenInDb(details, mol):
    try:
        probeFp = apply(FingerprintMols.FingerprintMol, (mol,), details.__dict__)
    except Exception:
        import traceback

        FingerprintMols.error("Error: problems fingerprinting molecule.\n")
        traceback.print_exc()
        return []
    if details.dbName and details.tableName:
        try:
            conn = DbConnect(details.dbName, details.tableName)
            if hasattr(details, "dbUser"):
                conn.user = details.dbUser
            if hasattr(details, "dbPassword"):
                conn.password = details.dbPassword
        except Exception:
            import traceback

            FingerprintMols.error(
                "Error: Problems establishing connection to database: %s|%s\n" % (details.dbName, details.tableName)
            )
            traceback.print_exc()

    if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity):
        data = GetFingerprints(details)
        res = ScreenFingerprints(details, data, mol)
    else:
        res = []
        if details.metric == DataStructs.TanimotoSimilarity:
            func = "rd_tanimoto"
            pkl = probeFp.ToBitString()
        elif details.metric == DataStructs.DiceSimilarity:
            func = "rd_dice"
            pkl = probeFp.ToBitString()
        elif details.metric == DataStructs.CosineSimilarity:
            func = "rd_cosine"
            pkl = probeFp.ToBitString()
        extraFields = "%s(%s,%s) as tani" % (func, DbModule.placeHolder, details.fpColName)
        cmd = _ConstructSQL(details, extraFields=extraFields)

        if details.doThreshold:
            # we need to do a subquery here:
            cmd = "select * from (%s) tmp where tani>%f" % (cmd, details.screenThresh)
        cmd += " order by tani desc"
        if not details.doThreshold and details.topN > 0:
            cmd += " limit %d" % details.topN
        curs = conn.GetCursor()
        curs.execute(cmd, (pkl,))
        res = curs.fetchall()

    return res
def is_similar_reagent(rgt1, rgt2, list_of_metal_atoms,
                       list_of_full_metal_names):
    if rgt1 == rgt2:
        return True
    elif 'Reaxys ID' in rgt1 or 'Reaxys ID' in rgt2:
        return False
    else:
        #if have metal atoms compare the
        rgt1_metal = 100
        rgt2_metal = 101
        if any(metal in rgt1 for metal in list_of_full_metal_names):
            rgt1_metal = [
                list_of_full_metal_names.index(metal)
                for metal in list_of_full_metal_names if metal in rgt1
            ]
        elif any(metal in rgt1 for metal in list_of_metal_atoms):
            rgt1_metal = [
                list_of_metal_atoms.index(metal)
                for metal in list_of_metal_atoms if metal in rgt1
            ]

        if any(metal in rgt2 for metal in list_of_full_metal_names):
            rgt2_metal = [
                list_of_full_metal_names.index(metal)
                for metal in list_of_full_metal_names if metal in rgt2
            ]
        elif any(metal in rgt2 for metal in list_of_metal_atoms):
            rgt2_metal = [
                list_of_metal_atoms.index(metal)
                for metal in list_of_metal_atoms if metal in rgt2
            ]

        if rgt1_metal == rgt2_metal:
            return True

        if 'Reaxys' in rgt1 or 'Reaxys' in rgt2:
            return False
        try:
            mol1 = Chem.MolFromSmiles(rgt1)
            mol2 = Chem.MolFromSmiles(rgt2)
            fp1 = FingerprintMols.FingerprintMol(mol1)
            fp2 = FingerprintMols.FingerprintMol(mol2)
        except:
            print('cannot calculate fp')
            return False
        if not any(list(fp1)) or not any(list(fp2)):
            return False
        similarity = DataStructs.FingerprintSimilarity(fp1, fp2)
        if similarity >= 1.0:
            return True
        else:
            return False
Example #22
0
 def mtansr(self, ):
     """
     This rule computes the structural similarity between the two passed molecules 
     using the tanimoto score. 
     Returns
     -------
     scr_tan : float
         the rule score
     """
     fps_moli = FingerprintMols.FingerprintMol(self.moli)
     fps_molj = FingerprintMols.FingerprintMol(self.molj)
     scr_tan = DataStructs.FingerprintSimilarity(fps_moli, fps_molj)
     return scr_tan
Example #23
0
def create_fpmols(smiles: List[str] or str) -> List or str:
    if isinstance(smiles, list):
        smiles = [Chem.MolFromSmiles(smile) for smile in smiles]
        fpmols: List = [
            FingerprintMols.FingerprintMol(smile) for smile in smiles
        ]
    elif isinstance(smiles, str):
        smiles = Chem.MolFromSmiles(smiles)
        fpmols: str = FingerprintMols.FingerprintMol(smiles)
    else:
        raise ValueError(f'{type(smiles)} is not supported')

    return fpmols
Example #24
0
def pipe_sim_filter(stream,
                    query,
                    cutoff=80,
                    summary=None,
                    comp_id="pipe_sim_filter"):
    """Filter for compounds that have a similarity greater or equal
    than `cutoff` (in percent) to the `query` Smiles.
    If the field `FP_b64` (e.g. pre-calculated) is present, this will be used,
    otherwise the fingerprint of the Murcko scaffold will be generated on-the-fly (much slower)."""
    rec_counter = 0

    query_mol = Chem.MolFromSmiles(query)
    if not query_mol:
        print("* {} ERROR: could not generate query from SMILES.".format(
            comp_id))
        return None

    murcko_mol = MurckoScaffold.GetScaffoldForMol(query_mol)
    if USE_FP == "morgan":
        query_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect(
            murcko_mol, 2)
    elif USE_FP == "avalon":
        query_fp = pyAv.GetAvalonFP(murcko_mol, 1024)
    else:
        query_fp = FingerprintMols.FingerprintMol(murcko_mol)

    for rec in stream:
        if "mol" not in rec: continue

        if "FP_b64" in rec:  # use the pre-defined fingerprint if it is present in the stream
            mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"]))
        else:
            murcko_mol = MurckoScaffold.GetScaffoldForMol(rec["mol"])
            if USE_FP == "morgan":
                mol_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect(
                    murcko_mol, 2)
            elif USE_FP == "avalon":
                mol_fp = pyAv.GetAvalonFP(murcko_mol, 1024)
            else:
                mol_fp = FingerprintMols.FingerprintMol(murcko_mol)

        sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp)
        if sim * 100 >= cutoff:
            rec_counter += 1
            rec["Sim"] = np.round(sim * 100, 2)

            if summary is not None:
                summary[comp_id] = rec_counter

            yield rec
Example #25
0
def score(end, start=None):
    """complexity score of a compound
    Should be high when that compound is difficult to synthesize
    and low when it is easy to synthesize
    """
    f_end = FingerprintMols.FingerprintMol(end)
    f_start = FingerprintMols.FingerprintMol(start)
    sim = DataStructs.FingerprintSimilarity(f_end, f_start)
    if start:
        return -1 * sim  #lower scores are better

    if Hydrobromination.has_br(end):
        return 2
    else:
        return 1
Example #26
0
def _ConnectToDatabase(details) -> DbConnect:
  if details.dbName and details.tableName:
    try:
      conn = DbConnect(details.dbName, details.tableName)
      if hasattr(details, 'dbUser'):
        conn.user = details.dbUser
      if hasattr(details, 'dbPassword'):
        conn.password = details.dbPassword
      return conn
    except Exception:
      import traceback
      FingerprintMols.error(f'Error: Problems establishing connection to '
                            f'database:{details.dbName}|{details.tableName}\n')
      traceback.print_exc()
  return None
    def __post_init__(self):
        """Properly initialize some class variable representations (or try)."""

        self.mol_reactants = [Chem.MolFromSmiles(x) for x in self.reactants]
        self.mol_reagents = [Chem.MolFromSmiles(x) for x in self.reagents]
        self.mol_products = [Chem.MolFromSmiles(x) for x in self.products]
        self.fp_reactants = [
            FingerprintMols.FingerprintMol(x) for x in self.mol_reactants
        ]
        self.fp_reagents = [
            FingerprintMols.FingerprintMol(x) for x in self.mol_reagents
        ]
        self.fp_products = [
            FingerprintMols.FingerprintMol(x) for x in self.mol_products
        ]
Example #28
0
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X,
           simType):

    if simType == "Topological":
        fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList]
        fp = FingerprintMols.FingerprintMol(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    elif simType == "Morgan":
        fpsTrain = [
            AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList
        ]
        fp = AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(predEx[smilesAttrName].value), 2)
    elif simType == "MACCS":
        fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList]
        fp = MACCSkeys.GenMACCSKeys(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    else:
        print "This type of sim is not implemented ", simType

    simDict = {}
    idx = 0
    simList = []
    for ex in train:
        if simType == "Topological":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        elif simType == "Morgan":
            sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp)
        elif simType == "MACCS":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        else:
            print "This type of sim is not implemented ", simType
        idx = idx + 1
        simDict[ex[nameAttr].value] = sim
        simList.append(sim)

    simList.sort(reverse=True)
    simList = simList[0:X]
    medSim = round(numpy.median(simList), 3)
    stdSim = round(numpy.std(simList), 3)
    minSim = round(min(simList), 3)
    maxSim = round(max(simList), 3)

    entropy = round(getRespVar(simList, simDict, train, nameAttr), 3)
    entropyClosest = round(
        getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3)

    return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
Example #29
0
def _pathFingerprintsClustering(rdkit_mols):
    """
        Returns the tanimoto distance matrix based on fingerprints method

        Parameters
        ----------
        rdkit_mols: list
            The list of rdkit.Chem.rdchem.Mol objects

        Returns
        -------
        tanimotomatrix: np.array
            The numpy array containing the tanimoto matrix
        """
    from rdkit.Chem.Fingerprints import FingerprintMols  # calcola path fingerprints

    fps = []
    for m in tqdm(rdkit_mols):
        fps.append(FingerprintMols.FingerprintMol(m))

    aprun = ParallelExecutor(n_jobs=-1)  # _config['ncpus'])
    tanimoto_matrix = aprun(total=len(fps), desc='PathFingerprints Distance') \
        (delayed(TanimotoDistances)(fp1, fps) for fp1 in fps)

    return np.array(tanimoto_matrix)
Example #30
0
def CalculateDaylightFingerprint(mol):
    """
    #################################################################
    Calculate Daylight-like fingerprint or topological fingerprint

    (2048 bits).

    Usage:

        result=CalculateDaylightFingerprint(mol)

        Input: mol is a molecule object.

        Output: result is a tuple form. The first is the number of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = {}
    NumFinger = 2048
    bv = FingerprintMols.FingerprintMol(mol)
    temp = tuple(bv.GetOnBits())
    for i in temp:
        res.update({i: 1})

    return NumFinger, res, bv
Example #31
0
    def computeFP(self, typeFP):

        from rdkit.Chem.Fingerprints import FingerprintMols
        from rdkit.Chem import MACCSkeys
        from rdkit.Chem.AtomPairs import Pairs, Torsions
        from rdkit.Chem import AllChem

        if not "smiclean" in self.__dict__:
            self.log = self.log + "No smiles prepared\n"
            return 1
        else:
            self.mol = Chem.MolFromSmiles(self.smiclean)
            #print self.smiclean

        dFP = {}
        if typeFP == "Mol" or typeFP == "All":
            dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol)
        if typeFP == "MACCS" or typeFP == "All":
            dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol)
        if typeFP == "pairs" or typeFP == "All":
            dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol)
        if typeFP == "Torsion" or typeFP == "All":
            dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(
                self.mol)
        if typeFP == "Morgan" or typeFP == "All":
            dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2)

        self.FP = dFP
        return 0
def get_fp(mols):
    fps = []
    if (args.fpType == 'ECFP4'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x, 2)
                fps.append(z)
    if (args.fpType == 'ECFP6'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x, 3)
                fps.append(z)
    if (args.fpType == 'ECFP12'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x, 6)
                fps.append(z)
    if (args.fpType == 'MACCS'):
        for x in mols:
            if (x):
                z = Chem.MACCSkeys.GenMACCSKeys(x)
                fps.append(z)
    if (args.fpType == 'Daylight'):
        for x in mols:
            if (x):
                z = FingerprintMols.FingerprintMol(x)
                fps.append(z)
    if (args.fpType == 'AP'):
        for x in mols:
            if (x):
                z = GetHashedAtomPairFingerprintAsBitVect(x, nBits=4096)
                fps.append(z)
    return fps
Example #33
0
def ScreenFingerprints(details,data,mol=None,probeFp=None):
  """ Returns a list of results

  """
  if probeFp is None:
    try:
      probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__)
    except:
      import traceback
      FingerprintMols.error('Error: problems fingerprinting molecule.\n')
      traceback.print_exc()
      return []
  if not probeFp:
    return []

  res = []
  if not details.doThreshold and details.topN>0:
    topN = TopNContainer(details.topN)
  else:
    topN = []
  res = []
  count = 0
  for pt in data:
    fp1 = probeFp
    if not details.noPickle:
      if type(pt) in (types.TupleType,types.ListType):
        id,fp = pt
      else:
        fp = pt
        id = pt._fieldsFromDb[0]
      score = DataStructs.FingerprintSimilarity(fp1,fp,details.metric)
    else:
      id,pkl = pt
      score = details.metric(fp1,str(pkl))
    if topN:
      topN.Insert(score,id)
    elif not details.doThreshold or \
             (details.doThreshold and score>=details.screenThresh):
      res.append((id,score))
    count += 1
    if hasattr(details,'stopAfter') and count >= details.stopAfter:
      break
  for score,id in topN:
    res.append((id,score))

  return res
Example #34
0
def ScreenFromDetails(details, mol=None):
  """ Returns a list of results

  """
  if not mol:
    if not details.probeMol:
      smi = details.probeSmiles
      try:
        mol = Chem.MolFromSmiles(smi)
      except Exception:
        import traceback
        FingerprintMols.error('Error: problems generating molecule for smiles: %s\n' % (smi))
        traceback.print_exc()
        return
    else:
      mol = details.probeMol
  if not mol:
    return

  if details.outFileName:
    try:
      outF = open(details.outFileName, 'w+')
    except IOError:
      FingerprintMols.error("Error: could not open output file %s for writing\n" %
                            (details.outFileName))
      return None
  else:
    outF = None

  if not hasattr(details, 'useDbSimilarity') or not details.useDbSimilarity:
    data = GetFingerprints(details)
    res = ScreenFingerprints(details, data, mol)
  else:
    res = ScreenInDb(details, mol)
  if outF:
    for pt in res:
      outF.write(','.join([str(x) for x in pt]))
      outF.write('\n')
  return res
Example #35
0
      Default is *AutoFragmentFP*
      
    - --minPath=val:  minimum path length to be included in
      fragment-based fingerprints. Default is *1*.

    - --maxPath=val:  maximum path length to be included in
      fragment-based fingerprints. Default is *7*.
      
    - --nBitsPerHash: number of bits to be set in the output
      fingerprint for each fragment. Default is *4*.

    - --discrim: use of path-based discriminators to hash bits.
      Default is *false*.

    - -V: include valence information in the fingerprints
      Default is *false*.
      
    - -H: include Hs in the fingerprint 
      Default is *false*.

    - --useMACCS: use the public MACCS keys to do the fingerprinting
      (instead of a daylight-type fingerprint)


"""
if __name__ == '__main__':
  FingerprintMols.message("This is MolSimilarity version %s\n\n"%(__VERSION_STRING))
  FingerprintMols._usageDoc=_usageDoc
  details = FingerprintMols.ParseArgs()
  ScreenFromDetails(details)
Example #36
0
      Default is *AutoFragmentFP*

    - --minPath=val:  minimum path length to be included in
      fragment-based fingerprints. Default is *1*.

    - --maxPath=val:  maximum path length to be included in
      fragment-based fingerprints. Default is *7*.

    - --nBitsPerHash: number of bits to be set in the output
      fingerprint for each fragment. Default is *4*.

    - --discrim: use of path-based discriminators to hash bits.
      Default is *false*.

    - -V: include valence information in the fingerprints
      Default is *false*.

    - -H: include Hs in the fingerprint
      Default is *false*.

    - --useMACCS: use the public MACCS keys to do the fingerprinting
      (instead of a daylight-type fingerprint)


"""
if __name__ == '__main__':
  FingerprintMols.message("This is MolSimilarity\n\n")
  FingerprintMols._usageDoc = _usageDoc
  details = FingerprintMols.ParseArgs()
  ScreenFromDetails(details)