Esempio n. 1
0
def make_matrix(file1, file2):
    res_df = pd.DataFrame(file1)
    list_smiles1 = list(file1.canonical_smiles)
    list_smiles1.append(file2)
    list_smiles = [Chem.MolFromSmiles(x) for x in list_smiles1]
    list_ids = list(res_df.name)
    list_ids.append("My_query")
    my_fps = [FingerprintMols.FingerprintMol(x) for x in list_smiles]

    dists = []
    simil = []
    nfps = len(my_fps)
    for j in range(0, nfps):
        simil.append(DataStructs.BulkTanimotoSimilarity(my_fps[j], my_fps))
        res_dis = DataStructs.BulkTanimotoSimilarity(my_fps[j],
                                                     my_fps,
                                                     returnDistance=1)
        dists.append([1 - x for x in res_dis])

    simil_mat = np.array(simil)
    dist_mat = np.array(dists)
    df_dist = pd.DataFrame(dist_mat)
    df_simil = pd.DataFrame(simil_mat)
    df_simil.columns = list_ids
    df_simil.index = list_ids
    return df_simil
Esempio n. 2
0
def do_sim(i1,i2,intra=False):
	ret = []
	for i in i1:
		if intra: sims = DataStructs.BulkTanimotoSimilarity(i[1],i2[:i[0]] + i2[i[0]+1:])
		else: sims = DataStructs.BulkTanimotoSimilarity(i[1],i2)
		if options.topn ==1: ret.append(max(sims))
		else: ret.append(np.average(sorted(sims,reverse=True)[:options.topn]))
	return ret
def dmat_sim(smiles_target, ntopick=10):
    """
    Function to select most dissimilar compounds from a given set
    Adapted from:
        http://rdkit.blogspot.com/2014/08/optimizing-diversity-picking-in-rdkit.html

    Args:
        smiles_target: DataFrame which contains compound-target activity pairs.
        The compounds should be in the smiles strings format and in a column
        named "smiles"
        ntoppick: The number of dissimiliar compounds to pick from the ranked
        list of dissimilarity

    Returns:
        A DataFrame of compound-target activity pairs that were sampled from
        the input smiles_target DataFrame based on their dissimilarity
    """
    ds = []
    smiles_target.reset_index(drop=True, inplace=True)
    mols = [MolFromSmiles(smi) for smi in smiles_target['smiles']]
    fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
    for i in range(1, len(fps)):
        ds.extend(
            DataStructs.BulkTanimotoSimilarity(fps[i],
                                               fps[:i],
                                               returnDistance=True))
    mmp = SimDivFilters.MaxMinPicker()
    ids = mmp.Pick(np.array(ds), len(fps), ntopick)
    smiles_target_dissim = smiles_target.iloc[list(ids)]

    return smiles_target_dissim
Esempio n. 4
0
def cluster(smile_keys, fp_type, cutoff=0.15):
    #note: it seems cutoff is one - similarity coefficient, it's euclidean distance I think??
    nfps = len(smile_keys)
    dists = []
    combinations = []

    data = [None] * nfps
    #Finger print each smile in the given smiles
    for i in range(0, nfps):
        fps = fingerprint_smile(smile_keys[i], fp_type)
        data[i] = fps

    #For each smile bulk calculate its similarity to each other smile in the list
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(data[i], data[:i])
        dists.extend([1 - x for x in sims])
        combinations.extend([(smile_keys[j], smile_keys[i])
                             for j in list(range(i))])

    #Prepare export data with each combination of
    matrix_df = create_similarity_export_matrix(combinations, dists)

    #perform clustering algorithm
    result = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    clusters = form_cluster_with_algorithm_results(smile_keys, result)
    return clusters, matrix_df
Esempio n. 5
0
    def select_best(self, scale=0.3, g=None):
        ''' Selects the best 50 smiles from a generation. 
			Adjusted score to boost small and diverse molecs.
			Inputs: 
			* scale: exponential weight for adjusted coefficients.
			* n: int. optional. generation ot select best molecs from. 
		'''
        if g is None:
            g = self.gen_counter

        indexs = self.index_table["gen_ev"] == g
        candidates = self.index_table[indexs]
        # prepare features
        mols = [Chem.MolFromSmiles(x) for x in candidates["smiles"].values]
        fingerprints = [Chem.RDKFingerprint(mol) for mol in mols]
        # calculate adjusted coefficients
        weight_coeff = [(900 / Descriptors.MolWt(mol))**scale for mol in mols]
        similarity_coeff = []
        for i in range(len(mols)):
            max_sim = np.max(
                DataStructs.BulkTanimotoSimilarity(
                    fingerprints[i],
                    [fingerprints[x] for x in range(len(mols)) if x != i]))
            similarity_coeff.append((1 / max_sim)**scale)
        adjusted_coeff   = candidates["best"].values *\
               np.array(weight_coeff) * np.array(similarity_coeff)
        # add overall score
        self.index_table.loc[indexs, "w_adj"] = weight_coeff
        self.index_table.loc[indexs, "s_adj"] = similarity_coeff
        self.index_table.loc[indexs, "adj"] = adjusted_coeff
        # select best values + add to best historic
        gen_best = self.index_table[indexs].sort_values("adj")["smiles"].values
        self.historic_best += list(gen_best[:self.retain])
Esempio n. 6
0
def cluster_fingerprints(fps, cutoff=0.2):
    """
    Performs Butina clustering on compounds specified by a list of fingerprint bit vectors.

    From RDKit cookbook http://rdkit.org/docs_temp/Cookbook.html.

    Args:
        fps (list of rdkit.ExplicitBitVect): List of fingerprint bit vectors.

        cutoff (float): Cutoff distance parameter used to seed clusters in Butina algorithm.

    Returns:
        tuple of tuple: Indices of fingerprints assigned to each cluster.

    """

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return cs
Esempio n. 7
0
def tanimoto_candidates(target, steroidlist):
    '''given a list of compounds, will compare to your target'''
    steroidmols = [Chem.MolFromSmiles(i) for i in steroidlist]

    for m in steroidmols:
        AllChem.Compute2DCoords(m)
    steroidlist_fps = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2) for x in steroidmols
    ]

    #recombine endogenous structures with their scores
    sims = DataStructs.BulkTanimotoSimilarity(steroidlist_fps[0],
                                              steroidlist_fps)
    nbrs = sorted(zip(sims, steroidmols), reverse=True)

    #grab bottom 10% of matches
    negative_structures = [x[1] for x in nbrs[:20]]
    negative_smiles = []
    for i in negative_structures:
        negative_smiles.append(Chem.MolToSmiles(i))

    nbrs_filtered = []
    for i in nbrs:
        if i[0] > .40:
            nbrs_filtered.append(i)
    #Draw.MolsToGridImage([x[1] for x in nbrs_filtered[:]],legends=['%.4f'%x[0] for x in nbrs_filtered])
    return nbrs_filtered
Esempio n. 8
0
def cluster_ligands(ligands, cutoff=0.2):
    """"""
    rdkit_ligands = []
    for lig in ligands:
        try:
            rdkit_ligands.append(ccdc_to_rdkit(lig))
        except:
            pass

    # from RDKit Cookbook
    fps = [
        AllChem.GetMorganFingerprintAsBitVect(lig, 2, 1024)
        for lig in rdkit_ligands
    ]
    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    clusters = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    all_ligands = []
    for cluster in clusters:
        try:
            all_ligands.append(rdkit_to_ccdc(rdkit_ligands[cluster[0]]))
        except:
            pass
    return all_ligands
Esempio n. 9
0
def check_mol(option, maps=None, out_put=None, target_id=None, extra=None):
    """Function to check whether an input is either a valid smiles or a valid 
    protein code
    Takes a string and a Target
    Returns an answer to be used by jquery"""
    my_mols = Molecule.objects.filter(prot_id__code__icontains=option)
    target = Target.objects.get(pk=target_id)
    if len(my_mols) == 0:
        tmpmol = Chem.MolFromSmiles(str(option))
        if tmpmol is None:
            return "None molecule"
        # Now do a similarity search on this against all the molecules
        cmps = [
            Chem.MolFromSmiles(str(x)) for x in Molecule.objects.filter(
                prot_id__target_id=target_id).exclude(
                    prot_id__code__startswith=target.title).exclude(
                        cmpd_id__smiles="DUMMY").values_list("cmpd_id__smiles",
                                                             flat=True)
        ]
        fps = [
            AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024)
            for x in cmps
        ]
        sims = DataStructs.BulkTanimotoSimilarity(
            AllChem.GetMorganFingerprintAsBitVect(tmpmol, 2, nBits=1024), fps)
        ind = max(enumerate(sims), key=lambda x: x[1])[0]
        mycmp = cmps[ind]
        my_mols = Molecule.objects.filter(
            cmpd_id__smiles=Chem.MolToSmiles(mycmp, isomericSmiles=True))
    # Now return the appropriate PDBcode
    return my_mols[0].prot_id.code
Esempio n. 10
0
def cluster_chemicals(
    *,
    rebuild: bool = False,
    chemicals_dict,
):
    """Cluster chemicals based on their similarities."""
    if not rebuild and os.path.exists(DEFAULT_CLUSTERED_CHEMICALS):
        return pd.read_csv(DEFAULT_CLUSTERED_CHEMICALS,
                           sep="\t",
                           index_col=False,
                           dtype={'PubchemID': str})
    dists = []
    drugs, fps = zip(*chemicals_dict.items())

    nfps = len(chemicals_dict)
    for i in tqdm(range(1, nfps), desc='Calculating distance for clustering'):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])
    cs = Butina.ClusterData(dists, nfps, 0.3, isDistData=True)
    df = pd.DataFrame(columns=['PubchemID', 'Cluster'])

    i = 1
    for j, cluster in enumerate(cs, start=1):
        for drug in cluster:
            df.loc[i] = [drugs[drug - 1]] + [j]
            i += 1

    df.to_csv(DEFAULT_CLUSTERED_CHEMICALS, sep='\t', index=False)
    return df
def compare_tanimoto_fingerprints_pairwise(smiles, fingerprints):

    """

    Arguments:

        smiles (List): List of smiles you would like to compare
        fingerprints (List): List of fingerprint RDKit objects for each smiles (should directly correlate)
    Returns:

        similarity_dataframe (Pandas Dataframe Object): a dataframe containing pairwise similarity.

    """


    query, target, similarity = [], [], []

    for index, fingerprint in enumerate(fingerprints):
        similarity_group = DataStructs.BulkTanimotoSimilarity(fingerprint, fingerprints[index+1:])
        for i in range(len(similarity_group)):
            query.append(combinations[index])
            target.append(combinations[index+1:][i])
            similarity.append(similarity_group[i])

    # build the dataframe and sort it
    similarity_data = {'query':query, 'target':target, 'similarity':similarity}
    similarity_dataframe = pd.DataFrame(data=similarity_data).sort_values('similarity', ascending=False)

    return similarity_dataframe
Esempio n. 12
0
def get_qnu(sgs_tuples, ref_path):
    novel_tuples = []
    pred_actives = []
    for tup in sgs_tuples:
        ra, mol, x, y, qed, sa = tup
        if topk_func((x, y, qed, sa)):
            # pred_actives = [mol for ra, mol, x, y, qed, sa in sgs_tuples )]
            pred_actives.append(mol)

    with open(ref_path) as f:
        next(f)
        true_actives = set(
            [get_canonical_smiles(line.split(',')[0]) for line in f])

    print('number of active reference', len(true_actives))
    all_set = set()

    true_fps = to_fingerprints(true_actives)
    pred_fps = to_fingerprints(pred_actives)

    for i in range(len(pred_actives)):
        sims = DataStructs.BulkTanimotoSimilarity(pred_fps[i], true_fps)
        canon_smiles = get_canonical_smiles(pred_actives[i])
        if canon_smiles not in all_set and max(sims) < 0.4:
            all_set.add(canon_smiles)
            novel_tuples.append(sgs_tuples[i])
    print('QNU {} -> {}'.format(len(sgs_tuples), len(novel_tuples)))

    return len(novel_tuples) / len(sgs_tuples)
Esempio n. 13
0
 def distance_matrix(self):
     self.dist = []
     nfps = len(self.fplist)
     for i in range(1, nfps):
         sims = DataStructs.BulkTanimotoSimilarity(self.fplist[i],
                                                   self.fplist[:i])
         self.dist.extend([1 - x for x in sims])
Esempio n. 14
0
def compute_similarity(fp_pred, fp_train, mols_pred, mols_train):
    results = []
    for i, fp in enumerate(tqdm(fp_pred)):
        dist = DataStructs.BulkTanimotoSimilarity(fp, fp_train)
        idx = int(np.argmax(dist))
        results.append((mols_pred[i], mols_train[idx], dist[idx]))
    return results
Esempio n. 15
0
def dimension(fnames, fp='ECFP', alg='PCA', maximum=int(1e5), ref='GPCR'):
    df = pd.DataFrame()
    for i, fname in enumerate(fnames):
        sub = pd.read_table(fname).dropna(subset=['Smiles'])
        sub = sub[sub.VALID == True]
        if maximum is not None and len(sub) > maximum:
            sub = sub.sample(maximum)
        if ref not in fname:
            sub = sub[sub.DESIRE == True]
        sub = sub.drop_duplicates(subset='Smiles')
        sub['LABEL'] = i
        df = df.append(sub)

    if fp == 'similarity':
        ref = df[(df.LABEL == 0) & (df.DESIRE == True)]
        refs = Predictor.calc_ecfp(ref.Smiles)
        fps = Predictor.calc_ecfp(df.Smiles)
        from rdkit.Chem import DataStructs
        fps = np.array(
            [DataStructs.BulkTanimotoSimilarity(fp, refs) for fp in fps])
    else:
        fp_alg = Predictor.calc_ecfp if fp == 'ECFP' else Predictor.calc_physchem
        fps = fp_alg(df.Smiles)
    fps = Scaler().fit_transform(fps)
    pca = PCA(n_components=2) if alg == 'PCA' else TSNE(n_components=2)
    xy = pca.fit_transform(fps)
    df['X'], df['Y'] = xy[:, 0], xy[:, 1]
    if alg == 'PCA':
        ratio = pca.explained_variance_ratio_[:2]
        return df, ratio
    else:
        return df
Esempio n. 16
0
def _compute_diversity(mol, fps):
    ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(
        mol, 4, nBits=2048)
    dist = DataStructs.BulkTanimotoSimilarity(
        ref_fps, fps, returnDistance=True)
    score = np.mean(dist)
    return score
Esempio n. 17
0
    def findCluster(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return "", "", False
        if self.bits > 0:
            fp = AllChem.GetMorganFingerprintAsBitVect(
                mol,
                self.radius,
                nBits=self.bits,
                useFeatures=self.useFeatures)
        else:
            fp = AllChem.GetMorganFingerprint(mol,
                                              self.radius,
                                              useFeatures=self.useFeatures)

        if smiles in self.getFingerprints():
            return smiles, fp, False

        fps = list(self.getFingerprints().values())
        sims = DataStructs.BulkTanimotoSimilarity(fp, fps)
        if len(sims) == 0:
            return smiles, fp, True
        closest = np.argmax(sims)
        if sims[closest] >= self.minsimilarity:
            return list(self.getFingerprints().keys())[closest], fp, False
        else:
            return smiles, fp, True
Esempio n. 18
0
    def findCluster(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            try:
                scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            except:
                return "", "", False
            if scaffold:
                cluster = Chem.MolToSmiles(scaffold, isomericSmiles=False)
            else:
                return "", "", False
        else:
            return "", "", False

        fp = Pairs.GetAtomPairFingerprint(scaffold)  # Change to Tanimoto?
        if cluster in self.getFingerprints():
            return cluster, fp, False

        fps = list(self.getFingerprints().values())
        sims = DataStructs.BulkTanimotoSimilarity(fp, fps)
        if len(sims) == 0:
            return cluster, fp, True
        closest = np.argmax(sims)
        if sims[closest] >= self.minsimilarity:
            return list(self.getFingerprints().keys())[closest], fp, False
        else:
            return cluster, fp, True
Esempio n. 19
0
def doSimSearch(model_name):
    if os.name == 'nt': sep = '\\'
    else: sep = '/'
    mod = model_name.split(sep)[-1].split('.')[0]
    try:
        with zipfile.ZipFile(
                os.path.dirname(os.path.abspath(__file__)) + sep + 'actives' +
                sep + mod + '.smi.zip', 'r') as zfile:
            comps = [
                i.split('\t')
                for i in zfile.open(mod + '.smi', 'r').read().splitlines()
            ]
    except IOError:
        return
    comps2 = []
    afp = []
    for comp in comps:
        try:
            afp.append(calcFingerprints(comp[1]))
            comps2.append(comp)
        except:
            pass
    ret = []
    for i, fp in enumerate(querymatrix):
        sims = DataStructs.BulkTanimotoSimilarity(fp, afp)
        idx = sims.index(max(sims))
        ret.append([sims[idx], mod] + comps2[idx] + [smiles[i]])
    return ret
Esempio n. 20
0
  def test6BulkTversky(self):
    """

    """
    sz = 10
    nToSet = 5
    nVs = 6
    import random
    vs = []
    for i in range(nVs):
      v = ds.IntSparseIntVect(sz)
      for j in range(nToSet):
        v[random.randint(0, sz - 1)] = random.randint(1, 10)
      vs.append(v)

    baseDs = [ds.TverskySimilarity(vs[0], vs[x], .5, .5) for x in range(1, nVs)]
    bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 0.5, 0.5)
    diceDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)]
    for i in range(len(baseDs)):
      self.assertTrue(feq(baseDs[i], bulkDs[i]))
      self.assertTrue(feq(baseDs[i], diceDs[i]))

    bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 1.0, 1.0)
    taniDs = [ds.TanimotoSimilarity(vs[0], vs[x]) for x in range(1, nVs)]
    for i in range(len(bulkDs)):
      self.assertTrue(feq(bulkDs[i], taniDs[i]))
    taniDs = ds.BulkTanimotoSimilarity(vs[0], vs[1:])
    for i in range(len(bulkDs)):
      self.assertTrue(feq(bulkDs[i], taniDs[i]))
Esempio n. 21
0
def calculate_pairwise_similarities(smiles_list1: List[str], smiles_list2: List[str]) -> np.array:
    """
    Computes the pairwise ECFP4 tanimoto similarity of the two smiles containers.

    Returns:
        Pairwise similarity matrix as np.array
    """
    if len(smiles_list1) > 10000 or len(smiles_list2) > 10000:
        logger.warning(f'Calculating similarity between large sets of '
                       f'SMILES strings ({len(smiles_list1)} x {len(smiles_list2)})')

    mols1 = get_mols(smiles_list1)
    fps1 = get_fingerprints(mols1)

    mols2 = get_mols(smiles_list2)
    fps2 = get_fingerprints(mols2)

    similarities = []

    for fp1 in fps1:
        sims = DataStructs.BulkTanimotoSimilarity(fp1, fps2)

        similarities.append(sims)

    similarities = np.array(similarities)

    return similarities
Esempio n. 22
0
def se_cs(fps, distThresh):
    lp = SimDivFilters.rdSimDivPickers.LeaderPicker()
    picks = lp.LazyBitVectorPick(fps, len(fps), distThresh)

    cs = defaultdict(list)
    # Assign each centroid as first item in list
    for i, idx in enumerate(picks):
        cs[i].append(idx)
    # Prepare similarity matrix
    sims = np.zeros((len(picks), len(fps)))
    # For each pick
    for i in range(len(picks)):
        pick = picks[i]
        # Assign bulk similarity to row
        sims[i, :] = DataStructs.BulkTanimotoSimilarity(fps[pick], fps)
        # Assign similarity to self as 0, so as not to pick yourself
        sims[i, i] = 0
    # Find snn to each pick
    best = np.argmax(sims, axis=0)
    # For each snn
    for i, idx in enumerate(best):
        # If it's not already a centroid
        if i not in picks:
            # Assign to nearest centroid...
            cs[idx].append(i)
    return [cs[k] for k in cs]
Esempio n. 23
0
def variety(mol, setfps):
    low_rand_dst = 0.9
    mean_div_dst = 0.945
    fp = Chem.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
    dist = DataStructs.BulkTanimotoSimilarity(fp, setfps, returnDistance=True)
    mean_dist = np.mean(np.array(dist))
    return mean_dist
Esempio n. 24
0
def _get_tanimoto_distance_matrix(fingerprints):
    """
    Calculate distance matrix for list of fingerprints.
    
    Parameters
    ----------
    fingerprints : list of rdkit.DataStructs.cDataStructs.ExplicitBitVect
        List of fingerprints.
        
    Returns
    -------
    list of floats
        Distance matrix (a triangular distance matrix in the form of a list)
    """

    fingerprints = list(fingerprints)
    distance_matrix = []

    for i in range(1, len(fingerprints)):

        # Calculate Tanimoto similarity between fingerprints
        similarities = DataStructs.BulkTanimotoSimilarity(
            fingerprints[i], fingerprints[:i])

        # Since we need a distance matrix, calculate 1-x for every element in similarity matrix
        distance_matrix.extend([1 - x for x in similarities])

    return distance_matrix
Esempio n. 25
0
def gen_cluster_subset_algButina(fps, cutoff):
    dists = []
    for i, fp in enumerate(fps):
        distance_matrix = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in distance_matrix])
    cs = Butina.ClusterData(dists, len(fps), cutoff, isDistData=True)
    return cs  # returns tuple of tuples with sequential numbers of compounds in each cluster
Esempio n. 26
0
def bulk_tanimoto_distance(smile, fps):
    ref_mol = Chem.MolFromSmiles(smile)
    ref_fps = AllChem.GetMorganFingerprintAsBitVect(ref_mol, 4, nBits=2048)
    dist = DataStructs.BulkTanimotoSimilarity(ref_fps,
                                              fps,
                                              returnDistance=True)
    return dist
Esempio n. 27
0
    def get_scores(self, mols, smiles2score=None):
        '''
        @params:
            mols: molecules to estimate score
        @return:
            dicts (list): list of score dictionaries
        '''
        if 'nov' in self.objectives or 'div' in self.objectives:
            fps_mols = [
                AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in mols
            ]

        dicts = [{} for _ in mols]

        for obj in self.objectives:
            if obj == 'adv': continue
            if obj == 'nov':
                for i, fp in enumerate(fps_mols):
                    sims = DataStructs.BulkTanimotoSimilarity(fp, self.fps_ref)
                    dicts[i][obj] = 1. - max(sims)
                continue
            if obj == 'div':
                for i, fp in enumerate(fps_mols):
                    sims = DataStructs.BulkTanimotoSimilarity(fp, fps_mols)
                    dicts[i][obj] = 1. - 1. * sum(sims) / len(fps_mols)
                continue
            scores = get_scores(obj, mols, smiles2score)
            for i, mol in enumerate(mols):
                dicts[i][obj] = scores[i]

        if 'adv' in self.objectives:
            graphs = [mol_to_dgl(mol) for mol in mols]
            dataset = GraphDataset(graphs)
            loader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                collate_fn=GraphDataset.collate_fn)

            preds = []
            for batch in loader:
                with torch.no_grad():
                    pred = self.discriminator(batch)  # (batch_size, 2)
                    pred = F.softmax(pred, dim=1)  # (batch_size, 2)
                preds.append(pred[:, 1])  # (batch_size,)
            preds = torch.cat(preds, dim=0).tolist()  # (num_mols,)
            for i, pred in enumerate(preds):
                dicts[i]['adv'] = pred
        return dicts
Esempio n. 28
0
def ClusterFps(fps, cutoff=0.2):
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
        dists.extend([1-x for x in sims])
    cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    return cs
Esempio n. 29
0
def tanimoto_1d(fps):
    ds = []
    for i in range(1, len(fps)):
        ds.extend(
            DataStructs.BulkTanimotoSimilarity(fps[i],
                                               fps[:i],
                                               returnDistance=True))
    return ds
Esempio n. 30
0
def calc_sims(fps_1, fps_2):
    sims = []
    for i in range(0, len(fps_1)):
        #sim = DataStructs.BulkTanimotoSimilarity(fps_1[i], [x for n,x in enumerate(fps_2) if n!= i]) within self
        sim = DataStructs.BulkTanimotoSimilarity(
            fps_1[i], fps_2)  # for two different arrays
        sims.append(sim)
    return sims