Example #1
0
File: tSNE.py Project: SY575/Decoys
def dg_score(active_mols, decoy_mols):

    # Similar to DEKOIS

    # Lower is better (less like actives), higher is worse (more like actives)

    active_fps = [AllChem.GetMorganFingerprintAsBitVect(mol,3,useFeatures=True) \

                  for mol in active_mols] # Roughly FCFP_6

    decoys_fps = [AllChem.GetMorganFingerprintAsBitVect(mol,3,useFeatures=True) \

                  if mol is not None else None for mol in decoy_mols] # Roughly FCFP_6

    closest_sims = []

    closest_sims_id = []

    for active_fp in active_fps:

        active_sims = []

        for decoy_fp in decoys_fps:

            active_sims.append(DataStructs.TanimotoSimilarity(active_fp, decoy_fp) \

                               if decoy_fp is not None else 0)

        closest_sims.append(max(active_sims))

        closest_sims_id.append(np.argmax(active_sims))

    return np.array(closest_sims), np.array(closest_sims_id)
Example #2
0
def dg_score_rev(actives, decoys):
    # Similar to DEKOIS
    # Lower is better (less like actives), higher is worse (more like actives)
    active_fps = [
        AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi),
                                              3,
                                              useFeatures=True)
        for smi in actives
    ]  # Roughly FCFP_6
    decoys_fps = [
        AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi),
                                              3,
                                              useFeatures=True)
        for smi in decoys
    ]  # Roughly FCFP_6

    closest_sims = []
    closest_sims_id = []
    for decoy_fp in decoys_fps:
        active_sims = []
        for active_fp in active_fps:
            active_sims.append(
                DataStructs.TanimotoSimilarity(active_fp, decoy_fp))
        closest_sims.append(max(active_sims))
        closest_sims_id.append(np.argmax(active_sims))

    return closest_sims, closest_sims_id
    def tanimoto(self, mol):
        try:
            with Timeout(seconds=1):
                fp = Generate.Gen2DFingerprint(mol, self.sigFactory)
            return DataStructs.TanimotoSimilarity(fp, self.query_fp)

        except TimeoutError:
            logging.debug("SMILES Pharmacophore timeout: ",
                          Chem.MolToSmiles(mol, isomericSmiles=False))
            return 0
Example #4
0
    def __call__(self, smiles: List[str]) -> dict:
        mols = [Chem.MolFromSmiles(smile) for smile in smiles]
        valid = [1 if mol is not None else 0 for mol in mols]
        valid_idxs = [idx for idx, boolean in enumerate(valid) if boolean == 1]
        valid_mols = [mols[idx] for idx in valid_idxs]

        fps = [AllChem.GetMorganFingerprint(mol, 2, useCounts=False, useFeatures=True) for mol in valid_mols]

        tanimoto_dist = np.array([DataStructs.TanimotoSimilarity(self.query_fp, fp, returnDistance=True) for fp in fps])
        score = np.full(len(smiles), 0, dtype=np.float32)

        for idx, value in zip(valid_idxs, tanimoto_dist):
            score[idx] = value
        return {"total_score": np.array(score, dtype=np.float32)}
Example #5
0
def compute_similarity_kernel_matrices(dataset):
    """
    Computes the drug-drug and protein-protein kernel matrices for kernel-based methods (e.g. Kron-RLS)

    :param dataset:
    :return: tuple
    """
    start = time.time()
    print("Computing kernel matrices (KD_dict, KT_dict)")
    all_comps = set()
    all_prots = set()
    for idx, pair in enumerate(dataset.X):
        mol, prot = pair
        all_comps.add(mol)
        all_prots.add(prot)

    # compounds / drugs
    comps_mat = {}
    for c1 in all_comps:
        fp1 = c1.fingerprint
        for c2 in all_comps:
            fp2 = c2.fingerprint
            # Tanimoto coefficient
            score = DataStructs.TanimotoSimilarity(fp1, fp2)
            comps_mat[Pair(c1, c2)] = score

    # proteins / targets
    aligner = Align.PairwiseAligner()
    aligner.mode = 'local'  # SW algorithm
    prots_mat = {}
    for p1 in all_prots:
        seq1 = p1.sequence[1]
        p1_score = aligner.score(seq1, seq1)
        for p2 in all_prots:
            seq2 = p2.sequence[1]
            p2_score = aligner.score(seq2, seq2)
            score = aligner.score(seq1, seq2)
            # Normalized SW score
            prots_mat[Pair(p1, p2)] = score / (sqrt(p1_score) * sqrt(p2_score))

    print("Kernel entities: Drugs={}, Prots={}".format(len(all_comps),
                                                       len(all_prots)))
    duration = time.time() - start
    print("Kernel matrices computation finished in: {:.0f}m {:.0f}s".format(
        duration // 60, duration % 60))
    return comps_mat, prots_mat
Example #6
0
 def generate_fingerprints_and_create_list(self):
     #generate fingerprints of predicted ligands and known ligands:
     gen_mo = rdFingerprintGenerator.GetMorganGenerator(fpSize=2048,
                                                        radius=2)
     predicted_fps = [
         gen_mo.GetFingerprint(mol) for mol in self.predicted['molecules']
     ]
     true_fps = [
         gen_mo.GetFingerprint(mol) for mol in self.true_pos['molecules']
     ]
     similarities = list()
     for count, mol in enumerate(predicted_fps):
         tanimoto_values = ([
             DataStructs.TanimotoSimilarity(mol, i) for i in true_fps
         ])
         index_of_highest = np.argmax(tanimoto_values)
         similarities.append(tanimoto_values[index_of_highest])
     #module code is in: https://github.com/rdkit/rdkit/tree/master/Contrib/SA_Score
     sa_score = [
         sascorer.calculateScore(i)
         for i in list(self.predicted['molecules'])
     ]
     #create a list holding the QED drug-likeness score
     #reference: https://doi.org/10.1038/nchem.1243
     qeds = [qed(mol) for mol in self.predicted['molecules']]
     #create a list holding logp:
     logp = [Descriptors.MolLogP(m) for m in self.predicted['molecules']]
     #filter catalog usage instructions are here: https://github.com/rdkit/rdkit/pull/536
     params = FilterCatalogParams()
     params.AddCatalog(FilterCatalogParams.FilterCatalogs.BRENK)
     catalog = FilterCatalog(params)
     self.brenk = np.array(
         [catalog.HasMatch(m) for m in self.predicted['molecules']])
     #add these lists as columns to the 'predicted' pd.DataFrame
     self.predicted['similarities'] = similarities
     self.predicted['sa_score'] = sa_score
     self.predicted['qeds'] = qeds
     self.predicted['logp'] = logp
     print(self.predicted['logp'] < 6)
     shortlist_mask = ((self.predicted['similarities'] < 0.2) &
                       (self.predicted['sa_score'] < 4) &
                       (self.predicted['qeds'] > 0.25) &
                       (self.predicted['logp'] < 6) & (~self.brenk))
Example #7
0
 def distij(i, j, features=features):
     return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)],
                                                 features[int(j)])
def select_and_evaluate_decoys(f, target, idx, file_loc='./', output_loc='./', 
                               dataset='ALL', num_cand_dec_per_act=100, 
                               num_dec_per_act=50, max_idx_cmpd=10000):
    print("Processing: ", f)
    dec_results = [f]
    dec_results.append(dataset)
    # Read data
    data = decoy_utils.read_paired_file(file_loc+f)
# =============================================================================
#         
# =============================================================================
    data = [d+[Chem.MolFromSmiles(d[1])] for d in data]
    lads_scores = decoy_utils.lads_score_v2(
        [Chem.MolFromSmiles(smi) for smi in list(set([d[0] for d in data]))], 
        [d[2] for d in data])
    data = [d for idx, d in enumerate(data) if lads_scores[idx]<0.5]
# =============================================================================
#     data = [d for d in data if AllChem.EmbedMolecule(
#         Chem.AddHs(d[2]), randomSeed=42) != -1]
# =============================================================================
    data = [d[:2] for d in data]
# =============================================================================
#         
# =============================================================================
    # Filter dupes and actives that are too small
    dec_results.append(len(set([d[0] for d in data])))
    seen = set()
    tmp = [Chem.MolFromSmiles(d[0]) for d in data]
    data = [d for idx, d in enumerate(data) if tmp[idx] is not None \
            and tmp[idx].GetNumHeavyAtoms()>min_active_size]
    unique_data = [x for x in data if not (tuple(x) in seen or seen.add(tuple(x)))]
    
    in_smis = [d[0] for d in data]
    in_mols = [Chem.MolFromSmiles(smi) for smi in in_smis]
    set_in_smis = list(set(in_smis))
    set_in_mols = [Chem.MolFromSmiles(smi) for smi in set_in_smis]
    gen_smis = [d[1] for d in data]
    gen_mols = [Chem.MolFromSmiles(smi) for smi in gen_smis]
    dec_results.extend([len(set(in_smis)), len(data), len(unique_data)])

    print('Calculate properties of in_smis and gen_mols')
    used = set([])
    in_smis_set = [x for x in in_smis if x not in used and (used.add(x) or True)]
    in_mols_set = [Chem.MolFromSmiles(smi) for smi in in_smis_set]
    if dataset == "dude_ext":
        in_props_temp = decoy_utils.calc_dataset_props_dude_extended(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_dude_extended(gen_mols, verbose=True)
    elif dataset == "dekois":
        in_props_temp = decoy_utils.calc_dataset_props_dekois(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_dekois(gen_mols, verbose=True)
    elif dataset == "MUV":
        in_props_temp = decoy_utils.calc_dataset_props_muv(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_muv(gen_mols, verbose=True)
    elif dataset == "ALL":
        in_props_temp = decoy_utils.calc_dataset_props_all(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_all(gen_mols, verbose=True)
    elif dataset == "dude":
        in_props_temp = decoy_utils.calc_dataset_props_dude(in_mols_set, verbose=True)
        gen_props = decoy_utils.calc_dataset_props_dude(gen_mols, verbose=True)
    else:
        print("Incorrect dataset")
        exit()
    in_mols_temp = list(in_smis_set) # copy
    in_props = []
    for i, smi in enumerate(in_smis):
        in_props.append(in_props_temp[in_mols_temp.index(smi)])

    in_basic_temp = decoy_utils.calc_dataset_props_basic(in_mols_set, verbose=True)
    in_mols_temp = list(in_smis_set) # copy
    in_basic = []
    for i, smi in enumerate(in_smis):
        in_basic.append(in_basic_temp[in_mols_temp.index(smi)])

    gen_basic_props = decoy_utils.calc_dataset_props_basic(gen_mols, verbose=True)

    print('Scale properties based on in_mols props')
    active_props_scaled_all = []
    decoy_props_scaled_all = []

    active_min_all = []
    active_max_all = []
    active_scale_all = []

    active_props = in_props_temp
    print('Exclude errors from min/max calc')
    act_prop = np.array(active_props)

    active_maxes = np.amax(act_prop, axis=0)
    active_mins = np.amin(act_prop, axis=0)

    active_max_all.append(active_maxes)
    active_min_all.append(active_mins)

    scale = []
    for (a_max, a_min) in zip(active_maxes,active_mins):
        if a_max != a_min:
            scale.append(a_max - a_min)
        else:
            scale.append(a_min)
    scale = np.array(scale)
    scale[scale == 0.0] = 1.0
    active_scale_all.append(scale)
    active_props_scaled = (active_props - active_mins) / scale
    active_props_scaled_all.append(active_props_scaled)

    # Calc SA scores
    in_sa_temp = [sascorer.calculateScore(mol) for mol in set_in_mols]
    in_smis_temp = list(set(in_smis))
    in_sa = []
    for i, smi in enumerate(in_smis):
        in_sa.append(in_sa_temp[in_smis_temp.index(smi)])
    gen_sa_props = [sascorer.calculateScore(mol) for mol in gen_mols]

    print('Calc Morgan fingerprints')
    in_fps = []
    for i, mol in enumerate(in_mols):
        in_fps.append(
            AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024))
    gen_fps = []
    for i, mol in enumerate(gen_mols):
        gen_fps.append(
            AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024))

    print('Calc DG scores')
    dg_scores, dg_ids = decoy_utils.dg_score_rev(set_in_mols, gen_mols)

    print('Calc LADS scores')
    lads_scores = decoy_utils.lads_score_v2(set_in_mols, gen_mols)
    
    print('Construct dictionary of results')
    results_dict = {}
    for i in range(len(in_smis)):
        # Get scaling
        in_props_scaled = (in_props[i] - active_min_all) / active_scale_all
        gen_props_scaled = (np.array(gen_props[i]) - active_min_all) / active_scale_all
        prop_diff = np.linalg.norm(np.array(in_props_scaled)-np.array(gen_props_scaled))

        # Get basic props diff
        basic_diff = np.sum(abs(np.array(in_basic[i])-np.array(gen_basic_props[i])))

        if in_smis[i] in results_dict:
            sim = DataStructs.TanimotoSimilarity(in_fps[i], gen_fps[i])
            results_dict[in_smis[i]].append(
                [in_smis[i], gen_smis[i], in_props[i], gen_props[i], prop_diff, 
                 sim, basic_diff, abs(gen_sa_props[i]-in_sa[i]), 
                 dg_scores[i], lads_scores[i], gen_mols[i]])
        else:
            sim = DataStructs.TanimotoSimilarity(in_fps[i], gen_fps[i])
            results_dict[in_smis[i]] = [
                [in_smis[i], gen_smis[i], in_props[i], gen_props[i], prop_diff, 
                 sim, basic_diff, abs(gen_sa_props[i]-in_sa[i]), 
                 dg_scores[i], lads_scores[i], gen_mols[i]] ]

    print('Get decoy matches')
    results = []
    results_success_only = []
    sorted_mols_success = []
    for key in results_dict:
        # Set initial criteria - Note most of these are relatively weak
        prop_max_diff = 5
        max_basic_diff = 3
        max_sa_diff = 1.51
        max_dg_score = 0.35
        max_lads_score = 0.2# 5# 0.1
        while True:
            count_success = sum([i[4]<prop_max_diff \
                                 and i[6]<max_basic_diff and i[7]<max_sa_diff \
                                 and i[8]<max_dg_score and i[9]<max_lads_score \
                                     for i in results_dict[key][0:max_idx_cmpd]])
            # Adjust criteria if not enough successes
            if count_success < num_cand_dec_per_act and max_dg_score<1:
                #print("Widening search", count_success)
                prop_max_diff *= 1.1
                max_basic_diff += 1
                max_sa_diff *= 1.1
                max_dg_score *= 1.1
                max_lads_score *= 1.1
            else:
                #print("Reached threshold", count_success)
                # Sort by sum of LADS and property difference (smaller better)
                sorted_mols_success.append(
                    [(i[0], i[1], i[4], i[9], i[4]+i[9], i[10]) \
                     for i in sorted(results_dict[key][0:max_idx_cmpd], 
                                     key=lambda i: i[4]+i[9], reverse=False)   
                    if i[4]<prop_max_diff \
                        and i[6]<max_basic_diff and i[7]<max_sa_diff \
                            and i[8]<max_dg_score and i[9]<max_lads_score])
                #assert count_success == len(sorted_mols_success[-1])
                break

    print('Choose decoys')
# =============================================================================
#     active_smis_gen = []
# =============================================================================
    decoy_smis_gen = set()

    embed_fails = 0
    dupes_wanted = 0
    for act_res in sorted_mols_success:
        count = 0
        # Greedy selection based on sum of LADS score and property difference (smaller better)
        for ent in act_res:
            # Check can gen conformer
            if ent[1] not in decoy_smis_gen: # Check conf and not a decoy for another ligand
                decoy_smis_gen.update([ent[1]])
                count +=1
                if count >= num_dec_per_act:
                    break
            elif ent[1] in decoy_smis_gen:
                dupes_wanted +=1
            else:
                embed_fails += 1
# =============================================================================
#         active_smis_gen.append(act_res[0][0])
# =============================================================================
    decoy_smis_gen = list(decoy_smis_gen)
    decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in decoy_smis_gen]
# =============================================================================
#     active_mols_gen = [Chem.MolFromSmiles(smi) for smi in active_smis_gen]
# =============================================================================
    active_mols_gen = set_in_mols
    dataset = 'dude'
    print('Calc props for chosen decoys')
    if dataset == "dude_ext":
        actives_feat = decoy_utils.calc_dataset_props_dude_extended(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_dude_extended(decoy_mols_gen, verbose=True)
    elif dataset == "dekois":
        actives_feat = decoy_utils.calc_dataset_props_dekois(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_dekois(decoy_mols_gen, verbose=True)
    elif dataset == "MUV":
        actives_feat = decoy_utils.calc_dataset_props_muv(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_muv(decoy_mols_gen, verbose=True)
    elif dataset == "ALL":
        actives_feat = decoy_utils.calc_dataset_props_all(active_mols_gen, verbose=True)
        decoys_feat = decoy_utils.calc_dataset_props_all(decoy_mols_gen, verbose=True)
    elif dataset == "dude":
        actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen)
        decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen)
    else:
        print("Incorrect dataset")
        exit()

    print('ML model performance')
    print(actives_feat.shape)
    print(decoys_feat.shape)
    dec_results.extend(list(decoy_utils.calc_xval_performance(
        actives_feat, decoys_feat, n_jobs=1)))

    print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)')
    dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat))
    lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen)
    dec_results.append(np.mean(lads_scores))
    dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen)
    dec_results.extend([np.mean(dg_scores), max(dg_scores)])
    
    # Save intermediate performance results in unique file
    #with open(output_loc+'results_'+f+'.csv', 'w') as csvfile:
    #    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    #    writer.writerow(dec_results)

    print('Save decoy mols')
    output_name = output_loc + \
        f'/{target}_{idx}_selected_{num_dec_per_act}_{num_cand_dec_per_act}.smi'
    with open(output_name, 'w') as outfile:
        for i, smi in enumerate(decoy_smis_gen):
            outfile.write(set_in_smis[i//num_dec_per_act] + ' ' + smi + '\n')
    print(dec_results)
    GM = np.mean(dec_results[7+1:7+1+3])
    print(f'GM: {GM:.4f}')
    dec_results.append(GM)
    return dec_results
Example #9
0
 def calc_similarity(self, mol1, mol2):
     fp_mol1 = rdMolDescriptors.GetMACCSKeysFingerprint(mol1)
     fp_mol2 = rdMolDescriptors.GetMACCSKeysFingerprint(mol2)
     score = DataStructs.TanimotoSimilarity(fp_mol1, fp_mol2)
     return score
Example #10
0
def calc_tanimoto(m1,m2):
    fp1 = AllChem.GetMorganFingerprintAsBitVect( m1,2 )
    fp2 = AllChem.GetMorganFingerprintAsBitVect( m2,2 )
    tc = DataStructs.TanimotoSimilarity( fp1, fp2 )
    return tc
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import RDConfig
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import GetUSRScore, GetUSRCAT
from rdkit.Chem import DataStructs

print(rdBase.rdkitVersion)

mols = [mol for mol in Chem.SDMolSupplier("cdk2.sdf")]
for mol in mols:
    AllChem.EmbedMolecule(mol,
                          useExpTorsionAnglePrefs=True,
                          useBasicKnowledge=True)
usrcats = [GetUSRCAT(mol) for mol in mols]
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]

data = {"tanimoto": [], "usrscore": []}

for i in range(len(usrcats)):
    for j in range(i):
        tc = DataStructs.TanimotoSimilarity(fps[i], fps[j])
        score = GetUSRScore(usrcats[i], usrcats[j])
        data["tanimoto"].append(tc)
        data["usrscore"].append(score)
        print(score, tc)
df = pd.DataFrame(data)

fig = sns.pairplot(df)
fig.savefig('plot.png')
mol1 = Chem.MolFromSmiles(smiles1)
mol2 = Chem.MolFromSmiles(smiles2)
mol3 = Chem.MolFromSmiles(smiles3)

fp1 = AllChem.GetMorganFingerprint(
    mol1,
    3,
    #nBits = 1024,
    useFeatures=False,
    useChirality=True,
)

fp2 = AllChem.GetMorganFingerprint(
    mol2,
    3,
    #nBits = 1024,
    useFeatures=False,
    useChirality=True,
)

fp3 = AllChem.GetMorganFingerprint(
    mol3,
    3,
    #nBits = 1024,
    useFeatures=False,
    useChirality=True,
)

print(DataStructs.TanimotoSimilarity(fp1, fp2))
print(DataStructs.TanimotoSimilarity(fp1, fp3))
print(DataStructs.TanimotoSimilarity(fp2, fp3))
Example #13
0
 def __call__(self, molecule):
     molecule_noh = Chem.RemoveHs(molecule)
     fingerprint = AllChem.GetMorganFingerprint(molecule_noh, self.radius)
     similarity = DataStructs.TanimotoSimilarity(self.reference,
                                                 fingerprint)
     return similarity
Example #14
0
def compute_simboost_drug_target_features(dataset,
                                          mf_simboost_data_dict,
                                          nbins=10,
                                          sim_threshold=0.5):
    """
    Constructs the type 1,2, and 3 features (with the matrix factorization part) of SimBoost as described in:
    https://jcheminf.biomedcentral.com/articles/10.1186/s13321-017-0209-z
    The Matrix Factorization part is deferred to the mf.py script.

    :param sim_threshold:
    :param nbins:
    :param dataset:
    :return:
    """
    assert isinstance(
        mf_simboost_data_dict,
        dict), "Drug-Target features dictionary must be provided."
    print('SimBoost Drug-Target feature vector computation started')

    print('Processing M matrix')
    all_comps = set()
    all_prots = set()
    pair_to_value_y = {}
    Mgraph = nx.Graph(name='drug_target_network')
    Mrows = defaultdict(list)
    Mcols = defaultdict(list)
    for x, y, w, id in tqdm(dataset.itersamples()):
        mol, prot = x
        all_comps.add(mol)
        all_prots.add(prot)
        pair_to_value_y[Pair(mol, prot)] = y
        Mrows[mol].append(y)
        Mcols[prot].append(y)
        Mgraph.add_edge(mol, prot, weight=y)
    print('Number of compounds = %d' % len(all_comps))
    print('Number of targets = %d' % len(all_prots))

    # compounds / drugs
    print('Processing drug similarity matrix')
    D = {}
    Dgraph = nx.Graph(name='drug_drug_network')
    for c1 in tqdm(all_comps):
        fp1 = c1.fingerprint
        for c2 in all_comps:
            fp2 = c2.fingerprint
            # Tanimoto coefficient
            score = DataStructs.TanimotoSimilarity(fp1, fp2)
            D[Pair(c1, c2)] = score
            Dgraph.add_nodes_from([c1, c2])
            if score >= sim_threshold and c1 != c2:
                Dgraph.add_edge(c1, c2)
    comp_feats = compute_type2_features(
        compute_type1_features(Mrows, all_comps, D, nbins), D, Dgraph)

    # proteins / targets
    print('Processing target similarity matrix')
    aligner = Align.PairwiseAligner()
    aligner.mode = 'local'  # SW algorithm
    T = {}
    Tgraph = nx.Graph(name='target_target_network')
    for p1 in tqdm(all_prots):
        seq1 = p1.sequence[1]
        p1_score = aligner.score(seq1, seq1)
        for p2 in all_prots:
            seq2 = p2.sequence[1]
            p2_score = aligner.score(seq2, seq2)
            score = aligner.score(seq1, seq2)
            # Normalized SW score
            normalized_score = score / (sqrt(p1_score) * sqrt(p2_score))
            T[Pair(p1, p2)] = normalized_score
            Tgraph.add_nodes_from([p1, p2])
            if normalized_score >= sim_threshold and p1 != p2:
                Tgraph.add_edge(p1, p2)
    prot_feats = compute_type2_features(
        compute_type1_features(Mcols, all_prots, T, nbins), T, Tgraph)

    pbar = UnboundedProgressbar()
    pbar.start()

    print('Processing type 3 features')
    # Type 3 features
    btw_cent = nx.betweenness_centrality(Mgraph)
    cls_cent = nx.closeness_centrality(Mgraph)
    # eig_cent = nx.eigenvector_centrality(Mgraph, tol=1e-3, max_iter=500)
    # pagerank = nx.pagerank(Mgraph, tol=1e-3, max_iter=1000)
    drug_target_feats_dict = defaultdict(list)
    vec_lengths = []

    # Retrieve data from the Matrix Factorization stage
    comp_mat = mf_simboost_data_dict['comp_mat']
    prot_mat = mf_simboost_data_dict['prot_mat']
    comp_index = mf_simboost_data_dict['comp_index']
    prot_index = mf_simboost_data_dict['prot_index']
    for pair in tqdm(pair_to_value_y):
        comp, prot = pair.p1, pair.p2
        feat = drug_target_feats_dict[Pair(comp, prot)]
        # mf
        cidx = comp_index[comp]
        pidx = prot_index[prot]
        c_vec = comp_mat[cidx].tolist()
        p_vec = prot_mat[pidx].tolist()
        mf = c_vec + p_vec
        feat.extend(mf)

        # d.t.ave
        d_av_lst = []
        for n in Mgraph.neighbors(prot):
            if Pair(comp, n) in pair_to_value_y:
                d_av_lst.append(pair_to_value_y[Pair(comp, n)])
        if len(d_av_lst) > 0:
            feat.append(np.mean(d_av_lst))

        # t.d.ave
        t_av_lst = []
        for n in Mgraph.neighbors(comp):
            if Pair(n, prot) in pair_to_value_y:
                t_av_lst.append(pair_to_value_y[Pair(n, prot)])
        if len(t_av_lst) > 0:
            feat.append(np.mean(t_av_lst))

        # d.t.bt, d.t.cl, d.t.ev
        feat.append(btw_cent[comp])
        feat.append(btw_cent[prot])
        feat.append(cls_cent[comp])
        feat.append(cls_cent[prot])
        # feat.append(eig_cent[comp])
        # feat.append(eig_cent[prot])

        # d.t.pr
        # feat.append(pagerank[comp])
        # feat.append(pagerank[prot])

        # add type 1 features
        feat.extend(comp_feats[comp])
        feat.extend(prot_feats[prot])

        vec_lengths.append(len(feat))

    # zero-pad all vectors to be of the same dimension
    dim = max(vec_lengths)
    for k in drug_target_feats_dict:
        feat = drug_target_feats_dict[k]
        pvec = [0] * (dim - len(feat))
        feat.extend(pvec)

    pbar.stop()
    pbar.join()
    print(
        'SimBoost Drug-Target feature vector computation finished. Vector dimension={}'
        .format(dim))
    return drug_target_feats_dict
Example #15
0
def get_scaffold_simi(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)