Exemple #1
0
def ppis_gold_standard(ppis, cxs_splits, species):
    pdppis = pd.PairDict([p[:3] for p in ppis])
    print len(pdppis.d), "predicted interactions"
    ppi_cxs,_,all_cxs = ppi.load_training_complexes(species, None,'') #conv doesn't matter
    pdcorum = pd.PairDict([(i[0],i[1],'gold') for i in
                        co.pairs_from_complexes(ut.i1(all_cxs))])
    print len(pdcorum.d), "total gold standard"
    pdcomb = pd.pd_union_disjoint_vals(pdppis, pdcorum)
    unmr_splits = cp.unmerged_splits_from_merged_splits(ppi_cxs,cxs_splits)
    print "unmerged split assignment lengths", [len(s) for s in unmr_splits]
    pdtrainpos = pd.PairDict([(t[0],t[1]) for t in
        co.pairs_from_complexes(unmr_splits[0])])
    print len(pdtrainpos.d), "total train interactions"
    counterrs = 0
    for tpair in pdtrainpos.d:
        cpair = pdcomb.find(tpair)
        #assert cpair is not None, "Gold standard problem--filter_methods changed since run?"
        if cpair is None or pdcomb.d[cpair][1] != 'gold':
            #print 'error: train should be subset', tpair
            counterrs += 1
        else:
            pdcomb.d[cpair][1] = 'train'
    if counterrs: print "number of training not found in gold std:", counterrs
    comblist = [list(k)+list(v) for k,v in pdcomb.d.items()]
    print (len([1 for p in comblist if p[2] and p[3]=='gold']), 
            "ppis in gold not train")
    print len([1 for p in comblist if p[2] and p[3]=='train']), "ppis in train"
    # only return those that are predictions
    return [p for p in comblist if p[2]]
def triple_venn_consv():
    hints = co.load_havug_ints()
    ppi_cxs, clean_cxs, corconsv = ppi.load_training_complexes("Hs", "Dm")
    cints = co.pairs_from_complexes(ut.i1(ppi_cxs))  # exclude huge ones
    ints23 = ut.loadpy(ut.bigd("../23_collapsenodes/Hs_filtorth025_withsc_2sp_refilt2sp_cxs_cxppis_clust27_532cxs"))[1]
    ints3 = [cp.consv_pairs(i, h2d) for i in ints23, hints, cints]
    cp.triple_venn(ints3, ["map23", "havug", "corum"])
def tested_ppis(gold_cxs, ppis):
    gold_ints = co.pairs_from_complexes(gold_cxs)
    ntest_pos = len(gold_ints)
    pdtrues = pd.PairDict(gold_ints)
    ppis = [(p[0],p[1],p[2],1 if pdtrues.contains(tuple(p[:2])) else 0) for p in
            ppis]
    return ppis, ntest_pos
def arrfeats_prep_all_data(arrfeats, ppis, sp="Hs", gold_consv="Dm", cutoff=0.5):
    print "Adding species summary."
    arrfeats = fe.arr_add_spsummary(arrfeats, cutoff)
    print "Adding ppis."
    arrfeats = fe.arrfeats_add_ppis(arrfeats, ppis)
    _, _, all_cxs = ppi.load_training_complexes(sp, None, gold_consv)
    pdgold = pd.PairDict(co.pairs_from_complexes(ut.i1(all_cxs)))
    print "Setting trues."
    arrfeats = fe.arrfeats_set_gold(arrfeats, pdgold)
    return arrfeats
def hpa_stats(ppis, locs, max_set_size=None):
    s = attr_to_sets(locs)
    if max_set_size is not None: 
        s = [c for c in s if len(c) < max_set_size]
    plocs = co.pairs_from_complexes(s)
    ppiprots = set(ut.i0(ppis)+ut.i1(ppis))
    anprots = set(ut.i0(locs))
    intprots = set.intersection(ppiprots, anprots)
    print len(ppiprots), len(anprots), len(intprots)
    return ppis_stats(ppis, plocs, intprots)
def _filter_ints(inlist, cxs):
    pairs = co.pairs_from_complexes(cxs)
    pdp = pd.PairDict(pairs)
    return [tup for tup in inlist if pdp.contains((tup[0],tup[1]))]
def clique_score(cx, pdints):
    cx_ints = co.pairs_from_complexes([cx])
    return len([1 for edge in cx_ints if pdints.contains(edge)])/len(cx_ints)
def gold_label_ppis(ppis, merged_splits, sp, gold_nsp):
    gold_consv = 'Dm' if gold_nsp>1 else ''
    ppi_cxs,_,_ = ppi.load_training_complexes(sp, '', gold_consv)
    train_cxs = unmerged_splits_from_merged_splits(ppi_cxs, merged_splits)[0]
    ppis = cv.gold_label_ppis(ppis, co.pairs_from_complexes(train_cxs))
    return ppis