def import_ints(cyto_ppis_fname): cyto_ppis = [p[:4] for p in ut.load_lol(cyto_ppis_fname)[1:]] ppis_dupes = [(id1.split('_')[1],id2.split('_')[1],prob, 1 if corum in set(['gold','train']) else 0) for id1,id2,prob,corum in cyto_ppis] cxppis = pd.pd_lol(pd.PairDict(ppis_dupes)) return cxppis
def load_reactome_pairs(fname='reactome/homo_sapiens.interactions.txt'): rmlol = ut.load_lol(fname) rmall = pd.PairDict([]) pd.pd_set_loi_sets(rmall, [[x[1].split(':')[1], x[4].split(':')[1], x[6]] for x in rmlol[1:] if x[1]<>'' and x[4]<>'']) rmfilt = [(k,v) for k,v in rmall.d.items() if ("reaction" in v) and not ("indirect_complex" in v) and not ("direct_complex" in v)] rmfiltpairs = [k for k,v in rmfilt] return rmfiltpairs
def multi_identities(input_fname, out_dir): input_list = ut.load_lol(input_fname) for desc, prots_fname, source_fasta, odict, target in input_list: print "%s, proteins: %s\n source: %s\n odict: %s\ntarget: %s" % (desc, prots_fname, source_fasta, odict, target) prots = ut.load_list(prots_fname) sims = all_identities(prots, odict, source_fasta, target) out_fname = os.path.join(out_dir, ut.shortname(target).split('.')[0] + "_" + desc + ".txt") ut.write_tab_file(sims, out_fname, islist=True)
def load_pepcount(f): lol = ut.load_lol(f) print "Omitting header:", lol[0] lol = lol[1:] peps = ut.i0(lol[1:]) samples = lol[0][2:] arr = np.zeros((len(peps), len(samples))) for i,row in enumerate(lol[1:]): arr[i,:] = row[2:] return peps, samples, arr
def exported_diff(cy_basefile, cy_difffile, col_header, diff_ppis=None, justids=False): """ Makes a new cy_ file labeling whether that interaction is also found in the cy_difffile (or the diff_ppis--pass None for cy_difffile in that case). """ def cy_ppi_to_pair(p): return (p[0].split('_')[1], p[1].split('_')[1]) if cy_difffile is not None: pd_diff = pd.PairDict([cy_ppi_to_pair(p) for p in ut.load_lot(cy_difffile)[1:]]) else: pd_diff = pd.PairDict(diff_ppis) header = ut.load_lol(cy_basefile)[0] lines = ut.load_lol(cy_basefile)[1:] if justids: lines = [l[:2] for l in lines] header = header[:2] header += [col_header] ut.write_tab_file([r + [pd_diff.contains(cy_ppi_to_pair(r))] for r in lines], ut.pre_ext(cy_basefile, col_header), header=header)
def ensg_to_ensp_and_park(ppips): dhpg = seqs.prots2genes('/Users/blakeweb/Dropbox/complex/data/sequences/canon/Hs.fasta') dhgp = ut.dict_inverse(dhpg) parkids = ut.load_lol('./orth_similarities/table.Hsapiens/Hsapiens_id.txt') ppips_ensp = [dhgp[g] for g in ppips] dg2park = dict([(x[2],x[0]) for x in parkids]) dp2park = dict([(x[1],x[0]) for x in parkids]) park_ppips_most = [dp2park[p] for p in ppips_ensp if p in dp2park] ppips_ensp_rest = [p for p in ppips_ensp if p not in dp2park] ppips_ensg_rest = [dhpg[p] for p in ppips_ensp_rest] park_ppips_rest = [dg2park[p] for p in ppips_ensg_rest if p in dg2park] park_ppips = park_ppips_most + park_ppips_rest return park_ppips
def load_reactome_pairs_reactions(fname='reactome/homo_sapiens.interactions.txt'): rmlol = ut.load_lol(fname) rmpd = pd.PairDict([]) pd.pd_set_loi_sets(rmpd, [[x[1].split(':')[1], x[4].split(':')[1], x[7]] for x in rmlol[1:] if x[1]<>'' and x[4]<>'' and x[6]=='reaction']) return rmpd
def entrez_desc(): return dict([(l[4][2:],l[2].split('[')[0]) for l in ut.load_lol(ut.config('gene_desc_Hs'))])
def load_ppis(fname): return ut.load_lol(fname, dtypes=(str,str,float,int))