def multi_clust(tested, score_cutoffs=None, length_cutoffs=None, fracs=[.012,.014], frac_retain=.1, ds=[.1,.25,.3,.35], ms=[.1,.15,.2], penalties=[.1,1], overlaps=[.55], haircuts=[0,.2], max_pval=1, savef=None, runid=None, show_stats=True, pres=None, gold_nspecies=1, gold_splits=None, gold_minlen=3, mdprod_min=.01, **kwargs): runid = runid or random.randrange(1,1000) fracs = (fracs if fracs is not None else [cl.n_thresh(tested, s)/len(tested) for s in score_cutoffs] if score_cutoffs is not None else [le/len(tested) for le in length_cutoffs]) print "random id:", runid clusts = [] params = [fracs, ds, ms, penalties, overlaps, haircuts] products = it.product(*params) for (f,d,m,p,o,h) in products: if d*m >= mdprod_min: cxstruct = cl.filter_clust(ut.list_frac(tested, f), ut.list_frac(tested, frac_retain), merge_cutoff=o, negmult=m, min_density=d, runid=runid, penalty=p, max_pval=max_pval, max_overlap=o, haircut=h, **kwargs) cxstruct.params = ('density=%s,frac=%s,f_retain=%s,negmult=%s,penalty=%s,max_overlap=%s,haircut=%s' % (d,f,frac_retain,m,p,o,h)) clusts.append(cxstruct) if show_stats and len(cxstruct.cxs)>0: if pres is not None and gold_splits is not None: out = cp.select_best(cp.result_stats(pres.species, gold_splits, clusts[-1:], gold_nspecies, min_gold_size=gold_minlen)) else: print "Can't show stats: pres and gold_splits required." if savef and (len(clusts) % 10 == 1): ut.savepy(clusts, ut.pre_ext(savef, "clusts_temp_%s_%s" % (ut.date(), runid))) return clusts, runid
def cluster(tested, negmult, cltype, max_pval=0.2, **kwargs): kwargs = ut.dict_set_defaults(kwargs, clust_defaults) command = c1_command if cltype=='c1' else mcl_command if 'runid' in kwargs: # keep temp files separate runid = str(kwargs['runid']) kwargs['fin'] = ut.pre_ext(kwargs['fin'], runid) kwargs['fout'] = ut.pre_ext(kwargs['fout'], runid) export_cxs(tested, kwargs['fin'],negmult) command = command % kwargs print command shell_call(command) if cltype=='c1': cxs, pvals, cx_details = read_clust_output(kwargs['fout'], max_pval) elif cltype=='mcl': cxs = read_mcl_output(kwargs['fout']) pvals,cx_details=None,None else: assert False, "Wrong cluster type: %s" %cltype return Struct(cxs=cxs, pvals=pvals, cx_details=cx_details)
def fnet_names(fnet_file): filename = ut.proj_path('fnet_path',fnet_file) first = ut.load_tab_file(filename).next() nfields = len(first)-2 if nfields > 1: return [l[0].strip() if l[0].find('=')==-1 else l[0].split('=')[0].strip() for l in ut.load_tab_file(ut.pre_ext(filename,'_names'))] else: return None #means there is only one data column.
def mq2elut(fname, quant='iBAQ'): lines = [l for l in ut.load_tab_file(fname)] # want eg 'iBAQ WAN...', not 'iBAQ L WAN...' inds = [i for i,val in enumerate(lines[0]) if re.match('^%s\s\w{2}' % quant,val) is not None] #prots = [[p.split()[0][1:] for p in ps.split(';')] #for ps in [l[0] for l in lines[1:]]] # for now just using the "majority protein" prots = [p.split()[0][1:] for p in [l[1] for l in lines[1:]]] output = [[lines[0][0]] + [lines[0][i] for i in inds]] + \ [[p] + [l[i] for i in inds] for p,l in zip(prots, lines[1:])] ut.write_tab_file(output, ut.pre_ext(fname, '_mq_%s' % quant))
def export_idconvert(ppis, dict_cxlabels, fname): cxs_labeled = set([]) pfx_convert = [] for p in ppis: for i in 0,1: combid = p[i] cxid = combid.split('_')[0] pid = '_'.join(combid.split('_')[1:]) #in case '_' in id, eg for Sp if cxid not in cxs_labeled: cxlabel = dict_cxlabels[cxid] cxs_labeled.add(cxid) else: cxlabel = '' pfx_convert.append([combid, pid, cxid, cxlabel]) pfx_convert = [['nodeid', 'ENSGID', 'complexid', 'ComplexLabel']] \ + pfx_convert ut.write_tab_file(pfx_convert, ut.pre_ext(fname,'pfx_convert'))
def munge_original(fdata, column_inds, fnames, fout, first_names=1): """ Keep selected columns, replace 'NA' with '?', remove empty rows. Ids (first 2 columns) are kept automatically. For column inds, start with 0 for scores. Keep the same columns from the fnames file so I have a record of it. """ out = [] default = ['?'] * len(column_inds) for l in ut.load_tab_file(fdata): ids = list(l[:2]) newdata = [l[i+2] if l[i+2]!='NA' else '?' for i in range(len(l)) if i in column_inds] if newdata != default: out.append(ids + newdata) ut.write_tab_file(out, fout) names = [l for i,l in enumerate( list( ut.load_tab_file( fnames))[first_names:]) if i in column_inds] ut.write_tab_file(names, ut.pre_ext(fout, '_names'))
def exported_diff(cy_basefile, cy_difffile, col_header, diff_ppis=None, justids=False): """ Makes a new cy_ file labeling whether that interaction is also found in the cy_difffile (or the diff_ppis--pass None for cy_difffile in that case). """ def cy_ppi_to_pair(p): return (p[0].split('_')[1], p[1].split('_')[1]) if cy_difffile is not None: pd_diff = pd.PairDict([cy_ppi_to_pair(p) for p in ut.load_lot(cy_difffile)[1:]]) else: pd_diff = pd.PairDict(diff_ppis) header = ut.load_lol(cy_basefile)[0] lines = ut.load_lol(cy_basefile)[1:] if justids: lines = [l[:2] for l in lines] header = header[:2] header += [col_header] ut.write_tab_file([r + [pd_diff.contains(cy_ppi_to_pair(r))] for r in lines], ut.pre_ext(cy_basefile, col_header), header=header)
def predict_clust(name, sp, nsp, obs=None, exs=None, savef=None, pres=None, pd_spcounts=None, cl_kwargs={}, clusts=None, runid=None, count_ext=False, cutoff=0.5, n_cvs=7, accept_clust=False, obs_fnames=None, base_splits=None, obs_kwargs={}, kfold=3, gold_nspecies=2, do_cluster=True, do_2stage_cluster=True, cxs_cxppis=None, do_rescue=True, n_rescue=20000, rescue_fracs=20, rescue_score=0.9, clstruct=None, **predict_kwargs): """ - obs/test_kwargs: note obs_kwargs is combined with predict_kwargs to enforce consistency. - pd_spcounts: supply from ppi.predict_all if nsp > 1. - base_splits: supply exs.splits to generate examples from existing division of complexes. - cxs_cxppis: provide if you want to export, or do the ppi rescue clustering--also must set accept_clust=True, do_rescue=True """ savef = savef if savef else ut.bigd(name)+'.pyd' print "Will save output to", savef runid = runid or random.randrange(0,1000) if clusts is None: if pres is None: if obs is None: obs, pd_spcounts = ppi.predict_all(sp, obs_fnames, save_fname=savef.replace('.pyd',''), nsp=nsp, **obs_kwargs) if exs is None: cvtest_kwargs = ut.dict_quick_merge(obs_kwargs, predict_kwargs) n_cvs = 1 if base_splits is not None else n_cvs cvs, cvstd = cvstd_via_median(name, sp, nsp, obs_fnames, kfold, base_splits, n_cvs, **cvtest_kwargs) if n_cvs > 1: ut.savepy(cvs, ut.pre_ext(savef, '_cvs_%s' % n_cvs)) ut.savepy(cvstd, ut.pre_ext(savef, '_cvstd')) exs=cvstd.exs pres = predict(name, sp, obs, exs.arrfeats, nsp, **predict_kwargs) pres.exs = exs ut.savepy(pres, ut.pre_ext(savef, '_pres'), check_exists=True) else: pres=ut.struct_copy(pres) if do_rescue: assert obs is not None, "Must supply obs for rescue step" merged_splits = pres.exs.splits[1] # splits is (lp_splits, clean_splits) if do_cluster: if cxs_cxppis is None and clstruct is None: if clusts is None and cxs_cxppis is None: #if calc_fracs: #cl_kwargs['fracs'] = [cp.find_inflection(pres.ppis, merged_splits, #pres.species, gold_nspecies)] clusts, runid = multi_clust(pres.ppis, savef=savef, runid=runid, pres=pres, gold_splits=merged_splits, gold_nspecies=gold_nspecies, **cl_kwargs) ut.savepy(clusts, ut.pre_ext(savef, '_clusts_id%s' % runid)) if do_2stage_cluster: clusts2 = multi_stage2_clust(clusts, pres.ppis, runid=runid, **cl_kwargs) clstruct = cp.result_stats(sp, merged_splits, clusts2, gold_nspecies) ut.savepy(clstruct, ut.pre_ext(savef, '_clstruct2_id%s' % runid)) else: clstruct = cp.result_stats(sp, merged_splits, clusts, nsp) ut.savepy(clstruct, ut.pre_ext(savef, '_clstruct_id%s' % runid)) if accept_clust: if cxs_cxppis is None: pres.cxs, pres.cxppis, pres.ind = cp.select_best(clstruct) ut.savepy([pres.cxs,pres.cxppis], ut.pre_ext(savef,'_cxs_cxppis_id%s_ind%s_%scxs' % (runid, pres.ind, len(pres.cxs)))) else: pres.cxs, pres.cxppis = cxs_cxppis pres.ind = 0 if do_rescue: # note cl_kwargs aren't passed--would be messy pres.cxs, pres.cxppis, pres.ppis_rescue = rescue_ppis(pres, obs, n_rescue, cutoff_fracs=rescue_fracs, cutoff_score=rescue_score) cyto_export(pres, merged_splits, name_ext='_clust%s_%scxs' % (pres.ind, len(pres.cxs)), pd_spcounts=pd_spcounts, arrdata=obs, cutoff=cutoff, count_ext=False, arrdata_ppis=None) return pres else: return pres, clstruct else: return pres