def multi_clust(tested, score_cutoffs=None, length_cutoffs=None,
        fracs=[.012,.014], frac_retain=.1, ds=[.1,.25,.3,.35], ms=[.1,.15,.2],
        penalties=[.1,1], overlaps=[.55], haircuts=[0,.2], max_pval=1,
        savef=None, runid=None, show_stats=True, pres=None, gold_nspecies=1,
        gold_splits=None, gold_minlen=3, mdprod_min=.01, **kwargs):
    runid = runid or random.randrange(1,1000)
    fracs = (fracs if fracs is not None 
        else [cl.n_thresh(tested, s)/len(tested) for s in score_cutoffs] if score_cutoffs is not None
        else [le/len(tested) for le in length_cutoffs])
    print "random id:", runid
    clusts = []
    params = [fracs, ds, ms, penalties, overlaps, haircuts]
    products = it.product(*params)
    for (f,d,m,p,o,h) in products:
        if d*m >= mdprod_min:
            cxstruct = cl.filter_clust(ut.list_frac(tested, f),
                    ut.list_frac(tested, frac_retain), merge_cutoff=o, negmult=m, min_density=d,
                    runid=runid, penalty=p, max_pval=max_pval, max_overlap=o,
                    haircut=h, **kwargs)
            cxstruct.params = ('density=%s,frac=%s,f_retain=%s,negmult=%s,penalty=%s,max_overlap=%s,haircut=%s' % (d,f,frac_retain,m,p,o,h))
            clusts.append(cxstruct)
            if show_stats and len(cxstruct.cxs)>0:
                if pres is not None and gold_splits is not None:
                    out = cp.select_best(cp.result_stats(pres.species, gold_splits,
                        clusts[-1:], gold_nspecies, min_gold_size=gold_minlen))
                else:
                    print "Can't show stats: pres and gold_splits required."
            if savef and (len(clusts) % 10 == 1):
                ut.savepy(clusts, ut.pre_ext(savef, "clusts_temp_%s_%s" % (ut.date(),
                    runid)))
    return clusts, runid
def cluster(tested, negmult, cltype, max_pval=0.2, **kwargs):
    kwargs = ut.dict_set_defaults(kwargs, clust_defaults)
    command = c1_command if cltype=='c1' else mcl_command
    if 'runid' in kwargs: # keep temp files separate
        runid = str(kwargs['runid']) 
        kwargs['fin'] = ut.pre_ext(kwargs['fin'], runid)
        kwargs['fout'] = ut.pre_ext(kwargs['fout'], runid)
    export_cxs(tested, kwargs['fin'],negmult)
    command = command % kwargs
    print command
    shell_call(command)
    if cltype=='c1':
        cxs, pvals, cx_details = read_clust_output(kwargs['fout'], max_pval)
    elif cltype=='mcl':
        cxs = read_mcl_output(kwargs['fout'])
        pvals,cx_details=None,None
    else:
        assert False, "Wrong cluster type: %s" %cltype
    return Struct(cxs=cxs, pvals=pvals, cx_details=cx_details)
Exemple #3
0
def fnet_names(fnet_file):
    filename = ut.proj_path('fnet_path',fnet_file)
    first = ut.load_tab_file(filename).next()
    nfields = len(first)-2
    if nfields > 1:
        return [l[0].strip() if l[0].find('=')==-1 else
                l[0].split('=')[0].strip() for l in
                ut.load_tab_file(ut.pre_ext(filename,'_names'))]
    else:
        return None #means there is only one data column.
def mq2elut(fname, quant='iBAQ'):
    lines = [l for l in ut.load_tab_file(fname)]
    # want eg 'iBAQ WAN...', not 'iBAQ L WAN...'
    inds = [i for i,val in enumerate(lines[0]) 
            if re.match('^%s\s\w{2}' % quant,val) is not None]
    #prots = [[p.split()[0][1:] for p in ps.split(';')] 
            #for ps in [l[0] for l in lines[1:]]]
    # for now just using the "majority protein"
    prots = [p.split()[0][1:] for p in [l[1] for l in lines[1:]]]
    output = [[lines[0][0]] + [lines[0][i] for i in inds]] + \
            [[p] + [l[i] for i in inds] for p,l in zip(prots, lines[1:])]
    ut.write_tab_file(output, ut.pre_ext(fname, '_mq_%s' % quant))
Exemple #5
0
def export_idconvert(ppis, dict_cxlabels, fname):
    cxs_labeled = set([])
    pfx_convert = []
    for p in ppis:
        for i in 0,1:
            combid = p[i]
            cxid = combid.split('_')[0]
            pid = '_'.join(combid.split('_')[1:]) #in case '_' in id, eg for Sp
            if cxid not in cxs_labeled:
                cxlabel = dict_cxlabels[cxid]
                cxs_labeled.add(cxid)
            else:
                cxlabel = ''
            pfx_convert.append([combid, pid, cxid, cxlabel])
    pfx_convert = [['nodeid', 'ENSGID', 'complexid', 'ComplexLabel']] \
            + pfx_convert
    ut.write_tab_file(pfx_convert, ut.pre_ext(fname,'pfx_convert'))
Exemple #6
0
def munge_original(fdata, column_inds, fnames, fout, first_names=1):
    """
    Keep selected columns, replace 'NA' with '?', remove empty rows.
    Ids (first 2 columns) are kept automatically.
    For column inds, start with 0 for scores.
    Keep the same columns from the fnames file so I have a record of it.
    """
    out = []
    default = ['?'] * len(column_inds)
    for l in ut.load_tab_file(fdata):
        ids = list(l[:2])
        newdata = [l[i+2] if l[i+2]!='NA' else '?' for i in range(len(l)) if i
            in column_inds]
        if newdata != default:
            out.append(ids + newdata)
    ut.write_tab_file(out, fout)
    names = [l for i,l in enumerate( list( ut.load_tab_file(
        fnames))[first_names:]) if i in column_inds]
    ut.write_tab_file(names, ut.pre_ext(fout, '_names')) 
Exemple #7
0
def exported_diff(cy_basefile, cy_difffile, col_header, diff_ppis=None,
        justids=False):
    """
    Makes a new cy_ file labeling whether that interaction is also found in the
    cy_difffile (or the diff_ppis--pass None for cy_difffile in that case).
    """
    def cy_ppi_to_pair(p):
        return (p[0].split('_')[1], p[1].split('_')[1])
    if cy_difffile is not None:
        pd_diff = pd.PairDict([cy_ppi_to_pair(p) 
            for p in ut.load_lot(cy_difffile)[1:]])
    else:
        pd_diff = pd.PairDict(diff_ppis)
    header = ut.load_lol(cy_basefile)[0]
    lines = ut.load_lol(cy_basefile)[1:]
    if justids:
        lines = [l[:2] for l in lines]
        header = header[:2]
    header += [col_header]
    ut.write_tab_file([r + [pd_diff.contains(cy_ppi_to_pair(r))] for r in
        lines], ut.pre_ext(cy_basefile, col_header), header=header)
def predict_clust(name, sp, nsp, obs=None, exs=None, savef=None, pres=None,
        pd_spcounts=None, cl_kwargs={}, clusts=None, runid=None,
        count_ext=False, cutoff=0.5, n_cvs=7, accept_clust=False,
        obs_fnames=None, base_splits=None, obs_kwargs={}, kfold=3,
        gold_nspecies=2, do_cluster=True, do_2stage_cluster=True,
        cxs_cxppis=None, do_rescue=True, n_rescue=20000, rescue_fracs=20,
        rescue_score=0.9, clstruct=None, **predict_kwargs):
    """
    - obs/test_kwargs: note obs_kwargs is combined with predict_kwargs to enforce
      consistency.
    - pd_spcounts: supply from ppi.predict_all if nsp > 1.
    - base_splits: supply exs.splits to generate examples from existing
      division of complexes.
    - cxs_cxppis: provide if you want to export, or do the ppi rescue
      clustering--also must set accept_clust=True, do_rescue=True
    """
    savef = savef if savef else ut.bigd(name)+'.pyd'
    print "Will save output to", savef
    runid = runid or random.randrange(0,1000)
    if clusts is None: 
        if pres is None:
            if obs is None:
                obs, pd_spcounts = ppi.predict_all(sp, obs_fnames,
                        save_fname=savef.replace('.pyd',''), nsp=nsp,
                        **obs_kwargs)
            if exs is None:
                cvtest_kwargs = ut.dict_quick_merge(obs_kwargs, predict_kwargs)
                n_cvs = 1 if base_splits is not None else n_cvs
                cvs, cvstd = cvstd_via_median(name, sp, nsp, obs_fnames, kfold,
                        base_splits, n_cvs, **cvtest_kwargs)
                if n_cvs > 1:
                    ut.savepy(cvs, ut.pre_ext(savef, '_cvs_%s' % n_cvs))
                ut.savepy(cvstd, ut.pre_ext(savef, '_cvstd'))
                exs=cvstd.exs
            pres = predict(name, sp, obs, exs.arrfeats, nsp, **predict_kwargs)
            pres.exs = exs
            ut.savepy(pres, ut.pre_ext(savef, '_pres'), check_exists=True) 
        else:
            pres=ut.struct_copy(pres)
            if do_rescue:
                assert obs is not None, "Must supply obs for rescue step"
    merged_splits = pres.exs.splits[1] # splits is (lp_splits, clean_splits)
    if do_cluster:
        if cxs_cxppis is None and clstruct is None:
            if clusts is None and cxs_cxppis is None:
                #if calc_fracs:
                    #cl_kwargs['fracs'] = [cp.find_inflection(pres.ppis, merged_splits,
                        #pres.species, gold_nspecies)]
                clusts, runid = multi_clust(pres.ppis, savef=savef, runid=runid,
                        pres=pres, gold_splits=merged_splits,
                        gold_nspecies=gold_nspecies, **cl_kwargs)
                ut.savepy(clusts, ut.pre_ext(savef, '_clusts_id%s' % runid))
            if do_2stage_cluster:
                clusts2 = multi_stage2_clust(clusts, pres.ppis, runid=runid,
                        **cl_kwargs)
                clstruct = cp.result_stats(sp, merged_splits, clusts2,
                        gold_nspecies) 
                ut.savepy(clstruct, ut.pre_ext(savef, '_clstruct2_id%s' % runid))
            else:
                clstruct = cp.result_stats(sp, merged_splits, clusts, nsp) 
                ut.savepy(clstruct, ut.pre_ext(savef, '_clstruct_id%s' % runid))
        if accept_clust:
            if cxs_cxppis is None:
                pres.cxs, pres.cxppis, pres.ind = cp.select_best(clstruct)
                ut.savepy([pres.cxs,pres.cxppis],
                        ut.pre_ext(savef,'_cxs_cxppis_id%s_ind%s_%scxs'
                            % (runid, pres.ind, len(pres.cxs))))
            else:
                pres.cxs, pres.cxppis = cxs_cxppis
                pres.ind = 0
            if do_rescue:
                # note cl_kwargs aren't passed--would be messy
                pres.cxs, pres.cxppis, pres.ppis_rescue = rescue_ppis(pres,
                        obs, n_rescue, cutoff_fracs=rescue_fracs,
                        cutoff_score=rescue_score)
            cyto_export(pres, merged_splits, name_ext='_clust%s_%scxs' % (pres.ind,
                len(pres.cxs)), pd_spcounts=pd_spcounts, arrdata=obs,
                cutoff=cutoff, count_ext=False, arrdata_ppis=None)
            return pres
        else:
            return pres, clstruct
    else:
        return pres