def plot_profiles(prots, eluts, sp='Hs', plot_sums=True, shape=None,
    shape: (m,n) = m rows, n columns
    eluts: [el.NormElut(f, sp, norm_rows=False, norm_cols=False) for f in
    import plotting as pl
    gt = seqs.GTrans()
    use_eluts = elutions_containing_prots(eluts, sp, seqs.names2ids(prots),
    shape = shape if shape else ut.sqrt_shape(len(use_eluts)+1)
    fig = pl.figure()
    for i,e in enumerate(use_eluts):
        sp_target = ut.shortname(e.filename)[:2]
        pids = [gt.name2id[p] for p in prots]
        protsmax = max([np.max(e.normarr[r]) for p in pids if p in e.baseid2inds for
            r in e.baseid2inds[p]])
        plot_prots(e, pids, e.baseid2inds, protsmax)
        if plot_sums:
            # plot total spectral counts normalized to match biggest peak
            sums = np.sum(e.normarr,axis=0)
            fmax = np.max(sums)
                    color='k', linestyle='-', linewidth=.5)
    # make legend with all prots
    for p in prots: pl.plot(0,label=p)
def plot_bigprofiles(prots, pids, unnorm_eluts, sp='Hs', min_count=1,
        remove_multi_base=False, gt=None, eluts_per_plot=10,
        do_cluster=True, label_trans=None, do_plot_tree=False,
        rename_fracs=None, colors=None, **kwargs):
    supply EITHER prots OR protids, set other to None
    unnorm_eluts: [el.NormElut(f, sp=sp, norm_cols=False, norm_rows=False) for f in fs]
    import plotting as pl
    if prots is not None:
        pids = [gt.name2id[p] for p in prots]
    if do_cluster:
        print "clustering"
        pids = cluster_ids(pids, unnorm_eluts, sp, gt=gt, do_plot=do_plot_tree, 
    if gt is not None:
        prots = [gt.id2name[pid] for pid in pids if pid in gt.id2name] #re-order to match
        prots = pids
        print "No gene names provided--labeling with ids."
    if label_trans: 
        print "Translating names for display."
        # Translate displayed names from base ids according to provided dict
        #prots = [gt.id2name[pid] for pid in pids]
        prots = [label_trans.get(p,p) for p in prots]
    prots.reverse(); pids.reverse(); # put them top to bottom
    if colors is not None: colors.reverse()
    print "%s proteins" % len(pids)
    use_eluts = elutions_containing_prots(unnorm_eluts, sp, pids, min_count)
    nplots = int(np.ceil(len(use_eluts) / eluts_per_plot))
    maxfracs = 0
    for iplot in range(nplots):
        pl.subplot(nplots, 1, iplot+1)
        plot_eluts = use_eluts[iplot*eluts_per_plot: (iplot+1)*eluts_per_plot]
        frac_names = [ut.shortname(e.filename) for e in plot_eluts]
        if rename_fracs:
            frac_names = [rename_fracs.get(n,n) for n in frac_names]
        startcols = [0]
        for i,e in enumerate(plot_eluts):
            freqarr = ut.normalize_fracs(e.normarr, norm_rows=False)
            sp_target = ut.shortname(e.filename)[:2]
            protsmax = max([np.max(freqarr[r]) for p in pids if p in
                e.baseid2inds for r in e.baseid2inds[p]])
            plot_big_single(freqarr, pids, e.baseid2inds, protsmax,
                    startcols[-1], colors=colors)
        label_xs(startcols, frac_names)
        maxfracs = maxfracs if maxfracs > startcols[-1] else startcols[-1]
    for iplot in range(nplots):
        pl.subplot(nplots, 1, iplot+1)
    return nplots
def prot_conservation(fs,sp1,sp2, gridsize=30, od11=None, return_data=False,
        filter_both=True, use_title=True, extent=[-22,-6,-22,-6], fontsize=18,
    Currently only uses 1 to 1 orthologs, so odict should be a simple flat dict
    of genesa:genesb.
    if sp1==sp2:
    fs1,fs2 = [[f for f in fs if ut.shortname(f)[:2]==sp] for sp in sp1,sp2]
    odict = orth.odict_1to1(sp1,sp2) if od11 == None else od11
    pc1_all,pc2_all = [prot_counts(fs) for fs in fs1,fs2]
    if filter_both:
        ps_use = [p for p in odict if (pc1_all[p]>0 and pc2_all[odict[p]]>0)]
        ps_use = [p for p in pc1_all if p in odict]
    pc1,pc2 = zip(*[(pc1_all[p], pc2_all[odict[p]]) for p in ps_use])
    logpc1,logpc2 = [np.log2(pc) for pc in pc1,pc2]
    plot(extent[:2],extent[2:],'k:', linewidth=1)
    #if use_title:
        #xlabel('%s log2 unique spectral counts' %sp1)
        #ylabel('%s log2 unique spectral counts' %sp2)
        #title('%s-%s: spearmanR: %0.2f, %s 1-1 nonzero ortholog pairs' %
                #(sp1,sp2, scipy.stats.spearmanr(pc1,pc2)[0], len(pc1)))
    rval = scipy.stats.spearmanr(pc1,pc2)[0]
    annotate('R=%0.2f\nN=%s' % (rval, len(pc1)), xy=(.05,.7),
            xycoords=("axes fraction"), fontsize=fontsize)
    if return_data:
        return pc1,pc2
def maybe_move(fpath, file2folder, remove_final_underscore):
    For moving a file into the proper folder based on a text mapping file.
    Format: fbase\tfolder
    Ex: WAN1100427_OT2_Celegans_HCW_P1A04       Ce_1104
    Purpose of remove_final_underscore is for non-exact mappings.  If using
    the mapping file from the raw files, should not be neede.
    if not os.path.exists(fpath):
        print "File not found:", fpath
    basename = ut.shortname(fpath)
    if remove_final_underscore:
        basename = ('_'.join(basename.split('_')[:3]) 
                if len(basename.split('_'))>2 else basename)
    if not basename in file2folder:
        print "No mapping for file:", fpath, basename
    folder = file2folder[basename]
    if not os.path.exists(folder):
        print "Creating directory", folder
    newpath = os.path.join(folder, os.path.split(fpath)[1])
    if os.path.exists(newpath):
        print "File exists:", newpath
        print "Moving to", newpath
        os.rename(fpath, newpath)
 def __init__(self, filename, sp_base="Hs", norm_rows=False, norm_cols=False):
     e = load_elution(filename)
     self.prots = e.prots
     self.filename = e.filename
     self.normarr = ut.normalize_fracs(e.mat, norm_rows=norm_rows, norm_cols=norm_cols)
     self.pinv = ut.list_inv_to_dict(e.prots)
     sp_target = ut.shortname(e.filename)[:2]
     self.baseid2inds = sc.orth_indices(sp_base, sp_target, e.prots, False)
def plot_sums(fs, shape=None):
    import plotting as pl
    shape = shape if shape else ut.sqrt_shape(len(fs))
    for i,f in enumerate(fs):
        e = el.load_elution(f)
        sums = np.sum(e.mat,axis=0)
        pl.plot(range(sums.shape[1]), sums[0,:].T)
def multi_identities(input_fname, out_dir):
    input_list = ut.load_lol(input_fname)
    for desc, prots_fname, source_fasta, odict, target in input_list:
        print "%s, proteins: %s\n source: %s\n odict: %s\ntarget: %s" % (desc,
                prots_fname, source_fasta, odict, target)
        prots = ut.load_list(prots_fname)
        sims = all_identities(prots, odict, source_fasta, target)
        out_fname = os.path.join(out_dir,
                ut.shortname(target).split('.')[0] + "_" + desc + ".txt")
        ut.write_tab_file(sims, out_fname, islist=True)
def all_filtered_pairs(fnames, score_keys, cutoff=0.5, sp_base=None, verbose=True, allow_singles=True):
    allpairs = pd.PairDict([])
    for skey, f in it.product(score_keys, fnames):
        if verbose:
            print skey, cutoff, ut.shortname(f)
        elut = load_elution(f)
        newpairs = passing_pairs(elut, skey, cutoff, allow_singles)
        newpairs = translate_pairs(newpairs, sp_base, file_sp(f))
        allpairs = pd.pd_union_novals(allpairs, newpairs)
    return allpairs
def countfs(fmap, filenames):
    fshorts = [ut.shortname(f) for f in filenames]
    counts = [count_dict_values(fmap, x) for x in [fmap.keys(), fshorts]]
    print "folder original_counts current_counts"
    output = [(folder, counts[0][folder], counts[1][folder]) for folder in
    for x in output: print x[0], x[1], x[2]
    print "\n\nfinished:"
    for x in output: 
        if x[1]==x[2]: print x[0]
def score_whole(gids, norm_eluts):
    scores = []
    for e in norm_eluts:        
        if min([i in e.baseid2inds for i in gids])==True:
            rows = [ind for gid in gids if gid in e.baseid2inds 
                for ind in e.baseid2inds[gid]] 
            score = SubsplitScore(gids, distance_set(e.normarr, rows), None,
    return SplitScores(gids, scores)
def supporting_ppis_separate(ppis, fnames, score_keys, sp_base, cutoff=0.5, verbose=True):
    sps = set([file_sp(f) for f in fnames])
    print "Species:", " ".join(sps)
    ppis_support = [dict([(s, pd.PairDict([])) for s in sps]) for p in ppis]
    eluts = [load_elution(f) for f in fnames]
    for elut, skey in it.product(eluts, score_keys):
        sp = file_sp(elut.filename)
        if verbose:
            print skey, ut.shortname(elut.filename)
        od = orth.odict(sp_base, sp)
            new_pairs = passing_pairs(elut, skey, cutoff)
        except IOError:
            print "No file for %s %s" % (ut.shortname(elut.filename), skey)
        for p, dsupport in zip(ppis, ppis_support):
            for opair in orth.orth_pairs(p[:2], od):
                opair = tuple(opair)
                if new_pairs.contains(opair):
                    dsupport[sp].set(opair, None)
    return [list(p) + [[dsupport[sp].d.keys()] for sp in sps] for p, dsupport in zip(ppis, ppis_support)]
def check(fasta, protq, do_convert):
    p2g = seqs.prots2genes(fasta)
    g2p = ut.dict_inverse(p2g)
    fprots = el.load_elution(protq).prots
    print "checking", ut.shortname(protq)
    print "proteins: %s of %s" % (len([p for p in fprots if p in p2g]),
    ngenesfound = len([p for p in fprots if p in g2p])
    print "genes: %s of %s" % (ngenesfound,
    if do_convert and ngenesfound < len(fprots):
        print "converting prots to genes:",  protq
        seqs.elut_p2g(protq, p2g)
def supporting_ppis(ppis, fnames, score_keys, sp_base, cutoff=0.5, verbose=True):
    ppis_support = [pd.PairDict([]) for p in ppis]
    eluts = [load_elution(f) for f in fnames]
    for elut,skey in it.product(eluts, score_keys):
        if verbose: print skey, ut.shortname(elut.filename)
        od = orth.odict(sp_base, file_sp(elut.filename))
        new_pairs = passing_pairs(elut, skey, cutoff)
        for p,pdsupport in zip(ppis,ppis_support):
            for opair in orth.orth_pairs(p[:2], od):
                opair = tuple(opair)
                if new_pairs.contains(opair):
    return [list(p) + [s.d.keys()] for p,s in zip(ppis, ppis_support)]
def move(fname, fmap):
    For renaming a file based on a mapping old_fname to new_fname.
    NOT for moving a file to mapped folder.  That's the other script.
    basename = ut.shortname(fname)
    fext = os.path.splitext(fname)[1]
    fdir = os.path.split(fname)[0]
    if basename in fmap:
        newname = os.path.join(fdir,fmap[basename] + fext)
        print "moving", fname, newname
        os.rename(fname, newname)
        print "not found", fname
def supporting_ppis(ppis, fnames, score_keys, sp_base, cutoff=0.5, verbose=True):
    ppis_support = [pd.PairDict([]) for p in ppis]
    eluts = [load_elution(f) for f in fnames]
    for elut, skey in it.product(eluts, score_keys):
        if verbose:
            print skey, ut.shortname(elut.filename)
        od = orth.odict(sp_base, file_sp(elut.filename))
        new_pairs = passing_pairs(elut, skey, cutoff)
        for p, pdsupport in zip(ppis, ppis_support):
            for opair in orth.orth_pairs(p[:2], od):
                opair = tuple(opair)
                if new_pairs.contains(opair):
                    pdsupport.set(opair, None)
    return [list(p) + [s.d.keys()] for p, s in zip(ppis, ppis_support)]
def msb_filter(proj_dir, msb_out_dir, pq_path):
    Filter the pepquant output by keeping only values with spectral counts in
    the msblender output.
    proj_name = ut.shortname(proj_dir)
    msb_quant_file = os.path.join(msb_out_dir, proj_name+MSB_EXT)
    assert os.path.exists(msb_quant_file), "No filter elution found: %s" % msb_quant_file
    pq_elut, msb_elut = [el.load_elution(f) for f in pq_path,
    pq_elut.mat = el.filter_matching_elution(pq_elut, msb_elut)
    pq_filt_path = pq_path.replace(PQ_CLEAN, PQ_FILT)
    el.write_elution(pq_elut, pq_filt_path)
    return pq_filt_path
def score_array_multi(arr, sp_base, elut_fs, scores, cutoff, verbose=False,
        remove_multi_base=False, gidscheme=None, allow_singles=True):
    - remove_multi_base: This is not the method currently used to filter scores
      in cases of orthogroup fan-outs--this is a stricter earlier version. That
      filter is filter_multi_orths(), applied after scoring.
    assert gidscheme=='', "Gidscheme not implemented in scoring."
    current_sp = ''
    if remove_multi_base: 
        print ("Filtering orths: only single base gene in orthogroups.")
    for e,f in [(el.load_elution(f),f) for f in elut_fs]:
        sp_target = ut.shortname(f)[:2]
        if sp_target != current_sp: # Just for status output
            print "Starting first %s file: %s" % (sp_target, ut.shortname(f))
            current_sp = sp_target
        baseid2inds = orth_indices(sp_base, sp_target, e.prots,
        # singles based on original spec counts
        singles = set([]) if allow_singles else prots_singles(e) 
        for score in scores:
            if verbose: print score, f
            score_array(arr, e, f, score, cutoff, baseid2inds, singles, lambda prots:
                    orth_indices(sp_base, sp_target, prots, remove_multi_base))
def score_eluts_in_out(normed_eluts, gid_split, sp='Hs', **kwargs):
    scores = []
    for e in normed_eluts:
        rowid_split = [[ind for gid in gids if gid in e.baseid2inds 
            for ind in e.baseid2inds[gid]] 
            for gids in gid_split]
        if len(rowid_split[0])>=2 and len(rowid_split[1])>=1:#skip if any empties
            cx_dist = distance_set(e.normarr, rowid_split[0])
            exclude_dist = distance_exclude(e.normarr, rowid_split[0],
            used_ids = [[e.prots[r] for r in rows] for rows in rowid_split]
            subscore = SubsplitScore(used_ids, cx_dist, exclude_dist,
    return scores
 def score_apart(self, gi, gj, cutoff=2):
     pair = self.pd_clust_feats.find((gi,gj))
     if pair: 
         evs = self.pd_clust_feats.d[pair]
         evs = [0]*len(self.feat_names)
     cumsum = 0
     for elutf in self.elut_max:
         maxi,maxj = [self.elut_max[elutf].get(x,0) for x in gi,gj]
         if maxi >= cutoff and maxj >= cutoff:
             use_inds = [ind for ind,name in enumerate(self.feat_names) if
                     frac_name(name) == ut.shortname(elutf)]
             cumsum += sum([self.avg_ev-evs[ind] for ind in use_inds])
         #elif maxi > cutoff or maxj > cutoff:
     return cumsum
def process(proj_dir, msb_out_dir, dirnames):
    If a single dirname, just process.
    If multiple, merge then process.
    proj_name = ut.shortname(proj_dir)
    if proj_dir in dirnames and len(dirnames) > 1:
    if dirnames != [proj_dir]:
        pq_path = os.path.join(proj_dir, PQ_OUTFILE)
        merge(proj_dir, dirnames, pq_path)
        print "No merging."
        pq_path = os.path.join(proj_dir, PQ_FILE)
    pq_clean_path = os.path.join(proj_dir, proj_name+PQ_CLEAN)
    elut_clean_prots(pq_path, pq_clean_path)
    pq_filt_path = msb_filter(proj_dir, msb_out_dir, pq_clean_path)
def merge(proj_dir, dirnames, pq_new_path):
    Combine pepquant quantitation from project_1 (etc) PQ_FILE into
    if not os.path.exists(proj_dir):
    proj_name = ut.shortname(proj_dir)
    assert not os.path.exists(pq_new_path), "%s exists. Exiting." % pq_new_path
    dirnames = ut.i0(sort_numbered(dirnames))
    #print "Sorted dirnames:", dirnames
    pq_files = [os.path.join(d,PQ_FILE) for d in dirnames]
    for f in pq_files:
        if not os.path.exists(f):
            print "No Elution File:", f
    eluts = (el.load_elution(f) for f in pq_files if os.path.exists(f))
    merged = reduce(el.combine_elutions, eluts)
    el.write_elution(merged, pq_new_path)
def name_score(fname, score):
    return ut.shortname(fname) + '_' + score 
def file_sp(filename):
    return ut.shortname(filename)[:2]
def sort_numbered(filenames):
    pairs = [(f, int(f.split('_')[-1])) for f in filenames]
    pairs.sort(key=lambda x: x[1])
    return pairs

if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit("usage: python proj_dir msb_out_dir directory(s)") 
    proj_dir = sys.argv[1]
    msb_out_dir = sys.argv[2]
    dirnames = sys.argv[3:]
    #print "Directories:", dirnames
    if not os.path.exists(dirnames[0]):
        print "First directory doesn't exist:", dirnames[0]
        print "Project:", ut.shortname(proj_dir), "First directory", dirnames[0]
        process(proj_dir, msb_out_dir, dirnames)
def load_seqs(fasta_fname):
    records = [x for x in SeqIO.parse(fasta_fname, "fasta")]
    medlen = np.median([len(r.seq) for r in records])
    print "%s: %s sequences, median length %s" % (ut.shortname(fasta_fname),
            len(records), medlen)
    return records
def name_score(fname, score):
