def plot_profiles(prots, eluts, sp='Hs', plot_sums=True, shape=None, min_count=1): """ shape: (m,n) = m rows, n columns eluts: [el.NormElut(f, sp, norm_rows=False, norm_cols=False) for f in fs] """ import plotting as pl gt = seqs.GTrans() use_eluts = elutions_containing_prots(eluts, sp, seqs.names2ids(prots), min_count) shape = shape if shape else ut.sqrt_shape(len(use_eluts)+1) fig = pl.figure() for i,e in enumerate(use_eluts): sp_target = ut.shortname(e.filename)[:2] pl.subplot(shape[0],shape[1],i+1) pl.title(ut.shortname(e.filename)) pids = [gt.name2id[p] for p in prots] protsmax = max([np.max(e.normarr[r]) for p in pids if p in e.baseid2inds for r in e.baseid2inds[p]]) plot_prots(e, pids, e.baseid2inds, protsmax) if plot_sums: # plot total spectral counts normalized to match biggest peak sums = np.sum(e.normarr,axis=0) fmax = np.max(sums) pl.plot(range(sums.shape[1]), np.log2(sums[0,:]).T*np.log2(protsmax)*len(pids)/np.log2(fmax), color='k', linestyle='-', linewidth=.5) # make legend with all prots pl.subplot(shape[0],shape[1],0) for p in prots: pl.plot(0,label=p) pl.legend()
def plot_bigprofiles(prots, pids, unnorm_eluts, sp='Hs', min_count=1, remove_multi_base=False, gt=None, eluts_per_plot=10, do_cluster=True, label_trans=None, do_plot_tree=False, rename_fracs=None, colors=None, **kwargs): """ supply EITHER prots OR protids, set other to None unnorm_eluts: [el.NormElut(f, sp=sp, norm_cols=False, norm_rows=False) for f in fs] """ import plotting as pl if prots is not None: pids = [gt.name2id[p] for p in prots] if do_cluster: print "clustering" pids = cluster_ids(pids, unnorm_eluts, sp, gt=gt, do_plot=do_plot_tree, **kwargs) if gt is not None: prots = [gt.id2name[pid] for pid in pids if pid in gt.id2name] #re-order to match else: prots = pids print "No gene names provided--labeling with ids." if label_trans: print "Translating names for display." # Translate displayed names from base ids according to provided dict #prots = [gt.id2name[pid] for pid in pids] prots = [label_trans.get(p,p) for p in prots] prots.reverse(); pids.reverse(); # put them top to bottom if colors is not None: colors.reverse() print "%s proteins" % len(pids) use_eluts = elutions_containing_prots(unnorm_eluts, sp, pids, min_count) nplots = int(np.ceil(len(use_eluts) / eluts_per_plot)) maxfracs = 0 for iplot in range(nplots): pl.subplot(nplots, 1, iplot+1) plot_eluts = use_eluts[iplot*eluts_per_plot: (iplot+1)*eluts_per_plot] frac_names = [ut.shortname(e.filename) for e in plot_eluts] if rename_fracs: frac_names = [rename_fracs.get(n,n) for n in frac_names] startcols = [0] for i,e in enumerate(plot_eluts): freqarr = ut.normalize_fracs(e.normarr, norm_rows=False) sp_target = ut.shortname(e.filename)[:2] protsmax = max([np.max(freqarr[r]) for p in pids if p in e.baseid2inds for r in e.baseid2inds[p]]) plot_big_single(freqarr, pids, e.baseid2inds, protsmax, startcols[-1], colors=colors) startcols.append(startcols[-1]+freqarr.shape[1]) label_ys(prots) label_xs(startcols, frac_names) pl.grid(False) maxfracs = maxfracs if maxfracs > startcols[-1] else startcols[-1] for iplot in range(nplots): pl.subplot(nplots, 1, iplot+1) pl.xlim(0,maxfracs) pl.subplots_adjust(hspace=5/len(prots)) return nplots
def prot_conservation(fs,sp1,sp2, gridsize=30, od11=None, return_data=False, filter_both=True, use_title=True, extent=[-22,-6,-22,-6], fontsize=18, **kwargs): """ Currently only uses 1 to 1 orthologs, so odict should be a simple flat dict of genesa:genesb. """ if sp1==sp2: return fs1,fs2 = [[f for f in fs if ut.shortname(f)[:2]==sp] for sp in sp1,sp2] odict = orth.odict_1to1(sp1,sp2) if od11 == None else od11 pc1_all,pc2_all = [prot_counts(fs) for fs in fs1,fs2] if filter_both: ps_use = [p for p in odict if (pc1_all[p]>0 and pc2_all[odict[p]]>0)] else: ps_use = [p for p in pc1_all if p in odict] pc1,pc2 = zip(*[(pc1_all[p], pc2_all[odict[p]]) for p in ps_use]) logpc1,logpc2 = [np.log2(pc) for pc in pc1,pc2] plot(extent[:2],extent[2:],'k:', linewidth=1) hexbin(logpc1,logpc2,gridsize=gridsize,**kwargs) #if use_title: #xlabel('%s log2 unique spectral counts' %sp1) #ylabel('%s log2 unique spectral counts' %sp2) #title('%s-%s: spearmanR: %0.2f, %s 1-1 nonzero ortholog pairs' % #(sp1,sp2, scipy.stats.spearmanr(pc1,pc2)[0], len(pc1))) rval = scipy.stats.spearmanr(pc1,pc2)[0] annotate('R=%0.2f\nN=%s' % (rval, len(pc1)), xy=(.05,.7), xycoords=("axes fraction"), fontsize=fontsize) if return_data: return pc1,pc2
def maybe_move(fpath, file2folder, remove_final_underscore): """ For moving a file into the proper folder based on a text mapping file. Format: fbase\tfolder Ex: WAN1100427_OT2_Celegans_HCW_P1A04 Ce_1104 Purpose of remove_final_underscore is for non-exact mappings. If using the mapping file from the raw files, should not be neede. """ if not os.path.exists(fpath): print "File not found:", fpath return basename = ut.shortname(fpath) if remove_final_underscore: basename = ('_'.join(basename.split('_')[:3]) if len(basename.split('_'))>2 else basename) if not basename in file2folder: print "No mapping for file:", fpath, basename return folder = file2folder[basename] if not os.path.exists(folder): print "Creating directory", folder os.mkdir(folder) newpath = os.path.join(folder, os.path.split(fpath)[1]) if os.path.exists(newpath): print "File exists:", newpath else: print "Moving to", newpath os.rename(fpath, newpath)
def __init__(self, filename, sp_base="Hs", norm_rows=False, norm_cols=False): e = load_elution(filename) self.prots = e.prots self.filename = e.filename self.normarr = ut.normalize_fracs(e.mat, norm_rows=norm_rows, norm_cols=norm_cols) self.pinv = ut.list_inv_to_dict(e.prots) sp_target = ut.shortname(e.filename)[:2] self.baseid2inds = sc.orth_indices(sp_base, sp_target, e.prots, False)
def plot_sums(fs, shape=None): import plotting as pl shape = shape if shape else ut.sqrt_shape(len(fs)) for i,f in enumerate(fs): e = el.load_elution(f) pl.subplot(shape[0],shape[1],i+1) pl.title(ut.shortname(f)) sums = np.sum(e.mat,axis=0) pl.plot(range(sums.shape[1]), sums[0,:].T)
def __init__(self, filename, sp_base='Hs', norm_rows=False, norm_cols=False): e = load_elution(filename) self.prots = e.prots self.filename = e.filename self.normarr = ut.normalize_fracs(e.mat, norm_rows=norm_rows, norm_cols=norm_cols) self.pinv = ut.list_inv_to_dict(e.prots) sp_target = ut.shortname(e.filename)[:2] self.baseid2inds = sc.orth_indices(sp_base, sp_target, e.prots, False)
def multi_identities(input_fname, out_dir): input_list = ut.load_lol(input_fname) for desc, prots_fname, source_fasta, odict, target in input_list: print "%s, proteins: %s\n source: %s\n odict: %s\ntarget: %s" % (desc, prots_fname, source_fasta, odict, target) prots = ut.load_list(prots_fname) sims = all_identities(prots, odict, source_fasta, target) out_fname = os.path.join(out_dir, ut.shortname(target).split('.')[0] + "_" + desc + ".txt") ut.write_tab_file(sims, out_fname, islist=True)
def all_filtered_pairs(fnames, score_keys, cutoff=0.5, sp_base=None, verbose=True, allow_singles=True): allpairs = pd.PairDict([]) for skey, f in it.product(score_keys, fnames): if verbose: print skey, cutoff, ut.shortname(f) elut = load_elution(f) newpairs = passing_pairs(elut, skey, cutoff, allow_singles) newpairs = translate_pairs(newpairs, sp_base, file_sp(f)) allpairs = pd.pd_union_novals(allpairs, newpairs) return allpairs
def all_filtered_pairs(fnames, score_keys, cutoff=0.5, sp_base=None, verbose=True, allow_singles=True): allpairs = pd.PairDict([]) for skey,f in it.product(score_keys,fnames): if verbose: print skey, cutoff, ut.shortname(f) elut = load_elution(f) newpairs = passing_pairs(elut, skey, cutoff, allow_singles) newpairs = translate_pairs(newpairs, sp_base, file_sp(f)) allpairs = pd.pd_union_novals(allpairs, newpairs) return allpairs
def countfs(fmap, filenames): fshorts = [ut.shortname(f) for f in filenames] counts = [count_dict_values(fmap, x) for x in [fmap.keys(), fshorts]] print "folder original_counts current_counts" output = [(folder, counts[0][folder], counts[1][folder]) for folder in sorted(set(fmap.values()))] for x in output: print x[0], x[1], x[2] print "\n\nfinished:" for x in output: if x[1]==x[2]: print x[0]
def score_whole(gids, norm_eluts): scores = [] for e in norm_eluts: if min([i in e.baseid2inds for i in gids])==True: rows = [ind for gid in gids if gid in e.baseid2inds for ind in e.baseid2inds[gid]] score = SubsplitScore(gids, distance_set(e.normarr, rows), None, ut.shortname(e.filename)) scores.append(score) return SplitScores(gids, scores)
def supporting_ppis_separate(ppis, fnames, score_keys, sp_base, cutoff=0.5, verbose=True): sps = set([file_sp(f) for f in fnames]) print "Species:", " ".join(sps) ppis_support = [dict([(s, pd.PairDict([])) for s in sps]) for p in ppis] eluts = [load_elution(f) for f in fnames] for elut, skey in it.product(eluts, score_keys): sp = file_sp(elut.filename) if verbose: print skey, ut.shortname(elut.filename) od = orth.odict(sp_base, sp) try: new_pairs = passing_pairs(elut, skey, cutoff) except IOError: print "No file for %s %s" % (ut.shortname(elut.filename), skey) continue for p, dsupport in zip(ppis, ppis_support): for opair in orth.orth_pairs(p[:2], od): opair = tuple(opair) if new_pairs.contains(opair): dsupport[sp].set(opair, None) return [list(p) + [[dsupport[sp].d.keys()] for sp in sps] for p, dsupport in zip(ppis, ppis_support)]
def check(fasta, protq, do_convert): p2g = seqs.prots2genes(fasta) g2p = ut.dict_inverse(p2g) fprots = el.load_elution(protq).prots print "checking", ut.shortname(protq) print "proteins: %s of %s" % (len([p for p in fprots if p in p2g]), len(fprots)) ngenesfound = len([p for p in fprots if p in g2p]) print "genes: %s of %s" % (ngenesfound, len(fprots)) if do_convert and ngenesfound < len(fprots): print "converting prots to genes:", protq seqs.elut_p2g(protq, p2g)
def supporting_ppis_separate(ppis, fnames, score_keys, sp_base, cutoff=0.5, verbose=True): sps = set([file_sp(f) for f in fnames]) print "Species:", ' '.join(sps) ppis_support = [dict([(s, pd.PairDict([])) for s in sps]) for p in ppis] eluts = [load_elution(f) for f in fnames] for elut,skey in it.product(eluts, score_keys): sp = file_sp(elut.filename) if verbose: print skey, ut.shortname(elut.filename) od = orth.odict(sp_base, sp) try: new_pairs = passing_pairs(elut, skey, cutoff) except IOError: print "No file for %s %s" % (ut.shortname(elut.filename), skey) continue for p,dsupport in zip(ppis,ppis_support): for opair in orth.orth_pairs(p[:2], od): opair = tuple(opair) if new_pairs.contains(opair): dsupport[sp].set(opair,None) return [list(p) + [[dsupport[sp].d.keys()] for sp in sps] for p,dsupport in zip(ppis, ppis_support)]
def supporting_ppis(ppis, fnames, score_keys, sp_base, cutoff=0.5, verbose=True): ppis_support = [pd.PairDict([]) for p in ppis] eluts = [load_elution(f) for f in fnames] for elut,skey in it.product(eluts, score_keys): if verbose: print skey, ut.shortname(elut.filename) od = orth.odict(sp_base, file_sp(elut.filename)) new_pairs = passing_pairs(elut, skey, cutoff) for p,pdsupport in zip(ppis,ppis_support): for opair in orth.orth_pairs(p[:2], od): opair = tuple(opair) if new_pairs.contains(opair): pdsupport.set(opair,None) return [list(p) + [s.d.keys()] for p,s in zip(ppis, ppis_support)]
def move(fname, fmap): """ For renaming a file based on a mapping old_fname to new_fname. NOT for moving a file to mapped folder. That's the other script. """ basename = ut.shortname(fname) fext = os.path.splitext(fname)[1] fdir = os.path.split(fname)[0] if basename in fmap: newname = os.path.join(fdir,fmap[basename] + fext) print "moving", fname, newname os.rename(fname, newname) else: print "not found", fname
def supporting_ppis(ppis, fnames, score_keys, sp_base, cutoff=0.5, verbose=True): ppis_support = [pd.PairDict([]) for p in ppis] eluts = [load_elution(f) for f in fnames] for elut, skey in it.product(eluts, score_keys): if verbose: print skey, ut.shortname(elut.filename) od = orth.odict(sp_base, file_sp(elut.filename)) new_pairs = passing_pairs(elut, skey, cutoff) for p, pdsupport in zip(ppis, ppis_support): for opair in orth.orth_pairs(p[:2], od): opair = tuple(opair) if new_pairs.contains(opair): pdsupport.set(opair, None) return [list(p) + [s.d.keys()] for p, s in zip(ppis, ppis_support)]
def msb_filter(proj_dir, msb_out_dir, pq_path): """ Filter the pepquant output by keeping only values with spectral counts in the msblender output. """ proj_name = ut.shortname(proj_dir) msb_quant_file = os.path.join(msb_out_dir, proj_name+MSB_EXT) assert os.path.exists(msb_quant_file), "No filter elution found: %s" % msb_quant_file pq_elut, msb_elut = [el.load_elution(f) for f in pq_path, msb_quant_file] pq_elut.mat = el.filter_matching_elution(pq_elut, msb_elut) pq_filt_path = pq_path.replace(PQ_CLEAN, PQ_FILT) el.write_elution(pq_elut, pq_filt_path) return pq_filt_path
def score_array_multi(arr, sp_base, elut_fs, scores, cutoff, verbose=False, remove_multi_base=False, gidscheme=None, allow_singles=True): """ - remove_multi_base: This is not the method currently used to filter scores in cases of orthogroup fan-outs--this is a stricter earlier version. That filter is feature.py: filter_multi_orths(), applied after scoring. """ assert gidscheme=='', "Gidscheme not implemented in scoring." current_sp = '' if remove_multi_base: print ("Filtering orths: only single base gene in orthogroups.") for e,f in [(el.load_elution(f),f) for f in elut_fs]: sp_target = ut.shortname(f)[:2] if sp_target != current_sp: # Just for status output print "Starting first %s file: %s" % (sp_target, ut.shortname(f)) current_sp = sp_target baseid2inds = orth_indices(sp_base, sp_target, e.prots, remove_multi_base) # singles based on original spec counts singles = set([]) if allow_singles else prots_singles(e) for score in scores: if verbose: print score, f score_array(arr, e, f, score, cutoff, baseid2inds, singles, lambda prots: orth_indices(sp_base, sp_target, prots, remove_multi_base))
def score_eluts_in_out(normed_eluts, gid_split, sp='Hs', **kwargs): scores = [] for e in normed_eluts: rowid_split = [[ind for gid in gids if gid in e.baseid2inds for ind in e.baseid2inds[gid]] for gids in gid_split] if len(rowid_split[0])>=2 and len(rowid_split[1])>=1:#skip if any empties cx_dist = distance_set(e.normarr, rowid_split[0]) exclude_dist = distance_exclude(e.normarr, rowid_split[0], rowid_split[1]) used_ids = [[e.prots[r] for r in rows] for rows in rowid_split] subscore = SubsplitScore(used_ids, cx_dist, exclude_dist, ut.shortname(e.filename)) scores.append(subscore) return scores
def score_apart(self, gi, gj, cutoff=2): pair = self.pd_clust_feats.find((gi,gj)) if pair: evs = self.pd_clust_feats.d[pair] else: evs = [0]*len(self.feat_names) cumsum = 0 for elutf in self.elut_max: maxi,maxj = [self.elut_max[elutf].get(x,0) for x in gi,gj] if maxi >= cutoff and maxj >= cutoff: use_inds = [ind for ind,name in enumerate(self.feat_names) if frac_name(name) == ut.shortname(elutf)] cumsum += sum([self.avg_ev-evs[ind] for ind in use_inds]) #elif maxi > cutoff or maxj > cutoff: #pass return cumsum
def process(proj_dir, msb_out_dir, dirnames): """ If a single dirname, just process. If multiple, merge then process. """ proj_name = ut.shortname(proj_dir) if proj_dir in dirnames and len(dirnames) > 1: dirnames.remove(proj_dir) if dirnames != [proj_dir]: pq_path = os.path.join(proj_dir, PQ_OUTFILE) merge(proj_dir, dirnames, pq_path) else: print "No merging." pq_path = os.path.join(proj_dir, PQ_FILE) pq_clean_path = os.path.join(proj_dir, proj_name+PQ_CLEAN) elut_clean_prots(pq_path, pq_clean_path) pq_filt_path = msb_filter(proj_dir, msb_out_dir, pq_clean_path)
def merge(proj_dir, dirnames, pq_new_path): """ Combine pepquant quantitation from project_1 (etc) PQ_FILE into project+PQ_NEW. """ if not os.path.exists(proj_dir): os.mkdir(proj_dir) proj_name = ut.shortname(proj_dir) assert not os.path.exists(pq_new_path), "%s exists. Exiting." % pq_new_path dirnames = ut.i0(sort_numbered(dirnames)) #print "Sorted dirnames:", dirnames pq_files = [os.path.join(d,PQ_FILE) for d in dirnames] for f in pq_files: if not os.path.exists(f): print "No Elution File:", f eluts = (el.load_elution(f) for f in pq_files if os.path.exists(f)) merged = reduce(el.combine_elutions, eluts) el.write_elution(merged, pq_new_path)
def name_score(fname, score): return ut.shortname(fname) + '_' + score
def file_sp(filename): return ut.shortname(filename)[:2]
os.mkdir(proj_dir) proj_name = ut.shortname(proj_dir) assert not os.path.exists(pq_new_path), "%s exists. Exiting." % pq_new_path dirnames = ut.i0(sort_numbered(dirnames)) #print "Sorted dirnames:", dirnames pq_files = [os.path.join(d,PQ_FILE) for d in dirnames] for f in pq_files: if not os.path.exists(f): print "No Elution File:", f eluts = (el.load_elution(f) for f in pq_files if os.path.exists(f)) merged = reduce(el.combine_elutions, eluts) el.write_elution(merged, pq_new_path) def sort_numbered(filenames): pairs = [(f, int(f.split('_')[-1])) for f in filenames] pairs.sort(key=lambda x: x[1]) return pairs if __name__ == '__main__': if len(sys.argv) < 2: sys.exit("usage: python pepquant_post.py proj_dir msb_out_dir directory(s)") proj_dir = sys.argv[1] msb_out_dir = sys.argv[2] dirnames = sys.argv[3:] #print "Directories:", dirnames if not os.path.exists(dirnames[0]): print "First directory doesn't exist:", dirnames[0] else: print "Project:", ut.shortname(proj_dir), "First directory", dirnames[0] process(proj_dir, msb_out_dir, dirnames)
def load_seqs(fasta_fname): records = [x for x in SeqIO.parse(fasta_fname, "fasta")] medlen = np.median([len(r.seq) for r in records]) print "%s: %s sequences, median length %s" % (ut.shortname(fasta_fname), len(records), medlen) return records