def scorekey_elution(score, elut, recalc_id2inds): new_id2inds = None new_prots = None if score == 'apex': score_mat = ApexScores(elut) elif score == 'cosine_old': score_mat = CosineLazyScores(elut) elif score == 'cosine': score_mat = CosineLazyNew(elut) elif score == 'euclidean': score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True, metric=score) elif score in ('pq_euc', 'pq_unfilt_euc', 'mq_euc'): # Use pepquant specific elution file. extension = ( '_pqmsb_filtmsb.tab' if score=='pq_euc' else '_pqmsb.tab' if score=='pq_unfilt_euc' else '.mq_Intensity.tab' if score=='mq_euc' else 0) elut = el.load_elution(os.path.splitext(elut.filename)[0] + extension) if recalc_id2inds is not None: new_id2inds = recalc_id2inds(elut.prots) #cv framework (arrfeats) new_prots = elut.prots score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True, metric='euclidean') else: fscore = elut.filename + ( '.corr_poisson' if score=='poisson' else '.T.wcc_width1' if score=='wcc' else '.corr_euclidean' if score=='euc_poisson' else '.standard' if score=='standard' else # eg elution/testms1 0 ) # no score: exception since string and int don't add score_mat = precalc_scores(fscore) return score_mat, new_id2inds, new_prots
def plot_sums(fs, shape=None): import plotting as pl shape = shape if shape else ut.sqrt_shape(len(fs)) for i,f in enumerate(fs): e = el.load_elution(f) pl.subplot(shape[0],shape[1],i+1) pl.title(ut.shortname(f)) sums = np.sum(e.mat,axis=0) pl.plot(range(sums.shape[1]), sums[0,:].T)
def elut_gene_maxes(elutfs, geneids): d = {} for f in elutfs: e = el.load_elution(f) prots_inv = ut.list_inv_to_dict(e.prots) for gid in geneids: if gid in prots_inv: d.setdefault(f,{})[gid] = np.max(e.mat[prots_inv[gid]]) return d
def check(fasta, protq, do_convert): p2g = seqs.prots2genes(fasta) g2p = ut.dict_inverse(p2g) fprots = el.load_elution(protq).prots print "checking", ut.shortname(protq) print "proteins: %s of %s" % (len([p for p in fprots if p in p2g]), len(fprots)) ngenesfound = len([p for p in fprots if p in g2p]) print "genes: %s of %s" % (ngenesfound, len(fprots)) if do_convert and ngenesfound < len(fprots): print "converting prots to genes:", protq seqs.elut_p2g(protq, p2g)
def msb_filter(proj_dir, msb_out_dir, pq_path): """ Filter the pepquant output by keeping only values with spectral counts in the msblender output. """ proj_name = ut.shortname(proj_dir) msb_quant_file = os.path.join(msb_out_dir, proj_name+MSB_EXT) assert os.path.exists(msb_quant_file), "No filter elution found: %s" % msb_quant_file pq_elut, msb_elut = [el.load_elution(f) for f in pq_path, msb_quant_file] pq_elut.mat = el.filter_matching_elution(pq_elut, msb_elut) pq_filt_path = pq_path.replace(PQ_CLEAN, PQ_FILT) el.write_elution(pq_elut, pq_filt_path) return pq_filt_path
def merge(proj_dir, dirnames, pq_new_path): """ Combine pepquant quantitation from project_1 (etc) PQ_FILE into project+PQ_NEW. """ if not os.path.exists(proj_dir): os.mkdir(proj_dir) proj_name = ut.shortname(proj_dir) assert not os.path.exists(pq_new_path), "%s exists. Exiting." % pq_new_path dirnames = ut.i0(sort_numbered(dirnames)) #print "Sorted dirnames:", dirnames pq_files = [os.path.join(d,PQ_FILE) for d in dirnames] for f in pq_files: if not os.path.exists(f): print "No Elution File:", f eluts = (el.load_elution(f) for f in pq_files if os.path.exists(f)) merged = reduce(el.combine_elutions, eluts) el.write_elution(merged, pq_new_path)
def prot_counts(fs, min_count=2): """ Sum up all the spectral counts for all the proteins in a set of fractionations. Filtered s.t. any returned protein will have at least min_count counts in one fraction of one of the fractionations. Return a dict: {prot1:count1, prot2:count2, ...} """ allprots = el.all_prots(fs, min_count=min_count) pcounts = collections.defaultdict(float) for f in fs: e = el.load_elution(f) psums = np.sum(np.array(e.mat),axis=1) frac_sum = sum(psums) norm_term = 1 / (frac_sum * len(fs)) for p,psum in zip(e.prots,psums): if p in allprots: pcounts[p] += (psum * norm_term) return pcounts
def score_array_multi(arr, sp_base, elut_fs, scores, cutoff, verbose=False, remove_multi_base=False, gidscheme=None, allow_singles=True): """ - remove_multi_base: This is not the method currently used to filter scores in cases of orthogroup fan-outs--this is a stricter earlier version. That filter is feature.py: filter_multi_orths(), applied after scoring. """ assert gidscheme=='', "Gidscheme not implemented in scoring." current_sp = '' if remove_multi_base: print ("Filtering orths: only single base gene in orthogroups.") for e,f in [(el.load_elution(f),f) for f in elut_fs]: sp_target = ut.shortname(f)[:2] if sp_target != current_sp: # Just for status output print "Starting first %s file: %s" % (sp_target, ut.shortname(f)) current_sp = sp_target baseid2inds = orth_indices(sp_base, sp_target, e.prots, remove_multi_base) # singles based on original spec counts singles = set([]) if allow_singles else prots_singles(e) for score in scores: if verbose: print score, f score_array(arr, e, f, score, cutoff, baseid2inds, singles, lambda prots: orth_indices(sp_base, sp_target, prots, remove_multi_base))
score_mat, _, new_prots = scorekey_elution(skey, elut, None) if new_prots is not None: arr_prots = np.array(new_prots) rows, cols = np.where(score_mat > thresh) p1s, p2s = [arr_prots[ids] for ids in rows, cols] pairs = ut.zip_exact(p1s, p2s) return pairs if __name__ == '__main__': nargs = len(sys.argv) if nargs < 3: sys.exit("usage: python score.py filename method(poisson|dotproduct|corrcoef|cov) [argument]") fname = sys.argv[1] method = sys.argv[2] methodarg = None if nargs < 4 else int(sys.argv[3]) elut = el.load_elution(fname) if method == 'poisson': corr = traver_corr(elut.mat, repeat=methodarg) if methodarg else \ traver_corr(elut.mat) elif method in ['cosine_poisson','euclidean_poisson']: corr = poisson_repeat(elut.mat, metric=method.split('_')[0], repeat=methodarg) if methodarg else poisson_repeat(elut.mat, metric=method) elif method in ['euclidean']: corr = pdist_score(elut.mat, norm_rows=True, norm_cols=True, metric=method) elif method in ['apex']: corr = apex_scores_toarray_fast(ApexScores(elut)) #elif method == 'dotproduct': #corr = elut.mat * elut.mat.T #elif method == 'corrcoef':
def elut_clean_prots(fin,fout): elut = el.load_elution(fin) elut.prots = [p.strip('>') for p in elut.prots] el.write_elution(elut, fout)
score_mat, _, new_prots = scorekey_elution(skey, elut, None) if new_prots is not None: arr_prots = np.array(new_prots) rows, cols = np.where(score_mat > thresh) p1s, p2s = [arr_prots[ids] for ids in rows, cols] pairs = ut.zip_exact(p1s, p2s) return pairs if __name__ == '__main__': nargs = len(sys.argv) if nargs < 3: sys.exit("usage: python score.py filename method(poisson|dotproduct|corrcoef|cov) [argument]") fname = sys.argv[1] method = sys.argv[2] methodarg = None if nargs < 4 else int(sys.argv[3]) elut = el.load_elution(fname) if method == 'poisson': corr = traver_corr(elut.mat, repeat=methodarg) if methodarg else \ traver_corr(elut.mat) elif method in ['cosine_poisson','euclidean_poisson']: corr = poisson_repeat(elut.mat, metric=method.split('_')[0], repeat=methodarg) if methodarg else poisson_repeat(elut.mat, metric=method) elif method in ['euclidean']: corr = pdist_score(elut.mat, norm_rows=True, norm_cols=True, metric=method) #elif method == 'dotproduct': #corr = elut.mat * elut.mat.T #elif method == 'corrcoef': #corr = np.corrcoef(elut.mat) #elif method == 'cov':