def msgf2seq_file(filepath, fasta_file, msb_psms): """ msb_psms: set of spectid_peptidesequence """ def parse_spec_pep_row(r): # get spec_pep from _best file format parsed = '_'.join(r[0].split('.')[:2] + [r[4]]) #print parsed return parsed usedir,fin = os.path.split(filepath) # Get the sample filename from the first item of the third line fout = next(it.islice(ut.load_tab_file(filepath),2,3))[0].split('.')[0] in_gen = ut.load_tab_file(filepath) in_gen.next(); in_gen.next() # skip 2 lines p2g = seqs.prots2genes(fasta_file) g2p = ut.dict_inverse(p2g) fout = os.path.join(usedir, '.'.join([fout, fin.split('.')[-1] , 'sequestformat'])) search = searches[filepath.split('.')[-1]] print "Converting/filtering; Search:", search output = (msgfbest2sequest_line(r,p2g, g2p, search) for r in in_gen if parse_spec_pep_row(r) in msb_psms) print "Writing", fout ut.write_tab_file(output, fout) return fout
def ensg_to_ensp_and_park(ppips): dhpg = seqs.prots2genes('/Users/blakeweb/Dropbox/complex/data/sequences/canon/Hs.fasta') dhgp = ut.dict_inverse(dhpg) parkids = ut.load_lol('./orth_similarities/table.Hsapiens/Hsapiens_id.txt') ppips_ensp = [dhgp[g] for g in ppips] dg2park = dict([(x[2],x[0]) for x in parkids]) dp2park = dict([(x[1],x[0]) for x in parkids]) park_ppips_most = [dp2park[p] for p in ppips_ensp if p in dp2park] ppips_ensp_rest = [p for p in ppips_ensp if p not in dp2park] ppips_ensg_rest = [dhpg[p] for p in ppips_ensp_rest] park_ppips_rest = [dg2park[p] for p in ppips_ensg_rest if p in dg2park] park_ppips = park_ppips_most + park_ppips_rest return park_ppips
def check(fasta, protq, do_convert): p2g = seqs.prots2genes(fasta) g2p = ut.dict_inverse(p2g) fprots = el.load_elution(protq).prots print "checking", ut.shortname(protq) print "proteins: %s of %s" % (len([p for p in fprots if p in p2g]), len(fprots)) ngenesfound = len([p for p in fprots if p in g2p]) print "genes: %s of %s" % (ngenesfound, len(fprots)) if do_convert and ngenesfound < len(fprots): print "converting prots to genes:", protq seqs.elut_p2g(protq, p2g)