def msgf2seq_file(filepath, fasta_file, msb_psms):
    """
    msb_psms: set of spectid_peptidesequence
    """
    def parse_spec_pep_row(r):
        # get spec_pep from _best file format
        parsed = '_'.join(r[0].split('.')[:2] + [r[4]])
        #print parsed
        return parsed
    usedir,fin = os.path.split(filepath)
    # Get the sample filename from the first item of the third line
    fout = next(it.islice(ut.load_tab_file(filepath),2,3))[0].split('.')[0]
    in_gen = ut.load_tab_file(filepath)
    in_gen.next(); in_gen.next() # skip 2 lines
    p2g = seqs.prots2genes(fasta_file)
    g2p = ut.dict_inverse(p2g)
    fout = os.path.join(usedir, '.'.join([fout, fin.split('.')[-1] ,
        'sequestformat']))
    search = searches[filepath.split('.')[-1]]
    print "Converting/filtering; Search:", search
    output = (msgfbest2sequest_line(r,p2g, g2p, search) for r in in_gen 
            if parse_spec_pep_row(r) in msb_psms)
    print "Writing", fout
    ut.write_tab_file(output, fout)
    return fout
def ensg_to_ensp_and_park(ppips):
    dhpg = seqs.prots2genes('/Users/blakeweb/Dropbox/complex/data/sequences/canon/Hs.fasta')
    dhgp = ut.dict_inverse(dhpg)
    parkids = ut.load_lol('./orth_similarities/table.Hsapiens/Hsapiens_id.txt')
    ppips_ensp = [dhgp[g] for g in ppips]
    dg2park = dict([(x[2],x[0]) for x in parkids])
    dp2park = dict([(x[1],x[0]) for x in parkids])
    park_ppips_most = [dp2park[p] for p in ppips_ensp if p in dp2park]
    ppips_ensp_rest = [p for p in ppips_ensp if p not in dp2park]
    ppips_ensg_rest = [dhpg[p] for p in ppips_ensp_rest]
    park_ppips_rest = [dg2park[p] for p in ppips_ensg_rest if p in dg2park]
    park_ppips = park_ppips_most + park_ppips_rest
    return park_ppips
def check(fasta, protq, do_convert):
    p2g = seqs.prots2genes(fasta)
    g2p = ut.dict_inverse(p2g)
    fprots = el.load_elution(protq).prots
    print "checking", ut.shortname(protq)
    print "proteins: %s of %s" % (len([p for p in fprots if p in p2g]),
            len(fprots))
    ngenesfound = len([p for p in fprots if p in g2p])
    print "genes: %s of %s" % (ngenesfound,
            len(fprots))
    if do_convert and ngenesfound < len(fprots):
        print "converting prots to genes:",  protq
        seqs.elut_p2g(protq, p2g)