# raw_info['fetchid'] = raw_info['fetchid'].apply(int) # this is an UGLY fix that we'd have to implement here just to save everything ... if args.exp_num: bad_info['enzyme'] = 'T' # # fasta = SeqIO.to_dict(SeqIO.parse(fasta_fname,"fasta"),key_function=lambda _: _.id.split('|')[1]) # 1-BASED NOTATION FOR PROTEINS INDEXING ENFORCED ... # pep_df = pd.read_csv(uniq_pept_fname) # connection between peptide info and spectrum info to be established ... ########################################################################## # unroll that spec table to have 1 deamid per row ... # # # quant_info_unrolled = ms.unroll_by_mfunc(quant_info,['Modifications','Sequence'],(lambda row: ms.extract_deamids(row[0],row[1])),'deamid_info') # now we'd have to determine the type of the 'Prob' column, object,float, or somethgin else ... # a new fix @ August 3 2016 ... if quant_info_unrolled['Prob'].dtype == 'float': quant_info_unrolled['pept_ident_probab'] = quant_info_unrolled['Prob'] elif quant_info_unrolled['Prob'].dtype == 'object': quant_info_unrolled['pept_ident_probab'] = quant_info_unrolled['Prob'].str.strip('%').apply(float) ########################################################## # so far the following merge seems to be 100% sufficient for the desired final output ... # we could add on extra features if needed ... quant_n_raw = quant_info_unrolled[['pept', 'deamid_info', 'pept_with_mod', 'Weight', 'spec_name',
spec_info = pd.read_csv(spec_fname,sep=separator) # fix their peptide sequence thing right away ... spec_info['pept'] = spec_info['Peptide sequence'].str.upper() pep_info['fetchid'] = pep_info['fetchid'].apply(int) # this is an UGLY fix that we'd have to implement here just to save everything ... if args.exp_num=='1': pep_info['enzyme'] = 'T' # # fasta = SeqIO.to_dict(SeqIO.parse(fasta_fname,"fasta"),key_function=lambda _: _.id.split('|')[1]) # 1-BASED NOTATION FOR PROTEINS INDEXING ENFORCED ... # pep_df = pd.read_csv(uniq_pept_fname) # connection between peptide info and spectrum info to be established ... ########################################################################## # unroll that spec table to have 1 deamid per row ... spec_info_unrolled = ms.unroll_by_mfunc(spec_info,'Variable modifications identified by spectrum',ms.extract_deamids,'deamid_info') spec_info_unrolled['prot_ident_probab'] = spec_info_unrolled['Protein identification probability'].str.strip('%').apply(float) spec_info_unrolled['pept_ident_probab'] = spec_info_unrolled['Peptide identification probability'].str.strip('%').apply(float) ########################################################## # so far the following merge seems to be 100% sufficient for the desired final output ... # we could add on extra features if needed ... spec_n_pep = spec_info_unrolled[['pept', 'deamid_info', 'prot_ident_probab', 'pept_ident_probab']].merge(pep_info,how='right',on='pept',suffixes=('','_x')) # Now, extract those gsites ... dg_func = lambda x: pd.Series( ms.deamid_to_gsite(x['deamid_info'], x['start_fetched'], str(gbrecs[str(int(x['fetchid']))].seq)) ) # and add them back to the main table ... gs_res = spec_n_pep[['deamid_info','start_fetched','fetchid']].apply( dg_func, axis=1 )