def psm(p, s0, c = 2, highResMs2 = False, dripLearnedMeans = 'dripLearned.means', dripLearnedCovars = 'dripLearned.covars', mods = '', ntermMods = '', ctermMods = '', varModSequence = '', precursor_filter = False, high_res_gauss_dist = 0.05): """ Inputs: p = peptide string s = observed spectrum, instance of class MS2Spectrum c = psm charge mods = static mods ntermMods = static nterm-mods ctermMods = static cterm-mods """ s = copy.deepcopy(s0) args = dripGaussianCollectionNames() sid = s.spectrum_id # parse modifications mods, varMods = parse_var_mods(mods, True) ntermMods, ntermVarMods = parse_var_mods(ntermMods, False) ctermMods, ctermVarMods = parse_var_mods(ctermMods, False) if precursor_filter: normalize = 'top300TightSequest' else: normalize = 'top300Sequest' preprocess = pipeline(normalize) preprocess(s) # get original intensity values to plot s0.mz = list(s.mz) mz_vals = set(s.mz) z = max(s0.intensity) s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity) if mz in mz_vals] num_psms = 1 max_obs_mass = 2001 dirBase = 'dtk' # output_dir = os.path.abspath('dripEncode_' + dirBase) output_dir = os.path.abspath('encode') if not os.path.exists(output_dir): os.mkdir(output_dir) obs_dir = 'obs' # sub directory of output_dir pfile_dir = os.path.join(output_dir, obs_dir) if not os.path.exists(pfile_dir): os.mkdir(pfile_dir) # log_dir = os.path.abspath('dripLog_' + dirBase) log_dir = os.path.abspath('log') if not os.path.exists(log_dir): os.mkdir(log_dir) if not highResMs2: dripMeans = load_drip_means(dripLearnedMeans) if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks_lowres(bNy, dripMeans, s.mz[0], s.mz[-1]) else: # calculate b- and y-ions, filter peaks outside of spectrum range if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist) # now construct means based on this dripMeans = {} for i, ion in enumerate(bNy): dripMeans[i] = ion ion_to_index_map = {} # reverse mapping, from ions to indices for ind in dripMeans: ion_to_index_map[dripMeans[ind]] = ind # make collection per spectrum make_master_parameters_lowres(args, dripMeans) peptide_obs_file = os.path.join(pfile_dir,'pep-lengths') spectrum_obs_file = os.path.join(pfile_dir,'spectrum') pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") pep_num = 0 # create iterable dt and peptide pfile peptide_sentence_flatascii(pep_dt, p, bNy, pep_num, sid, max_obs_mass, peptide_obs_file, True, len(bNy)) # create spectrum pfile spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, p, l, c)) # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(output_dir,'iterable.dts')], stdout = stdo, stderr = stde) # stdout = sys.stderr, stderr = sys.stderr) # create structure and master files then triangulate try: create_drip_structure(highResMs2, args.structure_file, max_obs_mass, False, False, high_res_gauss_dist) except: print "Could not create DRIP structure file %s, exitting" % args.structure_file exit(-1) try: create_drip_master(highResMs2, args.master_file, max_obs_mass, "DRIP_MZ", "drip_collection/covar.txt", "DRIP_GAUSSIAN_COMPONENTS", "DRIP_GAUSSIAN_MIXTURES", "DRIP_MZ_GAUSSIANS") except: print "Could not create DRIP master file %s, exitting" % args.master_file exit(-1) try: triangulate_drip(args.structure_file, args.master_file) except: print "Could not create triangulate structure file %s, exitting" % args.structure_file exit(-1) try: write_covar_file(highResMs2, args.covar_file, dripLearnedCovars, True, high_res_gauss_dist) except: print "Could not create covariance file %s, exitting" % args.covar_file exit(-1) # run GMTK dtFile = os.path.join(output_dir, 'iterable.dts') cppCommand = '\'-DITERABLE_DT=' + dtFile \ + ' -DDRIP_MZ=' + args.mean_file \ + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \ + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \ + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \ + '\'' # call gmtkViterbi vitStr0 = "gmtkViterbi -strFile " + args.structure_file \ + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \ + " -fdiffact2 rl" \ + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F" # gmtkViterbi command line vitValsFile = os.path.join(log_dir, 'vitVals.txt') vitStr = vitStr0 + ' -vitValsFile ' + vitValsFile \ + ' -of1 ' + spectrum_obs_file \ + ' -fmt1 flatascii ' \ + ' -of2 ' + peptide_obs_file \ + ' -fmt2 flatascii ' \ + ' -cppCommand ' + cppCommand # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout) call(shlex.split(vitStr), stdout = stdo, stderr = stde) # parse output t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt')) t = t[sid,c][0] # calculate insertions and deletions t.add_obs_spectrum(s0) t.calculate_drip_features(dripMeans) t.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) return t
def plot_psms(psmFile, spectrumFile, plotList = 'currPsms.html', highResMs2 = False, dripLearnedMeans = 'dripLearned.means', dripLearnedCovars = 'dripLearned.covars', mods = '', ntermMods = '', ctermMods = '', precursor_filter = False, high_res_gauss_dist = 0.05): """ """ # initialize arguments for dripExtract args = dripExtractParams(psmFile, spectrumFile, 'all', mods, ntermMods, ctermMods, highResMs2, dripLearnedMeans, dripLearnedCovars) mods, varMods = parse_var_mods(mods, True) ntermMods, ntermVarMods = parse_var_mods(ntermMods, False) ctermMods, ctermVarMods = parse_var_mods(ctermMods, False) stde = open('gmtk_err', "w") # stdo = sys.stdout stdo = stde args.precursor_filter = False args.high_res_gauss_dist = high_res_gauss_dist if precursor_filter: args.normalize = 'top300TightSequest' else: args.normalize = 'top300Sequest' # decode DRIP PSMs t, d, spectra0 = runDripExtract(args, stdo, stde) # if variable mods, get variable mod string per PSM if varMods or ntermVarMods or ctermVarMods: varModDict = psm_var_mods(psmFile) assert varModDict, "Variable mods specified in enzyme options, but strings denoting variables mods per peptide are not specified in %s, exitting" (psmFile) spectra, minMz, maxMz, validCharges = load_spectra_minMaxMz(spectrumFile) # get original intensity values to plot for sid in spectra0: spectra[sid].mz = list(spectra0[sid].mz) mz_vals = set(spectra0[sid].mz) z = max(spectra0[sid].intensity) spectra[sid].intensity = [i/z for mz, i in zip(spectra[sid].mz, spectra[sid].intensity) if mz in mz_vals] if not highResMs2: dripMeans = load_drip_means(dripLearnedMeans) else: dripMeansSet = set([]) for sid, c in t: for p in t[sid,c]: pep = p.peptide if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), c, mods, ntermMods, ctermMods) filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist) dripMeansSet |= set(bNy) # for i, ion in enumerate(bNy): # dripMeans[i] = ion for sid, c in d: for p in d[sid,c]: pep = p.peptide if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), c, mods, ntermMods, ctermMods) filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist) dripMeansSet |= set(bNy) # for i, ion in enumerate(bNy): # dripMeans[i] = ion dripMeans = {} for ind, ion in enumerate(sorted(dripMeansSet)): dripMeans[ind] = ion ion_to_index_map = {} # reverse mapping, from ions to indices for ind in dripMeans: ion_to_index_map[dripMeans[ind]] = ind all_psms = [] varModSequence = '' for sid, c in t: s = spectra[sid] for p in t[sid,c]: p.add_obs_spectrum(s) p.calculate_drip_features(dripMeans) if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] p.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) all_psms.append(p) for sid, c in d: s = spectra[sid] for p in d[sid,c]: p.add_obs_spectrum(s) p.calculate_drip_features(dripMeans) if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] p.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) all_psms.append(p) fid = open(plotList, "w") all_psms.sort(key = lambda r: r.score, reverse = True) for p in all_psms: if p.kind == 't': kind = 'target' elif p.kind == 'd': kind = 'decoy' else: continue plotName = kind + 'Scan' + str(p.scan) + \ 'Charge' + str(p.charge) + \ p.peptide + '.png' p.plot_drip_viterbi(plotName) fid.write("<a href=\"%s\">%s Scan %d Charge %d %s</a><br>\n" % (plotName, kind, p.scan, p.charge, p.peptide)) fid.close()
def psm(p, s0, c = 2, highResMs2 = False, dripLearnedMeans = 'dripLearned.means', dripLearnedCovars = 'dripLearned.covars', mods = '', ntermMods = '', ctermMods = '', varModSequence = '', precursor_filter = False, high_res_gauss_dist = 0.05): """ Inputs: p = peptide string s = observed spectrum, instance of class MS2Spectrum c = psm charge mods = static mods ntermMods = static nterm-mods ctermMods = static cterm-mods """ s = copy.deepcopy(s0) args = dripGaussianCollectionNames() sid = s.spectrum_id # parse modifications mods, varMods = parse_var_mods(mods, True) ntermMods, ntermVarMods = parse_var_mods(ntermMods, False) ctermMods, ctermVarMods = parse_var_mods(ctermMods, False) if precursor_filter: normalize = 'top300TightSequest' else: normalize = 'top300Sequest' preprocess = pipeline(normalize) preprocess(s) # get original intensity values to plot s0.mz = list(s.mz) mz_vals = set(s.mz) z = max(s0.intensity) s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity) if mz in mz_vals] num_psms = 1 max_obs_mass = 2001 dirBase = 'dtk' # output_dir = os.path.abspath('dripEncode_' + dirBase) output_dir = os.path.abspath('encode') if not os.path.exists(output_dir): os.mkdir(output_dir) obs_dir = 'obs' # sub directory of output_dir pfile_dir = os.path.join(output_dir, obs_dir) if not os.path.exists(pfile_dir): os.mkdir(pfile_dir) # log_dir = os.path.abspath('dripLog_' + dirBase) log_dir = os.path.abspath('log') if not os.path.exists(log_dir): os.mkdir(log_dir) if not highResMs2: dripMeans = load_drip_means(dripLearnedMeans) if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks_lowres(bNy, dripMeans, s.mz[0], s.mz[-1]) else: # calculate b- and y-ions, filter peaks outside of spectrum range if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist) # now construct means based on this dripMeans = {} for i, ion in enumerate(bNy): dripMeans[i] = ion ion_to_index_map = {} # reverse mapping, from ions to indices for ind in dripMeans: ion_to_index_map[dripMeans[ind]] = ind # make collection per spectrum make_master_parameters_lowres(args, dripMeans) peptide_obs_file = os.path.join(pfile_dir,'pep-lengths') spectrum_obs_file = os.path.join(pfile_dir,'spectrum') pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") pep_num = 0 # create iterable dt and peptide pfile peptide_sentence_flatascii(pep_dt, p, bNy, pep_num, sid, max_obs_mass, peptide_obs_file, True, len(bNy)) # create spectrum pfile spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, p, l, c)) # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(output_dir,'iterable.dts')], stdout = stdo, stderr = stde) # stdout = sys.stderr, stderr = sys.stderr) # create structure and master files then triangulate try: create_drip_structure(highResMs2, args.structure_file, max_obs_mass, False, False, high_res_gauss_dist) except: print "Could not create DRIP structure file %s, exitting" % args.structure_file exit(-1) try: create_drip_master(highResMs2, args.master_file, max_obs_mass, "DRIP_MZ", "drip_collection/covar.txt", "DRIP_GAUSSIAN_COMPONENTS", "DRIP_GAUSSIAN_MIXTURES", "DRIP_MZ_GAUSSIANS") except: print "Could not create DRIP master file %s, exitting" % args.master_file exit(-1) try: triangulate_drip(args.structure_file, args.master_file) except: print "Could not create triangulate structure file %s, exitting" % args.structure_file exit(-1) try: write_covar_file(highResMs2, args.covar_file, dripLearnedCovars, True, high_res_gauss_dist) except: print "Could not create covariance file %s, exitting" % args.covar_file exit(-1) # run GMTK dtFile = os.path.join(output_dir, 'iterable.dts') cppCommand = '\'-DITERABLE_DT=' + dtFile \ + ' -DMAX_FRAGMENT_MASS=' + str(max_obs_mass) \ + ' -DDRIP_MZ=' + args.mean_file \ + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \ + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \ + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \ + '\'' # call gmtkViterbi vitStr0 = "gmtkViterbi -strFile " + args.structure_file \ + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \ + " -fdiffact2 rl" \ + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F" # gmtkViterbi command line vitValsFile = os.path.join(log_dir, 'vitVals.txt') vitStr = vitStr0 + ' -vitValsFile ' + vitValsFile \ + ' -of1 ' + spectrum_obs_file \ + ' -fmt1 flatascii ' \ + ' -of2 ' + peptide_obs_file \ + ' -fmt2 flatascii ' \ + ' -cppCommand ' + cppCommand # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout) call(shlex.split(vitStr), stdout = stdo, stderr = stde) # parse output t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt')) t = t[sid,c][0] # calculate insertions and deletions t.add_obs_spectrum(s0) t.calculate_drip_features(dripMeans) t.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) return t
def make_drip_data_highres(args, spectra, stdo, stde): """Generate test data .pfile. and create job scripts for cluster use (if num_jobs > 1). Decrease number of calls to GMTK by only calling once per spectrum and running for all charge states in one go. inputs: args - output of parsed input arguments (struct) outputs: sids - list of scan IDs for the generated data pre: - args has been created by parse_args(), directories have been created/checked for existence, relevant arguments have been processed (Booleans, mods, digesting enzyme, etc) - data has been created by candidate_spectra_generate() and contains the above mentioned fields post: - args.{mean_file, gauss_file, mixture_file, collection_file} will all be adjusted - args.max_mass will be updated to the size of the number of unique theoretical fragmentation locations (floating point if high-res ms2, integers if low-res ms2) """ # parse modifications mods, varMods = parse_var_mods(args.mods_spec, True) # print "mods:" # print mods ntermMods, ntermVarMods = parse_var_mods(args.nterm_peptide_mods_spec, False) # print "n-term mods:" # print nterm_mods ctermMods, ctermVarMods = parse_var_mods(args.cterm_peptide_mods_spec, False) varModKey = "Var_mod_seq" if not args.append_to_pin: target,decoy,num_psms = load_psms(args.psm_file) else: target,decoy,num_psms = load_pin_file(args.psm_file) # check whether variable mods enzyme options were specified and # necessary variable mod string specifying which amino acids are modded # were in the PSM files for i in target[target.keys()[0]]: t = i break if varMods or ntermVarMods or ctermVarMods: if varModKey not in t.other: print "Variable modifications enzyme options specified," print "but PSM file does not contain necessary field Var_mod_seq for strings specifying which amino acids are modified." print "Exitting" exit(-1) # else: # if varModKey in t.other: # print "PSM file does contains field Var_mod_seq denoting variable modifications," # print "but variable modifications enzyme options not specified." # print "Exitting" # exit(-1) pfile_dir = os.path.join(args.output_dir, args.obs_dir) sid_charges = list(set(target.iterkeys()) | set(decoy.iterkeys())) # assume that we should randomize PSMs for multithreading purposes; only reason # why we are currently assuming this is that there is already a parameter for dripSearch # which signifies whether we should shuffle the data shuffle(sid_charges) if(args.normalize != 'filter0'): preprocess = pipeline(args.normalize) validcharges = args.charges ion_dict = {} # global dictionary for used fragment ions theo_spec_dict = {} numBY_dict_per_sid = {} # construct ion_dict for sid in spectra: s = spectra[sid] preprocess(s) for charge in validcharges: if (s.spectrum_id, charge) not in target: continue # check if we're filtering theoretical peaks outside observed m/z values if args.filt_theo_peaks: if args.per_spectrum_mz_bound: minMz = s.mz[0] maxMz = s.mz[-1] else: minMz = args.mz_lb maxMz = args.mz_ub # calculate maximum decoy and target theoretical spectra cardinalities for p in target[s.spectrum_id, charge]: pep = p.peptide # bNy = interleave_b_y_ions(Peptide(pep), charge, mods, # ntermMods, ctermMods) if varMods or ntermVarMods or ctermVarMods: varModSequence = p.other[varModKey] bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), charge, mods, ntermMods, ctermMods) numBY_dict_per_sid[sid, pep] = len(bNy) if args.filt_theo_peaks: filter_theoretical_peaks(bNy, minMz, maxMz) theo_spec_dict[s.spectrum_id, pep] = bNy for i in bNy: ion_dict[i] = 1 for d in decoy[s.spectrum_id, charge]: pep = d.peptide # bNy = interleave_b_y_ions(Peptide(pep), charge, mods, # ntermMods, ctermMods) if varMods or ntermVarMods or ctermVarMods: varModSequence = d.other[varModKey] bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), charge, mods, ntermMods, ctermMods) numBY_dict_per_sid[sid, pep] = len(bNy) if args.filt_theo_peaks: filter_theoretical_peaks(bNy, minMz, maxMz) theo_spec_dict[s.spectrum_id, pep] = bNy for i in bNy: ion_dict[i] = 1 ions = list(ion_dict.iterkeys()) ions.sort() for i, ion in enumerate(ions): ion_dict[ion] = i # make collection per spectrum make_master_parameters(args, ion_dict, ions) peptide_pfile = create_pfile(pfile_dir, 'pep-lengths.pfile', 0, 1) spectrum_pfile = create_pfile(pfile_dir, 'spectrum.pfile', 2,0) pep_dt = open(os.path.join(args.output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(args.output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") spec_dict = {} pep_num = 0 for sid, charge in sid_charges: if sid not in spec_dict: s = spectra[sid] preprocess(s) spec_dict[sid] = s else: s = spec_dict[sid] for p in target[sid,charge]: pep = p.peptide bNy = theo_spec_dict[s.spectrum_id, pep] bNy = [ion_dict[bOrY] for bOrY in bNy] drip_peptide_sentence(pep_dt, pep, bNy, pep_num, s.spectrum_id, args.max_obs_mass, peptide_pfile, True, len(bNy)-1) drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, pep, numBY_dict_per_sid[sid, pep], charge)) pep_num += 1 if (sid,charge) in decoy: for d in decoy[sid,charge]: pep = d.peptide bNy = theo_spec_dict[s.spectrum_id, pep] bNy = [ion_dict[bOrY] for bOrY in bNy] drip_peptide_sentence(pep_dt, pep, bNy, pep_num, s.spectrum_id, args.max_obs_mass, peptide_pfile, False, len(bNy)-1) drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity) pepdb_list.write("d\t%d\t%s\t%d\t%d\n" % (sid, pep, numBY_dict_per_sid[sid, pep], charge)) pep_num += 1 # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(args.output_dir,'iterable.dts')], stdout = stdo, stderr = stde) return spec_dict, pep_num