def performS2Icorrection(self, correctionfactors): """ @brief actully performs the s2i correction. The corrected data are written to the .hdf5 file in a separate column @correctionfactors dict normalized calculated factors giving amount of interfering signal from other reporter signals to be removed depending on s2i of ms1 """ spectrum2s2i = self.spectrumid2s2i self.cfg.log.info('we have %s s2i records' % len(spectrum2s2i)) mys2icorrecteddata = {} pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(self.isotopecorrecteddata), name='S2I correcting').start() for spectrum_id, data in self.isotopecorrecteddata.iteritems(): self.cfg.log.debug('spectrum id is %s' % spectrum_id) # perform correction only if there is actually an S2I value for that spectrum. if spectrum_id in spectrum2s2i: pBar.nextPrimary() s2ivalue = round(spectrum2s2i[spectrum_id], 3) self.cfg.log.debug('spectrum_id for s2i correction %s, s2i value %s ' % (spectrum_id, s2ivalue)) for isotopelabel_id, val in data.iteritems(): s2icorr = val - ((1 - s2ivalue) * correctionfactors[isotopelabel_id] * sum(data.values())) self.cfg.log.debug('calculated s2i corrected value %f from original %f for %s ' % (s2icorr, val, isotopelabel_id)) if s2icorr < 0: # no signal can be less than zero s2icorr = 0 if spectrum_id not in mys2icorrecteddata: mys2icorrecteddata[spectrum_id] = {} mys2icorrecteddata[spectrum_id][isotopelabel_id] = s2icorr else: self.cfg.log.debug('no spectrum_id (%s) for s2i correction ' % spectrum_id) for isotopelabel_id, val in data.iteritems(): if spectrum_id not in mys2icorrecteddata: mys2icorrecteddata[spectrum_id] = {} mys2icorrecteddata[spectrum_id][isotopelabel_id] = val pBar.finish() self.s2icorrecteddata = mys2icorrecteddata self.cfg.log.info('done performS2Icorrection') return mys2icorrecteddata
def export(self, hcdonly=0): """ @brief creates an mgf file for the MS/MS spectra in the hdf5 file @param hcdonly <integer>: flag to switch output to only HCD spectra bypassing the normal export filters """ if hcdonly: remove = ['CID'] filters = ['none'] else: remove = self.remove filters = self.usefilts hdf = self.hdf5 mgfFile = hdf.filePath.parent.joinpath(hdf.filePath.stem + '.mgf') # extra = path(hdf.filepath.splitext()[0] + '.txt') # self.fextra = extra.open('w') # self.fextra.write('spec_id\tmz\tinten\trel_inten\tion\texp_mz\n') mgfOut = open(str(mgfFile), 'w') mgfOut.write('#Removed spectra = %s, filtering = %s\n' % (remove, filters)) spec = 0 # read parameters from hdf5 file try: hdf.appendOpen() headers = hdf.readTable('/rawdata/msmsheader') runTimeEntry = hdf.getDataEqual('/rawdata/parameters', 'parameter', 'MS Run Time (min)') if len(runTimeEntry) == 0: raise ExHa.MGFprocessingError( 'MGF Error: Could not find "MS Run Time (min)" parameter in HDF5 file.' ) runtime = runTimeEntry[0]['value'] units = self.readUnitsOK() # add new table for the deconvoluted spectrum data hdf.removeTable('/rawdata/deconvions') hdf.createTable('rawdata', 'deconvions', 'DeconvIons') ident = [] for frag in units[1]: # find all the frag methods to be used in identification if 'I' in frag['use']: ident.append(frag['order']) logger.log.info('Reading %d spectra from %s' % (len(headers), hdf.filePath.name)) if 'deconv' in filters: deconv = 1 else: deconv = 0 pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(headers), name='Create .mgf').start() for idx, h in enumerate(headers): if hcdonly: if h['fragmeth'] != 'HCD': continue elif not h['order'] in ident: continue pBar.update(idx) # get spectrum data spec = h['spec_id'] spectrum = hdf.getDataEqual('/rawdata/ions', 'spec_id', spec) if deconv: # need extra column for charge information spectrum = self.addChargeColumn(spectrum) data = hdf.getDataGeneral( '/rawdata/specparams', '(spec_id == %i) & (parameter == "%s")' % (spec, 'setmass1')) setmass = data[0]['value'] data = hdf.getDataGeneral( '/rawdata/specparams', '(spec_id == %i) & (parameter == "%s")' % (spec, 'frag1')) frag = data[0]['value'] try: self.maxint = max(spectrum['inten']) except: self.maxint = 0 # construct title values list rt = '%.3f' % h['rt'] use = units[1][h['order'] - 1]['use'] pretitle = '' if use == 'IQ': # spec is both ID and Quan so us normal msms ID titles = ['msmsid:F%06d' % h['spec_id']] elif use == 'I': if h['quan_spec'] == 0: # no quant data so use spec_id titles = ['msmsid:F%06d' % h['spec_id']] else: # spec is only for ident find the quan spec titles = ['msmsid:F%06d' % h['quan_spec']] pretitle = '#CID=F%06d\n' % h['id_spec'] elif use == 'Q': titles = ['msmsid:F%06d' % h['quan_spec']] pretitle = '#CID=F%06d\n' % h['id_spec'] titles.append('rt:' + rt) titles.append('survey:S%06d' % h['survey_spec']) titles.append('parent:' + setmass) titles.append('AnalTime:' + runtime) titles.append('Activation:' + frag.upper()) titleline = 'TITLE=%s\n' % ','.join(titles) if h['precmz'] > 0: pepmass = h['precmz'] elif h['precmz_surv'] > 0: pepmass = h['precmz_surv'] else: pepmass = h['monomz'] if pepmass == 0: continue for filt in filters: if len(spectrum) > 5 and self.filters[filt]: spectrum = self.filters[filt](h, spectrum) # filter for mascot interference ionList = [] if len(spectrum) > 2: mgfOut.write(pretitle) mgfOut.write('BEGIN IONS\n') mgfOut.write(titleline) mgfOut.write('PEPMASS=%f\n' % pepmass) mgfOut.write('CHARGE=%d+\n' % h['charge']) if deconv: for pt in spectrum: if pt['inten'] == 0: continue mgfOut.write('%f %f %s\n' % (pt['mz'], pt['inten'], pt['charge'])) ionList.append( dict(spec_id=pt['spec_id'], mz=pt['mz'], inten=pt['inten'], charge=pt['charge'])) else: for pt in spectrum: if pt['inten'] == 0: continue mgfOut.write('%f %f\n' % (pt['mz'], pt['inten'])) ionList.append( dict(spec_id=pt['spec_id'], mz=pt['mz'], inten=pt['inten'])) mgfOut.write('END IONS\n\n') if len(ionList) > 0: hdf.appendRows('/rawdata/deconvions', ionList) pBar.finish() except ExHa.MGFprocessingError, czEx: if spec: ExHa.addContext(czEx, 'Raised whist processing spectrum %i' % spec) raise
def updateHDF5(self): """ @brief controls the updating of the data to the hdf5 results file @return finalMessage <string>: constructed from the protein data this is the RESULT stored in the DB """ pep2unique = self.pep2unique baseContext = 'updateHDF5: ' context = 'updateHDF5' try: # find the peptide sequences that are being imported usedPeps = self.setsManager.findUsedPeptides() logger.log.info('there are %s usedPeps' % len(usedPeps)) context = baseContext + 'Retrieving sample IDs' sample_ids = range(1, len(self.hdfFiles) + 1) # create proteinset and proteinhit data starting_protein_group_no = 1 self.setsManager.setProteinGroupNo(starting_protein_group_no) logger.log.info('adding protein group data to HDF5') logger.log.debug(str(self.hdfFiles.keys())) spectrum_id = 0 peptide_id = 0 hdfFileList = self.hdfFiles.keys() hdfFileList.sort() for key in hdfFileList: baseContext += '%s: ' % key logger.log.log( logger.PROCESS, 'Integrating Spectrum, Peptide & Quantification data from %s' % key) # collect fileData hdf = self.hdfFiles[key] hdfObj = hdf.hdfObject # set the current sample_id from the list of IDs extracted from the DB current_sample_id = sample_ids.pop() hdf.acquired_spectra, hdf.mascot_matched_spectra, numIsotopes, runTime = hdfObj.getNumbers( ) # read the Mascot data context = baseContext + 'Reading Mascot data' tmp = hdfObj.readImporterData(usedPeps, hdf) peptides = tmp[0] queryDict = tmp[1] headerArray = tmp[2] quanArray = tmp[3] hdf.spectra_in_qc_proteins = len(peptides) logger.log.debug('getting spectrum_ids') context = baseContext + 'Retrieving spectrum IDs' acqTime, hdf.idAct, hdf.quanAct = hdfObj.getTimeAndActivation() # create blank lists to hold data for writing to hdf5 file spectrum_list = [] peptide_list = [] quant_list = [] logger.log.info('collating spectrum, peptide & quant data') pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(queryDict), name='collate data').start() for idx, q in enumerate(queryDict): # loop round all the required spectra pBar.nextPrimary() context = baseContext + 'query %i: Setting spectrum data' % q # extract a spectrum_id from the list spectrum_id += 1 query = queryDict[q] spec = int(query['spec_id']) context = baseContext + 'spectrum %i: Updating DB with spectrum data' % spec # add spectrum data to spectrum_list header = self.filterArrayEqual(headerArray, 'spec_id', spec) spectrum_list.append( self.makeSpectrumDict(spectrum_id, current_sample_id, query, acqTime, header)) # find the appropriate peptides pepList = peptides[q] logger.log.debug('there are %s in peplist %s' % (len(pepList), str(pepList))) quantFound = 0 # this list will hold all peptides returned from makePeptideDictList and then filter # those non-rank1 equivalents based on the score of the rank 1 peptide tmplist = [] for pep in pepList: # find the sets that the peptide belongs to and add to the peptide_list sets = self.setsManager.peptide2set[pep['peptide']] context = baseContext + 'spectrum %i: Creating peptide data entries for hdf5' % spec tmp, qf = self.makePeptideDictList( spectrum_id, pep, query, sets, hdf, pep2unique) tmplist.extend(tmp) peptide_list += tmp quantFound += qf # only keep rank1 equivalent peptides (based on score) tmplist.sort(key=lambda x: x['rank']) toprankscore = tmplist[0]['score'] tmplist = [ x for x in tmplist if x['score'] == toprankscore ] if quantMethID and quantFound: # extract quantification data for the spectrum context = baseContext + 'spectrum %i: Creating quantitation data entries for DB' % spec newquant, deltas = self.makeQuantDictLists( spectrum_id, spec, tmplist, header, quanArray, hdf) quant_list += newquant if quantSource == 'ms2': context = baseContext + 'spectrum %i: Adding reporter ion delta data' % spec hdf.addReporterDeltas(deltas) pBar.finish() # calculate statistics context = baseContext + 'Calculating statistics' hdf.calcReporterStats() context = baseContext + 'Calculating delta m/z for fragment ions' context = baseContext + 'Updating sample table (%i)' % current_sample_id sample_data = hdf.getSampleDataDict(current_sample_id, key, runTime) hdf5results.writeSample(sample_data) self.importData.combineStatistics(hdf) # write data to HDF5 context = baseContext + 'Updating spectrum table' logger.log.info('updating HDF5 with spectrum data') hdf5results.writeSpectrum(spectrum_list) if quantMethID: context = baseContext + 'Updating specquant table' logger.log.info('updating HDF5 with quant data') hdf5results.writeSpecQuant(quant_list) context = baseContext + 'Retrieving peptide IDs' logger.log.info('updating HDF5 with peptide data') for pepdata in peptide_list: pepdata['peptide_id'] = peptide_id peptide_id += 1 context = baseContext + 'Updating peptide table' hdf5results.writePeptide(peptide_list) hdf5results.createIndexes() logger.log.info('finalising HDF5 entries') hdf5results.writeFDRdata(self.importData.score2fdr, 'peptide') hdf5results.writeFDRdata(self.importData.proteinscore2fdr, 'protein') topScoringProteinInfo = self.setsManager.addPeptideSetDBdata( hdf5results, self.importData.proteinscore2fdr) runtimedata = self.importData.getSummaryStatisticsDict() hdf5results.writeStatistics(runtimedata) finalMessage = 'queries matched: %i / %s (%.1f%%) ' % ( runtimedata['spectra_in_qc_proteins'], runtimedata['mascot_matched_spectra'], (runtimedata['spectra_in_qc_proteins'] / float(runtimedata['mascot_matched_spectra'])) * 100) finalMessage += 'spectra quantified: %i top hit %s (%s) ' % ( runtimedata['quantified_spectra'], '', '') finalMessage += 'with total score %f and %i matched peptides (hook AND non hook)' % \ (topScoringProteinInfo[0], topScoringProteinInfo[2]) baseContext = 'updateHDF5: ' context = baseContext + 'Finalising HDF5 entries' except Exception, genEx: # make sure that there aren't any permanent changes ExHa.addContext(genEx, context) finalMessage = 'Error: %s' % ExHa.oneLineRepr(genEx) raise
def collectPeptideData(hdfObject, sample2source): logger.log.info('generating Peptide based output') hdf = hdfObject.hdf outputFile = renameFile(hdf.filePath, '_peptides') # extract required protein data logger.log.info('loading protein data') proteinhit = hdf.readTable('/proteinhit') proteins = {} for prot in proteinhit: try: proteins[prot['protein_group_no']].append(prot['protein_id']) # it's ok to sort here as we don't expect too many protein ids for the protein group proteins[prot['protein_group_no']].sort() except KeyError: proteins[prot['protein_group_no']] = [prot['protein_id']] proteinhit = None # extract required spectrum data logger.log.info('loading spectrum data') spectra = hdf.readTable('/spectrum') specs = {} for sp in spectra: source_file = sample2source[sp['sample_id']] specs[sp['spectrum_id']] = dict(source_file=source_file, msms_id=sp['msms_id'], charge_state=sp['charge_state'], precursor_mz=sp['precursor_mz'], peak_intensity=sp['peak_intensity'], s2i=sp['s2i'], p2t=sp['p2t']) # extract quantification data logger.log.info('loading quantification data') specquant = hdf.readTable('/specquant') quant = {} usedIsotopes = set() for sq in specquant: id = sq['spectrum_id'] if id not in quant: quant[id] = dict( in_quantification_of_protein=sq['in_quantification_of_protein'] ) quant[id][sq['isotopelabel_id']] = sq['quant_allcorrected'] usedIsotopes.add(sq['isotopelabel_id']) usedIsotopes = sorted(usedIsotopes) outString = 'protein_group_no\tprotein_id\tsequence\tmodifications\tmw' outString += '\tprecursor_mz\tcharge_state\tppm_error\tscore\tfdr_at_score\trank\tmsms_id\tsource_file' outString += '\tpeak_intensity\ts2i\tp2t\tis_unique\tin_quantification_of_protein\tin_protein_inference' outString += '\tseq_start\tseq_end' if usedIsotopes: isotope_data = dict([(str(i), i) for i in usedIsotopes]) try: y = quantHandler.QuantMethods() g = y.getMethodByIsotope(usedIsotopes[0]) for id, data in g['quantmasses'].iteritems(): isotope_data[id] = data[0]['name'] except: print 'error getting label name data, just using names present in .hdf5 file' for iso in usedIsotopes: outString += '\tsig_%s' % isotope_data[iso] # open text file output f_out = open(str(outputFile), 'w') f_out.write(outString + '\n') # integrate other data with peptide data and output to text file logger.log.info('loading peptide data') out_string_template = '%(protein_group_no)i\t%(proteins)s\t%(sequence)s\t%(modifications)s\t%(mw)f\t' out_string_template += '%(precursor_mz)f\t%(charge_state)i\t%(ppm_error)f\t%(score).0f\t%(fdr_at_score).3f\t' out_string_template += '%(rank)i\t%(msms_id)i\t%(source_file)s\t%(peak_intensity)f\t%(s2i)f\t%(p2t)f\t' out_string_template += '%(is_unique)i\t%(in_quantification_of_protein)i\t' out_string_template += '%(in_protein_inference)f\t%(seq_start)s\t%(seq_end)s' fdr_data = dict([(x['score'], x['global_fdr']) for x in hdf.readTable('/fdrdata') if x['data_type'] == 'peptide']) peptidetable = hdf.getTable( '/peptide') # get reference to peptide table on disk current_pepgroupid = None tmplist = [] pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(peptidetable), name='load peptides').start() for idx, p in enumerate(peptidetable.itersorted('protein_group_no')): pBar.update(idx) if current_pepgroupid == p['protein_group_no']: tmplist.append( preparePeptideData(p, proteins, fdr_data, specs, quant)) else: if tmplist: tmplist = sorted(tmplist, key=lambda y: y['seq_start']) for x in tmplist: outString = out_string_template % x for iso in usedIsotopes: try: outString += '\t%f' % x[iso] except KeyError: outString += '\tNA' f_out.write(outString + '\n') tmplist = [preparePeptideData(p, proteins, fdr_data, specs, quant)] current_pepgroupid = p['protein_group_no'] pBar.finish() # not to forget the last protein groups data in tmplist if tmplist: tmplist = sorted(tmplist, key=lambda y: y['seq_start']) for x in tmplist: outString = out_string_template % x for iso in usedIsotopes: try: outString += '\t%f' % x[iso] except KeyError: outString += '\tNA' f_out.write(outString + '\n') f_out.close() return
def performBootstrapQuant(self, protein_group_nos, reference): """ @brief get all filtered spectra from spec quant table then fetch peptide data (sequence and spectrum id ) from all protein sets. for every protein group perform fold change calculation using bootstrap method @param protein_group_nos list of protein group ids @param reference id of value used for fold change calculation using bootstrap model """ all_proteins_quantdata = self.hdf5quantprot.getAllProteinDatafromSpecQuant( ) peptidedatafromsets = self.hdf5quantprot.getPeptideDataforSets() protein2quantdata = { } # keeps fold change result,sum ion area, isotopelabel usedIsotopes = sorted( list(set([x['isotopelabel_id'] for x in all_proteins_quantdata]))) pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(protein_group_nos), name='bootstrap quant').start() for idx, protein_group_no in enumerate(protein_group_nos): pBar.update(idx) missingrefevents = 0 datadict = {} self.cfg.log.debug('starting for proteingroup %s' % protein_group_no) protlocation = all_proteins_quantdata[ 'protein_group_no'] == protein_group_no data = all_proteins_quantdata[protlocation] peptidedataforset = peptidedatafromsets[protein_group_no] quantuniquepeps = set([ peptidedataforset[spectrum_id] for spectrum_id in data['spectrum_id'] ]) totalquantevents = len(set(data['spectrum_id'])) refdata = data[data['isotopelabel_id'] == reference]['quant_allcorrected'] sumrefdata = refdata.sum() refspectraquantified = len(refdata) if refspectraquantified != totalquantevents: missingrefevents = totalquantevents - refspectraquantified qupm = len(quantuniquepeps) if sumrefdata: datadict[reference] = [ sumrefdata, refspectraquantified, qupm, (1, 0, 0) ] else: # if there are no quantified peptides from the reference label then we cannot calculate a fold change datadict[reference] = [ sumrefdata, refspectraquantified, qupm, (-1, -1, -1) ] for isotopelabel_id in usedIsotopes: missingqueryevents = 0 self.cfg.log.debug('assessing isotopelabel %s' % isotopelabel_id) if isotopelabel_id != reference: queryvalues = data[data['isotopelabel_id'] == isotopelabel_id]['quant_allcorrected'] sumqueryvalues = queryvalues.sum() qssm = len(queryvalues) if qssm != totalquantevents: missingqueryevents = totalquantevents - qssm self.cfg.log.debug('length ref %s, length query %s' % (len(refdata), len(queryvalues))) if queryvalues.any() and refdata.any(): minquantspectra = cfg.parameters['general'][ 'minquantspectra'] result = self.makeBootstrap( queryvalues.tolist() + [0] * missingqueryevents, refdata.tolist() + [0] * missingrefevents, minquantspectra) self.cfg.log.debug('bootstrap result for %s : %s ' % (protein_group_no, result)) elif refdata.any(): self.cfg.log.debug( 'there are no valid query values for %s: FC will be zero, as reference' ' is present' % protein_group_no) result = (0, -1, -1) else: result = (-1, -1, -1) datadict[isotopelabel_id] = [ sumqueryvalues, qssm, qupm, result ] protein2quantdata[protein_group_no] = datadict pBar.finish() return protein2quantdata
def performSimpleSumQuant(self, protein_group_nos, reference): """ @brief perform simple sum ratio calculation using valid quant data from all spectra """ all_proteins_quantdata = self.hdf5quantprot.getAllProteinDatafromSpecQuant( ) peptidedatafromsets = self.hdf5quantprot.getPeptideDataforSets() protein2quantdata = { } # keeps fold change result,sum ion area, isotopelabel usedIsotopes = sorted( list(set([x['isotopelabel_id'] for x in all_proteins_quantdata]))) pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(protein_group_nos), name='bootstrap quant').start() # scan through each protein group and perform quantification (according to method given) on quant values # for idx, protein_group_no in enumerate(protein_group_nos): pBar.update(idx) datadict = {} self.cfg.log.debug('starting for proteingroup %s' % protein_group_no) protlocation = all_proteins_quantdata[ 'protein_group_no'] == protein_group_no data = all_proteins_quantdata[protlocation] peptidedataforset = peptidedatafromsets[protein_group_no] quantuniquepeps = set([ peptidedataforset[spectrum_id] for spectrum_id in data['spectrum_id'] ]) refdata = data[data['isotopelabel_id'] == reference]['quant_allcorrected'] sumrefdata = refdata.sum() qupm = len(quantuniquepeps) refspectraquantified = len(refdata) if sumrefdata: datadict[reference] = [ sumrefdata, refspectraquantified, qupm, (1, 0, 0) ] else: datadict[reference] = [ sumrefdata, refspectraquantified, qupm, (-1, -1, -1) ] for isotopelabel_id in usedIsotopes: datadict[isotopelabel_id] = [0, 0, qupm, (-1, -1, -1)] # cannot continue as there are no references! break for isotopelabel_id in usedIsotopes: self.cfg.log.debug('assessing isotopelabel %s' % isotopelabel_id) if isotopelabel_id != reference: queryvalues = data[data['isotopelabel_id'] == isotopelabel_id]['quant_allcorrected'] sumqueryvalues = queryvalues.sum() qssm = len(queryvalues) self.cfg.log.debug('length ref %s, length query %s' % (len(refdata), len(queryvalues))) if queryvalues.any() and refdata.any(): ratio_result = sumqueryvalues / sumrefdata result = (ratio_result, -1, -1) self.cfg.log.debug('simple ratio result for %s : %s ' % (protein_group_no, result)) elif refdata.any(): self.cfg.log.debug( 'there are no valid query values for %s: FC will be zero, as reference' ' is present' % protein_group_no) result = (0, -1, -1) else: result = (-1, -1, -1) datadict[isotopelabel_id] = [ sumqueryvalues, qssm, qupm, result ] protein2quantdata[protein_group_no] = datadict pBar.finish() return protein2quantdata