def doEnzyme(self, fileiterator, datfile): ''' @brief reads the enzyme section and puts the data into a dictionary, data is passed to the datfile object @param fileiterator <file object>: linked to the dat file @param datfile <datfile object>: containing all the dat file data and processing methods ''' try: isbound = self.isBoundary self.logs.datlog.info('Loading and Parsing Enzyme') while 1: allparams = {} while not isbound(self.cargo): match = rx_enzymeline.search(self.cargo) if match: allparams[match.group('key')] = match.group('value') elif len(self.cargo) > 2: allparams['Side'] = self.cargo[:-1] self.cargo = fileiterator.next() datfile.hdfMascot.writeParameters(allparams, 'enzyme') yield ('boundary', self.cargo) except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'Called from doEnzyme') raise
def doETsummary(self, fileiterator, datfile): ''' @brief reads the summary section and puts the data into a dictionary, data is passed to the datfile object @param fileiterator <file object>: linked to the dat file @param datfile <datfile object>: containing all the dat file data and processing methods ''' try: isbound = self.isBoundary self.logs.datlog.info('Loading and Parsing et_summary') while 1: allparams = {} while not isbound(self.cargo): match = rx_summary.search(self.cargo) if match: allparams[match.group('key')] = match.group('value') self.cargo = fileiterator.next() datfile.addETsummary(allparams) yield ('boundary', self.cargo) except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'Called from doETsummary') raise
def doIndex(self, fileiterator, datfile): ''' @brief parses index section of the dat file @param fileiterator <file object>: linked to the dat file @param datfile <datfile object>: containing all the dat file data and processing methods ''' try: isbound = self.isBoundary self.logs.datlog.info('Loading and Parsing index') while 1: index = [] while not isbound(self.cargo): try: key, value = self.cargo.split('=') index.append((key, int(value))) except: pass self.cargo = fileiterator.next() datfile.addIndex(index) yield ('boundary', self.cargo) except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'Called from doIndex') raise
def doQuery(self, fileiterator, datfile): ''' @brief reads the query sections and puts the data into a dictionary, data is passed to the datfile object @param fileiterator <file object>: linked to the dat file @param datfile <datfile object>: containing all the dat file data and processing methods ''' query = 'None' try: isbound = self.isBoundary while 1: allparams = {} match = rx_query.search(self.cargo) query = match.group('ID') # if match.group('ID') == '1': # self.logs.datlog.info('Loading and Parsing queries') allparams['query'] = query while not isbound(self.cargo): match = rx_paramline.search(self.cargo) if match: allparams[match.group('key')] = match.group('value') self.cargo = fileiterator.next() datfile.addQuerySpectra(allparams) qry = int(query) datfile.spectra[qry]['spec_id'] = int( datfile.spectra[qry]['msmsid'][1:]) datfile.spectra[qry]['rt'] = float( datfile.spectra[qry]['start']) yield ('postquery', self.cargo) except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'doQuery: last query = %s' % query) raise
def __init__(self, filedic, hdf5, cfg, logs, searchID, quantMethod): ''' @brief initialise the Datfile object @param filedic <dictionary>: containig the full path to the dat file and other file data @param hdf5 <hdf5 object>: containing all the writing methods for hdf5 output @param cfg <cfg object>: containing all the running parameters @param dbobj <db object>: linking to the meet database ''' self.filedic = filedic self.searchid = searchID self.quantMethod = quantMethod datfilename = filedic['dat'] if datfilename == str(None): raise ExHa.FileNotFoundException('no resulting datfile found with search') self.datfilename = datfilename self.cfg = cfg self.logs = logs self.hdfMascot = hdf5 self.dataDir = filedic['datpath'].parent self.spectra = {} self.peptidecounter = 0 self.failedsequences = 0 self.sequences = {} self.seq2acc = {} self.hookpeps = [] self.hookppm = [] self.analtimes = dict(first=40, last=0, early=[]) self.stats = dict(numpeps=0, numfailedpeps=0, numspectra_nopeps=0)
def startParsing(self): ''' startParsing(string) -> integer Parses the dat file given by datfilepath. ''' datPath = str(self.datfileobj.filedic['datpath']) self.logs.datlog.info('Starting parsing file %s' % datPath) fin = open(datPath, 'r') try: self.cargo = fin.next() except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'Empty dat file.') raise
def doPostQuery(self, fileiterator, datfile): ''' @brief perfoms tasks after all the query data is loaded @param fileiterator <file object>: linked to the dat file @param datfile <datfile object>: containing all the dat file data and processing methods ''' try: datfile.doPostQuery() while 1: # Used to trigger the QC of the spectrum if required. yield ('boundary', self.cargo) except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'Called from doPostQuery') raise
def doETpeptides(self, fileiterator, datfile): ''' @brief reads the peptide section and puts the data into a dictionary, data is passed to the datfile object @param fileiterator <file object>: linked to the dat file @param datfile <datfile object>: containing all the dat file data and processing methods ''' key = 'None' try: isbound = self.isBoundary self.logs.datlog.info('Loading and Parsing et_peptides') collection = {} lastquery = '' # create the etpeptides table in the HDF5 file datfile.createETpeptidesTable() while 1: while not isbound(self.cargo): match = rx_peptide.search(self.cargo) if match: value = match.group('value') key = match.group('key') if value == '-1': # no matching peptide datfile.stats['numspectra_nopeps'] += 1 else: query = key.split('_')[0] if query != lastquery: lastquery = query if collection: # add the query number to addpeptides datfile.addETpeptides(collection) collection = {} collection[key] = value self.cargo = fileiterator.next() # add last peptide if collection: # add the query number to addpeptides datfile.addETpeptides(collection) yield ('postpeptides', self.cargo) except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'doETpeptides, last peptide = %s' % key) raise
def doUnimod(self, fileiterator, datfile): ''' @brief reads the masses section and puts the data into a dictionary, data is passed to the datfile object @param fileiterator <file object>: linked to the dat file @param datfile <datfile object>: containing all the dat file data and processing methods ''' try: isbound = self.isBoundary self.logs.datlog.info('Loading and Parsing Unimod data') while 1: xml = [] while not isbound(self.cargo): xml.append(self.cargo) self.cargo = fileiterator.next() datfile.addUnimod(xml) yield ('boundary', self.cargo) except Exception, genEx: # catch exceptions and add some context data ExHa.addContext(genEx, 'Called from doUnimod') raise
ret = cfg.evaluateCommandLineArgs(sys.argv) try: cfg.scalePpmMda() dataDir = cfg.parameters['runtime']['datadir'] logParam = cfg.parameters['logging'] logPath = Path(dataDir.joinpath(logParam['logdir'])) if not logPath.exists(): logPath.mkdir(parents=True) logFile = logPath.joinpath(logParam['logfile']) logger = Logger(logFile, logParam['loglevel'], logParam['screenlevel'], False) logger.setMascotParserLogs() jobcontrol(cfg, logger) except ExHa.UsageError as useEx: ExHa.reformatException(useEx) print useEx.context except Exception as genEx: ExHa.reformatException(genEx) errorFile = Path(cfg.parameters['runtime']['hdf5file']).stem + '.error' ExHa.exportError2File( genEx, cfg.parameters['runtime']['datadir'].joinpath(errorFile)) if logs: logs.datlog.warning(ExHa.oneLineRepr(genEx)) else: print ExHa.multiLineRepr(genEx)
def run(self, jobObj): """ @brief run the analysis @param job <object>: containing all the job data """ # needs to catch exceptions here so that LUX doesn't recieve an exception hdf = '' xraw = '' tempFile = '' config = self.cfg logs = self.logs try: hdf = '' maxspec = jobObj.args['maxspec'] logs.log.info("Starting PyMSSafe runner") # assign objects to the file interfaces rawpath = jobObj.srcpth.absolute() namebase = rawpath.stem hdfpath = jobObj.dstpth.absolute().joinpath(namebase + '.hdf5') runname = namebase tempFile = jobObj.dstpth.absolute().joinpath( str(config.parameters['runtime']['pid'])) if tempFile.exists(): tempFile.unlink() logs.log.info('started job: %s' % rawpath.name) watch = Stopwatch() if not rawpath.exists(): logs.log.info('could not find file: %s' % str(rawpath)) logs.log.info('Stopped') return { 'code': 1, 'error': 'could not find file: %s' % str(rawpath) } # get quant information if file is quan type quantMeth = self.extractQuant(jobObj.srcpth.name.upper()) if quantMeth == -1: raise ExHa.QuantificationMethodError( 'Unable to find "%s" quantification method' % self.cfg.parameters['runtime']['quant'].upper()) elif quantMeth == -2: raise ExHa.QuantificationMethodError( 'Unable to find valid quantification method in file name (%s)' % jobObj.srcpth.name.upper()) xraw = XRawFile(str(rawpath)) if config.parameters['general']['skipscanevents']: xraw.skipScanEvents = config.parameters['general'][ 'skipscanevents'] # opens hdf5 file for writing hdf = hdf5Base(hdfpath, True, True) hdf.appendOpen() self.createHDFtables(hdf) # save the config entries hdf.appendRows('/rawdata/config', config.convertConfig()) # create datamanager object config.rawfile = jobObj.srcpth dataman = Datamanager(xraw, config, logs, hdf, quantMeth, str(tempFile)) dataman.maxspec = maxspec dataman.addLUXdata(jobObj, config.parameters['runtime']['pid']) # run the analysis in the datamanager ok = dataman.processSpectra(quantMeth, watch, runname) if ok['code'] != 0: raise ExHa.SpectraProcessingError(ok['error']) # run the XIC generation logs.log.info('Processing XIC data for %d MS/MS events' % len(dataman.specs)) ok = dataman.processXICs() if ok['code'] != 0: raise ExHa.XICprocessingError(ok['error']) watch.rec('Processing XIC') logs.log.info('Writing HDF5 indexes') hdf.indexTable('/rawdata/msmsheader', ['spec_id']) hdf.close() xraw = '' hdf = '' watch.stop() logs.log.info('job took: %s' % watch.oneLineFormat()) tempFile.unlink() except ExHa.MSdataConsistancyError, msg: self.shutdown(hdf, xraw, tempFile) logs.log.warning('error: pyMSsafe Data error: %s' % msg) return {'code': 2, 'error': 'pyMSsafe Data error: %s' % msg}
def jobcontrol(cfg, logs): ''' @brief takes filepath, executes parse ... import @param cfg <ConfigManager>: all the configuartion parameters including the sample data @param logs <loggingManager>: to control the logging of events ''' try: msg = 'Setting files' datFileName = cfg.parameters['runtime']['datfile'] hdf5FileName = cfg.parameters['runtime']['hdf5file'] logs.setuplog.info('started job: %s' % datFileName) watch = Stopwatch() filedic = dict(dat=datFileName, datpath=dataDir.joinpath(datFileName), hdf5=hdf5FileName, hdf5path=dataDir.joinpath(hdf5FileName)) searchID = int(filedic['datpath'].stem[1:]) hdfMascot = HDF5Mascot(hdfFilePath=filedic['hdf5path']) hdfMascot.appendOpen() importGroup = hdfMascot.checkDatFilePresence(filedic['dat']) # test the quantification method already in the hdf5 file msg = 'Setting quantification method' quantMethID = hdfMascot.getH5DFQuantMeth() quantHandler = QuantMethods() quantMethod = quantHandler.getMethodByID(quantMethID) except Exception as genEx: ExHa.addContext(genEx, 'jobcontrol Error: %s' % msg) raise try: # control the deletion of existing data msg = 'Deleting existing HDF5 data' hdfMascot.deleteAllMascotImports(0) # if overwrite: # # delte all Mascot data whatever the source # hdfMascot.deleteAllMascotImports(0) # elif importGroup: # # delete previous version of this data # hdfMascot.deleteMascotImport(importGroup) importGroup = filedic['dat'].replace('.', '_') hdfMascot.createTables(importGroup, searchID, 0) hdfMascot.writeConfig(cfg.convertConfig()) datfile = Datfile(filedic, hdfMascot, cfg, logs, searchID, quantMethod) logs.setuplog.info('searchid: %d, dat file: %s, hdf5 file: %s ' % (searchID, datfile.datfilename, filedic['hdf5'])) msg = 'Parsing data' datparser = DatParser(datfile, cfg, logs) datparser.startParsing() watch.rec('parser') # post parsing processing logs.qclog.info('Effective Run-time analysis') datfile.doTimeAnalysis() msg = 'Find top protein hit' logs.qclog.info(msg) # tophit = db.getMascotTopHit(searchID) if datfile.seq2acc: datfile.findBestProtein() except Exception as genEx: ExHa.addContext(genEx, 'jobcontrol Error: %s' % msg) raise finally: hdfMascot.close() logs.setuplog.info('Closing HDF5') hdfMascot.close() watch.rec('processing') watch.stop() logs.setuplog.info('job took %s' % watch.format()) return
class mgftools: def __init__(self, hdf5file): """ @brief initialises the mgftools class @param hdf5file <string/path>: path for the hdf5 file to analyse """ self.cfgFilters = cfg.parameters['msmsfilters'] self.maxint = 0 self.proton = cfg.parameters['general']['proton'] self.neutron = cfg.parameters['general']['neutron'] self.hdf5 = hdf5Base(hdf5file) self.hdf5.readOpen() self.filters = dict(ten=self.tenpercent, zone=self.zonefilter, repion=self.repionfilter, deconv=self.deconvolute, mascot=self.mascot, neutralloss=self.neutrallossfilter, multi=self.multichargefilt, immonium=self.immoniumfilter, none='') self.isos = self.hdf5.readTable('/rawdata/isotopes') frags = self.getActivationTypes() if len(frags) == 1: if frags == ['HCD']: # hcd olny method usefilts = self.cfgFilters['filt_hcd'] self.remove = self.cfgFilters['rem_hcd'] else: # CID/PQD only method usefilts = self.cfgFilters['filt_other'] self.remove = self.cfgFilters['rem_other'] else: # mixed method usefilts = self.cfgFilters['filt_mixed_hcd_other'] self.remove = self.cfgFilters['rem_mixed_hcd_other'] if len(self.isos) == 0: repionIdx = -1 for idx in range(len(usefilts)): if usefilts[idx] == 'repion': repionIdx = idx break if repionIdx != -1: usefilts.pop(repionIdx) self.usefilts = usefilts self.hdf5.close() def close(self): self.hdf5.close() def getActivationTypes(self): data = self.hdf5.getDataEqual('/rawdata/parameters', 'parameter', 'Activation Type') if len(data) == 0: data = self.hdf5.getDataEqual('/rawdata/parameters', 'parameter', 'activation') types = {} for act in data.flat: activation = act['value'].upper() types[activation] = types.get(activation, 0) + 1 return types.keys() def export(self, hcdonly=0): """ @brief creates an mgf file for the MS/MS spectra in the hdf5 file @param hcdonly <integer>: flag to switch output to only HCD spectra bypassing the normal export filters """ if hcdonly: remove = ['CID'] filters = ['none'] else: remove = self.remove filters = self.usefilts hdf = self.hdf5 mgfFile = hdf.filePath.parent.joinpath(hdf.filePath.stem + '.mgf') # extra = path(hdf.filepath.splitext()[0] + '.txt') # self.fextra = extra.open('w') # self.fextra.write('spec_id\tmz\tinten\trel_inten\tion\texp_mz\n') mgfOut = open(str(mgfFile), 'w') mgfOut.write('#Removed spectra = %s, filtering = %s\n' % (remove, filters)) spec = 0 # read parameters from hdf5 file try: hdf.appendOpen() headers = hdf.readTable('/rawdata/msmsheader') runTimeEntry = hdf.getDataEqual('/rawdata/parameters', 'parameter', 'MS Run Time (min)') if len(runTimeEntry) == 0: raise ExHa.MGFprocessingError( 'MGF Error: Could not find "MS Run Time (min)" parameter in HDF5 file.' ) runtime = runTimeEntry[0]['value'] units = self.readUnitsOK() # add new table for the deconvoluted spectrum data hdf.removeTable('/rawdata/deconvions') hdf.createTable('rawdata', 'deconvions', 'DeconvIons') ident = [] for frag in units[1]: # find all the frag methods to be used in identification if 'I' in frag['use']: ident.append(frag['order']) logger.log.info('Reading %d spectra from %s' % (len(headers), hdf.filePath.name)) if 'deconv' in filters: deconv = 1 else: deconv = 0 pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(headers), name='Create .mgf').start() for idx, h in enumerate(headers): if hcdonly: if h['fragmeth'] != 'HCD': continue elif not h['order'] in ident: continue pBar.update(idx) # get spectrum data spec = h['spec_id'] spectrum = hdf.getDataEqual('/rawdata/ions', 'spec_id', spec) if deconv: # need extra column for charge information spectrum = self.addChargeColumn(spectrum) data = hdf.getDataGeneral( '/rawdata/specparams', '(spec_id == %i) & (parameter == "%s")' % (spec, 'setmass1')) setmass = data[0]['value'] data = hdf.getDataGeneral( '/rawdata/specparams', '(spec_id == %i) & (parameter == "%s")' % (spec, 'frag1')) frag = data[0]['value'] try: self.maxint = max(spectrum['inten']) except: self.maxint = 0 # construct title values list rt = '%.3f' % h['rt'] use = units[1][h['order'] - 1]['use'] pretitle = '' if use == 'IQ': # spec is both ID and Quan so us normal msms ID titles = ['msmsid:F%06d' % h['spec_id']] elif use == 'I': if h['quan_spec'] == 0: # no quant data so use spec_id titles = ['msmsid:F%06d' % h['spec_id']] else: # spec is only for ident find the quan spec titles = ['msmsid:F%06d' % h['quan_spec']] pretitle = '#CID=F%06d\n' % h['id_spec'] elif use == 'Q': titles = ['msmsid:F%06d' % h['quan_spec']] pretitle = '#CID=F%06d\n' % h['id_spec'] titles.append('rt:' + rt) titles.append('survey:S%06d' % h['survey_spec']) titles.append('parent:' + setmass) titles.append('AnalTime:' + runtime) titles.append('Activation:' + frag.upper()) titleline = 'TITLE=%s\n' % ','.join(titles) if h['precmz'] > 0: pepmass = h['precmz'] elif h['precmz_surv'] > 0: pepmass = h['precmz_surv'] else: pepmass = h['monomz'] if pepmass == 0: continue for filt in filters: if len(spectrum) > 5 and self.filters[filt]: spectrum = self.filters[filt](h, spectrum) # filter for mascot interference ionList = [] if len(spectrum) > 2: mgfOut.write(pretitle) mgfOut.write('BEGIN IONS\n') mgfOut.write(titleline) mgfOut.write('PEPMASS=%f\n' % pepmass) mgfOut.write('CHARGE=%d+\n' % h['charge']) if deconv: for pt in spectrum: if pt['inten'] == 0: continue mgfOut.write('%f %f %s\n' % (pt['mz'], pt['inten'], pt['charge'])) ionList.append( dict(spec_id=pt['spec_id'], mz=pt['mz'], inten=pt['inten'], charge=pt['charge'])) else: for pt in spectrum: if pt['inten'] == 0: continue mgfOut.write('%f %f\n' % (pt['mz'], pt['inten'])) ionList.append( dict(spec_id=pt['spec_id'], mz=pt['mz'], inten=pt['inten'])) mgfOut.write('END IONS\n\n') if len(ionList) > 0: hdf.appendRows('/rawdata/deconvions', ionList) pBar.finish() except ExHa.MGFprocessingError, czEx: if spec: ExHa.addContext(czEx, 'Raised whist processing spectrum %i' % spec) raise except Exception as genEx: ExHa.reformatException(genEx) if spec: ExHa.addContext(genEx, 'Raised whist processing spectrum %i' % spec) raise
def export(self, hcdonly=0): """ @brief creates an mgf file for the MS/MS spectra in the hdf5 file @param hcdonly <integer>: flag to switch output to only HCD spectra bypassing the normal export filters """ if hcdonly: remove = ['CID'] filters = ['none'] else: remove = self.remove filters = self.usefilts hdf = self.hdf5 mgfFile = hdf.filePath.parent.joinpath(hdf.filePath.stem + '.mgf') # extra = path(hdf.filepath.splitext()[0] + '.txt') # self.fextra = extra.open('w') # self.fextra.write('spec_id\tmz\tinten\trel_inten\tion\texp_mz\n') mgfOut = open(str(mgfFile), 'w') mgfOut.write('#Removed spectra = %s, filtering = %s\n' % (remove, filters)) spec = 0 # read parameters from hdf5 file try: hdf.appendOpen() headers = hdf.readTable('/rawdata/msmsheader') runTimeEntry = hdf.getDataEqual('/rawdata/parameters', 'parameter', 'MS Run Time (min)') if len(runTimeEntry) == 0: raise ExHa.MGFprocessingError( 'MGF Error: Could not find "MS Run Time (min)" parameter in HDF5 file.' ) runtime = runTimeEntry[0]['value'] units = self.readUnitsOK() # add new table for the deconvoluted spectrum data hdf.removeTable('/rawdata/deconvions') hdf.createTable('rawdata', 'deconvions', 'DeconvIons') ident = [] for frag in units[1]: # find all the frag methods to be used in identification if 'I' in frag['use']: ident.append(frag['order']) logger.log.info('Reading %d spectra from %s' % (len(headers), hdf.filePath.name)) if 'deconv' in filters: deconv = 1 else: deconv = 0 pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(headers), name='Create .mgf').start() for idx, h in enumerate(headers): if hcdonly: if h['fragmeth'] != 'HCD': continue elif not h['order'] in ident: continue pBar.update(idx) # get spectrum data spec = h['spec_id'] spectrum = hdf.getDataEqual('/rawdata/ions', 'spec_id', spec) if deconv: # need extra column for charge information spectrum = self.addChargeColumn(spectrum) data = hdf.getDataGeneral( '/rawdata/specparams', '(spec_id == %i) & (parameter == "%s")' % (spec, 'setmass1')) setmass = data[0]['value'] data = hdf.getDataGeneral( '/rawdata/specparams', '(spec_id == %i) & (parameter == "%s")' % (spec, 'frag1')) frag = data[0]['value'] try: self.maxint = max(spectrum['inten']) except: self.maxint = 0 # construct title values list rt = '%.3f' % h['rt'] use = units[1][h['order'] - 1]['use'] pretitle = '' if use == 'IQ': # spec is both ID and Quan so us normal msms ID titles = ['msmsid:F%06d' % h['spec_id']] elif use == 'I': if h['quan_spec'] == 0: # no quant data so use spec_id titles = ['msmsid:F%06d' % h['spec_id']] else: # spec is only for ident find the quan spec titles = ['msmsid:F%06d' % h['quan_spec']] pretitle = '#CID=F%06d\n' % h['id_spec'] elif use == 'Q': titles = ['msmsid:F%06d' % h['quan_spec']] pretitle = '#CID=F%06d\n' % h['id_spec'] titles.append('rt:' + rt) titles.append('survey:S%06d' % h['survey_spec']) titles.append('parent:' + setmass) titles.append('AnalTime:' + runtime) titles.append('Activation:' + frag.upper()) titleline = 'TITLE=%s\n' % ','.join(titles) if h['precmz'] > 0: pepmass = h['precmz'] elif h['precmz_surv'] > 0: pepmass = h['precmz_surv'] else: pepmass = h['monomz'] if pepmass == 0: continue for filt in filters: if len(spectrum) > 5 and self.filters[filt]: spectrum = self.filters[filt](h, spectrum) # filter for mascot interference ionList = [] if len(spectrum) > 2: mgfOut.write(pretitle) mgfOut.write('BEGIN IONS\n') mgfOut.write(titleline) mgfOut.write('PEPMASS=%f\n' % pepmass) mgfOut.write('CHARGE=%d+\n' % h['charge']) if deconv: for pt in spectrum: if pt['inten'] == 0: continue mgfOut.write('%f %f %s\n' % (pt['mz'], pt['inten'], pt['charge'])) ionList.append( dict(spec_id=pt['spec_id'], mz=pt['mz'], inten=pt['inten'], charge=pt['charge'])) else: for pt in spectrum: if pt['inten'] == 0: continue mgfOut.write('%f %f\n' % (pt['mz'], pt['inten'])) ionList.append( dict(spec_id=pt['spec_id'], mz=pt['mz'], inten=pt['inten'])) mgfOut.write('END IONS\n\n') if len(ionList) > 0: hdf.appendRows('/rawdata/deconvions', ionList) pBar.finish() except ExHa.MGFprocessingError, czEx: if spec: ExHa.addContext(czEx, 'Raised whist processing spectrum %i' % spec) raise
for f in dataDir.glob(fileFilter): if not f.is_file(): # skip any directories continue # if f.name[:4] in ['6528', '1814', '2032']: continue mgf = mgftools(f) logger.log.info('Filename: %s' % f.name) if hcdOnly: logger.log.info('Export HCD data only') else: logger.log.info('Using filters: %s' % str(mgf.usefilts)) rtn = mgf.export(hcdOnly) mgf.close() if f == 0: raise ExHa.MGFprocessingError('no files found for: %s' % str(dataDir / fileFilter)) except ExHa.UsageError as useEx: ExHa.reformatException(useEx) logger.log.info(useEx.context) except Exception, genEx: ExHa.reformatException(genEx) if f: ExHa.exportError2File(genEx, f.parent.joinpath(f.stem + '.error')) else: ExHa.exportError2File(genEx, dataDir.joinpath('errors.error')) logger.log.info(ExHa.multiLineRepr(genEx)) logger.log.info('finished')
def updateHDF5(self): """ @brief controls the updating of the data to the hdf5 results file @return finalMessage <string>: constructed from the protein data this is the RESULT stored in the DB """ pep2unique = self.pep2unique baseContext = 'updateHDF5: ' context = 'updateHDF5' try: # find the peptide sequences that are being imported usedPeps = self.setsManager.findUsedPeptides() logger.log.info('there are %s usedPeps' % len(usedPeps)) context = baseContext + 'Retrieving sample IDs' sample_ids = range(1, len(self.hdfFiles) + 1) # create proteinset and proteinhit data starting_protein_group_no = 1 self.setsManager.setProteinGroupNo(starting_protein_group_no) logger.log.info('adding protein group data to HDF5') logger.log.debug(str(self.hdfFiles.keys())) spectrum_id = 0 peptide_id = 0 hdfFileList = self.hdfFiles.keys() hdfFileList.sort() for key in hdfFileList: baseContext += '%s: ' % key logger.log.log( logger.PROCESS, 'Integrating Spectrum, Peptide & Quantification data from %s' % key) # collect fileData hdf = self.hdfFiles[key] hdfObj = hdf.hdfObject # set the current sample_id from the list of IDs extracted from the DB current_sample_id = sample_ids.pop() hdf.acquired_spectra, hdf.mascot_matched_spectra, numIsotopes, runTime = hdfObj.getNumbers( ) # read the Mascot data context = baseContext + 'Reading Mascot data' tmp = hdfObj.readImporterData(usedPeps, hdf) peptides = tmp[0] queryDict = tmp[1] headerArray = tmp[2] quanArray = tmp[3] hdf.spectra_in_qc_proteins = len(peptides) logger.log.debug('getting spectrum_ids') context = baseContext + 'Retrieving spectrum IDs' acqTime, hdf.idAct, hdf.quanAct = hdfObj.getTimeAndActivation() # create blank lists to hold data for writing to hdf5 file spectrum_list = [] peptide_list = [] quant_list = [] logger.log.info('collating spectrum, peptide & quant data') pBar = progBar.ProgressBar(widgets=progBar.name_widgets, maxval=len(queryDict), name='collate data').start() for idx, q in enumerate(queryDict): # loop round all the required spectra pBar.nextPrimary() context = baseContext + 'query %i: Setting spectrum data' % q # extract a spectrum_id from the list spectrum_id += 1 query = queryDict[q] spec = int(query['spec_id']) context = baseContext + 'spectrum %i: Updating DB with spectrum data' % spec # add spectrum data to spectrum_list header = self.filterArrayEqual(headerArray, 'spec_id', spec) spectrum_list.append( self.makeSpectrumDict(spectrum_id, current_sample_id, query, acqTime, header)) # find the appropriate peptides pepList = peptides[q] logger.log.debug('there are %s in peplist %s' % (len(pepList), str(pepList))) quantFound = 0 # this list will hold all peptides returned from makePeptideDictList and then filter # those non-rank1 equivalents based on the score of the rank 1 peptide tmplist = [] for pep in pepList: # find the sets that the peptide belongs to and add to the peptide_list sets = self.setsManager.peptide2set[pep['peptide']] context = baseContext + 'spectrum %i: Creating peptide data entries for hdf5' % spec tmp, qf = self.makePeptideDictList( spectrum_id, pep, query, sets, hdf, pep2unique) tmplist.extend(tmp) peptide_list += tmp quantFound += qf # only keep rank1 equivalent peptides (based on score) tmplist.sort(key=lambda x: x['rank']) toprankscore = tmplist[0]['score'] tmplist = [ x for x in tmplist if x['score'] == toprankscore ] if quantMethID and quantFound: # extract quantification data for the spectrum context = baseContext + 'spectrum %i: Creating quantitation data entries for DB' % spec newquant, deltas = self.makeQuantDictLists( spectrum_id, spec, tmplist, header, quanArray, hdf) quant_list += newquant if quantSource == 'ms2': context = baseContext + 'spectrum %i: Adding reporter ion delta data' % spec hdf.addReporterDeltas(deltas) pBar.finish() # calculate statistics context = baseContext + 'Calculating statistics' hdf.calcReporterStats() context = baseContext + 'Calculating delta m/z for fragment ions' context = baseContext + 'Updating sample table (%i)' % current_sample_id sample_data = hdf.getSampleDataDict(current_sample_id, key, runTime) hdf5results.writeSample(sample_data) self.importData.combineStatistics(hdf) # write data to HDF5 context = baseContext + 'Updating spectrum table' logger.log.info('updating HDF5 with spectrum data') hdf5results.writeSpectrum(spectrum_list) if quantMethID: context = baseContext + 'Updating specquant table' logger.log.info('updating HDF5 with quant data') hdf5results.writeSpecQuant(quant_list) context = baseContext + 'Retrieving peptide IDs' logger.log.info('updating HDF5 with peptide data') for pepdata in peptide_list: pepdata['peptide_id'] = peptide_id peptide_id += 1 context = baseContext + 'Updating peptide table' hdf5results.writePeptide(peptide_list) hdf5results.createIndexes() logger.log.info('finalising HDF5 entries') hdf5results.writeFDRdata(self.importData.score2fdr, 'peptide') hdf5results.writeFDRdata(self.importData.proteinscore2fdr, 'protein') topScoringProteinInfo = self.setsManager.addPeptideSetDBdata( hdf5results, self.importData.proteinscore2fdr) runtimedata = self.importData.getSummaryStatisticsDict() hdf5results.writeStatistics(runtimedata) finalMessage = 'queries matched: %i / %s (%.1f%%) ' % ( runtimedata['spectra_in_qc_proteins'], runtimedata['mascot_matched_spectra'], (runtimedata['spectra_in_qc_proteins'] / float(runtimedata['mascot_matched_spectra'])) * 100) finalMessage += 'spectra quantified: %i top hit %s (%s) ' % ( runtimedata['quantified_spectra'], '', '') finalMessage += 'with total score %f and %i matched peptides (hook AND non hook)' % \ (topScoringProteinInfo[0], topScoringProteinInfo[2]) baseContext = 'updateHDF5: ' context = baseContext + 'Finalising HDF5 entries' except Exception, genEx: # make sure that there aren't any permanent changes ExHa.addContext(genEx, context) finalMessage = 'Error: %s' % ExHa.oneLineRepr(genEx) raise
msg = 'merged sample' else: isMerged = False msg = 'single sample' importer = DATimporter(cfg, logger) logger.log.info(msg) searchDict = {} for idx, hdf5file in enumerate(searches): searchData = {} searchData['hdf5name'] = hdf5file searchData['archivepath'] = cfg.parameters['runtime']['datadir'] fullPath = searchData['archivepath'] fi = fullPath.joinpath(searchData['hdf5name']) if not fi.exists(): raise ExHa.FileNotFoundException('Missing file: %s' % str(fi)) logger.log.info('Reading: %s' % str(fi)) importer.addHDFfile(fi, idx + 1, logger) sw.rec('file loading') fdrthreshold = cfg.parameters['general']['fdrthreshold'] peptidescoreatthreshold, score2fdr = importer.calculateFDR( importer.setsManager.FDRdata, fdrthreshold) importer.importData.score2fdr = score2fdr importer.finaliseProteins(peptidescoreatthreshold) proteinscoreatthreshold, score2fdr = importer.calculateFDR( importer.setsManager.proteinFDRdata, fdrthreshold) importer.importData.proteinscore2fdr = score2fdr finalMessage = importer.updateHDF5()
allfractionbgratios = {} for isotopelabel_id, data in allsumionratiodata.iteritems(): allfractionbgratios[isotopelabel_id] = np.median(data) # second normalization so that the bg-ratios all add to 1 for isotopelabel_id, data in allfractionbgratios.iteritems(): allfractionbgratios[isotopelabel_id] = data / sum(allfractionbgratios.values()) logger.log.debug(('allfractionbgratios are %s' % str(allfractionbgratios))) for corrects2iquantob in corrects2iquantoblist: # perform correction for each of the analyzed .hdf5 files. s2icorrecteddata = corrects2iquantob.performS2Icorrection(allfractionbgratios) corrects2iquantob.hdf5corrects2iquant.updates2ivalues(s2icorrecteddata) hdf5corrects2iquant.close() except ExHa.czException as czEx: ExHa.reformatException(czEx) ExHa.addContext(czEx, 'Error during corrects2iquant run') ExHa.exportError2File(czEx, cfg.parameters['runtime']['datadir'] / Path('errors.error')) if logger: logger.log.warning(ExHa.oneLineRepr(czEx)) else: print ExHa.multiLineRepr(czEx) except Exception as genEx: ExHa.reformatException(genEx) ExHa.addContext(genEx, 'Error during corrects2iquant run') ExHa.exportError2File(genEx, cfg.parameters['runtime']['datadir'] / 'errors.error') if logger: logger.log.warning(ExHa.oneLineRepr(genEx)) else:
from pathlib import Path # cellzome CommonUtils sys.path.insert(0, '..') from CommonUtils.tools import * from CommonUtils.ConfigManager import pyMSsafeConfigManager from CommonUtils.LoggingManager import Logger from CommonUtils.hdf5Base import hdf5Base from CommonUtils.QuantMethodHandler import QuantMethods import CommonUtils.ExceptionHandler as ExHa # pyMSsafe modules try: from xRawFile import XRawFile except ImportError, ieEx: ExHa.reformatException(ieEx) ExHa.addContext(ieEx, 'Xcalibur not set up properly') configPath = './pymssafe.cfg' cfg = pyMSsafeConfigManager(configPath) ret = cfg.evaluateCommandLineArgs(sys.argv) dataDir = cfg.parameters['runtime']['datadir'] ExHa.exportError2File(ieEx, dataDir.joinpath('errors.error')) from datastore import Datamanager class pymssafe: def __init__(self, config): """ @brief initiaise pyMSsafe controller @param mode <string>: operational mode of the application