Esempio n. 1
0
    def doEnzyme(self, fileiterator, datfile):
        '''
        @brief reads the enzyme section and puts the data into a dictionary, data is passed to the datfile object
        @param fileiterator <file object>: linked to the dat file
        @param datfile <datfile object>: containing all the dat file data and processing methods
        '''
        try:
            isbound = self.isBoundary

            self.logs.datlog.info('Loading and Parsing Enzyme')
            while 1:
                allparams = {}
                while not isbound(self.cargo):
                    match = rx_enzymeline.search(self.cargo)
                    if match:
                        allparams[match.group('key')] = match.group('value')
                    elif len(self.cargo) > 2:
                        allparams['Side'] = self.cargo[:-1]
                    self.cargo = fileiterator.next()
                datfile.hdfMascot.writeParameters(allparams, 'enzyme')
                yield ('boundary', self.cargo)

        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'Called from doEnzyme')
            raise
Esempio n. 2
0
    def doETsummary(self, fileiterator, datfile):
        '''
        @brief reads the summary section and puts the data into a dictionary, data is passed to the datfile object
        @param fileiterator <file object>: linked to the dat file
        @param datfile <datfile object>: containing all the dat file data and processing methods
        '''
        try:
            isbound = self.isBoundary

            self.logs.datlog.info('Loading and Parsing et_summary')
            while 1:
                allparams = {}
                while not isbound(self.cargo):
                    match = rx_summary.search(self.cargo)
                    if match:
                        allparams[match.group('key')] = match.group('value')

                    self.cargo = fileiterator.next()
                datfile.addETsummary(allparams)
                yield ('boundary', self.cargo)

        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'Called from doETsummary')
            raise
Esempio n. 3
0
    def doIndex(self, fileiterator, datfile):
        '''
        @brief parses index section of the dat file
        @param fileiterator <file object>: linked to the dat file
        @param datfile <datfile object>: containing all the dat file data and processing methods
        '''
        try:
            isbound = self.isBoundary

            self.logs.datlog.info('Loading and Parsing index')
            while 1:
                index = []
                while not isbound(self.cargo):
                    try:
                        key, value = self.cargo.split('=')
                        index.append((key, int(value)))
                    except:
                        pass
                    self.cargo = fileiterator.next()
                datfile.addIndex(index)
                yield ('boundary', self.cargo)

        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'Called from doIndex')
            raise
Esempio n. 4
0
    def doQuery(self, fileiterator, datfile):
        '''
        @brief reads the query sections and puts the data into a dictionary, data is passed to the datfile object
        @param fileiterator <file object>: linked to the dat file
        @param datfile <datfile object>: containing all the dat file data and processing methods
        '''
        query = 'None'
        try:
            isbound = self.isBoundary
            while 1:
                allparams = {}
                match = rx_query.search(self.cargo)
                query = match.group('ID')
                #            if match.group('ID') == '1':
                #                self.logs.datlog.info('Loading and Parsing queries')
                allparams['query'] = query
                while not isbound(self.cargo):
                    match = rx_paramline.search(self.cargo)
                    if match:
                        allparams[match.group('key')] = match.group('value')
                    self.cargo = fileiterator.next()

                datfile.addQuerySpectra(allparams)
                qry = int(query)
                datfile.spectra[qry]['spec_id'] = int(
                    datfile.spectra[qry]['msmsid'][1:])
                datfile.spectra[qry]['rt'] = float(
                    datfile.spectra[qry]['start'])
                yield ('postquery', self.cargo)

        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'doQuery: last query = %s' % query)
            raise
Esempio n. 5
0
    def __init__(self, filedic, hdf5, cfg, logs, searchID, quantMethod):
        '''
        @brief initialise the Datfile object
        @param filedic <dictionary>: containig the full path to the dat file and other file data
        @param hdf5 <hdf5 object>: containing all the writing methods for hdf5 output
        @param cfg <cfg object>: containing all the running parameters
        @param dbobj <db object>: linking to the meet database
        '''
        self.filedic = filedic
        self.searchid = searchID
        self.quantMethod = quantMethod
        datfilename = filedic['dat']
        if datfilename == str(None):
            raise ExHa.FileNotFoundException('no resulting datfile found with search')
        self.datfilename = datfilename
        self.cfg = cfg
        self.logs = logs
        self.hdfMascot = hdf5
        self.dataDir = filedic['datpath'].parent

        self.spectra = {}
        self.peptidecounter = 0
        self.failedsequences = 0
        self.sequences = {}
        self.seq2acc = {}
        self.hookpeps = []
        self.hookppm = []

        self.analtimes = dict(first=40, last=0, early=[])
        self.stats = dict(numpeps=0, numfailedpeps=0, numspectra_nopeps=0)
Esempio n. 6
0
    def startParsing(self):
        '''
        startParsing(string) -> integer

        Parses the dat file given by datfilepath.
        '''
        datPath = str(self.datfileobj.filedic['datpath'])
        self.logs.datlog.info('Starting parsing file %s' % datPath)
        fin = open(datPath, 'r')

        try:
            self.cargo = fin.next()
        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'Empty dat file.')
            raise
Esempio n. 7
0
    def doPostQuery(self, fileiterator, datfile):
        '''
        @brief perfoms tasks after all the query data is loaded
        @param fileiterator <file object>: linked to the dat file
        @param datfile <datfile object>: containing all the dat file data and processing methods
        '''
        try:
            datfile.doPostQuery()
            while 1:
                # Used to trigger the QC of the spectrum if required.
                yield ('boundary', self.cargo)

        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'Called from doPostQuery')
            raise
Esempio n. 8
0
    def doETpeptides(self, fileiterator, datfile):
        '''
        @brief reads the peptide section and puts the data into a dictionary, data is passed to the datfile object
        @param fileiterator <file object>: linked to the dat file
        @param datfile <datfile object>: containing all the dat file data and processing methods
        '''
        key = 'None'
        try:
            isbound = self.isBoundary

            self.logs.datlog.info('Loading and Parsing et_peptides')
            collection = {}
            lastquery = ''

            # create the etpeptides table in the HDF5 file
            datfile.createETpeptidesTable()
            while 1:
                while not isbound(self.cargo):
                    match = rx_peptide.search(self.cargo)
                    if match:
                        value = match.group('value')
                        key = match.group('key')
                        if value == '-1':
                            # no matching peptide
                            datfile.stats['numspectra_nopeps'] += 1
                        else:
                            query = key.split('_')[0]
                            if query != lastquery:
                                lastquery = query
                                if collection:
                                    # add the query number to addpeptides
                                    datfile.addETpeptides(collection)
                                    collection = {}
                            collection[key] = value
                    self.cargo = fileiterator.next()
                    # add last peptide
                if collection:
                    # add the query number to addpeptides
                    datfile.addETpeptides(collection)
                yield ('postpeptides', self.cargo)

        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'doETpeptides, last peptide = %s' % key)
            raise
Esempio n. 9
0
    def doUnimod(self, fileiterator, datfile):
        '''
        @brief reads the masses section and puts the data into a dictionary, data is passed to the datfile object
        @param fileiterator <file object>: linked to the dat file
        @param datfile <datfile object>: containing all the dat file data and processing methods
        '''

        try:
            isbound = self.isBoundary

            self.logs.datlog.info('Loading and Parsing Unimod data')
            while 1:
                xml = []
                while not isbound(self.cargo):
                    xml.append(self.cargo)
                    self.cargo = fileiterator.next()
                datfile.addUnimod(xml)
                yield ('boundary', self.cargo)

        except Exception, genEx:
            # catch exceptions and add some context data
            ExHa.addContext(genEx, 'Called from doUnimod')
            raise
Esempio n. 10
0
    ret = cfg.evaluateCommandLineArgs(sys.argv)

    try:
        cfg.scalePpmMda()
        dataDir = cfg.parameters['runtime']['datadir']

        logParam = cfg.parameters['logging']
        logPath = Path(dataDir.joinpath(logParam['logdir']))
        if not logPath.exists():
            logPath.mkdir(parents=True)
        logFile = logPath.joinpath(logParam['logfile'])

        logger = Logger(logFile, logParam['loglevel'], logParam['screenlevel'],
                        False)
        logger.setMascotParserLogs()

        jobcontrol(cfg, logger)

    except ExHa.UsageError as useEx:
        ExHa.reformatException(useEx)
        print useEx.context
    except Exception as genEx:
        ExHa.reformatException(genEx)
        errorFile = Path(cfg.parameters['runtime']['hdf5file']).stem + '.error'
        ExHa.exportError2File(
            genEx, cfg.parameters['runtime']['datadir'].joinpath(errorFile))
        if logs:
            logs.datlog.warning(ExHa.oneLineRepr(genEx))
        else:
            print ExHa.multiLineRepr(genEx)
Esempio n. 11
0
    def run(self, jobObj):
        """
        @brief run the analysis
        @param job <object>: containing all the job data
        """
        # needs to catch exceptions here so that LUX doesn't recieve an exception

        hdf = ''
        xraw = ''
        tempFile = ''
        config = self.cfg
        logs = self.logs

        try:
            hdf = ''
            maxspec = jobObj.args['maxspec']
            logs.log.info("Starting PyMSSafe runner")

            # assign objects to the file interfaces
            rawpath = jobObj.srcpth.absolute()
            namebase = rawpath.stem
            hdfpath = jobObj.dstpth.absolute().joinpath(namebase + '.hdf5')
            runname = namebase
            tempFile = jobObj.dstpth.absolute().joinpath(
                str(config.parameters['runtime']['pid']))
            if tempFile.exists():
                tempFile.unlink()

            logs.log.info('started job: %s' % rawpath.name)
            watch = Stopwatch()

            if not rawpath.exists():
                logs.log.info('could not find file: %s' % str(rawpath))
                logs.log.info('Stopped')
                return {
                    'code': 1,
                    'error': 'could not find file: %s' % str(rawpath)
                }

            # get quant information if file is quan type
            quantMeth = self.extractQuant(jobObj.srcpth.name.upper())

            if quantMeth == -1:
                raise ExHa.QuantificationMethodError(
                    'Unable to find "%s" quantification method' %
                    self.cfg.parameters['runtime']['quant'].upper())
            elif quantMeth == -2:
                raise ExHa.QuantificationMethodError(
                    'Unable to find valid quantification method in file name (%s)'
                    % jobObj.srcpth.name.upper())

            xraw = XRawFile(str(rawpath))
            if config.parameters['general']['skipscanevents']:
                xraw.skipScanEvents = config.parameters['general'][
                    'skipscanevents']

            # opens hdf5 file for writing
            hdf = hdf5Base(hdfpath, True, True)
            hdf.appendOpen()
            self.createHDFtables(hdf)

            # save the config entries
            hdf.appendRows('/rawdata/config', config.convertConfig())

            # create datamanager object
            config.rawfile = jobObj.srcpth
            dataman = Datamanager(xraw, config, logs, hdf, quantMeth,
                                  str(tempFile))
            dataman.maxspec = maxspec
            dataman.addLUXdata(jobObj, config.parameters['runtime']['pid'])

            # run the analysis in the datamanager
            ok = dataman.processSpectra(quantMeth, watch, runname)
            if ok['code'] != 0:
                raise ExHa.SpectraProcessingError(ok['error'])

            # run the XIC generation
            logs.log.info('Processing XIC data for %d MS/MS events' %
                          len(dataman.specs))
            ok = dataman.processXICs()
            if ok['code'] != 0:
                raise ExHa.XICprocessingError(ok['error'])
            watch.rec('Processing XIC')

            logs.log.info('Writing HDF5 indexes')
            hdf.indexTable('/rawdata/msmsheader', ['spec_id'])
            hdf.close()

            xraw = ''
            hdf = ''
            watch.stop()
            logs.log.info('job took: %s' % watch.oneLineFormat())
            tempFile.unlink()
        except ExHa.MSdataConsistancyError, msg:
            self.shutdown(hdf, xraw, tempFile)
            logs.log.warning('error: pyMSsafe Data error: %s' % msg)
            return {'code': 2, 'error': 'pyMSsafe Data error: %s' % msg}
Esempio n. 12
0
def jobcontrol(cfg, logs):
    '''
    @brief takes filepath, executes parse ... import
    @param cfg <ConfigManager>: all the configuartion parameters including the sample data
    @param logs <loggingManager>: to control the logging of events
    '''
    try:
        msg = 'Setting files'
        datFileName = cfg.parameters['runtime']['datfile']
        hdf5FileName = cfg.parameters['runtime']['hdf5file']

        logs.setuplog.info('started job: %s' % datFileName)
        watch = Stopwatch()

        filedic = dict(dat=datFileName,
                       datpath=dataDir.joinpath(datFileName),
                       hdf5=hdf5FileName,
                       hdf5path=dataDir.joinpath(hdf5FileName))
        searchID = int(filedic['datpath'].stem[1:])

        hdfMascot = HDF5Mascot(hdfFilePath=filedic['hdf5path'])
        hdfMascot.appendOpen()
        importGroup = hdfMascot.checkDatFilePresence(filedic['dat'])

        # test the quantification method already in the hdf5 file
        msg = 'Setting quantification method'
        quantMethID = hdfMascot.getH5DFQuantMeth()
        quantHandler = QuantMethods()
        quantMethod = quantHandler.getMethodByID(quantMethID)

    except Exception as genEx:
        ExHa.addContext(genEx, 'jobcontrol Error: %s' % msg)
        raise

    try:
        # control the deletion of existing data
        msg = 'Deleting existing HDF5 data'
        hdfMascot.deleteAllMascotImports(0)
        # if overwrite:
        #     # delte all Mascot data whatever the source
        #     hdfMascot.deleteAllMascotImports(0)
        # elif importGroup:
        #     # delete previous version of this data
        #     hdfMascot.deleteMascotImport(importGroup)

        importGroup = filedic['dat'].replace('.', '_')

        hdfMascot.createTables(importGroup, searchID, 0)

        hdfMascot.writeConfig(cfg.convertConfig())

        datfile = Datfile(filedic, hdfMascot, cfg, logs, searchID, quantMethod)

        logs.setuplog.info('searchid: %d, dat file: %s, hdf5 file: %s ' %
                           (searchID, datfile.datfilename, filedic['hdf5']))

        msg = 'Parsing data'
        datparser = DatParser(datfile, cfg, logs)
        datparser.startParsing()
        watch.rec('parser')

        # post parsing processing
        logs.qclog.info('Effective Run-time analysis')
        datfile.doTimeAnalysis()

        msg = 'Find top protein hit'
        logs.qclog.info(msg)
        # tophit = db.getMascotTopHit(searchID)
        if datfile.seq2acc:
            datfile.findBestProtein()
    except Exception as genEx:
        ExHa.addContext(genEx, 'jobcontrol Error: %s' % msg)
        raise
    finally:
        hdfMascot.close()
        logs.setuplog.info('Closing HDF5')
        hdfMascot.close()
        watch.rec('processing')

        watch.stop()
        logs.setuplog.info('job took %s' % watch.format())
    return
Esempio n. 13
0
File: mgf.py Progetto: hdinkel/isob
class mgftools:
    def __init__(self, hdf5file):
        """
        @brief initialises the mgftools class
        @param hdf5file <string/path>: path for the hdf5 file to analyse
        """
        self.cfgFilters = cfg.parameters['msmsfilters']
        self.maxint = 0
        self.proton = cfg.parameters['general']['proton']
        self.neutron = cfg.parameters['general']['neutron']

        self.hdf5 = hdf5Base(hdf5file)
        self.hdf5.readOpen()

        self.filters = dict(ten=self.tenpercent,
                            zone=self.zonefilter,
                            repion=self.repionfilter,
                            deconv=self.deconvolute,
                            mascot=self.mascot,
                            neutralloss=self.neutrallossfilter,
                            multi=self.multichargefilt,
                            immonium=self.immoniumfilter,
                            none='')
        self.isos = self.hdf5.readTable('/rawdata/isotopes')

        frags = self.getActivationTypes()

        if len(frags) == 1:
            if frags == ['HCD']:
                # hcd olny method
                usefilts = self.cfgFilters['filt_hcd']
                self.remove = self.cfgFilters['rem_hcd']
            else:
                # CID/PQD only method
                usefilts = self.cfgFilters['filt_other']
                self.remove = self.cfgFilters['rem_other']
        else:
            # mixed method
            usefilts = self.cfgFilters['filt_mixed_hcd_other']
            self.remove = self.cfgFilters['rem_mixed_hcd_other']

        if len(self.isos) == 0:
            repionIdx = -1
            for idx in range(len(usefilts)):
                if usefilts[idx] == 'repion':
                    repionIdx = idx
                    break
            if repionIdx != -1:
                usefilts.pop(repionIdx)

        self.usefilts = usefilts
        self.hdf5.close()

    def close(self):
        self.hdf5.close()

    def getActivationTypes(self):

        data = self.hdf5.getDataEqual('/rawdata/parameters', 'parameter',
                                      'Activation Type')
        if len(data) == 0:
            data = self.hdf5.getDataEqual('/rawdata/parameters', 'parameter',
                                          'activation')

        types = {}
        for act in data.flat:
            activation = act['value'].upper()
            types[activation] = types.get(activation, 0) + 1

        return types.keys()

    def export(self, hcdonly=0):
        """
        @brief creates an mgf file for the MS/MS spectra in the hdf5 file
        @param hcdonly <integer>: flag to switch output to only HCD spectra bypassing the normal export filters
        """
        if hcdonly:
            remove = ['CID']
            filters = ['none']
        else:
            remove = self.remove
            filters = self.usefilts

        hdf = self.hdf5
        mgfFile = hdf.filePath.parent.joinpath(hdf.filePath.stem + '.mgf')
        # extra = path(hdf.filepath.splitext()[0] + '.txt')
        # self.fextra = extra.open('w')
        # self.fextra.write('spec_id\tmz\tinten\trel_inten\tion\texp_mz\n')

        mgfOut = open(str(mgfFile), 'w')
        mgfOut.write('#Removed spectra = %s, filtering = %s\n' %
                     (remove, filters))
        spec = 0

        # read parameters from hdf5 file
        try:
            hdf.appendOpen()
            headers = hdf.readTable('/rawdata/msmsheader')
            runTimeEntry = hdf.getDataEqual('/rawdata/parameters', 'parameter',
                                            'MS Run Time (min)')
            if len(runTimeEntry) == 0:
                raise ExHa.MGFprocessingError(
                    'MGF Error: Could not find "MS Run Time (min)" parameter in HDF5 file.'
                )
            runtime = runTimeEntry[0]['value']
            units = self.readUnitsOK()

            # add new table for the deconvoluted spectrum data
            hdf.removeTable('/rawdata/deconvions')
            hdf.createTable('rawdata', 'deconvions', 'DeconvIons')
            ident = []
            for frag in units[1]:
                # find all the frag methods to be used in identification
                if 'I' in frag['use']:
                    ident.append(frag['order'])

            logger.log.info('Reading %d spectra from %s' %
                            (len(headers), hdf.filePath.name))
            if 'deconv' in filters:
                deconv = 1
            else:
                deconv = 0

            pBar = progBar.ProgressBar(widgets=progBar.name_widgets,
                                       maxval=len(headers),
                                       name='Create .mgf').start()
            for idx, h in enumerate(headers):
                if hcdonly:
                    if h['fragmeth'] != 'HCD':
                        continue
                elif not h['order'] in ident:
                    continue
                pBar.update(idx)

                # get spectrum data
                spec = h['spec_id']
                spectrum = hdf.getDataEqual('/rawdata/ions', 'spec_id', spec)
                if deconv:
                    # need extra column for charge information
                    spectrum = self.addChargeColumn(spectrum)

                data = hdf.getDataGeneral(
                    '/rawdata/specparams',
                    '(spec_id == %i) & (parameter == "%s")' %
                    (spec, 'setmass1'))
                setmass = data[0]['value']
                data = hdf.getDataGeneral(
                    '/rawdata/specparams',
                    '(spec_id == %i) & (parameter == "%s")' % (spec, 'frag1'))
                frag = data[0]['value']
                try:
                    self.maxint = max(spectrum['inten'])
                except:
                    self.maxint = 0

                # construct title values list
                rt = '%.3f' % h['rt']
                use = units[1][h['order'] - 1]['use']
                pretitle = ''
                if use == 'IQ':
                    # spec is both ID and Quan so us normal msms ID
                    titles = ['msmsid:F%06d' % h['spec_id']]
                elif use == 'I':
                    if h['quan_spec'] == 0:
                        # no quant data so use spec_id
                        titles = ['msmsid:F%06d' % h['spec_id']]
                    else:
                        # spec is only for ident find the quan spec
                        titles = ['msmsid:F%06d' % h['quan_spec']]
                        pretitle = '#CID=F%06d\n' % h['id_spec']
                elif use == 'Q':
                    titles = ['msmsid:F%06d' % h['quan_spec']]
                    pretitle = '#CID=F%06d\n' % h['id_spec']

                titles.append('rt:' + rt)
                titles.append('survey:S%06d' % h['survey_spec'])
                titles.append('parent:' + setmass)
                titles.append('AnalTime:' + runtime)
                titles.append('Activation:' + frag.upper())

                titleline = 'TITLE=%s\n' % ','.join(titles)

                if h['precmz'] > 0:
                    pepmass = h['precmz']
                elif h['precmz_surv'] > 0:
                    pepmass = h['precmz_surv']
                else:
                    pepmass = h['monomz']

                if pepmass == 0:
                    continue

                for filt in filters:
                    if len(spectrum) > 5 and self.filters[filt]:
                        spectrum = self.filters[filt](h, spectrum)

                # filter for mascot interference
                ionList = []
                if len(spectrum) > 2:
                    mgfOut.write(pretitle)
                    mgfOut.write('BEGIN IONS\n')
                    mgfOut.write(titleline)
                    mgfOut.write('PEPMASS=%f\n' % pepmass)
                    mgfOut.write('CHARGE=%d+\n' % h['charge'])
                    if deconv:
                        for pt in spectrum:
                            if pt['inten'] == 0:
                                continue
                            mgfOut.write('%f  %f  %s\n' %
                                         (pt['mz'], pt['inten'], pt['charge']))
                            ionList.append(
                                dict(spec_id=pt['spec_id'],
                                     mz=pt['mz'],
                                     inten=pt['inten'],
                                     charge=pt['charge']))
                    else:
                        for pt in spectrum:
                            if pt['inten'] == 0:
                                continue
                            mgfOut.write('%f  %f\n' % (pt['mz'], pt['inten']))
                            ionList.append(
                                dict(spec_id=pt['spec_id'],
                                     mz=pt['mz'],
                                     inten=pt['inten']))
                    mgfOut.write('END IONS\n\n')
                if len(ionList) > 0:
                    hdf.appendRows('/rawdata/deconvions', ionList)

            pBar.finish()

        except ExHa.MGFprocessingError, czEx:
            if spec:
                ExHa.addContext(czEx,
                                'Raised whist processing spectrum %i' % spec)
            raise
        except Exception as genEx:
            ExHa.reformatException(genEx)
            if spec:
                ExHa.addContext(genEx,
                                'Raised whist processing spectrum %i' % spec)
            raise
Esempio n. 14
0
File: mgf.py Progetto: hdinkel/isob
    def export(self, hcdonly=0):
        """
        @brief creates an mgf file for the MS/MS spectra in the hdf5 file
        @param hcdonly <integer>: flag to switch output to only HCD spectra bypassing the normal export filters
        """
        if hcdonly:
            remove = ['CID']
            filters = ['none']
        else:
            remove = self.remove
            filters = self.usefilts

        hdf = self.hdf5
        mgfFile = hdf.filePath.parent.joinpath(hdf.filePath.stem + '.mgf')
        # extra = path(hdf.filepath.splitext()[0] + '.txt')
        # self.fextra = extra.open('w')
        # self.fextra.write('spec_id\tmz\tinten\trel_inten\tion\texp_mz\n')

        mgfOut = open(str(mgfFile), 'w')
        mgfOut.write('#Removed spectra = %s, filtering = %s\n' %
                     (remove, filters))
        spec = 0

        # read parameters from hdf5 file
        try:
            hdf.appendOpen()
            headers = hdf.readTable('/rawdata/msmsheader')
            runTimeEntry = hdf.getDataEqual('/rawdata/parameters', 'parameter',
                                            'MS Run Time (min)')
            if len(runTimeEntry) == 0:
                raise ExHa.MGFprocessingError(
                    'MGF Error: Could not find "MS Run Time (min)" parameter in HDF5 file.'
                )
            runtime = runTimeEntry[0]['value']
            units = self.readUnitsOK()

            # add new table for the deconvoluted spectrum data
            hdf.removeTable('/rawdata/deconvions')
            hdf.createTable('rawdata', 'deconvions', 'DeconvIons')
            ident = []
            for frag in units[1]:
                # find all the frag methods to be used in identification
                if 'I' in frag['use']:
                    ident.append(frag['order'])

            logger.log.info('Reading %d spectra from %s' %
                            (len(headers), hdf.filePath.name))
            if 'deconv' in filters:
                deconv = 1
            else:
                deconv = 0

            pBar = progBar.ProgressBar(widgets=progBar.name_widgets,
                                       maxval=len(headers),
                                       name='Create .mgf').start()
            for idx, h in enumerate(headers):
                if hcdonly:
                    if h['fragmeth'] != 'HCD':
                        continue
                elif not h['order'] in ident:
                    continue
                pBar.update(idx)

                # get spectrum data
                spec = h['spec_id']
                spectrum = hdf.getDataEqual('/rawdata/ions', 'spec_id', spec)
                if deconv:
                    # need extra column for charge information
                    spectrum = self.addChargeColumn(spectrum)

                data = hdf.getDataGeneral(
                    '/rawdata/specparams',
                    '(spec_id == %i) & (parameter == "%s")' %
                    (spec, 'setmass1'))
                setmass = data[0]['value']
                data = hdf.getDataGeneral(
                    '/rawdata/specparams',
                    '(spec_id == %i) & (parameter == "%s")' % (spec, 'frag1'))
                frag = data[0]['value']
                try:
                    self.maxint = max(spectrum['inten'])
                except:
                    self.maxint = 0

                # construct title values list
                rt = '%.3f' % h['rt']
                use = units[1][h['order'] - 1]['use']
                pretitle = ''
                if use == 'IQ':
                    # spec is both ID and Quan so us normal msms ID
                    titles = ['msmsid:F%06d' % h['spec_id']]
                elif use == 'I':
                    if h['quan_spec'] == 0:
                        # no quant data so use spec_id
                        titles = ['msmsid:F%06d' % h['spec_id']]
                    else:
                        # spec is only for ident find the quan spec
                        titles = ['msmsid:F%06d' % h['quan_spec']]
                        pretitle = '#CID=F%06d\n' % h['id_spec']
                elif use == 'Q':
                    titles = ['msmsid:F%06d' % h['quan_spec']]
                    pretitle = '#CID=F%06d\n' % h['id_spec']

                titles.append('rt:' + rt)
                titles.append('survey:S%06d' % h['survey_spec'])
                titles.append('parent:' + setmass)
                titles.append('AnalTime:' + runtime)
                titles.append('Activation:' + frag.upper())

                titleline = 'TITLE=%s\n' % ','.join(titles)

                if h['precmz'] > 0:
                    pepmass = h['precmz']
                elif h['precmz_surv'] > 0:
                    pepmass = h['precmz_surv']
                else:
                    pepmass = h['monomz']

                if pepmass == 0:
                    continue

                for filt in filters:
                    if len(spectrum) > 5 and self.filters[filt]:
                        spectrum = self.filters[filt](h, spectrum)

                # filter for mascot interference
                ionList = []
                if len(spectrum) > 2:
                    mgfOut.write(pretitle)
                    mgfOut.write('BEGIN IONS\n')
                    mgfOut.write(titleline)
                    mgfOut.write('PEPMASS=%f\n' % pepmass)
                    mgfOut.write('CHARGE=%d+\n' % h['charge'])
                    if deconv:
                        for pt in spectrum:
                            if pt['inten'] == 0:
                                continue
                            mgfOut.write('%f  %f  %s\n' %
                                         (pt['mz'], pt['inten'], pt['charge']))
                            ionList.append(
                                dict(spec_id=pt['spec_id'],
                                     mz=pt['mz'],
                                     inten=pt['inten'],
                                     charge=pt['charge']))
                    else:
                        for pt in spectrum:
                            if pt['inten'] == 0:
                                continue
                            mgfOut.write('%f  %f\n' % (pt['mz'], pt['inten']))
                            ionList.append(
                                dict(spec_id=pt['spec_id'],
                                     mz=pt['mz'],
                                     inten=pt['inten']))
                    mgfOut.write('END IONS\n\n')
                if len(ionList) > 0:
                    hdf.appendRows('/rawdata/deconvions', ionList)

            pBar.finish()

        except ExHa.MGFprocessingError, czEx:
            if spec:
                ExHa.addContext(czEx,
                                'Raised whist processing spectrum %i' % spec)
            raise
Esempio n. 15
0
File: mgf.py Progetto: hdinkel/isob
        for f in dataDir.glob(fileFilter):
            if not f.is_file():
                # skip any directories
                continue

            # if f.name[:4] in ['6528', '1814', '2032']: continue
            mgf = mgftools(f)
            logger.log.info('Filename:     %s' % f.name)
            if hcdOnly:
                logger.log.info('Export HCD data only')
            else:
                logger.log.info('Using filters: %s' % str(mgf.usefilts))
            rtn = mgf.export(hcdOnly)
            mgf.close()
        if f == 0:
            raise ExHa.MGFprocessingError('no files found for: %s' %
                                          str(dataDir / fileFilter))

    except ExHa.UsageError as useEx:
        ExHa.reformatException(useEx)
        logger.log.info(useEx.context)
    except Exception, genEx:
        ExHa.reformatException(genEx)
        if f:
            ExHa.exportError2File(genEx, f.parent.joinpath(f.stem + '.error'))
        else:
            ExHa.exportError2File(genEx, dataDir.joinpath('errors.error'))
        logger.log.info(ExHa.multiLineRepr(genEx))

    logger.log.info('finished')
Esempio n. 16
0
    def updateHDF5(self):
        """
        @brief controls the updating of the data to the hdf5 results file

        @return finalMessage <string>: constructed from the protein data this is the RESULT stored in the DB
        """
        pep2unique = self.pep2unique
        baseContext = 'updateHDF5: '
        context = 'updateHDF5'
        try:
            # find the peptide sequences that are being imported
            usedPeps = self.setsManager.findUsedPeptides()
            logger.log.info('there are %s usedPeps' % len(usedPeps))

            context = baseContext + 'Retrieving sample IDs'

            sample_ids = range(1, len(self.hdfFiles) + 1)
            # create proteinset and proteinhit data
            starting_protein_group_no = 1
            self.setsManager.setProteinGroupNo(starting_protein_group_no)

            logger.log.info('adding protein group data to HDF5')

            logger.log.debug(str(self.hdfFiles.keys()))
            spectrum_id = 0
            peptide_id = 0
            hdfFileList = self.hdfFiles.keys()
            hdfFileList.sort()

            for key in hdfFileList:
                baseContext += '%s: ' % key
                logger.log.log(
                    logger.PROCESS,
                    'Integrating Spectrum, Peptide & Quantification data from %s'
                    % key)
                # collect fileData
                hdf = self.hdfFiles[key]
                hdfObj = hdf.hdfObject

                # set the current sample_id from the list of IDs extracted from the DB
                current_sample_id = sample_ids.pop()

                hdf.acquired_spectra, hdf.mascot_matched_spectra, numIsotopes, runTime = hdfObj.getNumbers(
                )

                # read the Mascot data
                context = baseContext + 'Reading Mascot data'
                tmp = hdfObj.readImporterData(usedPeps, hdf)
                peptides = tmp[0]
                queryDict = tmp[1]
                headerArray = tmp[2]
                quanArray = tmp[3]

                hdf.spectra_in_qc_proteins = len(peptides)

                logger.log.debug('getting spectrum_ids')
                context = baseContext + 'Retrieving spectrum IDs'

                acqTime, hdf.idAct, hdf.quanAct = hdfObj.getTimeAndActivation()
                # create blank lists to hold data for writing to hdf5 file
                spectrum_list = []
                peptide_list = []
                quant_list = []
                logger.log.info('collating spectrum, peptide & quant data')
                pBar = progBar.ProgressBar(widgets=progBar.name_widgets,
                                           maxval=len(queryDict),
                                           name='collate data').start()
                for idx, q in enumerate(queryDict):
                    # loop round all the required spectra
                    pBar.nextPrimary()
                    context = baseContext + 'query %i: Setting spectrum data' % q
                    # extract a spectrum_id from the list
                    spectrum_id += 1
                    query = queryDict[q]
                    spec = int(query['spec_id'])
                    context = baseContext + 'spectrum %i: Updating DB with spectrum data' % spec
                    # add spectrum data to spectrum_list
                    header = self.filterArrayEqual(headerArray, 'spec_id',
                                                   spec)
                    spectrum_list.append(
                        self.makeSpectrumDict(spectrum_id, current_sample_id,
                                              query, acqTime, header))

                    # find the appropriate peptides
                    pepList = peptides[q]
                    logger.log.debug('there are %s in peplist %s' %
                                     (len(pepList), str(pepList)))
                    quantFound = 0

                    # this list will hold all peptides returned from makePeptideDictList and then filter
                    # those non-rank1 equivalents based on the score of the rank 1 peptide
                    tmplist = []
                    for pep in pepList:
                        # find the sets that the peptide belongs to and add to the peptide_list
                        sets = self.setsManager.peptide2set[pep['peptide']]
                        context = baseContext + 'spectrum %i: Creating peptide data entries for hdf5' % spec
                        tmp, qf = self.makePeptideDictList(
                            spectrum_id, pep, query, sets, hdf, pep2unique)
                        tmplist.extend(tmp)
                        peptide_list += tmp
                        quantFound += qf

                    # only keep rank1 equivalent peptides (based on score)
                    tmplist.sort(key=lambda x: x['rank'])
                    toprankscore = tmplist[0]['score']
                    tmplist = [
                        x for x in tmplist if x['score'] == toprankscore
                    ]

                    if quantMethID and quantFound:
                        # extract quantification data for the spectrum
                        context = baseContext + 'spectrum %i: Creating quantitation data entries for DB' % spec
                        newquant, deltas = self.makeQuantDictLists(
                            spectrum_id, spec, tmplist, header, quanArray, hdf)

                        quant_list += newquant

                        if quantSource == 'ms2':
                            context = baseContext + 'spectrum %i: Adding reporter ion delta data' % spec
                            hdf.addReporterDeltas(deltas)
                pBar.finish()

                # calculate statistics
                context = baseContext + 'Calculating statistics'
                hdf.calcReporterStats()
                context = baseContext + 'Calculating delta m/z for fragment ions'

                context = baseContext + 'Updating sample table (%i)' % current_sample_id
                sample_data = hdf.getSampleDataDict(current_sample_id, key,
                                                    runTime)

                hdf5results.writeSample(sample_data)

                self.importData.combineStatistics(hdf)

                # write data to HDF5
                context = baseContext + 'Updating spectrum table'
                logger.log.info('updating HDF5 with spectrum data')
                hdf5results.writeSpectrum(spectrum_list)

                if quantMethID:
                    context = baseContext + 'Updating specquant table'
                    logger.log.info('updating HDF5 with quant data')
                    hdf5results.writeSpecQuant(quant_list)

                context = baseContext + 'Retrieving peptide IDs'
                logger.log.info('updating HDF5 with peptide data')
                for pepdata in peptide_list:
                    pepdata['peptide_id'] = peptide_id
                    peptide_id += 1

                context = baseContext + 'Updating peptide table'
                hdf5results.writePeptide(peptide_list)
            hdf5results.createIndexes()

            logger.log.info('finalising HDF5 entries')
            hdf5results.writeFDRdata(self.importData.score2fdr, 'peptide')
            hdf5results.writeFDRdata(self.importData.proteinscore2fdr,
                                     'protein')

            topScoringProteinInfo = self.setsManager.addPeptideSetDBdata(
                hdf5results, self.importData.proteinscore2fdr)
            runtimedata = self.importData.getSummaryStatisticsDict()

            hdf5results.writeStatistics(runtimedata)

            finalMessage = 'queries matched: %i / %s (%.1f%%) ' % (
                runtimedata['spectra_in_qc_proteins'],
                runtimedata['mascot_matched_spectra'],
                (runtimedata['spectra_in_qc_proteins'] /
                 float(runtimedata['mascot_matched_spectra'])) * 100)
            finalMessage += 'spectra quantified: %i top hit %s (%s) ' % (
                runtimedata['quantified_spectra'], '', '')
            finalMessage += 'with total score %f and %i matched peptides (hook AND non hook)' % \
                            (topScoringProteinInfo[0], topScoringProteinInfo[2])

            baseContext = 'updateHDF5: '
            context = baseContext + 'Finalising HDF5 entries'
        except Exception, genEx:
            # make sure that there aren't any permanent changes
            ExHa.addContext(genEx, context)
            finalMessage = 'Error: %s' % ExHa.oneLineRepr(genEx)
            raise
Esempio n. 17
0
            msg = 'merged sample'
        else:
            isMerged = False
            msg = 'single sample'

        importer = DATimporter(cfg, logger)
        logger.log.info(msg)
        searchDict = {}
        for idx, hdf5file in enumerate(searches):
            searchData = {}
            searchData['hdf5name'] = hdf5file
            searchData['archivepath'] = cfg.parameters['runtime']['datadir']
            fullPath = searchData['archivepath']
            fi = fullPath.joinpath(searchData['hdf5name'])
            if not fi.exists():
                raise ExHa.FileNotFoundException('Missing file: %s' % str(fi))
            logger.log.info('Reading: %s' % str(fi))
            importer.addHDFfile(fi, idx + 1, logger)
        sw.rec('file loading')
        fdrthreshold = cfg.parameters['general']['fdrthreshold']
        peptidescoreatthreshold, score2fdr = importer.calculateFDR(
            importer.setsManager.FDRdata, fdrthreshold)
        importer.importData.score2fdr = score2fdr

        importer.finaliseProteins(peptidescoreatthreshold)
        proteinscoreatthreshold, score2fdr = importer.calculateFDR(
            importer.setsManager.proteinFDRdata, fdrthreshold)

        importer.importData.proteinscore2fdr = score2fdr

        finalMessage = importer.updateHDF5()
Esempio n. 18
0
        allfractionbgratios = {}
        for isotopelabel_id, data in allsumionratiodata.iteritems():
            allfractionbgratios[isotopelabel_id] = np.median(data)
        # second normalization so that the bg-ratios all add to 1
        for isotopelabel_id, data in allfractionbgratios.iteritems():
            allfractionbgratios[isotopelabel_id] = data / sum(allfractionbgratios.values())
        logger.log.debug(('allfractionbgratios are %s' % str(allfractionbgratios)))
        for corrects2iquantob in corrects2iquantoblist:
            # perform correction for each of the analyzed .hdf5 files.
            s2icorrecteddata = corrects2iquantob.performS2Icorrection(allfractionbgratios)
            corrects2iquantob.hdf5corrects2iquant.updates2ivalues(s2icorrecteddata)
            hdf5corrects2iquant.close()

    except ExHa.czException as czEx:
        ExHa.reformatException(czEx)
        ExHa.addContext(czEx, 'Error during corrects2iquant run')
        ExHa.exportError2File(czEx, cfg.parameters['runtime']['datadir'] / Path('errors.error'))
        if logger:
            logger.log.warning(ExHa.oneLineRepr(czEx))
        else:
            print ExHa.multiLineRepr(czEx)

    except Exception as genEx:

        ExHa.reformatException(genEx)
        ExHa.addContext(genEx, 'Error during corrects2iquant run')
        ExHa.exportError2File(genEx, cfg.parameters['runtime']['datadir'] / 'errors.error')
        if logger:
            logger.log.warning(ExHa.oneLineRepr(genEx))
        else:
Esempio n. 19
0
from pathlib import Path

# cellzome CommonUtils
sys.path.insert(0, '..')
from CommonUtils.tools import *
from CommonUtils.ConfigManager import pyMSsafeConfigManager
from CommonUtils.LoggingManager import Logger
from CommonUtils.hdf5Base import hdf5Base
from CommonUtils.QuantMethodHandler import QuantMethods
import CommonUtils.ExceptionHandler as ExHa

# pyMSsafe modules
try:
    from xRawFile import XRawFile
except ImportError, ieEx:
    ExHa.reformatException(ieEx)
    ExHa.addContext(ieEx, 'Xcalibur not set up properly')
    configPath = './pymssafe.cfg'
    cfg = pyMSsafeConfigManager(configPath)
    ret = cfg.evaluateCommandLineArgs(sys.argv)
    dataDir = cfg.parameters['runtime']['datadir']
    ExHa.exportError2File(ieEx, dataDir.joinpath('errors.error'))

from datastore import Datamanager


class pymssafe:
    def __init__(self, config):
        """
        @brief initiaise pyMSsafe controller
        @param mode <string>: operational mode of the application