Beispiel #1
0
            def calculate_precursors(calibrant):
                if data.format == 'raw':
                    lastMS1 = data.lscan(lastMS1ScanName)
                    lastMS1, calibrant = raw_scan_recalibration(
                        lastMS1, calibrant)
                else:
                    try:
                        lastMS1 = data.scan(lastMS1ScanName, centroid=True)
                    except NotImplementedError:
                        lastMS1 = centroid_func(data.scan(lastMS1ScanName))

                envelopes = peak_pick(lastMS1,
                                      tolerance=0.01,
                                      min_peaks=2,
                                      enforce_isotopic_ratios=True)[0]
                return sum([[(x[0][0], c) for x in xs]
                            for c, xs in envelopes.items()], []), calibrant
Beispiel #2
0
 def calculate_precursors(self):
     if self.data.format == 'raw':
         if self.long_ms1:
             lastMS1 = self.get_long_MS1(self.lastMS1ScanName)
             # Is calibration valid in this case?
         else:
             lastMS1 = self.data.lscan(self.lastMS1ScanName)
         lastMS1 = self.raw_scan_recalibration(lastMS1)
     else:
         try:
             lastMS1 = self.data.scan(self.lastMS1ScanName,
                                      centroid = True)
         except NotImplementedError:
             lastMS1 = centroid_func(self.data.scan(self.lastMS1ScanName))      
             
     envelopes = peak_pick(lastMS1, **self.deisoreduce_MS1_args)[0]
     self.possible_precursors = sum([[(x[0][0], c) for x in xs]
                                     for c, xs in envelopes.items()], [])    
Beispiel #3
0
    def get_long_MS1_byProfile(self, scanNum):
        # This seems to take a really long time.  Profile-mode scans are heavy!
        ms1_index = bisect.bisect_left(self.ms1_list, scanNum)
        scannumbers = [self.ms1_list[ms1_index + i] for i
                       in [-1, 0, 1]
                       if ms1_index + i >= 0 and ms1_index + i < len(self.ms1_list)]

        ms1s = []
        for scannum in scannumbers:
            if scannum in self.MS1_scan_batch:
                ms1s.append(self.MS1_scan_batch[scannum])
            else:
                scan = self.data.scan(scannum, centroid = False)
                self.MS1_scan_batch[scannum] = scan
                ms1s.append(scan)
        to_del = [k for k in self.MS1_scan_batch if k <= scannumbers[0]]
        for scannum in to_del:
            del self.MS1_scan_batch[scannum]
        
        # MS1 profile scans aren't consistent across the entire MZ range,
        # apparently dependent upon whether there is signal in a particular
        # location; we don't want to cover the entire range evenly, since
        # this would expand the data and slow things down, but we also don't
        # want to miss/mis-assign signal that occurs in only one scan of the
        # batch.
        long_mzs = set()
        for b in ms1s:
            long_mzs.update(round(x[0], 4) for x in b)
            
        long_ms1 = []
        inds = [0]*len(ms1s)
        for lmz in sorted(long_mzs):
            sumint = 0
            for j in xrange(len(ms1s)):
                bat, ind = ms1s[j], inds[j]
                while ind < len(bat) and bat[ind][0] < lmz:
                    sumint += bat[ind][1]
                    inds[j] += 1
                    ind = inds[j]
            long_ms1.append((lmz, sumint))
            
        return centroid_func(long_ms1)
Beispiel #4
0
def extract(datafile,
            outputfile=None,
            default_charge=2,
            centroid=True,
            scan_type=None,
            deisotope_and_reduce_charge=True,
            deisotope_and_reduce_args={},
            min_mz=140,
            precursor_tolerance=0.005,
            isobaric_labels=None,
            label_tolerance=0.01):
    """
    Converts a mzAPI-compatible data file to MGF.
    
    Writes only MS2 spectra where these can be determined, otherwise takes
    every spectrum in the file.  Likewise writes the precursor charge
    and mass if these can be determined.
    
    deisotope_and_reduce_charge deisotopes and charge-reduces each MS2
    spectrum, which generally improves results from peptide database search
    algorithms. However, it should be disabled for very low-resolution scans.
    """
    # Currently doesn't compensate for injection time! Would be required in
    # order to deal with iTRAQ/TMT labels.

    from multiplierz.spectral_process import deisotope_reduce_scan, peak_pick
    from multiplierz.spectral_process import centroid as centroid_func  # Distinct from 'centroid' argument.

    def _get_precursor(mz, possible_prec, charge):
        try:
            return min([
                x for x in possible_prec if (charge == None or x[1] == charge)
            ],
                       key=lambda x: abs(x[0] - mz))
        except ValueError:
            return None, None

    if not outputfile:
        outputfile = datafile + '.mgf'

    if os.path.exists(outputfile):
        assert outputfile.lower().endswith('mgf'), (
            "Overwriting a non-MGF file %s with "
            "the MGF extractor is probably a mistake." % outputfile)

    writer = MGF_Writer(outputfile)

    data = mzFile(datafile)
    scanInfo = data.scan_info()

    # Coerce that scanInfo be in order of time, so that for .WIFF files
    # we can still use the previous-MS1 method to look up precursor charges.
    scanInfo.sort(key=lambda x: x[0])

    if datafile.lower().endswith('.raw'):  # May also exist for WIFF?
        filters = dict(data.filters())

        # For RAW files only, there's the option to filter by a given
        # scan type.  (It would be more efficient in many cases to
        # actually split files in a single run, though.)
        if scan_type:
            scan_type = scan_type
            assert (scan_type.lower()
                    in ['cid', 'hcd', 'etd',
                        'etdsa']), ("Invalid scan type %s, must be one"
                                    "of (CID, HCD, ETD, ETDSA).") % scan_type
            typestr = "@%s" % scan_type.lower()

            scanInfo = [
                x for x in scanInfo
                if x[3] == 'MS1' or typestr in filters[x[0]]
            ]
    else:
        filters = None
        assert not scan_type, "Scan type filtering only enabled with .RAW format files."

    if isobaric_labels:
        assert centroid, "Isobaric tags can only be read from centroided data; set 'centroid' to True."

    if not isobaric_labels:
        labels = []
    elif isobaric_labels == 4 or isobaric_labels == '4plex':
        labels = zip([114, 115, 116, 117], [114.11, 115.11, 116.11, 117.12])
    elif isobaric_labels == 6 or isobaric_labels == '6plex':
        labels = zip([126, 127, 128, 129, 130, 131],
                     [126.127, 127.131, 128.134, 129.138, 130.141, 131.138])
    elif isobaric_labels == 8 or isobaric_labels == '8plex':
        labels = zip(
            [113, 114, 115, 116, 117, 118, 119, 121],
            [113.11, 114.11, 115.11, 116.11, 117.12, 118.12, 119.12, 121.12])
    elif isobaric_labels == 10 or isobaric_labels == '10plex':
        labels = zip([
            '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N',
            '130C', '131'
        ], [
            126.127726, 127.124761, 127.131081, 128.128116, 128.134436,
            129.131471, 129.137790, 130.134825, 130.141145, 131.138180
        ])

        assert label_tolerance < 0.005, (
            "label_tolerance must be lower "
            "than 0.005 for 10-plex experiments! (Currently %s)" %
            label_tolerance)
    else:
        raise NotImplementedError, ("Labels of type %s not recognized.\n"
                                    "Should be one of [4,6,8,10] or None.")

    def read_labels(scan):
        partscan = [x for x in scan if x[0] < labels[-1][1] + 3]
        if not partscan:
            return dict([(str(l), '0') for l in zip(*labels)[0]])

        # This should probably actually sum all points within
        # the tolerance range.
        scan_values = {}
        for label, mz in labels:
            nearpt = min(partscan, key=lambda x: abs(x[0] - mz))
            if abs(nearpt[0] - mz) < label_tolerance:
                scan_values[str(label)] = '%.3f' % nearpt[1]
            else:
                scan_values[str(label)] = '0'  # Report noise value?

        return scan_values

    inconsistent_precursors = 0
    scans_written = 0

    lastMS1 = None
    lastMS1ScanName = None
    recal_factor = 1
    calibrant = RAW_CAL_MASS
    for time, mz, scanNum, scanLevel, scanMode in scanInfo:
        scanName = scanNum if isinstance(scanNum, int) else time

        if scanLevel == 'MS1':
            lastMS1ScanName = scanName

            possible_precursors = None

            def calculate_precursors(calibrant):
                if data.format == 'raw':
                    lastMS1 = data.lscan(lastMS1ScanName)
                    lastMS1, calibrant = raw_scan_recalibration(
                        lastMS1, calibrant)
                else:
                    try:
                        lastMS1 = data.scan(lastMS1ScanName, centroid=True)
                    except NotImplementedError:
                        lastMS1 = centroid_func(data.scan(lastMS1ScanName))

                envelopes = peak_pick(lastMS1,
                                      tolerance=0.01,
                                      min_peaks=2,
                                      enforce_isotopic_ratios=True)[0]
                return sum([[(x[0][0], c) for x in xs]
                            for c, xs in envelopes.items()], []), calibrant

            continue
        elif scanLevel == 'MS3':
            continue
        elif lastMS1ScanName == None:
            continue

        # Each file type handles centroiding differently (or not at all.)
        if data.format == 'raw':
            scan = data.scan(scanName, centroid=centroid)

            scan, calibrant = raw_scan_recalibration(scan, calibrant)
        elif data.format == 'wiff':
            # explicit_numbering, of course, can't be active here.
            scan = data.scan(scanName)
            if centroid:
                scan = centroid_func(scan)
        elif data.format == 'd':
            scan = data.scan(scanName, centroid=centroid)
            if centroid and not scan:
                # mzAPI.D returns empty if centroid data is not present in
                # the file, but that can be corrected by external centroiding.
                scan = centroid_func(data.scan(scanName, centroid=False))
        else:
            raise NotImplementedError, "Extractor does not handle type %s" % data.format

        if filters and not mz:
            mz = float(filters[time].split('@')[0].split(' ')[-1])

        mzP = None
        chargeP = None
        if "scanPrecursor" in dir(data):
            assert isinstance(scanName, int)
            mzP, chargeP = data.scanPrecursor(scanName)

        if not mzP:  # .scanPrecursor sometimes returns charge and not mzP.
            if possible_precursors == None:
                possible_precursors, calibrant = calculate_precursors(
                    calibrant)

            mzP, chargeP = _get_precursor(mz, possible_precursors, chargeP)
            if not mzP:
                # Release presumed charge possibly obtained from scanPrecursor.
                mzP, chargeP = _get_precursor(mz, possible_precursors, None)
                if mz and chargeP:
                    inconsistent_precursors += 1

        if mzP and (abs(mz - mzP) < 2 or not mz):
            mz = mzP
            charge = chargeP
        else:
            charge = default_charge

        if not charge:
            charge = default_charge

        if not mz:
            import warnings
            errmgf = os.path.abspath(datafile)
            warnings.warn('Unable to recover all precursor masses from %s' %
                          errmgf)
        else:
            if labels:
                scan_labels = read_labels(scan)
            else:
                scan_labels = {}

            title = standard_title_write(datafile,
                                         rt=time,
                                         mz=mz,
                                         mode=scanMode,
                                         scan=scanNum,
                                         **scan_labels)

            # Should expand extract() call to include arguments to this.
            if deisotope_and_reduce_charge and centroid:
                if ('tolerance' not in deisotope_and_reduce_args
                        or not deisotope_and_reduce_args['tolerance']):
                    deisotope_and_reduce_args[
                        'tolerance'] = precursor_tolerance
                scan = deisotope_reduce_scan(scan, **deisotope_and_reduce_args)
            scan = [x for x in scan if x[0] > min_mz]
            assert charge, title
            writer.write(scan, title, mass=mz, charge=charge)
            scans_written += 1

    writer.close

    if inconsistent_precursors:
        vprint("Precursor inconsistencies: %s/%s" %
               (inconsistent_precursors, scans_written))

    return outputfile
Beispiel #5
0
 def run(self):
     self.inconsistent_precursors = 0
     self.scans_written = 0
     
     self.lastMS1ScanName = None
     self.possible_precursors = None
     for time, mz, scanNum, scanLevel, scanMode in self.scanInfo:
         scanName = scanNum if isinstance(scanNum, int) else time
         
         if scanLevel == 'MS1':
             self.lastMS1ScanName = scanName
             self.possible_precursors = None
             continue
         elif scanLevel == 'MS3':
             continue
         elif self.lastMS1ScanName == None:
             continue                
         
         # Each file type handles centroiding differently (or not at all.)
         if self.data.format == 'raw':
             scan = self.data.scan(scanName, centroid = self.centroid)
             scan = self.raw_scan_recalibration(scan)
         elif self.data.format == 'wiff':
             # explicit_numbering, of course, can't be active here.
             scan = self.data.scan(scanName)
             if self.centroid:
                 scan = centroid_func(scan)
         elif self.data.format == 'd':
             scan = self.data.scan(scanName, centroid = self.centroid)
             if self.centroid and not scan:
                 # mzAPI.D returns empty if centroid data is not present in
                 # the file, but that can be corrected by external centroiding.
                 scan = centroid_func(self.data.scan(scanName, centroid = False))
         else:
             raise NotImplementedError, ("Extractor does not handle type %s"
                                         % self.data.format)
         
         if self.filters and not mz:
             mz = float(self.filters[time].split('@')[0].split(' ')[-1])            
             
         mzP = None
         chargeP = None
         if ("scanPrecursor" in dir(self.data) and 
             self.derive_precursor_via in ['All', 'Thermo']):
             assert isinstance(scanName, int)
             mzP, chargeP = self.data.scanPrecursor(scanName)
             
         if (not mzP) or self.derive_precursor_via in ['Direct']: # 'and derive_precursor_via not in ['Thermo']', except why would you?
             # .scanPrecursor sometimes returns charge and not mzP.
             mzP, chargeP = self.get_precursor(mz, chargeP)
             if not mzP:
                 # Release presumed charge possibly obtained from scanPrecursor.
                 mzP, chargeP = self.get_precursor(mz, None)
                 if mz and chargeP:
                     self.inconsistent_precursors += 1     
 
             
         if mzP and (abs(mz - mzP) < 2 or not mz): 
             mz = mzP
             charge = chargeP
         else:
             charge = self.default_charge
         
         if not charge:
             charge = self.default_charge                
         
         if not mz:
             import warnings
             errmgf = os.path.abspath(self.filename)
             warnings.warn('Unable to recover all precursor masses from %s' % errmgf)
         else:    
             if (self.maximum_mass and
                 remove_protons(mz, charge) > self.maximum_mass):
                 continue
                 
             if self.labels:
                 scan_labels = self.read_labels(scan)
             else:
                 scan_labels = {}
             
             title = standard_title_write(self.filename, rt = time, mz = mz,
                                          mode = scanMode, scan = scanNum,
                                          **scan_labels)
         
             if self.deisoreduce and self.centroid:
                 scan = deisotope_reduce_scan(scan, **self.deisoreduce_MS2_args)  
             scan = [x for x in scan if x[0] > self.min_mz]
             assert charge, title         
             
             yield scan, title, mz, charge
             self.scans_written += 1