Esempio n. 1
0
def psm_intersection(directory, mode_subdirs):
    """
    To give a more accurate depiction of the relative elution profile of each
    label state, the final results will only consider peptides that appear in
    the results for all four states. This determines the overlapping peptide
    repertoire detected across all four experiments, and produces subset
    result files that only include these peptides.
    """

    psmByCondition = defaultdict(list)
    for mode, subdir, par in mode_subdirs:
        files = typeInDir(os.path.join(directory, subdir), 'xlsx')
        conditionPSMs = []
        for resultfile in files:
            if not 'FDR' in resultfile:
                continue
            conditionPSMs += list(reader(resultfile))
        psmByCondition[subdir] = collectByCriterion(conditionPSMs, peptideKey)

    consistentPSMs = reduce(set.intersection,
                            [set(x.keys()) for x in psmByCondition.values()],
                            set(psmByCondition.values()[0].keys()))

    newSubdirs = []
    for mode, subdir, par in mode_subdirs:
        newSubdir = subdir + '_intersection_sheets'
        newSubdirs.append((mode, newSubdir))
        try:
            os.mkdir(os.path.join(directory, newSubdir))
        except:
            pass
        files = typeInDir(os.path.join(directory, subdir), 'xlsx')
        for filename in files:
            alreadySeenPeptides = set()
            if not 'FDR' in filename:
                continue
            psms = reader(filename)
            filterfile = writer(os.path.join(directory, newSubdir,
                                             os.path.basename(filename)),
                                columns=psms.columns)
            for psm in psms:
                pepKey = peptideKey(psm)
                if pepKey in consistentPSMs and pepKey not in alreadySeenPeptides:
                    alreadySeenPeptides.add(pepKey)
                    filterfile.write(psm)
            filterfile.close()

    return newSubdirs
    def openResultFile(self, event):
        self.set_status("Opening PSM file...", 0)

        reportfile = self.resultCtrl.GetValue()
        #self.psms = collectByCriterion(list(reader(reportfile)), lambda x: (x['Peptide Sequence'],
        #x['Variable Modifications'],
        #x['Charge']))
        results = list(reader(reportfile))
        proteinLabel = 'gene_symbol' if 'gene_symbol' in results[
            0] else 'Accession Number'
        #proteins = collectByCriterion(results, lambda x: x[proteinLabel])
        proteins = defaultdict(list)
        for psm in results:
            accessions = [x.strip() for x in psm[proteinLabel].split(';')]
            for accession in accessions:
                proteins[accession].append(psm)

        self.psms = {}
        for protein, psms in proteins.items():
            peptides = collectByCriterion(
                psms, lambda x: '|'.join([
                    x['Peptide Sequence'], x['Variable Modifications'],
                    str(x['Charge'])
                ]))
            self.psms[protein] = peptides

        self.updatePSMDisplay(None)
    def openFile(self, event):
        print "openFile"
        filenames = self.fileChooser.GetValue()
        filenames = [x.strip() for x in filenames.split(';')]

        #if not filenames or filenames == self.filenames: return
        #else: self.filenames = filenames
        self.filenames = filenames

        reportData = []
        for resultFile in filenames:
            report = reader(resultFile)
            reportData += list(report)

        self.psmsByAccession = defaultdict(list)
        protList = []
        for psm in reportData:
            if float(psm['Peptide Score']) < self.scoreCutoff: continue

            protList.append(
                (psm['Accession Number'], psm['Protein Description']))
            for accession in psm['Accession Number'].split(';'):
                #protein = psm['Accession Number'], psm['Protein Description']
                self.psmsByAccession[accession].append(psm)

        self.updateProtList(protList)
Esempio n. 4
0
    def openFile(self, event):
        filenames = self.fileChooser.GetValue()
        filenames = [x.strip() for x in filenames.split(';')]

        self.filenames = filenames
        self.sheetname = self.sheetSelect.GetValue()
        if not self.sheetname: self.sheetname = 'Data'

        reportData = []
        for resultFile in filenames:
            try:
                report = reader(resultFile, sheet_name = self.sheetname, autotypecast = False)
            except IOError:
                print "%s has no sheet %s" % (resultFile, self.sheetname)
                continue
            reportData += list(report)

        self.psmsByAccession = defaultdict(list)
        protList = set()
        for psm in reportData:
            if float(psm['Peptide Score']) < self.scoreCutoff: continue
            if (self.searchTerm and 
                self.searchTerm.lower() not in psm['Accession Number'].lower() and
                self.searchTerm.lower() not in psm['Protein Description'].lower()):
                continue
            
            protList.add((psm['Accession Number'], psm['Protein Description']))
            for accession in psm['Accession Number'].split(';'):
                self.psmsByAccession[accession].append(psm)

        self.updateProtList(list(protList))
Esempio n. 5
0
    def loadFromFiles(self, event):
        self.set_status("Opening data files...", 0)
        datafiles = [self.files[self.fileDisplay.GetItemText(x, 1)]
                     for x in range(0, self.fileDisplay.GetItemCount())]
        self.dataPtrs = {}
        for datafile in datafiles:
            basedata = os.path.basename(datafile)
            if self.curves:
                self.dataPtrs[basedata] = mzFileMapped(datafile,
                                                       self.curves[basedata])
            else:
                self.dataPtrs[basedata] = mzFileMapped(datafile)                
                
    
        self.set_status("Loading PSMs...", 0)
        resultfiles = [(self.files[self.fileDisplay.GetItemText(x, 1)], 
                        self.files[self.fileDisplay.GetItemText(x, 2)])
                       for x in range(0, self.fileDisplay.GetItemCount())]         
        self.psms = {}
        for datafile, resultfile in resultfiles:
            #if datafile not in self.psms or self.psms[datafile][0] != resultfile:
            psms = list(reader(resultfile))
            self.psms[os.path.basename(datafile)] = resultfile, psms 
                
        featurefiles = [(self.files[self.fileDisplay.GetItemText(x, 1)], 
                         self.files[self.fileDisplay.GetItemText(x, 3)])
                        for x in range(0, self.fileDisplay.GetItemCount())]
        self.features = {}
        for datafile, featurefile in featurefiles:
            #if datafile not in self.features or self.features[datafile[0]] != featurefile:
            featureDB = FeatureInterface(featurefile)
            self.features[os.path.basename(datafile)] = featureDB


        self.set_status("Loading MS1 info...", 0)
        self.ms1s = dict([(x, [s for s in data.scan_info(0, 9999999) if s[3] == 'MS1'])
                          for x, data in self.dataPtrs.items()])
        
        
        self.set_status("Collecting peptides...", 0)
        self.proteins = defaultdict(list)
        for datafile, (resultfile, psms) in self.psms.items():
            for psm in psms:
                psm['Datafile'] = datafile
            byProtein = collectByCriterion(psms, lambda x: x['Accession Number'])
            for acc, psms in byProtein.items():
                self.proteins[acc] += psms        
        
        for acc, psms in self.proteins.items():
            collected = collectByCriterion(psms,
                                           lambda x: renderPeptideTag((x['Peptide Sequence'],
                                                                       x['Variable Modifications'],
                                                                       x['Charge'])))        
            self.proteins[acc] = collected
        
    
        self.set_status("...", 0)
        if event:
            self.render(None)
Esempio n. 6
0
def annotateFileWithCoverageImages(resultfile, fastafile):
    from multiplierz.mzReport.mzSpreadsheetClassic import XLSheetWriter as classic_writer    
    import tempfile, os, shutil
    
    foo = wx.App(0)
    
    coverpanel = CoveragePanel(wx.Frame(None))
    coverpanel.fastaChooser.SetSelection(0)
    coverpanel.fastaChooser.SetString(0, fastafile)
    coverpanel.resizeSequence()
    coverpanel.fileChooser.AppendText(resultfile)
    coverpanel.openFile(None)
    
    psms = reader(resultfile)
    output = classic_writer(resultfile + '.coverage_annotated.xls',
                            columns = psms.columns + ['Coverage'])

    tempdir = tempfile.mkdtemp()
    tempimgs = []
    i = 0
    for psm in psms:
        accessions = psm['Accession Number'].split('; ')
        for accession in accessions:
            coverpanel.accessions = [accession]
            coverpanel.subAccession = 0
            coverpanel.displayAccession()
            coverpanel.chartProtein()
            
            #tempimg = tempfile.
            #coverpanel.saveImage(None, )
            #image = coverpanel.renderCombinedImage()
            # Convert image somehow?
            img = os.path.join(tempdir, str(i) + '.png')
            i += 1
            #coverpanel.saveImage(None, outputName = img)
            #coverpanel.sequenceDisplay.SetSize((2000, 500))
            seqHeight = (coverpanel.sequenceDisplay.CellToRect(0,0).height *
                         coverpanel.sequenceDisplay.GetNumberRows())
            seqWidth = (coverpanel.sequenceDisplay.CellToRect(0,0).width *
                        coverpanel.sequenceDisplay.GetNumberCols())          
            bitmap = wx.EmptyBitmap(seqWidth, seqHeight)
            imageDC = wx.MemoryDC()
            imageDC.SelectObject(bitmap)
            coverpanel.sequenceDisplay.RenderToDC(imageDC, (0, 0))
            bitmap.SaveFile(img, wx.BITMAP_TYPE_BMP)
            
            #image = wx.ImageFromBitmap(bitmap)
            #image = image.Rescale(seqWidth * 10, seqHeight * 1, quality = wx.IMAGE_QUALITY_HIGH)
            #image.SaveFile(img, wx.BITMAP_TYPE_PNG)
            
            
            
            psm['Coverage'] = '#'
            output.write(psm, metadata = [('Coverage', ('image', seqHeight, seqWidth), img)])
            break
    
    output.close()
    shutil.rmtree(tempdir)
Esempio n. 7
0
def combine_peptides(reportfile, isobaric=None, outputfile=None):
    from multiplierz.mzReport import reader, writer
    from multiplierz.mgf import standard_title_parse

    isobaric_labels = {
        None: [],
        4: ['114', '115', '116', '117'],
        6: ['126', '127', '128', '129', '130', '131'],
        8: ['113', '114', '115', '116', '117', '118', '119', '121'],
        10: [
            '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N',
            '130C', '131'
        ]
    }

    def _byPeptide(row):
        # Not counting charge.
        varmodset = frozenset(
            [x.strip() for x in row['Variable Modifications'].split(';')])
        return row['Peptide Sequence'], varmodset

    def _getReporters(row):
        attrib = standard_title_parse(row['Spectrum Description'])
        return [float(attrib[x.lower()]) for x in isobaric_labels[isobaric]]

    assert isobaric in isobaric_labels

    psms = reader(reportfile)
    rowsByPeptide = collectByCriterion(psms, _byPeptide)

    sum_cols = ['Sum%s' % x for x in isobaric_labels[isobaric]]
    top_cols = ['Max%s' % x for x in isobaric_labels[isobaric]]
    if not outputfile:
        outputfile = insert_tag(reportfile, 'peptide_combined')
    output = writer(outputfile,
                    columns=(psms.columns + sum_cols + top_cols + ['PSMs']))

    for pep, psms in rowsByPeptide.items():
        outrow = max(psms, key=lambda x: x['Peptide Score'])
        outrow['PSMs'] = len(psms)

        if isobaric:
            repsets = [_getReporters(x) for x in psms]
            toprepset = max(repsets, key=lambda x: sum(x))
            sumrepset = [sum(x) for x in zip(*repsets)]

            for rep, col in zip(toprepset, top_cols):
                outrow[col] = rep
            for rep, col in zip(sumrepset, sum_cols):
                outrow[col] = rep

        output.write(outrow)

    output.close()

    return outputfile
Esempio n. 8
0
def combineFiles(files, outputFile, ext):
    if not os.path.isabs(outputFile):
        outputFile = os.path.join(os.path.dirname(files[0]),
                                  os.path.basename(outputFile))
        
    if not outputFile[-1*len(ext):] == ext:
        outputFile += ext
        
    print "Merging %s" % files
    columns = reader(files[0]).columns
    output = writer(outputFile, columns = ['Source'] + columns)
    
    for filename in files:
        for row in reader(filename):
            row['Source'] = os.path.basename(filename)
            output.write(row)
    
    output.close()
    print "Wrote %s !" % outputFile    
Esempio n. 9
0
def combine_accessions(reportfile, outputfile = None):
    """
    Given a Mascot-style PSM report, this combines all protein hypotheses for a given
    MS2 spectrum into a single PSM.
    
    outputfile may be safely specified to be the same as the input file, in
    order to overwrite the original file.
    """
    
    
    from multiplierz.mzReport import reader, writer
    
    report = reader(reportfile)
    columns = report.columns

    molecules = defaultdict(list)
    for row in report:
        molecules[row['Spectrum Description']].append(row)
        
    
    outputData = []
    for rows in molecules.values():
        accessions = [x['Accession Number'] for x in rows]
        newRow = max(rows, key = lambda x: x['Peptide Score'])
        
        if 'Accession Number' in columns:
            newRow['Accession Number'] = '; '.join([x['Accession Number'] for x in rows])
        if 'Protein Description' in columns:
            newRow['Protein Description'] = '; '.join([x['Protein Description'] for x in rows])
        if 'Protein Masses' in columns:
            newRow['Protein Masses'] = '; '.join([str(x['Protein Mass']) for x in rows])
        newRow['Protein Redundancy'] = len(rows)
        outputData.append(newRow)
    
    try:
        columns = [x for x in columns + ['Protein Masses'] if x in newRow]
    except UnboundLocalError:
        pass # Means there was no newRow, and thus no rows, so it's pretty arbitrary.
    
    if not outputfile:
        outputfile = insert_tag(reportfile, 'combined_accessions')
    
    output = writer(outputfile, columns = columns + ['Protein Redundancy'])
    report.close()
    for row in outputData:
        output.write(row)
    output.close()
    
    
    
    return outputfile
Esempio n. 10
0
    def dispatchModes(self, event):
        self.runButton.Enable(False)

        mode = self.modeCtrl.GetString(self.modeCtrl.GetSelection())
        self.criteria = self.fieldsCtrl.GetCheckedStrings()
        self.inputfiles = [(x, reader(x)) for x in self.fileList.GetStrings()]

        outputfile = self.outputCtrl.GetValue()
        if not outputfile:
            outputfile = 'combined_output_file'
        if not outputfile.split('.')[-1].lower() in ('xls', 'xlsx', 'csv',
                                                     'mzd'):
            outputfile += '.xlsx'
        if not os.path.isabs(outputfile):
            outdir = os.path.dirname(self.inputfiles[0][0])
            outputfile = os.path.join(outdir, outputfile)

        if mode in [
                'Concatenate All', 'Unique-by-File Report',
                'Entries-in-Common Report'
        ]:
            columnsets = [x[1].columns for x in self.inputfiles]
            columnIntersection = reduce(set.intersection, columnsets,
                                        set(columnsets[0]))
            self.outcolumns = ['Source'] + [
                x for x in columnsets[0] if x in columnIntersection
            ]

            if mode != 'Concatenate All':
                assert all([x in self.outcolumns for x in self.criteria])
        elif mode in ['Cross-Report Key']:
            self.outcolumns = ['Key'] + [x[0] for x in self.inputfiles]
        else:
            raise Exception

        self.output = writer(outputfile, columns=self.outcolumns)

        if mode == 'Concatenate All':
            self.concatenate()
        elif mode == 'Cross-Report Key':
            self.cross_report_key()
        elif mode == 'Unique-by-File Report':
            self.unique_by_file()
        elif mode == 'Entries-in-Common Report':
            self.entries_in_common()
        else:
            raise Exception

        self.output.close()
        print "Wrote %s" % outputfile
        self.runButton.Enable(True)
Esempio n. 11
0
    def addColumnsMenu(self, event):
        filenames = [
            self.fileList.GetString(x)
            for x in range(0, self.fileList.GetCount())
        ]

        columns = set()
        for filename in filenames:
            read = reader(filename)
            columns.update(read.columns)

        columns = list(columns)

        if columns:
            columnDialog = wx.MultiChoiceDialog(None,
                                                "Choose Columns To Match:",
                                                "More Fields",
                                                choices=columns)
            if columnDialog.ShowModal() == wx.ID_OK:
                newCheckedColumnIndices = columnDialog.GetSelections()
                newCheckedColumns = [
                    columns[i] for i in newCheckedColumnIndices
                ]

                oldColumns = self.fieldsCtrl.GetStrings()
                checkedColumns = self.fieldsCtrl.GetCheckedStrings()

                self.fieldsCtrl.SetItems(
                    oldColumns +
                    [x for x in newCheckedColumns if x not in oldColumns])
                self.fieldsCtrl.SetCheckedStrings(
                    list(checkedColumns) +
                    [x for x in newCheckedColumns if x not in checkedColumns])
        else:
            columnAlert = wx.MessageDialog(
                None, "Could not get additional columns; no files selected.")
            columnAlert.ShowModal()
def featureToPSM(resultFile, featureData, groupSILAC=False):
    results = reader(resultFile)
    if 'Feature' not in results.columns:
        raise IOError("Not a feature-annotated file!")

    featureToPSMs = defaultdict(list)
    if groupSILAC:
        for psm in results:
            mods = psm['Variable Modifications']
            if mods == None: mods = []

            isHeavy = heavyK in mods or heavyR in mods
            isMedium = (mediumK in mods or mediumR in mods) and not isHeavy
            isLight = not (isHeavy or isMedium)

            if isLight:
                if not psm['Light Features']: continue
                features = str(psm['Light Features']).split(';')
            elif isMedium:
                if not psm['Medium Features']: continue
                features = str(psm['Medium Features']).split(';')
            else:
                if not psm['Heavy Features']: continue
                features = str(psm['Heavy Features']).split(';')

            for feature in features:
                feature = int(float(feature))
                featureToPSMs[feature].append(psm)
    else:
        for psm in results:
            try:
                featureToPSMs[int(float(psm['Feature']))].append(psm)
            except ValueError:
                pass

    return dict(featureToPSMs)
Esempio n. 13
0
    def on_convert(self, event):
        if not self.file_list.GetStrings():
            wx.MessageBox('No files selected', 'Error')
            return

        #show hourglass
        wx.BeginBusyCursor(wx.HOURGLASS_CURSOR)

        files = self.file_list.GetStrings()
        input_format = self.input_format.GetSelection()
        output_format = self.output_format.GetSelection()
        output_ext = { 0:'.xls', 1:'.xlsx', 2:'.csv', 3:'.mzd' }[output_format]

        #update statusbar
        self.set_status("Converting...", 0)
        self.set_status("", 1)

        if self.combineCheck.GetValue():
            if input_format not in [0, 6]:
                wx.MessageBox("Only tabular/Excel files can currently be merged.")
                return
            combineFiles(self.file_list.GetStrings(),
                         self.combineCtrl.GetValue(),
                         output_ext)
            wx.EndBusyCursor()        
            self.set_status("Ready", 0)
            self.set_status("Done", 1)            
            return
            
            

        if input_format == 0: # Mascot CSV
            mascot_converter = mascot.mascot(version=settings.mascot_version)

            for file_name in files:
                self.set_status(file_name, 1)

                #Run MascotCSV program
                clean_csv_file = '_clean'.join(os.path.splitext(file_name))

                rep_file = os.path.splitext(clean_csv_file)[0] + output_ext
                if os.path.exists(rep_file):
                    os.remove(rep_file)

                mascot_converter.clean_csv(file_name, export_file=clean_csv_file, ion_list=False)

                repreader = mzReport.reader(clean_csv_file)
                repwriter = mzReport.writer(rep_file, columns=repreader.columns)

                for row in repreader:
                    repwriter.write(row)

                repreader.close()
                repwriter.close()

                #if os.path.splitext(rep_file)[1].lower() in ('.xls', '.xlsx', 'mzd'):
                    #mascot_reporter.mascot_header(rep_file, file_name)

                os.remove(clean_csv_file)

        elif input_format == 1: # Mascot DAT
            mascot_reporter = mzTools.MascotReport()

            _mascot_options = dict(max_hits=1000, ion_cutoff=20, bold_red=True,
                                   unassigned_queries=False, show_query_data=True,
                                   show_same_set=False, show_sub_set=False, quant=False)

            for file_name in files:
                self.set_status(file_name, 1)

                mascot_dat_file = mascot.MascotDatFile(file_name, **_mascot_options)
                mascot_header = mascot_dat_file.mascot_header()
                #mascot_header, prot_report, pep_report = mascot.parse_dat_file(file_name, **_mascot_options)

                ms_file_name = mascot_header[7][1] or (os.path.splitext(os.path.basename(file_name))[0])
                report_file = os.path.join(os.path.dirname(file_name),
                                           os.path.basename(ms_file_name) + output_ext)

                if os.path.exists(report_file):
                    os.remove(report_file)

                if output_ext in ('.xls', '.xlsx', '.mzd'):
                    mascot_reporter.mascot_header(report_file, mascot_header)
                    #mascot_reporter.mascot_header(report_file, mascot_header)

                if mascot_dat_file.res_file.getMascotVer() >= '2.3':
                    report = mzReport.writer(report_file,
                                             columns=(mzReport.default_columns[:1]
                                                      + ['Protein Database']
                                                      + mzReport.default_columns[1:]))
                else:
                    report = mzReport.writer(report_file, default_columns=True)

                #for row in pep_report:
                for row in mascot_dat_file.peptide_report():
                    report.write(row)

                mascot_dat_file.close()
                report.close()

        #elif input_format == 2: # Mascot mzIdentML
            #for file_name in files:
                #mzid = mzIdentML(file_name)
                #report_file = os.path.splitext(file_name)[0] + output_ext

                #if os.path.exists(report_file):
                    #os.remove(report_file)

                #report = mzReport.writer(report_file, default_columns=True)

                #for row in mzid:
                    #report.write(row)

                #report.close()

        elif input_format == 2: # Mascot mzIdentML
            for file_name in files:
                mzid = mzIdentML(file_name)
                data = mzid.peptideSummary()
                header = data[0].keys()
                
                report_file = os.path.splitext(file_name)[0] + output_ext
                
                if os.path.exists(report_file): os.remove(report_file)
                
                report = mzReport.writer(report_file, columns = header)
                
                for row in data:
                    writeRow = []
                    for column in header:
                        thing = row[column]
                        if type(thing) == type(['list']):
                            thing = "; ".join(thing)
                        writeRow.append(thing)
                    report.write(writeRow)

                report.close()
                
        elif input_format == 3: # Protein Pilot
            for file_name in files:
                self.set_status(file_name, 1)
                pilot = ProteinPilot(file_name)
                pilot.format(str(os.path.splitext(file_name)[0] + output_ext))

        elif input_format == 4: # OMMSA
            for file_name in files:
                self.set_status(file_name, 1)
                omssa = OMSSA_CSV(file_name)
                omssa.format(str(os.path.splitext(file_name)[0] + output_ext))

        elif input_format == 5: # X!Tandem XML
            for file_name in files:
                report_file = os.path.splitext(file_name)[0] + output_ext

                format_XML(file_name, report_file)

        elif input_format == 6: # other mzReport
            output_method = {'.xls': mzReport.toXLS,
                             '.xlsx': mzReport.toXLS,
                             '.csv': mzReport.toCSV,
                             '.mzd': mzReport.toMZD}[output_ext]

            for file_name in files:
                self.set_status(file_name, 1)
                
                rdr = reader(file_name)
                outputname = '.'.join(file_name.split('.')[:-1]) + output_ext
                wtr = writer(outputname, columns = rdr.columns)
                
                for row in rdr:
                    wtr.write(row)
                wtr.close()
                rdr.close()
                
                #if output_ext.startswith('.xls'):
                    #output_method(file_name, output_ext == '.xlsx')
                #else:
                    #output_method(file_name)

        #hide hourglass
        wx.EndBusyCursor()

        self.set_status("Ready", 0)
        self.set_status("Done", 1)
Esempio n. 14
0
def psm_XIC_localized(directory, subdirs):
    """
    A peptide may appear in multiple fractions due various factors, but for
    the purpose of this analysis it is useful to consider a peptide as
    "belonging" only to the fraction in which the main bulk of the elution
    occurred. For each fraction in which a given peptide appeared, we take
    XICs over the m/z values for a set of possible charge and compare their
    total intensity; the fraction with the most intense XIC(s) is assigned
    that peptide for the final count.
    """

    tolerance = 0.1
    time_tolerance = 15

    rawfiles = dict([(x.split('.')[0], mzFile(os.path.join(directory, x)))
                     for x in os.listdir(directory)
                     if x.lower().endswith('raw')])
    columns = None

    start = time.clock()
    for subdir in subdirs:
        resultfiles = typeInDir(os.path.join(directory, subdir), 'xlsx')
        resultfiles = [x for x in resultfiles if 'XIC_localized' not in x]

        peptidesForFile = defaultdict(dict)
        for resultfile in resultfiles:
            rdr = reader(resultfile)
            columns = rdr.columns
            psmsByPeptide = collectByCriterion(
                list(rdr), lambda x:
                (x['Peptide Sequence'], x['Variable Modifications']))
            for peptide, psms in psmsByPeptide.items():
                peptidesForFile[peptide][resultfile] = psms

        outputByFile = defaultdict(list)
        for peptide, psmsByFile in peptidesForFile.items():
            xicsByFile = []

            allPSMs = sum(psmsByFile.values(), [])
            mass = allPSMs[0]['Predicted mr']
            assert len(set(x['Predicted mr'] for x in allPSMs)) == 1

            charges = set(x['Charge'] for x in allPSMs)
            allScans = set([
                tuple(x['Spectrum Description'].split('.')[:2])
                for x in allPSMs
            ])
            allRTs = set(rawfiles[x[0]].scan_time_from_scan_name(int(x[1]))
                         for x in allScans)
            minRT, maxRT = min(allRTs), max(allRTs)

            for resultfile, psms in psmsByFile.items():
                rawfile = rawfiles[os.path.basename(resultfile.split('.')[0])]
                xicInt = 0
                for charge in charges:
                    mz = (mass + (1.0072764 * charge)) / charge
                    xic = rawfile.xic(minRT - time_tolerance,
                                      maxRT + time_tolerance, mz - tolerance,
                                      mz + tolerance)
                    xicInt += sum(zip(*xic)[1])

                xicsByFile.append((xicInt, resultfile))

            highIntFile = max(xicsByFile, key=lambda x: x[0])[1]
            outputByFile[highIntFile].append(psmsByFile[highIntFile][0])

        for resultfile, psms in outputByFile.items():
            outputfile = resultfile[:-5] + '.XIC_localized.xlsx'
            output = writer(outputfile, columns=columns)
            for psm in psms:
                output.write(psm)
            output.close()
Esempio n. 15
0
def multimode_fractionation_plot(mode_fractions, outputfile = None,
                                 count_to_size = (lambda x: x/100),
                                 fig_size = None,
                                 color_sequence = None):
    """
    mode_fractions must be a list of (<mode title>, <fractions>) tuples where
    <fractions> is a list of (<organic fraction>, <salt fraction>,
    <filename>) tuples as in the input for fractionation_plot.  Output is a plot
    of each fraction in organic/salt space, where each point is a pie chart
    showing absolute magnitude of all modes in the space (via chart size) and
    relative magnitude of each mode (via chart slice sizes.)
    """

    pyt.cla()
    if fig_size:
        fig = pyt.gcf()
        fig.set_size_inches(*fig_size)    
    
    for i in range(len(mode_fractions)):
        mode_fractions[i] = mode_fractions[i][0], [(float(o), float(s), f) for o, s, f 
                                                   in mode_fractions[i][1]]
    
    modes = zip(*mode_fractions)[0]
    organics = sorted(set(sum([list(zip(*x[1])[0]) for x in mode_fractions], [])))
    salts = sorted(set(sum([list(zip(*x[1])[1]) for x in mode_fractions], [])))	
    
    orgCoords = dict([(o, i) for i, o in enumerate(organics, start = 1)])
    saltCoords = dict([(s, i) for i, s in enumerate(salts, start = 1)])    
    

    
    grid = defaultdict(dict)
    largest = 0
    for mode, fractions in mode_fractions:
        for organic, salt, filename in fractions:
            orgCoord = orgCoords[organic]
            saltCoord = saltCoords[salt]
            rdr = reader(filename)
            count = len(list(rdr))
            rdr.close()
            grid[orgCoord, saltCoord][mode] = count
            largest = max([largest, count])
    
    xScale = (max(zip(*grid.keys())[0]) - min(zip(*grid.keys())[0])) / float(len(orgCoords))
    yScale = (max(zip(*grid.keys())[1]) - min(zip(*grid.keys())[1])) / float(len(saltCoords))
    overallscale = min([xScale, yScale])    
    
    #def countConvert(count):
        #return (float(count) / (largest*3)) * float(overallscale)
    def countConvert(x): return 1
    
    fig  = pyt.figure()
    ax = fig.gca()
    for (orgCoord, saltCoord), modecounts in grid.items():
        countForAllModes = [modecounts.get(m, 0) for m in modes]
        total = sum(modecounts.values())
        ax.pie(countForAllModes, center = (orgCoord, saltCoord),
               radius = countConvert(total), frame = True,
               colors = color_sequence)
        

    
        
    ax.set_xlim(min(orgCoords.values()) - 0.5, max(orgCoords.values()) + 0.5)
    ax.set_ylim(min(saltCoords.values()) - 0.5, max(saltCoords.values()) + 0.5)
    ax.set_aspect('equal')
    
    ax.legend(modes, loc = 'upper left')
    
    if not outputfile:
        pyt.show()
    else:
        pyt.savefig(outputfile)
    pyt.cla()
        
        
    
Esempio n. 16
0
def fractionation_plot(fractions, outputfile = None, fig_size = None, **kwargs):
    """
    Takes a list of 3-tuples (<organic fraction>, <salt fraction>, <filename>)
    describing the PSM output of a multi-fraction MS experiment; draws a
    plot where each fraction is represented as an appropriately scaled point
    (according to PSM count) in organic/salt space.
    """
    from multiplierz.mzReport import reader
    import matplotlib.pyplot as pyt
    pyt.cla()
    
    fractions = [(float(o), float(s), f) for o, s, f in fractions]
    
    organics = sorted(set(zip(*fractions)[0]))
    salts = sorted(set(zip(*fractions)[1]))
    
    orgCoords = dict([(o, i) for i, o in enumerate(organics, start = 1)])
    saltCoords = dict([(s, i) for i, s in enumerate(salts, start = 1)])
    
    
    if fig_size:
        fig = pyt.gcf()
        fig.set_size_inches(*fig_size)
    elif len(orgCoords) > 8 or len(saltCoords) > 8:
        fig = pyt.gcf()
        cursize = fig.get_size_inches()
        newsize = [cursize[0], cursize[1]]
        if len(orgCoords) > 8:
            cursize[0] = max(cursize[0], len(orgCoords) * 0.9)
        if len(saltCoords) > 8:
            cursize[1] = max(cursize[1], len(saltCoords) * 0.9)
        fig.set_size_inches(*cursize)
        
        
        
    
    scatterPts = []
    for organic, salt, psms in fractions:
        orgcoord = orgCoords[organic]
        saltcoord = saltCoords[salt]
        if isinstance(psms, int): # Can just pass the count.
            count = psms
        elif isinstance(psms, basestring): # Else the file
            rdr = reader(psms)
            try:
                count = rdr.get_row_count()
            except (IOError, AttributeError):
                count = len(list(rdr))
            rdr.close()
        else:
            raise Exception, "Must specify PSM count or file."
        
        scatterPts.append((orgcoord, saltcoord, count))
        pyt.text(orgcoord, saltcoord, str(count),
                 verticalalignment = 'center',
                 horizontalalignment = 'center')
        
    orgRange = max(orgCoords.values()) - min(orgCoords.values())
    saltRange = max(saltCoords.values()) - min(saltCoords.values())
    orgMargin = orgRange / 15.0
    saltMargin = saltRange / 15.0
    overallRange = min(orgRange, saltRange)
        
    ax = pyt.axes()
    ax.set_xlim(min(orgCoords.values()) - orgMargin, max(orgCoords.values()) + orgMargin)
    ax.set_ylim(min(saltCoords.values()) - saltMargin, max(saltCoords.values()) + saltMargin)
    
        
    #def count_to_size(counts):
        #counts / 
        
    orgpts, saltpts, counts = zip(*scatterPts)
    pyt.scatter(orgpts, saltpts, counts,
                alpha = 0.2,
                **kwargs)
    
    orgTicks = [(v, k) for k, v in orgCoords.items()]
    saltTicks = [(v, k) for k, v in saltCoords.items()]
    pyt.xticks(*zip(*orgTicks))
    pyt.yticks(*zip(*saltTicks))
    pyt.xlabel('Organic')
    pyt.ylabel('Salt')
    
    
    
    #pyt.xlim(min(orgCoords.values()) - 0.5, max(orgCoords.values()) + 0.5)
    #pyt.ylim(min(saltCoords.values()) - 0.5, max(saltCoords.values()) + 0.5)
    ##pyt.set_aspect('equal')    
    #pyt.autoscale(enable = False)
    
    #print pyt.xlim(), pyt.ylim()
    if not outputfile:
        pyt.show()
    else:
        pyt.savefig(outputfile)
    
    #print pyt.xlim(), pyt.ylim()
    pyt.cla()
Esempio n. 17
0
def _detect_matches(file_names, fields, tol_field, tolerance=0.0):

    all_mzd = all(
        (os.path.splitext(f)[1].lower() == '.mzd') for f in file_names)

    # a separate set code for when all of the files are SQLite databases.
    # might as well take advantage of the ability to do a real query
    if all_mzd:
        # create a SQLite database in memory
        conn = sqlite3.connect(':memory:')  # connection object

        # short-hand names for each file
        table_names = [('mzd%d' % i) for i, f in enumerate(file_names)]

        for f, t in zip(file_names, table_names):
            # attach each mzResults file to the database
            conn.execute('attach database (?) as (?)', (f, t))

        # add 'comma-join' function to aggregate file names
        conn.create_aggregate("cjoin", 1, CommaJoin)

        #if tol_field:
        #fields.append(tol_field)

        field_list = ','.join('"%s"' % f for f in fields)

        sub_query = ('select \'%s\' as "File Name",' + field_list +
                     ' from %s.PeptideData')

        union_query = ' union '.join((sub_query % (t, t)) for t in table_names)

        query = ('select distinct cjoin("File Name"),%s' +
                 ' from (%s) group by %s') % (field_list, union_query,
                                              field_list)

        try:
            yield (fields + [os.path.basename(f)
                             for f in file_names] + ["Detections"])

            # if tolerance is specified, need to do a second grouping step
            if tol_field:
                tol_dict = defaultdict(lambda: defaultdict(set))

                for row in conn.execute(query):
                    file_set = set(row[0].split(','))
                    t = tuple(row[1:-1])
                    tol_val = float(row[-1])

                    k = min(tol_dict[t] or [tol_val],
                            key=lambda k: abs(k - tol_val))
                    if abs(k - tol_val) <= tolerance:
                        tol_dict[t][k].update(file_set)
                        tol_dict[t][tol_val].update(tol_dict[t][k])
                    else:
                        tol_dict[t][tol_val].update(file_set)

                for t in tol_dict:
                    for v in tol_dict[t]:
                        file_row = [
                            int(f in tol_dict[t][v]) for f in table_names
                        ]
                        file_row.append(sum(file_row))

                        yield (t + (v, ) + tuple(file_row))
            else:
                for row in conn.execute(query):
                    file_set = set(row[0].split(','))
                    file_row = [int(f in file_set) for f in table_names]
                    file_row.append(sum(file_row))

                    yield (row[1:] + tuple(file_row))
        finally:
            conn.close()
    else:
        if tol_field:
            rows = defaultdict(lambda: defaultdict(set))
        else:
            rows = defaultdict(set)

        for name in file_names:
            report = mzReport.reader(name)

            #mzTools.logger_message(10, name)

            for row in report:
                t = tuple(row.get(field.lower()) for field in fields)

                if tol_field:
                    tol_val = row.get(tol_field.lower())

                    rows[t][tol_val].add(name)

                    for k in rows[t].keys():
                        if abs(k - tol_val) <= tolerance:
                            rows[t][tol_val].update(rows[t][k])
                            rows[t][k].add(name)
                else:
                    rows[t].add(name)

            report.close()

        if tol_field:
            yield (fields + [tol_field] +
                   [os.path.basename(f) for f in file_names] + ["Detections"])

            for t in rows:
                for v in rows[t]:
                    file_row = [int(f in rows[t][v]) for f in file_names]
                    file_row.append(sum(file_row))

                    yield (t + (v, ) + tuple(file_row))

        else:
            yield (fields + [os.path.basename(f)
                             for f in file_names] + ["Detections"])

            for t in rows:
                file_row = [int(f in rows[t]) for f in file_names]
                file_row.append(sum(file_row))

                yield (t + tuple(file_row))
Esempio n. 18
0
def filterJoin(filenames,
               matchColumns,
               returnMode,
               outputKeyFile,
               combinedOutputFile=None,
               outputFileType='.xlsx',
               outputTag=None,
               tolerance=None,
               toleranceColumn=None):
    """
    Produces a joined file, filtering out either repeat or unique
    PSMS.
    
    If returnMode is 'matched', output file contains one instance
    of each PSM group;
    if returnMode is 'unmatched', the output file contains every
    PSM that wasn't part of a larger PSM group.
    (Where 'PSM group' is a set of PSMs that are identical based on
    the given matchColumns + toleranceColumn.)
    If 'both', both kinds of output are produced.  
    
    Returns output file name(s); a tuple in the case of 'both.'
    """

    assert returnMode in ['matched', 'unmatched', 'both']
    #if not outputFileBase:
    #outputFileBase = filenames[0]

    data = []
    columnLists = []
    for filename in filenames:
        subdata = []
        inputfile = reader(filename)
        columnLists.append(inputfile.columns)
        for psm in inputfile:
            psm['Source'] = filename
            subdata.append(psm)
        data.append(subdata)
        inputfile.close()

    assert all([columnLists[0] == x
                for x in columnLists]), "Heterogeneous data columns!"

    datadict = defaultdict(list)
    for subdata in data:
        for psm in subdata:
            signature = tuple([psm[x] for x in matchColumns])
            datadict[signature].append(psm)

    if toleranceColumn:
        toldatadict = {}
        for signature, sigGroup in datadict.items():
            subGroups = []
            for psm in sigGroup:
                match = False
                for subGroup in subGroups:
                    if all([
                            abs(psm[toleranceColumn] - subpsm[toleranceColumn])
                            < tolerance for subpsm in subGroup
                    ]):
                        match = True
                        subGroup.append(psm)
                        break

                if not match:
                    subGroups.append([psm])

            for index, subGroup in enumerate(subGroups):
                subSig = tuple(list(signature) + index)
                toldatadict[subSig] = subGroup

        datadict = toldatadict

    if outputKeyFile:
        keyfile = writer(outputKeyFile, columns=['PSM Key'] + filenames)
        for signature, psmGroup in datadict.items():
            line = {}
            line['PSM Key'] = '|'.join([str(x) for x in signature])
            line.update([(x, len([y for y in psmGroup if y['Source'] == x]))
                         for x in filenames])
            keyfile.write(line)

        keyfile.close()

    outputpsms = []
    if returnMode == 'matched' or returnMode == 'both':
        #outputFileName = outputFileBase + '_matchedPSMs' + outputFileType
        #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0])

        for psmGroup in datadict.values():
            if len(psmGroup) > 1:
                exemplar = psmGroup[0]
                sourceFiles = '; '.join(set([x['Source'] for x in psmGroup]))
                exemplar['source'] = sourceFiles
                #outputfile.write(exemplar)
                outputpsms.append(exemplar)
        #outputfile.close()

    if returnMode == 'unmatched' or returnMode == 'both':
        #outputFileName = outputFileBase + '_uniquePSMs' + outputFileType
        #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0])

        for psmGroup in datadict.values():
            if len(psmGroup) == 1:
                #outputfile.write(psmGroup[0])
                outputpsms.append(psmGroup[0])

        #outputfile.close()

    outputs = []
    if outputTag:
        outputfiles = [
            (x, '.'.join(x.split('.')[:-1] + [outputTag, outputFileType]))
            for x in filenames
        ]
        for filename, outputfile in outputfiles:
            output = writer(outputfile, columns=['Source'] + columnLists[0])
            for psm in [x for x in outputpsms if x['Source'] == filename]:
                output.write(psm)
            output.close()

        outputs = [x[1] for x in outputfiles]

    if combinedOutputFile:
        output = writer(combinedOutputFile,
                        columns=['Source'] + columnLists[0])
        for psm in outputpsms:
            output.write(psm)
        output.close()
        outputs.append(combinedOutputFile)

    return outputs
Esempio n. 19
0
def _filter_join(file_names,
                 key_source_file,
                 exclude,
                 save_file_suffix='_filtered'):

    all_mzd = (os.path.splitext(key_source_file)[1].lower() == '.mzd' and all(
        (os.path.splitext(f)[1].lower() == '.mzd') for f in file_names))

    if all_mzd:
        # create a SQLite database in memory
        conn = sqlite3.connect(':memory:')  # connection object
        conn.execute('attach database (?) as key_table', (key_source_file, ))

        cols = [
            d[0] for d in conn.execute(
                'select * from key_table.PeptideData limit 1').description
        ]
        col_set = set(cols)

        # short-hand names for each file
        table_names = [('mzd%d' % i) for i, f in enumerate(file_names)]

        col_list = []
        for f, t in zip(file_names, table_names):
            # attach each mzResults file to the database
            conn.execute('attach database (?) as (?)', (f, t))
            col_list.append(
                tuple(d[0] for d in conn.execute(
                    'select * from %s.PeptideData limit 1' % t).description))

        yield col_list

        try:
            for f, t, t_cols in zip(file_names, table_names, col_list):
                #mzTools.logger_message(20, f)

                t_set = set(t_cols)
                u_str = ' and '.join(
                    '(A."%s" = B."%s" or ifnull(A."%s", B."%s") is NULL)' %
                    (c, c, c, c) for c in t_cols if c in col_set)

                query = 'select A.* from %s.PeptideData as A, key_table.PeptideData as B where %s' % (
                    t, u_str)

                if exclude:
                    query = 'select * from %s.PeptideData except %s' % (t,
                                                                        query)

                cur = conn.execute(query)

                res_cols = [d[0] for d in cur.description]
                res_dict = dict((c, i) for i, c in enumerate(res_cols))

                yield (f, t_cols,
                       (dict([('Filter Key', '|'.join(
                           str(row[res_dict[c]])
                           for c in cols if c in t_set))] +
                             zip(t_cols, (v for i, v in enumerate(row)
                                          if res_cols[i] in t_cols)))
                        for row in cur))

                conn.execute('detach database %s' % t)
        finally:
            conn.close()
    else:
        source_report = mzReport.reader(key_source_file)

        col_list = []
        for name in file_names:
            rdr = mzReport.reader(name)
            col_list.append(tuple(rdr.columns))
            rdr.close()

        yield col_list

        try:
            for name in file_names:
                #mzTools.logger_message(20, name)
                this_rep = mzReport.reader(name)

                key_cols = [
                    col for col in source_report.columns
                    if col in this_rep.columns
                ]
                filter_keys = set()

                for row in source_report:
                    if int(row['Detections']) > 1:
                        filter_keys.add("|".join(
                            str(row[col]) for col in key_cols))

                if exclude:
                    filtered_data = []
                    for row in this_rep:
                        filter_key = "|".join(
                            str(row[col]) for col in key_cols)
                        if filter_key not in filter_keys:
                            row['Filter Key'] = filter_key
                            filtered_data.append(row)
                else:
                    filtered_data = []
                    for row in this_rep:
                        filter_key = "|".join(
                            str(row[col]) for col in key_cols)
                        if filter_key in filter_keys:
                            row['Filter Key'] = filter_key
                            filtered_data.append(row)

                yield (name, this_rep.columns, filtered_data)

                this_rep.close()
        finally:
            this_rep.close()
Esempio n. 20
0
def calculate_FDR(reportfile, outputfile = None, threshold = 0.01,
                  decoyString = 'rev_', includeStatisticsSheet = True,
                  includeDuplicates = True, separateDuplicateSheet = True,
                  includeFailedSheet = True, includeReverseSheet = True,
                  single_cutoff = True):
    """
    Performs Forward/Reverse database filtering on the target file, giving back
    the true PSMs over the specified statistical threshold as well as removed decoy
    and below-threshold PSMs, in respective sheets.

    All entries in the decoy (reverse) database must have accessions that begin with
    some uniform prefix; by default, "rev_" (so that gi|198292342|X7823_EXTRA becomes
    rev_gi|198292342|X7823_EXTRA.)
    
    outputfile may be safely specified to be the same as the input file, in
    order to overwrite the original file.
    """

    from multiplierz.mzReport import reader, writer

    reportReader = reader(reportfile)
    reportRows = list(reportReader)
    columns = reportReader.columns + ['FDR']
    reportReader.close()

    reportRows.sort(key = lambda x: x['Peptide Score'], reverse = True)

    seenSpectra = {}

    passedRows = []
    failedRows = []
    duplicateRows = []
    reverseRows = []

    reverses = 0.0
    forwards = 0.0
    duplicates = 0
    passed = 0
    failed = 0
    lowPass = 999999999
    highRev = 0
    for row in reportRows:
        specDesc = row['Spectrum Description']
        if specDesc in seenSpectra:
            duplicates += 1
            fdr = seenSpectra[specDesc]
            row['FDR'] = fdr
            if includeDuplicates and not separateDuplicateSheet:
                if fdr < threshold:
                    passedRows.append(row)
                else:
                    failedRows.append(row)
            else:
                duplicateRows.append(row)
            continue

        #if decoyString in row['Accession Number'].lower():
        # Turns out that produced awful results, since high-scoring peptides
        # could just happen to be duplicated in the reverse database.
        # So instead:
        if all([decoyString in x.lower() for x in row['Accession Number'].split(';')]):
            reverses += 1
            if forwards:
                fdr = reverses / forwards
            else:
                fdr = 100
            row['FDR'] = fdr      

            if float(row['Peptide Score']) > highRev:
                highRev = float(row['Peptide Score'])

            seenSpectra[specDesc] = fdr
            reverseRows.append(row)
        else:
            forwards += 1
            fdr = reverses / forwards
            row['FDR'] = fdr            

            seenSpectra[specDesc] = fdr
            if fdr < threshold:
                passed += 1
                passedRows.append(row)
                if float(row['Peptide Score']) < lowPass:
                    lowPass = float(row['Peptide Score'])
            else:
                failed += 1
                failedRows.append(row)

    if single_cutoff:
        recovered = [x for x in failedRows if x['Peptide Score'] > lowPass]
        failedRows = [x for x in failedRows if x['Peptide Score'] <= lowPass]
        passedRows += recovered

    if not outputfile: 
        # Output format must support sheets.
        if reportfile.lower().endswith('xlsx') or reportfile.lower().endswith('xls'):
            outputfile = insert_tag(reportfile, 'FDR_filtered')
        else:
            outputfile = '.'.join(reportfile.split('.')[:-1] + ['FDR_filtered.xlsx'])

    percentage = round(threshold * 100)

    if includeFailedSheet:
        failedOutput = writer(outputfile, columns = columns,
                              sheet_name = "Failed %s%% FDR" % percentage)
        for row in failedRows:
            failedOutput.write(row)
        failedOutput.close()

    if separateDuplicateSheet:
        duplicateOutput = writer(outputfile, columns = columns,
                                 sheet_name = "Duplicate Rows")
        for row in duplicateRows:
            duplicateOutput.write(row)
        duplicateOutput.close()

    if includeReverseSheet:
        reverseOutput = writer(outputfile, columns = columns,
                               sheet_name = 'Reverse Hits')
        for row in reverseRows:
            reverseOutput.write(row)
        reverseOutput.close()

    if includeStatisticsSheet:
        statOutput = writer(outputfile, columns = ['FDR Calculation Statistics', '--------------'],
                            sheet_name = "FDR Statistics")
        statOutput.write(['', ''])
        statOutput.write(['Total Spectra', str(len(reportRows))])
        statOutput.write(['Passed %s%% FDR' % percentage, str(passed)])
        statOutput.write(['Lowest Passing Score', str(lowPass)])
        statOutput.write(['Reverse Hits', str(reverses)])
        statOutput.write(['Highest Scoring Reverse Hit', str(highRev)])
        statOutput.write(['Number of Duplicates', str(duplicates)])
        statOutput.close()

    passedOutput = writer(outputfile, columns = columns, sheet_name = "Data")
    for row in passedRows:
        passedOutput.write(row)
    passedOutput.close()   

    return outputfile
Esempio n. 21
0
def feature_analysis(datafile,
                     resultFiles,
                     featureFile=None,
                     tolerance=None,
                     mzRegex=None,
                     scanRegex=None,
                     **constants):
    """
    Performs feature-detection analysis on the given .RAW file and PSM
    reports. The output files group the given PSMs by feature, with the
    addition of source feature extent and intensity information.
    
    """

    import os

    if mzRegex:
        import re
        global spectrumDescriptionToMZ

        mzRegCompiled = re.compile(mzRegex)

        def newParser(description):
            return float(mzRegCompiled.search(description).group())

        spectrumDescriptionToMZ = newParser

    if scanRegex:
        import re
        global spectrumDescriptionToScanNumber

        scanRegCompiled = re.compile(scanRegex)

        def newParser(description):
            return int(scanRegCompiled.search(description).group())

        spectrumDescriptionToScanNumber = newParser

    #if tolerance:
    #global peakFindTolerance
    #peakFindTolerance = tolerance

    #if signalNoise:
    #global signalToNoiseThreshold
    #signalToNoiseThreshold = signalNoise

    assert os.path.exists(datafile), "%s not found!" % datafile
    for resultfile in resultFiles:
        assert os.path.exists(resultfile), "%s not found!" % resultfile
    assert datafile.lower().endswith(
        '.raw'), "Only .raw files are currently supported."

    if featureFile:
        assert os.path.exists(
            featureFile
        ), "Specified feature data file %s not found!" % featureFile
    else:
        featureFile = detect_features(datafile,
                                      tolerance=tolerance,
                                      **constants)
    features = FeatureInterface(featureFile)

    outputfiles = []
    if resultFiles:
        print resultFiles
        print "Categorizing search results by file."
        for resultfile in resultFiles:
            resultfile = os.path.abspath(resultfile)
            inputResults = mzReport.reader(resultfile)
            outputfile = '.'.join(
                resultfile.split('.')[:-1] + ['featureDetect', 'xlsx'])
            outputfiles.append(outputfile)

            resultsByFeature = binByFullFeature(datafile, features,
                                                inputResults)

            output = mzReport.writer(
                outputfile,
                columns=inputResults.columns + [
                    'Feature', 'feature error', 'feature start scan',
                    'feature end scan', 'feature start time',
                    'feature end time', 'feature intensity',
                    'feature kurtosis', 'feature skewness'
                ])

            for result in resultsByFeature:
                output.write(result)

            output.close()

            print "Output saved to %s ." % outputfile
    else:
        print "No PSM data given; skipping annotation step."

    return featureFile, outputfiles
Esempio n. 22
0
def add_gene_ids(target_files,
                 p2g_database,
                 target_sheet=None,
                 outputfile=None,
                 inPlace=False,
                 leucine_equals_isoleucine=True,
                 legacy_columns=True):
    starttime = time.clock()

    if isinstance(target_files, str):
        return_list = False
        target_files = [target_files]
    else:
        return_list = True

    dataRdr = open(p2g_database, 'rb')
    data = pickle.load(dataRdr)
    k_len = None
    if isinstance(data, tuple) and len(data) == 6:
        k_len, seqLookup, fmerLookup, geneLookup, isoSeqLookup, isoFmerLookup = data
    elif isinstance(data, tuple) and not len(data) == 6:
        raise Exception(str(len(data)))
    else:
        print('Legacy mode P2G database detected!')
        seqLookup = data
        fmerLookup = pickle.load(dataRdr)
        geneLookup = pickle.load(dataRdr)
        try:
            isoSeqLookup = pickle.load(dataRdr)
            isoFmerLookup = pickle.load(dataRdr)
        except EOFError:
            distinguish_leucine = False
            isoSeqLookup = None
            isoFmerLookup = None
    dataRdr.close()

    if isinstance(list(geneLookup.values())[0], tuple):
        print("Legacy mode gene names detected.")
        oldTupleInstance = list(geneLookup.values())[0]
        nameIndex = 0 if oldTupleInstance[0] and any(
            x.isalpha() for x in oldTupleInstance[0]) else 1
        for k, v in list(geneLookup.items()):
            geneLookup[k] = v[nameIndex]

    if leucine_equals_isoleucine:
        assert isoFmerLookup, (
            "Pep2Gene database does not contain leucine-isoleucine "
            "ambiguity data; re-compile database or "
            "select leucine_equals_isoleucine = False .")
    if k_len:
        assert k_len == K, "Pep2Gene database created with kmers of length %s, not %s" % (
            k_len, K)

    print("P2G database loaded: %.2f\n\n" % (time.clock() - starttime))
    prevtime = time.clock()

    outputfiles = []
    for target_file in target_files:
        try:
            rdr = reader(target_file, sheet_name=target_sheet)
        except TypeError:
            rdr = reader(target_file)  # Not an Excel file.

        add_legacy_cols = [
            "pro_count",
            "pro_list",
            "gene_count",
            "gene_symbols",
        ]

        add_cols = ["Protein Count", "Proteins", "Gene Count", "Gene Symbols"]
        if legacy_columns:
            new_cols = add_legacy_cols
            colname = dict(list(zip(add_cols, add_legacy_cols)))
        else:
            new_cols = add_cols
            colname = dict(list(zip(add_cols, add_cols)))

        iso_legacy_cols = [
            'IL Ambiguity pro_count', 'IL Ambiguity pro_list',
            "IL Ambiguity gene_count", "IL Ambiguity gene_symbols"
        ]
        iso_cols = [
            'I<->L Protein Count', 'I<->L Proteins', 'I<->L Gene Count',
            'I<->L Gene Symbols'
        ]
        if legacy_columns and leucine_equals_isoleucine:
            new_cols += iso_legacy_cols
            colname.update(dict(list(zip(iso_cols, iso_legacy_cols))))
        elif leucine_equals_isoleucine:
            new_cols += iso_cols
            colname.update(dict(list(zip(iso_cols, iso_cols))))

        if (not outputfile) or return_list:
            ext = target_file.split('.')[-1]
            outputfile = '.'.join(target_file.split('.')[:-1] + ['GENES', ext])
        output = writer(outputfile, columns=rdr.columns + new_cols)

        pepToProts = {}
        isoPepToProts = {}
        for counter, row in enumerate(rdr):
            if counter % 1000 == 0:
                print_progress(counter)
            try:
                pep = row['Peptide Sequence'].upper()
            except KeyError:
                pep = row['Peptide'].upper()

            pep = ''.join([x for x in pep if x.isalpha()])

            if len(pep) <= K:
                continue  # No 4-mers in a 3-mer!

            isoPep = pep.replace('I', 'L')
            if pep not in pepToProts:
                candidate_prots = reduce(set.intersection,
                                         (fmerLookup[pep[x:x + K]]
                                          for x in range(len(pep) - K)))
                # pep_find could be replaced by giving the p2g database a pre-made set of
                # hashes of all tryptic peptides in a protein, and seeing if the hash of the
                # pep is present in the set.
                pep_find = re.compile(
                    '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % pep)
                pepToProts[pep] = set(prot for prot in candidate_prots
                                      if pep_find.search(seqLookup[prot]))

                if leucine_equals_isoleucine and isoPep not in isoPepToProts:
                    iso_candidate_prots = reduce(
                        set.intersection, (isoFmerLookup[isoPep[x:x + K]]
                                           for x in range(len(isoPep) - K)))
                    pep_find = re.compile(
                        '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % isoPep)
                    isoPepToProts[isoPep] = set(
                        prot for prot in iso_candidate_prots
                        if pep_find.search(isoSeqLookup[prot]))

            proteins = '; '.join(pepToProts[pep])
            proteinCount = len(pepToProts[pep])

            geneList = set(geneLookup[x] for x in pepToProts[pep]
                           if x in geneLookup)
            geneIds = '; '.join(set(g for g in geneList))
            #geneSymbols = '; '.join(set(s for _, s in geneList))
            geneCount = len(geneList)

            row[colname['Protein Count']] = proteinCount
            row[colname['Proteins']] = proteins
            row[colname['Gene Count']] = geneCount
            row[colname['Gene Symbols']] = geneIds
            #row[colname['Gene IDs']] =

            if leucine_equals_isoleucine:
                isoProteins = '; '.join(isoPepToProts[isoPep])
                isoProteinCount = len(isoPepToProts[isoPep])

                isoGeneList = set(geneLookup[x] for x in isoPepToProts[isoPep]
                                  if x in geneLookup)
                isoGeneIds = '; '.join(set(g for g in isoGeneList))
                #isoGeneSymbols = '; '.join(set(s for _, s in isoGeneList))
                isoGeneCount = len(isoGeneList)

                row[colname['I<->L Protein Count']] = isoProteinCount
                row[colname['I<->L Proteins']] = isoProteins
                row[colname['I<->L Gene Count']] = isoGeneCount
                row[colname['I<->L Gene Symbols']] = isoGeneIds
                #row[colname['I<->L Gene IDs']] =

            output.write(row)

        print("\nGene lookup completed: %.2f" % (time.clock() - prevtime))
        prevtime = time.clock()
        rdr.close()
        output.close()
        print("Output written: %.2f" % (time.clock() - prevtime))
        outputfiles.append(outputfile)
    if return_list:
        return outputfiles
    else:
        return outputfile
def evaluateMascotFile(resultfile, datafile = None, featurefile = None, outputfile = None):
    #assert datafile or featurefile, "Either raw data or feature data must be given!"
    
    header = [list(x.values()) for x in list(reader(resultfile, sheet_name = 'Mascot_Header'))]
    
    def retrieveHeaderValue(key):
        try:
            return [[x for x in xs if x != key] for xs in header if key in xs][0][0]
        except IndexError:
            return ''
    quant = retrieveHeaderValue('Quantitation method')
    varmods = retrieveHeaderValue('Variable modifications')
    
    assert ('SILAC' in quant) or ('plex' in varmods), "Label method not recognized!"
    
    if not featurefile:
        featurefile = detectFeatures(datafile, signalToNoiseThreshold = 15)
        features = FeatureInterface(featurefile)
    else:
        features = FeatureInterface(featurefile)
    
    print("Matching features to PSMs...")
    results = reader(resultfile)
    columns = results.columns
    results = list(results)
    
    
    data = mzFile(datafile)
    ms1map = {}
    ms2s = []
    ms1 = None
    for _, _, scan, level, _ in data.scan_info(0, 999999):
        if level == 'MS1':
            for ms2 in ms2s:
                ms1map[ms2] = ms1
            ms1 = scan
            ms2s = []
        elif level == 'MS2':
            ms2s.append(scan)
    ms1map[ms1] = ms2s
    data.close()
    
    featureIntMap = {}
    for psm in results:
        mz = psm['Experimental mz']
        scan = int(psm['Spectrum Description'].split('.')[1])
        charge = int(psm['Charge'])
        for index, feature in features.mz_range(mz - 1, mz + 1):
            if feature.containsPoint(mz, ms1map[scan], charge):
                featureIntMap[scan] = feature.c12Intensity()
                break
    
    del features    
    
    if not outputfile:
        outputfile = '.'.join(resultfile.split('.')[:-1]) + '_LABEL_EVALUATION.xlsx'
    
    if 'SILAC' in quant:
        return evaluateSILAC(outputfile, columns, results, featureIntMap), outputfile
    elif 'plex' in varmods:
        return evaluateTMTiTRAQ(outputfile, columns, results, featureIntMap), outputfile