def psm_intersection(directory, mode_subdirs): """ To give a more accurate depiction of the relative elution profile of each label state, the final results will only consider peptides that appear in the results for all four states. This determines the overlapping peptide repertoire detected across all four experiments, and produces subset result files that only include these peptides. """ psmByCondition = defaultdict(list) for mode, subdir, par in mode_subdirs: files = typeInDir(os.path.join(directory, subdir), 'xlsx') conditionPSMs = [] for resultfile in files: if not 'FDR' in resultfile: continue conditionPSMs += list(reader(resultfile)) psmByCondition[subdir] = collectByCriterion(conditionPSMs, peptideKey) consistentPSMs = reduce(set.intersection, [set(x.keys()) for x in psmByCondition.values()], set(psmByCondition.values()[0].keys())) newSubdirs = [] for mode, subdir, par in mode_subdirs: newSubdir = subdir + '_intersection_sheets' newSubdirs.append((mode, newSubdir)) try: os.mkdir(os.path.join(directory, newSubdir)) except: pass files = typeInDir(os.path.join(directory, subdir), 'xlsx') for filename in files: alreadySeenPeptides = set() if not 'FDR' in filename: continue psms = reader(filename) filterfile = writer(os.path.join(directory, newSubdir, os.path.basename(filename)), columns=psms.columns) for psm in psms: pepKey = peptideKey(psm) if pepKey in consistentPSMs and pepKey not in alreadySeenPeptides: alreadySeenPeptides.add(pepKey) filterfile.write(psm) filterfile.close() return newSubdirs
def openResultFile(self, event): self.set_status("Opening PSM file...", 0) reportfile = self.resultCtrl.GetValue() #self.psms = collectByCriterion(list(reader(reportfile)), lambda x: (x['Peptide Sequence'], #x['Variable Modifications'], #x['Charge'])) results = list(reader(reportfile)) proteinLabel = 'gene_symbol' if 'gene_symbol' in results[ 0] else 'Accession Number' #proteins = collectByCriterion(results, lambda x: x[proteinLabel]) proteins = defaultdict(list) for psm in results: accessions = [x.strip() for x in psm[proteinLabel].split(';')] for accession in accessions: proteins[accession].append(psm) self.psms = {} for protein, psms in proteins.items(): peptides = collectByCriterion( psms, lambda x: '|'.join([ x['Peptide Sequence'], x['Variable Modifications'], str(x['Charge']) ])) self.psms[protein] = peptides self.updatePSMDisplay(None)
def openFile(self, event): print "openFile" filenames = self.fileChooser.GetValue() filenames = [x.strip() for x in filenames.split(';')] #if not filenames or filenames == self.filenames: return #else: self.filenames = filenames self.filenames = filenames reportData = [] for resultFile in filenames: report = reader(resultFile) reportData += list(report) self.psmsByAccession = defaultdict(list) protList = [] for psm in reportData: if float(psm['Peptide Score']) < self.scoreCutoff: continue protList.append( (psm['Accession Number'], psm['Protein Description'])) for accession in psm['Accession Number'].split(';'): #protein = psm['Accession Number'], psm['Protein Description'] self.psmsByAccession[accession].append(psm) self.updateProtList(protList)
def openFile(self, event): filenames = self.fileChooser.GetValue() filenames = [x.strip() for x in filenames.split(';')] self.filenames = filenames self.sheetname = self.sheetSelect.GetValue() if not self.sheetname: self.sheetname = 'Data' reportData = [] for resultFile in filenames: try: report = reader(resultFile, sheet_name = self.sheetname, autotypecast = False) except IOError: print "%s has no sheet %s" % (resultFile, self.sheetname) continue reportData += list(report) self.psmsByAccession = defaultdict(list) protList = set() for psm in reportData: if float(psm['Peptide Score']) < self.scoreCutoff: continue if (self.searchTerm and self.searchTerm.lower() not in psm['Accession Number'].lower() and self.searchTerm.lower() not in psm['Protein Description'].lower()): continue protList.add((psm['Accession Number'], psm['Protein Description'])) for accession in psm['Accession Number'].split(';'): self.psmsByAccession[accession].append(psm) self.updateProtList(list(protList))
def loadFromFiles(self, event): self.set_status("Opening data files...", 0) datafiles = [self.files[self.fileDisplay.GetItemText(x, 1)] for x in range(0, self.fileDisplay.GetItemCount())] self.dataPtrs = {} for datafile in datafiles: basedata = os.path.basename(datafile) if self.curves: self.dataPtrs[basedata] = mzFileMapped(datafile, self.curves[basedata]) else: self.dataPtrs[basedata] = mzFileMapped(datafile) self.set_status("Loading PSMs...", 0) resultfiles = [(self.files[self.fileDisplay.GetItemText(x, 1)], self.files[self.fileDisplay.GetItemText(x, 2)]) for x in range(0, self.fileDisplay.GetItemCount())] self.psms = {} for datafile, resultfile in resultfiles: #if datafile not in self.psms or self.psms[datafile][0] != resultfile: psms = list(reader(resultfile)) self.psms[os.path.basename(datafile)] = resultfile, psms featurefiles = [(self.files[self.fileDisplay.GetItemText(x, 1)], self.files[self.fileDisplay.GetItemText(x, 3)]) for x in range(0, self.fileDisplay.GetItemCount())] self.features = {} for datafile, featurefile in featurefiles: #if datafile not in self.features or self.features[datafile[0]] != featurefile: featureDB = FeatureInterface(featurefile) self.features[os.path.basename(datafile)] = featureDB self.set_status("Loading MS1 info...", 0) self.ms1s = dict([(x, [s for s in data.scan_info(0, 9999999) if s[3] == 'MS1']) for x, data in self.dataPtrs.items()]) self.set_status("Collecting peptides...", 0) self.proteins = defaultdict(list) for datafile, (resultfile, psms) in self.psms.items(): for psm in psms: psm['Datafile'] = datafile byProtein = collectByCriterion(psms, lambda x: x['Accession Number']) for acc, psms in byProtein.items(): self.proteins[acc] += psms for acc, psms in self.proteins.items(): collected = collectByCriterion(psms, lambda x: renderPeptideTag((x['Peptide Sequence'], x['Variable Modifications'], x['Charge']))) self.proteins[acc] = collected self.set_status("...", 0) if event: self.render(None)
def annotateFileWithCoverageImages(resultfile, fastafile): from multiplierz.mzReport.mzSpreadsheetClassic import XLSheetWriter as classic_writer import tempfile, os, shutil foo = wx.App(0) coverpanel = CoveragePanel(wx.Frame(None)) coverpanel.fastaChooser.SetSelection(0) coverpanel.fastaChooser.SetString(0, fastafile) coverpanel.resizeSequence() coverpanel.fileChooser.AppendText(resultfile) coverpanel.openFile(None) psms = reader(resultfile) output = classic_writer(resultfile + '.coverage_annotated.xls', columns = psms.columns + ['Coverage']) tempdir = tempfile.mkdtemp() tempimgs = [] i = 0 for psm in psms: accessions = psm['Accession Number'].split('; ') for accession in accessions: coverpanel.accessions = [accession] coverpanel.subAccession = 0 coverpanel.displayAccession() coverpanel.chartProtein() #tempimg = tempfile. #coverpanel.saveImage(None, ) #image = coverpanel.renderCombinedImage() # Convert image somehow? img = os.path.join(tempdir, str(i) + '.png') i += 1 #coverpanel.saveImage(None, outputName = img) #coverpanel.sequenceDisplay.SetSize((2000, 500)) seqHeight = (coverpanel.sequenceDisplay.CellToRect(0,0).height * coverpanel.sequenceDisplay.GetNumberRows()) seqWidth = (coverpanel.sequenceDisplay.CellToRect(0,0).width * coverpanel.sequenceDisplay.GetNumberCols()) bitmap = wx.EmptyBitmap(seqWidth, seqHeight) imageDC = wx.MemoryDC() imageDC.SelectObject(bitmap) coverpanel.sequenceDisplay.RenderToDC(imageDC, (0, 0)) bitmap.SaveFile(img, wx.BITMAP_TYPE_BMP) #image = wx.ImageFromBitmap(bitmap) #image = image.Rescale(seqWidth * 10, seqHeight * 1, quality = wx.IMAGE_QUALITY_HIGH) #image.SaveFile(img, wx.BITMAP_TYPE_PNG) psm['Coverage'] = '#' output.write(psm, metadata = [('Coverage', ('image', seqHeight, seqWidth), img)]) break output.close() shutil.rmtree(tempdir)
def combine_peptides(reportfile, isobaric=None, outputfile=None): from multiplierz.mzReport import reader, writer from multiplierz.mgf import standard_title_parse isobaric_labels = { None: [], 4: ['114', '115', '116', '117'], 6: ['126', '127', '128', '129', '130', '131'], 8: ['113', '114', '115', '116', '117', '118', '119', '121'], 10: [ '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N', '130C', '131' ] } def _byPeptide(row): # Not counting charge. varmodset = frozenset( [x.strip() for x in row['Variable Modifications'].split(';')]) return row['Peptide Sequence'], varmodset def _getReporters(row): attrib = standard_title_parse(row['Spectrum Description']) return [float(attrib[x.lower()]) for x in isobaric_labels[isobaric]] assert isobaric in isobaric_labels psms = reader(reportfile) rowsByPeptide = collectByCriterion(psms, _byPeptide) sum_cols = ['Sum%s' % x for x in isobaric_labels[isobaric]] top_cols = ['Max%s' % x for x in isobaric_labels[isobaric]] if not outputfile: outputfile = insert_tag(reportfile, 'peptide_combined') output = writer(outputfile, columns=(psms.columns + sum_cols + top_cols + ['PSMs'])) for pep, psms in rowsByPeptide.items(): outrow = max(psms, key=lambda x: x['Peptide Score']) outrow['PSMs'] = len(psms) if isobaric: repsets = [_getReporters(x) for x in psms] toprepset = max(repsets, key=lambda x: sum(x)) sumrepset = [sum(x) for x in zip(*repsets)] for rep, col in zip(toprepset, top_cols): outrow[col] = rep for rep, col in zip(sumrepset, sum_cols): outrow[col] = rep output.write(outrow) output.close() return outputfile
def combineFiles(files, outputFile, ext): if not os.path.isabs(outputFile): outputFile = os.path.join(os.path.dirname(files[0]), os.path.basename(outputFile)) if not outputFile[-1*len(ext):] == ext: outputFile += ext print "Merging %s" % files columns = reader(files[0]).columns output = writer(outputFile, columns = ['Source'] + columns) for filename in files: for row in reader(filename): row['Source'] = os.path.basename(filename) output.write(row) output.close() print "Wrote %s !" % outputFile
def combine_accessions(reportfile, outputfile = None): """ Given a Mascot-style PSM report, this combines all protein hypotheses for a given MS2 spectrum into a single PSM. outputfile may be safely specified to be the same as the input file, in order to overwrite the original file. """ from multiplierz.mzReport import reader, writer report = reader(reportfile) columns = report.columns molecules = defaultdict(list) for row in report: molecules[row['Spectrum Description']].append(row) outputData = [] for rows in molecules.values(): accessions = [x['Accession Number'] for x in rows] newRow = max(rows, key = lambda x: x['Peptide Score']) if 'Accession Number' in columns: newRow['Accession Number'] = '; '.join([x['Accession Number'] for x in rows]) if 'Protein Description' in columns: newRow['Protein Description'] = '; '.join([x['Protein Description'] for x in rows]) if 'Protein Masses' in columns: newRow['Protein Masses'] = '; '.join([str(x['Protein Mass']) for x in rows]) newRow['Protein Redundancy'] = len(rows) outputData.append(newRow) try: columns = [x for x in columns + ['Protein Masses'] if x in newRow] except UnboundLocalError: pass # Means there was no newRow, and thus no rows, so it's pretty arbitrary. if not outputfile: outputfile = insert_tag(reportfile, 'combined_accessions') output = writer(outputfile, columns = columns + ['Protein Redundancy']) report.close() for row in outputData: output.write(row) output.close() return outputfile
def dispatchModes(self, event): self.runButton.Enable(False) mode = self.modeCtrl.GetString(self.modeCtrl.GetSelection()) self.criteria = self.fieldsCtrl.GetCheckedStrings() self.inputfiles = [(x, reader(x)) for x in self.fileList.GetStrings()] outputfile = self.outputCtrl.GetValue() if not outputfile: outputfile = 'combined_output_file' if not outputfile.split('.')[-1].lower() in ('xls', 'xlsx', 'csv', 'mzd'): outputfile += '.xlsx' if not os.path.isabs(outputfile): outdir = os.path.dirname(self.inputfiles[0][0]) outputfile = os.path.join(outdir, outputfile) if mode in [ 'Concatenate All', 'Unique-by-File Report', 'Entries-in-Common Report' ]: columnsets = [x[1].columns for x in self.inputfiles] columnIntersection = reduce(set.intersection, columnsets, set(columnsets[0])) self.outcolumns = ['Source'] + [ x for x in columnsets[0] if x in columnIntersection ] if mode != 'Concatenate All': assert all([x in self.outcolumns for x in self.criteria]) elif mode in ['Cross-Report Key']: self.outcolumns = ['Key'] + [x[0] for x in self.inputfiles] else: raise Exception self.output = writer(outputfile, columns=self.outcolumns) if mode == 'Concatenate All': self.concatenate() elif mode == 'Cross-Report Key': self.cross_report_key() elif mode == 'Unique-by-File Report': self.unique_by_file() elif mode == 'Entries-in-Common Report': self.entries_in_common() else: raise Exception self.output.close() print "Wrote %s" % outputfile self.runButton.Enable(True)
def addColumnsMenu(self, event): filenames = [ self.fileList.GetString(x) for x in range(0, self.fileList.GetCount()) ] columns = set() for filename in filenames: read = reader(filename) columns.update(read.columns) columns = list(columns) if columns: columnDialog = wx.MultiChoiceDialog(None, "Choose Columns To Match:", "More Fields", choices=columns) if columnDialog.ShowModal() == wx.ID_OK: newCheckedColumnIndices = columnDialog.GetSelections() newCheckedColumns = [ columns[i] for i in newCheckedColumnIndices ] oldColumns = self.fieldsCtrl.GetStrings() checkedColumns = self.fieldsCtrl.GetCheckedStrings() self.fieldsCtrl.SetItems( oldColumns + [x for x in newCheckedColumns if x not in oldColumns]) self.fieldsCtrl.SetCheckedStrings( list(checkedColumns) + [x for x in newCheckedColumns if x not in checkedColumns]) else: columnAlert = wx.MessageDialog( None, "Could not get additional columns; no files selected.") columnAlert.ShowModal()
def featureToPSM(resultFile, featureData, groupSILAC=False): results = reader(resultFile) if 'Feature' not in results.columns: raise IOError("Not a feature-annotated file!") featureToPSMs = defaultdict(list) if groupSILAC: for psm in results: mods = psm['Variable Modifications'] if mods == None: mods = [] isHeavy = heavyK in mods or heavyR in mods isMedium = (mediumK in mods or mediumR in mods) and not isHeavy isLight = not (isHeavy or isMedium) if isLight: if not psm['Light Features']: continue features = str(psm['Light Features']).split(';') elif isMedium: if not psm['Medium Features']: continue features = str(psm['Medium Features']).split(';') else: if not psm['Heavy Features']: continue features = str(psm['Heavy Features']).split(';') for feature in features: feature = int(float(feature)) featureToPSMs[feature].append(psm) else: for psm in results: try: featureToPSMs[int(float(psm['Feature']))].append(psm) except ValueError: pass return dict(featureToPSMs)
def on_convert(self, event): if not self.file_list.GetStrings(): wx.MessageBox('No files selected', 'Error') return #show hourglass wx.BeginBusyCursor(wx.HOURGLASS_CURSOR) files = self.file_list.GetStrings() input_format = self.input_format.GetSelection() output_format = self.output_format.GetSelection() output_ext = { 0:'.xls', 1:'.xlsx', 2:'.csv', 3:'.mzd' }[output_format] #update statusbar self.set_status("Converting...", 0) self.set_status("", 1) if self.combineCheck.GetValue(): if input_format not in [0, 6]: wx.MessageBox("Only tabular/Excel files can currently be merged.") return combineFiles(self.file_list.GetStrings(), self.combineCtrl.GetValue(), output_ext) wx.EndBusyCursor() self.set_status("Ready", 0) self.set_status("Done", 1) return if input_format == 0: # Mascot CSV mascot_converter = mascot.mascot(version=settings.mascot_version) for file_name in files: self.set_status(file_name, 1) #Run MascotCSV program clean_csv_file = '_clean'.join(os.path.splitext(file_name)) rep_file = os.path.splitext(clean_csv_file)[0] + output_ext if os.path.exists(rep_file): os.remove(rep_file) mascot_converter.clean_csv(file_name, export_file=clean_csv_file, ion_list=False) repreader = mzReport.reader(clean_csv_file) repwriter = mzReport.writer(rep_file, columns=repreader.columns) for row in repreader: repwriter.write(row) repreader.close() repwriter.close() #if os.path.splitext(rep_file)[1].lower() in ('.xls', '.xlsx', 'mzd'): #mascot_reporter.mascot_header(rep_file, file_name) os.remove(clean_csv_file) elif input_format == 1: # Mascot DAT mascot_reporter = mzTools.MascotReport() _mascot_options = dict(max_hits=1000, ion_cutoff=20, bold_red=True, unassigned_queries=False, show_query_data=True, show_same_set=False, show_sub_set=False, quant=False) for file_name in files: self.set_status(file_name, 1) mascot_dat_file = mascot.MascotDatFile(file_name, **_mascot_options) mascot_header = mascot_dat_file.mascot_header() #mascot_header, prot_report, pep_report = mascot.parse_dat_file(file_name, **_mascot_options) ms_file_name = mascot_header[7][1] or (os.path.splitext(os.path.basename(file_name))[0]) report_file = os.path.join(os.path.dirname(file_name), os.path.basename(ms_file_name) + output_ext) if os.path.exists(report_file): os.remove(report_file) if output_ext in ('.xls', '.xlsx', '.mzd'): mascot_reporter.mascot_header(report_file, mascot_header) #mascot_reporter.mascot_header(report_file, mascot_header) if mascot_dat_file.res_file.getMascotVer() >= '2.3': report = mzReport.writer(report_file, columns=(mzReport.default_columns[:1] + ['Protein Database'] + mzReport.default_columns[1:])) else: report = mzReport.writer(report_file, default_columns=True) #for row in pep_report: for row in mascot_dat_file.peptide_report(): report.write(row) mascot_dat_file.close() report.close() #elif input_format == 2: # Mascot mzIdentML #for file_name in files: #mzid = mzIdentML(file_name) #report_file = os.path.splitext(file_name)[0] + output_ext #if os.path.exists(report_file): #os.remove(report_file) #report = mzReport.writer(report_file, default_columns=True) #for row in mzid: #report.write(row) #report.close() elif input_format == 2: # Mascot mzIdentML for file_name in files: mzid = mzIdentML(file_name) data = mzid.peptideSummary() header = data[0].keys() report_file = os.path.splitext(file_name)[0] + output_ext if os.path.exists(report_file): os.remove(report_file) report = mzReport.writer(report_file, columns = header) for row in data: writeRow = [] for column in header: thing = row[column] if type(thing) == type(['list']): thing = "; ".join(thing) writeRow.append(thing) report.write(writeRow) report.close() elif input_format == 3: # Protein Pilot for file_name in files: self.set_status(file_name, 1) pilot = ProteinPilot(file_name) pilot.format(str(os.path.splitext(file_name)[0] + output_ext)) elif input_format == 4: # OMMSA for file_name in files: self.set_status(file_name, 1) omssa = OMSSA_CSV(file_name) omssa.format(str(os.path.splitext(file_name)[0] + output_ext)) elif input_format == 5: # X!Tandem XML for file_name in files: report_file = os.path.splitext(file_name)[0] + output_ext format_XML(file_name, report_file) elif input_format == 6: # other mzReport output_method = {'.xls': mzReport.toXLS, '.xlsx': mzReport.toXLS, '.csv': mzReport.toCSV, '.mzd': mzReport.toMZD}[output_ext] for file_name in files: self.set_status(file_name, 1) rdr = reader(file_name) outputname = '.'.join(file_name.split('.')[:-1]) + output_ext wtr = writer(outputname, columns = rdr.columns) for row in rdr: wtr.write(row) wtr.close() rdr.close() #if output_ext.startswith('.xls'): #output_method(file_name, output_ext == '.xlsx') #else: #output_method(file_name) #hide hourglass wx.EndBusyCursor() self.set_status("Ready", 0) self.set_status("Done", 1)
def psm_XIC_localized(directory, subdirs): """ A peptide may appear in multiple fractions due various factors, but for the purpose of this analysis it is useful to consider a peptide as "belonging" only to the fraction in which the main bulk of the elution occurred. For each fraction in which a given peptide appeared, we take XICs over the m/z values for a set of possible charge and compare their total intensity; the fraction with the most intense XIC(s) is assigned that peptide for the final count. """ tolerance = 0.1 time_tolerance = 15 rawfiles = dict([(x.split('.')[0], mzFile(os.path.join(directory, x))) for x in os.listdir(directory) if x.lower().endswith('raw')]) columns = None start = time.clock() for subdir in subdirs: resultfiles = typeInDir(os.path.join(directory, subdir), 'xlsx') resultfiles = [x for x in resultfiles if 'XIC_localized' not in x] peptidesForFile = defaultdict(dict) for resultfile in resultfiles: rdr = reader(resultfile) columns = rdr.columns psmsByPeptide = collectByCriterion( list(rdr), lambda x: (x['Peptide Sequence'], x['Variable Modifications'])) for peptide, psms in psmsByPeptide.items(): peptidesForFile[peptide][resultfile] = psms outputByFile = defaultdict(list) for peptide, psmsByFile in peptidesForFile.items(): xicsByFile = [] allPSMs = sum(psmsByFile.values(), []) mass = allPSMs[0]['Predicted mr'] assert len(set(x['Predicted mr'] for x in allPSMs)) == 1 charges = set(x['Charge'] for x in allPSMs) allScans = set([ tuple(x['Spectrum Description'].split('.')[:2]) for x in allPSMs ]) allRTs = set(rawfiles[x[0]].scan_time_from_scan_name(int(x[1])) for x in allScans) minRT, maxRT = min(allRTs), max(allRTs) for resultfile, psms in psmsByFile.items(): rawfile = rawfiles[os.path.basename(resultfile.split('.')[0])] xicInt = 0 for charge in charges: mz = (mass + (1.0072764 * charge)) / charge xic = rawfile.xic(minRT - time_tolerance, maxRT + time_tolerance, mz - tolerance, mz + tolerance) xicInt += sum(zip(*xic)[1]) xicsByFile.append((xicInt, resultfile)) highIntFile = max(xicsByFile, key=lambda x: x[0])[1] outputByFile[highIntFile].append(psmsByFile[highIntFile][0]) for resultfile, psms in outputByFile.items(): outputfile = resultfile[:-5] + '.XIC_localized.xlsx' output = writer(outputfile, columns=columns) for psm in psms: output.write(psm) output.close()
def multimode_fractionation_plot(mode_fractions, outputfile = None, count_to_size = (lambda x: x/100), fig_size = None, color_sequence = None): """ mode_fractions must be a list of (<mode title>, <fractions>) tuples where <fractions> is a list of (<organic fraction>, <salt fraction>, <filename>) tuples as in the input for fractionation_plot. Output is a plot of each fraction in organic/salt space, where each point is a pie chart showing absolute magnitude of all modes in the space (via chart size) and relative magnitude of each mode (via chart slice sizes.) """ pyt.cla() if fig_size: fig = pyt.gcf() fig.set_size_inches(*fig_size) for i in range(len(mode_fractions)): mode_fractions[i] = mode_fractions[i][0], [(float(o), float(s), f) for o, s, f in mode_fractions[i][1]] modes = zip(*mode_fractions)[0] organics = sorted(set(sum([list(zip(*x[1])[0]) for x in mode_fractions], []))) salts = sorted(set(sum([list(zip(*x[1])[1]) for x in mode_fractions], []))) orgCoords = dict([(o, i) for i, o in enumerate(organics, start = 1)]) saltCoords = dict([(s, i) for i, s in enumerate(salts, start = 1)]) grid = defaultdict(dict) largest = 0 for mode, fractions in mode_fractions: for organic, salt, filename in fractions: orgCoord = orgCoords[organic] saltCoord = saltCoords[salt] rdr = reader(filename) count = len(list(rdr)) rdr.close() grid[orgCoord, saltCoord][mode] = count largest = max([largest, count]) xScale = (max(zip(*grid.keys())[0]) - min(zip(*grid.keys())[0])) / float(len(orgCoords)) yScale = (max(zip(*grid.keys())[1]) - min(zip(*grid.keys())[1])) / float(len(saltCoords)) overallscale = min([xScale, yScale]) #def countConvert(count): #return (float(count) / (largest*3)) * float(overallscale) def countConvert(x): return 1 fig = pyt.figure() ax = fig.gca() for (orgCoord, saltCoord), modecounts in grid.items(): countForAllModes = [modecounts.get(m, 0) for m in modes] total = sum(modecounts.values()) ax.pie(countForAllModes, center = (orgCoord, saltCoord), radius = countConvert(total), frame = True, colors = color_sequence) ax.set_xlim(min(orgCoords.values()) - 0.5, max(orgCoords.values()) + 0.5) ax.set_ylim(min(saltCoords.values()) - 0.5, max(saltCoords.values()) + 0.5) ax.set_aspect('equal') ax.legend(modes, loc = 'upper left') if not outputfile: pyt.show() else: pyt.savefig(outputfile) pyt.cla()
def fractionation_plot(fractions, outputfile = None, fig_size = None, **kwargs): """ Takes a list of 3-tuples (<organic fraction>, <salt fraction>, <filename>) describing the PSM output of a multi-fraction MS experiment; draws a plot where each fraction is represented as an appropriately scaled point (according to PSM count) in organic/salt space. """ from multiplierz.mzReport import reader import matplotlib.pyplot as pyt pyt.cla() fractions = [(float(o), float(s), f) for o, s, f in fractions] organics = sorted(set(zip(*fractions)[0])) salts = sorted(set(zip(*fractions)[1])) orgCoords = dict([(o, i) for i, o in enumerate(organics, start = 1)]) saltCoords = dict([(s, i) for i, s in enumerate(salts, start = 1)]) if fig_size: fig = pyt.gcf() fig.set_size_inches(*fig_size) elif len(orgCoords) > 8 or len(saltCoords) > 8: fig = pyt.gcf() cursize = fig.get_size_inches() newsize = [cursize[0], cursize[1]] if len(orgCoords) > 8: cursize[0] = max(cursize[0], len(orgCoords) * 0.9) if len(saltCoords) > 8: cursize[1] = max(cursize[1], len(saltCoords) * 0.9) fig.set_size_inches(*cursize) scatterPts = [] for organic, salt, psms in fractions: orgcoord = orgCoords[organic] saltcoord = saltCoords[salt] if isinstance(psms, int): # Can just pass the count. count = psms elif isinstance(psms, basestring): # Else the file rdr = reader(psms) try: count = rdr.get_row_count() except (IOError, AttributeError): count = len(list(rdr)) rdr.close() else: raise Exception, "Must specify PSM count or file." scatterPts.append((orgcoord, saltcoord, count)) pyt.text(orgcoord, saltcoord, str(count), verticalalignment = 'center', horizontalalignment = 'center') orgRange = max(orgCoords.values()) - min(orgCoords.values()) saltRange = max(saltCoords.values()) - min(saltCoords.values()) orgMargin = orgRange / 15.0 saltMargin = saltRange / 15.0 overallRange = min(orgRange, saltRange) ax = pyt.axes() ax.set_xlim(min(orgCoords.values()) - orgMargin, max(orgCoords.values()) + orgMargin) ax.set_ylim(min(saltCoords.values()) - saltMargin, max(saltCoords.values()) + saltMargin) #def count_to_size(counts): #counts / orgpts, saltpts, counts = zip(*scatterPts) pyt.scatter(orgpts, saltpts, counts, alpha = 0.2, **kwargs) orgTicks = [(v, k) for k, v in orgCoords.items()] saltTicks = [(v, k) for k, v in saltCoords.items()] pyt.xticks(*zip(*orgTicks)) pyt.yticks(*zip(*saltTicks)) pyt.xlabel('Organic') pyt.ylabel('Salt') #pyt.xlim(min(orgCoords.values()) - 0.5, max(orgCoords.values()) + 0.5) #pyt.ylim(min(saltCoords.values()) - 0.5, max(saltCoords.values()) + 0.5) ##pyt.set_aspect('equal') #pyt.autoscale(enable = False) #print pyt.xlim(), pyt.ylim() if not outputfile: pyt.show() else: pyt.savefig(outputfile) #print pyt.xlim(), pyt.ylim() pyt.cla()
def _detect_matches(file_names, fields, tol_field, tolerance=0.0): all_mzd = all( (os.path.splitext(f)[1].lower() == '.mzd') for f in file_names) # a separate set code for when all of the files are SQLite databases. # might as well take advantage of the ability to do a real query if all_mzd: # create a SQLite database in memory conn = sqlite3.connect(':memory:') # connection object # short-hand names for each file table_names = [('mzd%d' % i) for i, f in enumerate(file_names)] for f, t in zip(file_names, table_names): # attach each mzResults file to the database conn.execute('attach database (?) as (?)', (f, t)) # add 'comma-join' function to aggregate file names conn.create_aggregate("cjoin", 1, CommaJoin) #if tol_field: #fields.append(tol_field) field_list = ','.join('"%s"' % f for f in fields) sub_query = ('select \'%s\' as "File Name",' + field_list + ' from %s.PeptideData') union_query = ' union '.join((sub_query % (t, t)) for t in table_names) query = ('select distinct cjoin("File Name"),%s' + ' from (%s) group by %s') % (field_list, union_query, field_list) try: yield (fields + [os.path.basename(f) for f in file_names] + ["Detections"]) # if tolerance is specified, need to do a second grouping step if tol_field: tol_dict = defaultdict(lambda: defaultdict(set)) for row in conn.execute(query): file_set = set(row[0].split(',')) t = tuple(row[1:-1]) tol_val = float(row[-1]) k = min(tol_dict[t] or [tol_val], key=lambda k: abs(k - tol_val)) if abs(k - tol_val) <= tolerance: tol_dict[t][k].update(file_set) tol_dict[t][tol_val].update(tol_dict[t][k]) else: tol_dict[t][tol_val].update(file_set) for t in tol_dict: for v in tol_dict[t]: file_row = [ int(f in tol_dict[t][v]) for f in table_names ] file_row.append(sum(file_row)) yield (t + (v, ) + tuple(file_row)) else: for row in conn.execute(query): file_set = set(row[0].split(',')) file_row = [int(f in file_set) for f in table_names] file_row.append(sum(file_row)) yield (row[1:] + tuple(file_row)) finally: conn.close() else: if tol_field: rows = defaultdict(lambda: defaultdict(set)) else: rows = defaultdict(set) for name in file_names: report = mzReport.reader(name) #mzTools.logger_message(10, name) for row in report: t = tuple(row.get(field.lower()) for field in fields) if tol_field: tol_val = row.get(tol_field.lower()) rows[t][tol_val].add(name) for k in rows[t].keys(): if abs(k - tol_val) <= tolerance: rows[t][tol_val].update(rows[t][k]) rows[t][k].add(name) else: rows[t].add(name) report.close() if tol_field: yield (fields + [tol_field] + [os.path.basename(f) for f in file_names] + ["Detections"]) for t in rows: for v in rows[t]: file_row = [int(f in rows[t][v]) for f in file_names] file_row.append(sum(file_row)) yield (t + (v, ) + tuple(file_row)) else: yield (fields + [os.path.basename(f) for f in file_names] + ["Detections"]) for t in rows: file_row = [int(f in rows[t]) for f in file_names] file_row.append(sum(file_row)) yield (t + tuple(file_row))
def filterJoin(filenames, matchColumns, returnMode, outputKeyFile, combinedOutputFile=None, outputFileType='.xlsx', outputTag=None, tolerance=None, toleranceColumn=None): """ Produces a joined file, filtering out either repeat or unique PSMS. If returnMode is 'matched', output file contains one instance of each PSM group; if returnMode is 'unmatched', the output file contains every PSM that wasn't part of a larger PSM group. (Where 'PSM group' is a set of PSMs that are identical based on the given matchColumns + toleranceColumn.) If 'both', both kinds of output are produced. Returns output file name(s); a tuple in the case of 'both.' """ assert returnMode in ['matched', 'unmatched', 'both'] #if not outputFileBase: #outputFileBase = filenames[0] data = [] columnLists = [] for filename in filenames: subdata = [] inputfile = reader(filename) columnLists.append(inputfile.columns) for psm in inputfile: psm['Source'] = filename subdata.append(psm) data.append(subdata) inputfile.close() assert all([columnLists[0] == x for x in columnLists]), "Heterogeneous data columns!" datadict = defaultdict(list) for subdata in data: for psm in subdata: signature = tuple([psm[x] for x in matchColumns]) datadict[signature].append(psm) if toleranceColumn: toldatadict = {} for signature, sigGroup in datadict.items(): subGroups = [] for psm in sigGroup: match = False for subGroup in subGroups: if all([ abs(psm[toleranceColumn] - subpsm[toleranceColumn]) < tolerance for subpsm in subGroup ]): match = True subGroup.append(psm) break if not match: subGroups.append([psm]) for index, subGroup in enumerate(subGroups): subSig = tuple(list(signature) + index) toldatadict[subSig] = subGroup datadict = toldatadict if outputKeyFile: keyfile = writer(outputKeyFile, columns=['PSM Key'] + filenames) for signature, psmGroup in datadict.items(): line = {} line['PSM Key'] = '|'.join([str(x) for x in signature]) line.update([(x, len([y for y in psmGroup if y['Source'] == x])) for x in filenames]) keyfile.write(line) keyfile.close() outputpsms = [] if returnMode == 'matched' or returnMode == 'both': #outputFileName = outputFileBase + '_matchedPSMs' + outputFileType #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0]) for psmGroup in datadict.values(): if len(psmGroup) > 1: exemplar = psmGroup[0] sourceFiles = '; '.join(set([x['Source'] for x in psmGroup])) exemplar['source'] = sourceFiles #outputfile.write(exemplar) outputpsms.append(exemplar) #outputfile.close() if returnMode == 'unmatched' or returnMode == 'both': #outputFileName = outputFileBase + '_uniquePSMs' + outputFileType #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0]) for psmGroup in datadict.values(): if len(psmGroup) == 1: #outputfile.write(psmGroup[0]) outputpsms.append(psmGroup[0]) #outputfile.close() outputs = [] if outputTag: outputfiles = [ (x, '.'.join(x.split('.')[:-1] + [outputTag, outputFileType])) for x in filenames ] for filename, outputfile in outputfiles: output = writer(outputfile, columns=['Source'] + columnLists[0]) for psm in [x for x in outputpsms if x['Source'] == filename]: output.write(psm) output.close() outputs = [x[1] for x in outputfiles] if combinedOutputFile: output = writer(combinedOutputFile, columns=['Source'] + columnLists[0]) for psm in outputpsms: output.write(psm) output.close() outputs.append(combinedOutputFile) return outputs
def _filter_join(file_names, key_source_file, exclude, save_file_suffix='_filtered'): all_mzd = (os.path.splitext(key_source_file)[1].lower() == '.mzd' and all( (os.path.splitext(f)[1].lower() == '.mzd') for f in file_names)) if all_mzd: # create a SQLite database in memory conn = sqlite3.connect(':memory:') # connection object conn.execute('attach database (?) as key_table', (key_source_file, )) cols = [ d[0] for d in conn.execute( 'select * from key_table.PeptideData limit 1').description ] col_set = set(cols) # short-hand names for each file table_names = [('mzd%d' % i) for i, f in enumerate(file_names)] col_list = [] for f, t in zip(file_names, table_names): # attach each mzResults file to the database conn.execute('attach database (?) as (?)', (f, t)) col_list.append( tuple(d[0] for d in conn.execute( 'select * from %s.PeptideData limit 1' % t).description)) yield col_list try: for f, t, t_cols in zip(file_names, table_names, col_list): #mzTools.logger_message(20, f) t_set = set(t_cols) u_str = ' and '.join( '(A."%s" = B."%s" or ifnull(A."%s", B."%s") is NULL)' % (c, c, c, c) for c in t_cols if c in col_set) query = 'select A.* from %s.PeptideData as A, key_table.PeptideData as B where %s' % ( t, u_str) if exclude: query = 'select * from %s.PeptideData except %s' % (t, query) cur = conn.execute(query) res_cols = [d[0] for d in cur.description] res_dict = dict((c, i) for i, c in enumerate(res_cols)) yield (f, t_cols, (dict([('Filter Key', '|'.join( str(row[res_dict[c]]) for c in cols if c in t_set))] + zip(t_cols, (v for i, v in enumerate(row) if res_cols[i] in t_cols))) for row in cur)) conn.execute('detach database %s' % t) finally: conn.close() else: source_report = mzReport.reader(key_source_file) col_list = [] for name in file_names: rdr = mzReport.reader(name) col_list.append(tuple(rdr.columns)) rdr.close() yield col_list try: for name in file_names: #mzTools.logger_message(20, name) this_rep = mzReport.reader(name) key_cols = [ col for col in source_report.columns if col in this_rep.columns ] filter_keys = set() for row in source_report: if int(row['Detections']) > 1: filter_keys.add("|".join( str(row[col]) for col in key_cols)) if exclude: filtered_data = [] for row in this_rep: filter_key = "|".join( str(row[col]) for col in key_cols) if filter_key not in filter_keys: row['Filter Key'] = filter_key filtered_data.append(row) else: filtered_data = [] for row in this_rep: filter_key = "|".join( str(row[col]) for col in key_cols) if filter_key in filter_keys: row['Filter Key'] = filter_key filtered_data.append(row) yield (name, this_rep.columns, filtered_data) this_rep.close() finally: this_rep.close()
def calculate_FDR(reportfile, outputfile = None, threshold = 0.01, decoyString = 'rev_', includeStatisticsSheet = True, includeDuplicates = True, separateDuplicateSheet = True, includeFailedSheet = True, includeReverseSheet = True, single_cutoff = True): """ Performs Forward/Reverse database filtering on the target file, giving back the true PSMs over the specified statistical threshold as well as removed decoy and below-threshold PSMs, in respective sheets. All entries in the decoy (reverse) database must have accessions that begin with some uniform prefix; by default, "rev_" (so that gi|198292342|X7823_EXTRA becomes rev_gi|198292342|X7823_EXTRA.) outputfile may be safely specified to be the same as the input file, in order to overwrite the original file. """ from multiplierz.mzReport import reader, writer reportReader = reader(reportfile) reportRows = list(reportReader) columns = reportReader.columns + ['FDR'] reportReader.close() reportRows.sort(key = lambda x: x['Peptide Score'], reverse = True) seenSpectra = {} passedRows = [] failedRows = [] duplicateRows = [] reverseRows = [] reverses = 0.0 forwards = 0.0 duplicates = 0 passed = 0 failed = 0 lowPass = 999999999 highRev = 0 for row in reportRows: specDesc = row['Spectrum Description'] if specDesc in seenSpectra: duplicates += 1 fdr = seenSpectra[specDesc] row['FDR'] = fdr if includeDuplicates and not separateDuplicateSheet: if fdr < threshold: passedRows.append(row) else: failedRows.append(row) else: duplicateRows.append(row) continue #if decoyString in row['Accession Number'].lower(): # Turns out that produced awful results, since high-scoring peptides # could just happen to be duplicated in the reverse database. # So instead: if all([decoyString in x.lower() for x in row['Accession Number'].split(';')]): reverses += 1 if forwards: fdr = reverses / forwards else: fdr = 100 row['FDR'] = fdr if float(row['Peptide Score']) > highRev: highRev = float(row['Peptide Score']) seenSpectra[specDesc] = fdr reverseRows.append(row) else: forwards += 1 fdr = reverses / forwards row['FDR'] = fdr seenSpectra[specDesc] = fdr if fdr < threshold: passed += 1 passedRows.append(row) if float(row['Peptide Score']) < lowPass: lowPass = float(row['Peptide Score']) else: failed += 1 failedRows.append(row) if single_cutoff: recovered = [x for x in failedRows if x['Peptide Score'] > lowPass] failedRows = [x for x in failedRows if x['Peptide Score'] <= lowPass] passedRows += recovered if not outputfile: # Output format must support sheets. if reportfile.lower().endswith('xlsx') or reportfile.lower().endswith('xls'): outputfile = insert_tag(reportfile, 'FDR_filtered') else: outputfile = '.'.join(reportfile.split('.')[:-1] + ['FDR_filtered.xlsx']) percentage = round(threshold * 100) if includeFailedSheet: failedOutput = writer(outputfile, columns = columns, sheet_name = "Failed %s%% FDR" % percentage) for row in failedRows: failedOutput.write(row) failedOutput.close() if separateDuplicateSheet: duplicateOutput = writer(outputfile, columns = columns, sheet_name = "Duplicate Rows") for row in duplicateRows: duplicateOutput.write(row) duplicateOutput.close() if includeReverseSheet: reverseOutput = writer(outputfile, columns = columns, sheet_name = 'Reverse Hits') for row in reverseRows: reverseOutput.write(row) reverseOutput.close() if includeStatisticsSheet: statOutput = writer(outputfile, columns = ['FDR Calculation Statistics', '--------------'], sheet_name = "FDR Statistics") statOutput.write(['', '']) statOutput.write(['Total Spectra', str(len(reportRows))]) statOutput.write(['Passed %s%% FDR' % percentage, str(passed)]) statOutput.write(['Lowest Passing Score', str(lowPass)]) statOutput.write(['Reverse Hits', str(reverses)]) statOutput.write(['Highest Scoring Reverse Hit', str(highRev)]) statOutput.write(['Number of Duplicates', str(duplicates)]) statOutput.close() passedOutput = writer(outputfile, columns = columns, sheet_name = "Data") for row in passedRows: passedOutput.write(row) passedOutput.close() return outputfile
def feature_analysis(datafile, resultFiles, featureFile=None, tolerance=None, mzRegex=None, scanRegex=None, **constants): """ Performs feature-detection analysis on the given .RAW file and PSM reports. The output files group the given PSMs by feature, with the addition of source feature extent and intensity information. """ import os if mzRegex: import re global spectrumDescriptionToMZ mzRegCompiled = re.compile(mzRegex) def newParser(description): return float(mzRegCompiled.search(description).group()) spectrumDescriptionToMZ = newParser if scanRegex: import re global spectrumDescriptionToScanNumber scanRegCompiled = re.compile(scanRegex) def newParser(description): return int(scanRegCompiled.search(description).group()) spectrumDescriptionToScanNumber = newParser #if tolerance: #global peakFindTolerance #peakFindTolerance = tolerance #if signalNoise: #global signalToNoiseThreshold #signalToNoiseThreshold = signalNoise assert os.path.exists(datafile), "%s not found!" % datafile for resultfile in resultFiles: assert os.path.exists(resultfile), "%s not found!" % resultfile assert datafile.lower().endswith( '.raw'), "Only .raw files are currently supported." if featureFile: assert os.path.exists( featureFile ), "Specified feature data file %s not found!" % featureFile else: featureFile = detect_features(datafile, tolerance=tolerance, **constants) features = FeatureInterface(featureFile) outputfiles = [] if resultFiles: print resultFiles print "Categorizing search results by file." for resultfile in resultFiles: resultfile = os.path.abspath(resultfile) inputResults = mzReport.reader(resultfile) outputfile = '.'.join( resultfile.split('.')[:-1] + ['featureDetect', 'xlsx']) outputfiles.append(outputfile) resultsByFeature = binByFullFeature(datafile, features, inputResults) output = mzReport.writer( outputfile, columns=inputResults.columns + [ 'Feature', 'feature error', 'feature start scan', 'feature end scan', 'feature start time', 'feature end time', 'feature intensity', 'feature kurtosis', 'feature skewness' ]) for result in resultsByFeature: output.write(result) output.close() print "Output saved to %s ." % outputfile else: print "No PSM data given; skipping annotation step." return featureFile, outputfiles
def add_gene_ids(target_files, p2g_database, target_sheet=None, outputfile=None, inPlace=False, leucine_equals_isoleucine=True, legacy_columns=True): starttime = time.clock() if isinstance(target_files, str): return_list = False target_files = [target_files] else: return_list = True dataRdr = open(p2g_database, 'rb') data = pickle.load(dataRdr) k_len = None if isinstance(data, tuple) and len(data) == 6: k_len, seqLookup, fmerLookup, geneLookup, isoSeqLookup, isoFmerLookup = data elif isinstance(data, tuple) and not len(data) == 6: raise Exception(str(len(data))) else: print('Legacy mode P2G database detected!') seqLookup = data fmerLookup = pickle.load(dataRdr) geneLookup = pickle.load(dataRdr) try: isoSeqLookup = pickle.load(dataRdr) isoFmerLookup = pickle.load(dataRdr) except EOFError: distinguish_leucine = False isoSeqLookup = None isoFmerLookup = None dataRdr.close() if isinstance(list(geneLookup.values())[0], tuple): print("Legacy mode gene names detected.") oldTupleInstance = list(geneLookup.values())[0] nameIndex = 0 if oldTupleInstance[0] and any( x.isalpha() for x in oldTupleInstance[0]) else 1 for k, v in list(geneLookup.items()): geneLookup[k] = v[nameIndex] if leucine_equals_isoleucine: assert isoFmerLookup, ( "Pep2Gene database does not contain leucine-isoleucine " "ambiguity data; re-compile database or " "select leucine_equals_isoleucine = False .") if k_len: assert k_len == K, "Pep2Gene database created with kmers of length %s, not %s" % ( k_len, K) print("P2G database loaded: %.2f\n\n" % (time.clock() - starttime)) prevtime = time.clock() outputfiles = [] for target_file in target_files: try: rdr = reader(target_file, sheet_name=target_sheet) except TypeError: rdr = reader(target_file) # Not an Excel file. add_legacy_cols = [ "pro_count", "pro_list", "gene_count", "gene_symbols", ] add_cols = ["Protein Count", "Proteins", "Gene Count", "Gene Symbols"] if legacy_columns: new_cols = add_legacy_cols colname = dict(list(zip(add_cols, add_legacy_cols))) else: new_cols = add_cols colname = dict(list(zip(add_cols, add_cols))) iso_legacy_cols = [ 'IL Ambiguity pro_count', 'IL Ambiguity pro_list', "IL Ambiguity gene_count", "IL Ambiguity gene_symbols" ] iso_cols = [ 'I<->L Protein Count', 'I<->L Proteins', 'I<->L Gene Count', 'I<->L Gene Symbols' ] if legacy_columns and leucine_equals_isoleucine: new_cols += iso_legacy_cols colname.update(dict(list(zip(iso_cols, iso_legacy_cols)))) elif leucine_equals_isoleucine: new_cols += iso_cols colname.update(dict(list(zip(iso_cols, iso_cols)))) if (not outputfile) or return_list: ext = target_file.split('.')[-1] outputfile = '.'.join(target_file.split('.')[:-1] + ['GENES', ext]) output = writer(outputfile, columns=rdr.columns + new_cols) pepToProts = {} isoPepToProts = {} for counter, row in enumerate(rdr): if counter % 1000 == 0: print_progress(counter) try: pep = row['Peptide Sequence'].upper() except KeyError: pep = row['Peptide'].upper() pep = ''.join([x for x in pep if x.isalpha()]) if len(pep) <= K: continue # No 4-mers in a 3-mer! isoPep = pep.replace('I', 'L') if pep not in pepToProts: candidate_prots = reduce(set.intersection, (fmerLookup[pep[x:x + K]] for x in range(len(pep) - K))) # pep_find could be replaced by giving the p2g database a pre-made set of # hashes of all tryptic peptides in a protein, and seeing if the hash of the # pep is present in the set. pep_find = re.compile( '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % pep) pepToProts[pep] = set(prot for prot in candidate_prots if pep_find.search(seqLookup[prot])) if leucine_equals_isoleucine and isoPep not in isoPepToProts: iso_candidate_prots = reduce( set.intersection, (isoFmerLookup[isoPep[x:x + K]] for x in range(len(isoPep) - K))) pep_find = re.compile( '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % isoPep) isoPepToProts[isoPep] = set( prot for prot in iso_candidate_prots if pep_find.search(isoSeqLookup[prot])) proteins = '; '.join(pepToProts[pep]) proteinCount = len(pepToProts[pep]) geneList = set(geneLookup[x] for x in pepToProts[pep] if x in geneLookup) geneIds = '; '.join(set(g for g in geneList)) #geneSymbols = '; '.join(set(s for _, s in geneList)) geneCount = len(geneList) row[colname['Protein Count']] = proteinCount row[colname['Proteins']] = proteins row[colname['Gene Count']] = geneCount row[colname['Gene Symbols']] = geneIds #row[colname['Gene IDs']] = if leucine_equals_isoleucine: isoProteins = '; '.join(isoPepToProts[isoPep]) isoProteinCount = len(isoPepToProts[isoPep]) isoGeneList = set(geneLookup[x] for x in isoPepToProts[isoPep] if x in geneLookup) isoGeneIds = '; '.join(set(g for g in isoGeneList)) #isoGeneSymbols = '; '.join(set(s for _, s in isoGeneList)) isoGeneCount = len(isoGeneList) row[colname['I<->L Protein Count']] = isoProteinCount row[colname['I<->L Proteins']] = isoProteins row[colname['I<->L Gene Count']] = isoGeneCount row[colname['I<->L Gene Symbols']] = isoGeneIds #row[colname['I<->L Gene IDs']] = output.write(row) print("\nGene lookup completed: %.2f" % (time.clock() - prevtime)) prevtime = time.clock() rdr.close() output.close() print("Output written: %.2f" % (time.clock() - prevtime)) outputfiles.append(outputfile) if return_list: return outputfiles else: return outputfile
def evaluateMascotFile(resultfile, datafile = None, featurefile = None, outputfile = None): #assert datafile or featurefile, "Either raw data or feature data must be given!" header = [list(x.values()) for x in list(reader(resultfile, sheet_name = 'Mascot_Header'))] def retrieveHeaderValue(key): try: return [[x for x in xs if x != key] for xs in header if key in xs][0][0] except IndexError: return '' quant = retrieveHeaderValue('Quantitation method') varmods = retrieveHeaderValue('Variable modifications') assert ('SILAC' in quant) or ('plex' in varmods), "Label method not recognized!" if not featurefile: featurefile = detectFeatures(datafile, signalToNoiseThreshold = 15) features = FeatureInterface(featurefile) else: features = FeatureInterface(featurefile) print("Matching features to PSMs...") results = reader(resultfile) columns = results.columns results = list(results) data = mzFile(datafile) ms1map = {} ms2s = [] ms1 = None for _, _, scan, level, _ in data.scan_info(0, 999999): if level == 'MS1': for ms2 in ms2s: ms1map[ms2] = ms1 ms1 = scan ms2s = [] elif level == 'MS2': ms2s.append(scan) ms1map[ms1] = ms2s data.close() featureIntMap = {} for psm in results: mz = psm['Experimental mz'] scan = int(psm['Spectrum Description'].split('.')[1]) charge = int(psm['Charge']) for index, feature in features.mz_range(mz - 1, mz + 1): if feature.containsPoint(mz, ms1map[scan], charge): featureIntMap[scan] = feature.c12Intensity() break del features if not outputfile: outputfile = '.'.join(resultfile.split('.')[:-1]) + '_LABEL_EVALUATION.xlsx' if 'SILAC' in quant: return evaluateSILAC(outputfile, columns, results, featureIntMap), outputfile elif 'plex' in varmods: return evaluateTMTiTRAQ(outputfile, columns, results, featureIntMap), outputfile