def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1, experiment=None): self.transformation_collection = TransformationCollection() self.alignment_fdr_threshold_ = alignment_fdr_threshold self.smoother = smoother self.tmpdir_ = external_r_tmpdir self.max_data_ = maxdata self._cacher = None self._experiment = experiment
def __init__(self, alignment_fdr_threshold=0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1, experiment=None): self.transformation_collection = TransformationCollection() self.alignment_fdr_threshold_ = alignment_fdr_threshold self.smoother = smoother self.tmpdir_ = external_r_tmpdir self.max_data_ = maxdata self._cacher = None self._cy_cacher = None self._experiment = experiment try: from msproteomicstoolslib.algorithms.alignment.DataCacher import CyDataCacher self._cy_cacher = CyDataCacher() except ImportError: print( "WARNING: cannot import CyDataCacher, will use Python version (slower)." )
def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1, experiment=None): self.transformation_collection = TransformationCollection() self.alignment_fdr_threshold_ = alignment_fdr_threshold self.smoother = smoother self.tmpdir_ = external_r_tmpdir self.max_data_ = maxdata self._cacher = None self._cy_cacher = None self._experiment = experiment try: from msproteomicstoolslib.algorithms.alignment.DataCacher import CyDataCacher self._cy_cacher = CyDataCacher() except ImportError: print("WARNING: cannot import CyDataCacher, will use Python version (slower).")
def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1): self.transformation_collection = TransformationCollection() self.alignment_fdr_threshold_ = alignment_fdr_threshold self.smoother = smoother self.tmpdir_ = external_r_tmpdir self.max_data_ = maxdata
class SplineAligner(): """ Use the datasmoothing part of msproteomicstoolslib to align two runs in retention times using splines. >>> spl_aligner = SplineAligner() >>> transformations = spl_aligner.rt_align_all_runs(this_exp, multipeptides, options.alignment_score, options.use_scikit) """ def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1): self.transformation_collection = TransformationCollection() self.alignment_fdr_threshold_ = alignment_fdr_threshold self.smoother = smoother self.tmpdir_ = external_r_tmpdir self.max_data_ = maxdata def _determine_best_run(self, experiment): maxcount = -1 bestrun = -1 for run in experiment.runs: cnt = 0 for prgroup in run: for peptide in prgroup: if peptide.get_decoy(): continue pg = peptide.get_best_peakgroup() if pg.get_fdr_score() < self.alignment_fdr_threshold_: cnt += 1 if cnt > maxcount: maxcount = cnt bestrun = run.get_id() print("Found best run", bestrun, "with %s features above the cutoff of %s%%" % (maxcount, self.alignment_fdr_threshold_)) return [r for r in experiment.runs if r.get_id() == bestrun][0] def _getRTData(self, bestrun, run, multipeptides): """ Return retention time data for reference and slave run """ # data1 = reference data (master) # data2 = data to be aligned (slave) data1 = [] data2 = [] data_tmp = [] cnt_multiple = 0 for m in multipeptides: try: len_ali = len([pg for pg in m.getPrecursorGroup(run.get_id()).getAllPeakgroups() if pg.get_fdr_score() < self.alignment_fdr_threshold_]) len_ref = len([pg for pg in m.getPrecursorGroup(bestrun.get_id()).getAllPeakgroups() if pg.get_fdr_score() < self.alignment_fdr_threshold_]) # Do not consider peakgroups that are missing in one run # Do not consider peakgroups that have more than one good peakgroup if len_ali != 1 or len_ref != 1: if len_ali > 1 or len_ref > 1: cnt_multiple += 1 continue ref_pep = m.getPrecursorGroup(bestrun.get_id()).getOverallBestPeakgroup() align_pep = m.getPrecursorGroup(run.get_id()).getOverallBestPeakgroup() except KeyError: # it is possible that for some, no peak group exists in this run continue # Do not use decoy peptides if ref_pep.peptide.get_decoy() or align_pep.peptide.get_decoy(): continue if ref_pep.get_fdr_score() < self.alignment_fdr_threshold_ and \ align_pep.get_fdr_score() < self.alignment_fdr_threshold_: # data1.append(ref_pep.get_normalized_retentiontime()) # data2.append(align_pep.get_normalized_retentiontime()) data_tmp.append( ( ref_pep.get_fdr_score(), ref_pep.get_normalized_retentiontime(), align_pep.get_normalized_retentiontime() ) ) if cnt_multiple > len(multipeptides) * 0.8 : print ("") print (" Warning: Most of your data has more than one peakgroup with a score better than %s." % self.alignment_fdr_threshold_) print (" This may be a problem for the alignment, please consider adjusting the --alignment_score option." ) maxdata = self.max_data_ if maxdata == -1: # -1 means take all data maxdata = len(data_tmp) for fdr, d1, d2 in sorted(data_tmp)[:maxdata]: data1.append(d1) data2.append(d2) return data1,data2 def _spline_align_runs(self, bestrun, run, multipeptides): """Will align run against bestrun""" sm = smoothing.getSmoothingObj(smoother = self.smoother, tmpdir = self.tmpdir_) # get those peptides we want to use for alignment => for this use the mapping # data1 = reference data (master) # data2 = data to be aligned (slave) data1,data2 = self._getRTData(bestrun, run, multipeptides) if len(data2) < 2: print("No common identifications between %s and %s. Only found %s features below a cutoff of %s" % ( run.get_id(), bestrun.get_id(), len(data1), self.alignment_fdr_threshold_) ) print("If you ran the feature_alignment.py script, try to skip the re-alignment step (e.g. remove the --realign_runs option)." ) raise Exception("Not enough datapoints (less than 2 datapoints).") # Since we want to predict how to convert from slave to master, slave # is first and master is second. sm.initialize(data2, data1) data2_aligned = sm.predict(data2) # Store transformation in collection (from run to bestrun) self.transformation_collection.addTransformationData([data2, data1], run.get_id(), bestrun.get_id() ) self.transformation_collection.addTransformedData(data2_aligned, run.get_id(), bestrun.get_id() ) stdev = numpy.std(numpy.array(data1) - numpy.array(data2_aligned)) median = numpy.median(numpy.array(data1) - numpy.array(data2_aligned)) print("Will align run %s against %s, using %s features" % (run.get_id(), bestrun.get_id(), len(data1)) ) print(" Computed stdev", stdev, "and median", median ) # Store error for later d = self.transformation_error.transformations.get(run.get_id(), {}) d[bestrun.get_id()] = [stdev, median] self.transformation_error.transformations[ run.get_id() ] = d # Now predict on _all_ data and write this back to the data i = 0 all_pg = [] for prgr in run: for pep in prgr: all_pg.extend( [ (pg.get_normalized_retentiontime(), pg.get_feature_id()) for pg in pep.get_all_peakgroups()] ) rt_eval = [ pg[0] for pg in all_pg] aligned_result = sm.predict(rt_eval) for prgr in run: for pep in prgr: # TODO hack -> direct access to the internal peakgroups object mutable = [list(pg) for pg in pep.peakgroups_] for k in range(len(mutable)): mutable[k][2] = aligned_result[i] i += 1 pep.peakgroups_ = [ tuple(m) for m in mutable] def rt_align_all_runs(self, experiment, multipeptides): """ Align all runs contained in an MRExperiment Args: experiment(MRExperiment): a collection of runs multipeptides(list(multipeptides)): a list of Multipeptide derived from the above expriment """ print("Will re-align runs" ) # get the best run (e.g. the one with the most ids below threshold) bestrun = self._determine_best_run(experiment) ## spl_aligner.transformation_collection = experiment.transformation_collection self.transformation_collection.setReferenceRunID( bestrun.get_id() ) self.transformation_error = TransformationError() # go through all runs and align two runs at a time for run in experiment.runs: if run.get_id() == bestrun.get_id(): continue # do not align reference run itself self._spline_align_runs(bestrun, run, multipeptides) return self.transformation_collection def getTransformationError(self): """ Get the error of the transformation Returns: transformation_error(:class:`.TransformationError`) : the error of the transformation """ return self.transformation_error
def __init__(self): super(Experiment, self).__init__() self.transformation_collection = TransformationCollection()
class Experiment(MRExperiment): """ An Experiment is a container for multiple experimental runs - some of which may contain the same precursors. """ def __init__(self): super(Experiment, self).__init__() self.transformation_collection = TransformationCollection() def estimate_real_fdr(self, multipeptides, fraction_needed_selected): class DecoyStats(): def __init__(self): self.est_real_fdr = 0.0 self.nr_decoys = 0 self.nr_targets = 0 self.decoy_pcnt = 0.0 self.est_real_fdr = 0.0 d = DecoyStats() precursors_to_be_used = [ m for m in multipeptides if m.more_than_fraction_selected(fraction_needed_selected) ] # count the decoys d.nr_decoys = sum([ len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if prec.find_best_peptide_pg().peptide.get_decoy() ]) d.nr_targets = sum([ len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if not prec.find_best_peptide_pg().peptide.get_decoy() ]) # estimate the real fdr by calculating the decoy ratio and dividing it # by the decoy ration obtained at @fdr_cutoff => which gives us the # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute # value, we multiply by fdr_cutoff again (which was used to obtain the # original estimated decoy percentage). if self.estimated_decoy_pcnt is None: return d if (d.nr_targets + d.nr_decoys) == 0: return d d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys)) d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff return d def print_stats(self, multipeptides, fdr_cutoff, fraction_present, min_nrruns): alignment = AlignmentStatistics() alignment.count(multipeptides, fdr_cutoff) # Count presence in all runs (before alignment) precursors_in_all_runs_wo_align = len([ 1 for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy() ]) proteins_in_all_runs_wo_align_target = len( set([ m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy() ])) peptides_in_all_runs_wo_align_target = len( set([ m.find_best_peptide_pg().peptide.sequence for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy() ])) # Count presence in all runs (before alignment) precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] nr_peptides_target = len( set([ prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy() ])) nr_proteins_target = len( set([ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy() ])) nr_precursors_in_all = len([ 1 for m in multipeptides if m.all_selected() and not m.get_decoy() ]) max_pg = alignment.nr_good_precursors * len(self.runs) dstats = self.estimate_real_fdr(multipeptides, fraction_present) dstats_all = self.estimate_real_fdr(multipeptides, 1.0) # Get single/multiple hits stats from itertools import groupby precursors_quantified = [ m for m in multipeptides if len(m.get_selected_peakgroups()) > 0 ] target_quant_protein_list = [ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy() ] target_quant_protein_list.sort() nr_sh_target_proteins = sum([ len(list(group)) == 1 for key, group in groupby(target_quant_protein_list) ]) nr_mh_target_proteins = sum([ len(list(group)) > 1 for key, group in groupby(target_quant_protein_list) ]) # ########################################################################### # print "=" * 75 print "=" * 75 print "Total we have", len(self.runs), "runs with", alignment.nr_good_precursors, \ "peakgroups quantified in at least %s run(s) below m_score (q-value) %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " + \ "giving maximally nr peakgroups", max_pg print "We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned", \ alignment.nr_aligned print " The order of", alignment.nr_changed, "peakgroups was changed,", max_pg - alignment.nr_quantified, \ "could not be aligned and %s were removed. Ambigous cases: %s, multiple suitable peakgroups: %s" % ( alignment.nr_removed, self.nr_ambiguous, self.nr_multiple_align) print "We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)" % ( alignment.nr_quant_precursors, alignment.nr_good_precursors, min_nrruns, nr_precursors_in_all, precursors_in_all_runs_wo_align) print "We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)" % ( len(alignment.quant_peptides), len( alignment.good_peptides), min_nrruns, nr_peptides_target, peptides_in_all_runs_wo_align_target) print "We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)" % ( len(alignment.quant_proteins), len( alignment.good_proteins), min_nrruns, nr_proteins_target, proteins_in_all_runs_wo_align_target) print " Of these %s proteins, %s were multiple hits and %s were single hits." % ( len(alignment.quant_proteins), nr_mh_target_proteins, nr_sh_target_proteins) # Get decoy estimates decoy_precursors = len([ 1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0 and m.find_best_peptide_pg().peptide.get_decoy() ]) if len(precursors_in_all_runs) > 0: print "Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % ( dstats_all.decoy_pcnt, dstats_all.nr_decoys, dstats_all.nr_decoys + dstats_all.nr_targets, dstats_all.est_real_fdr * 100) print "Decoy percentage of peakgroups that are partially aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % ( dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys + dstats.nr_targets, dstats.est_real_fdr * 100) print "There were", decoy_precursors, "decoy precursors identified out of", \ alignment.nr_quant_precursors + decoy_precursors, "precursors which is %0.4f %%" % ( decoy_precursors *100.0 / (alignment.nr_quant_precursors + decoy_precursors)) def _getTrafoFilename(self, current_run, ref_id): current_id = current_run.get_id() input_basename = os.path.basename(current_run.orig_filename) fn = os.path.splitext(input_basename)[0] dirname = os.path.dirname(current_run.orig_filename) filename = os.path.join(dirname, "%s-%s-%s.tr" % (fn, current_id, ref_id)) return filename def _write_trafo_files(self): # Print out trafo data trafo_fnames = [] for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = self._getTrafoFilename(current_run, ref_id) trafo_fnames.append(filename) self.transformation_collection.writeTransformationData( filename, current_id, ref_id) self.transformation_collection.readTransformationData(filename) def write_to_file(self, multipeptides, options, writeTrafoFiles=True): infiles = options.infiles outfile = options.outfile matrix_outfile = options.matrix_outfile yaml_outfile = options.yaml_outfile ids_outfile = options.ids_outfile fraction_needed_selected = options.min_frac_selected file_format = options.file_format # 1. Collect ids of selected features selected_pgs = [] for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups) * 1.0 / len(self.runs)) < fraction_needed_selected: continue for p in m.getAllPeptides(): selected_pg = p.get_selected_peakgroup() clustered_pg = p.getClusteredPeakgroups() for pg in clustered_pg: selected_pgs.append(pg) selected_ids_dict = dict([(pg.get_feature_id(), pg) for pg in selected_pgs]) # 2. Write out the (selected) ids if len(ids_outfile) > 0: fh = open(ids_outfile, "w") id_writer = csv.writer(fh, delimiter="\t") for pg in selected_pgs: id_writer.writerow([pg.get_feature_id()]) fh.close() del id_writer # 3. Write out the matrix outfile if len(matrix_outfile) > 0: write_out_matrix_file(matrix_outfile, self.runs, multipeptides, fraction_needed_selected, style=options.matrix_output_method, aligner_mscore_treshold=options.fdr_cutoff) # 4. Write out the full outfile if len(outfile) > 0 and options.readmethod == "full": # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += ["align_runid", "align_origfilename"] writer.writerow(header_first) for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups) * 1.0 / len(self.runs)) < fraction_needed_selected: continue for p in m.get_peptides(): selected_pg = p.get_selected_peakgroup() if selected_pg is None: continue row_to_write = selected_pg.row row_to_write += [ selected_pg.run.get_id(), selected_pg.run.orig_filename ] # Replace run_id with the aligned id (align_runid) -> # otherwise the run_id is not guaranteed to be unique row_to_write[header_dict["run_id"]] = selected_ids_dict[ f_id].peptide.run.get_id() writer.writerow(row_to_write) elif len(outfile) > 0 and file_format in [ "openswath", "peakview_preprocess" ]: name_of_id_col_map = { "openswath": "id", "peakview_preprocess": "preprocess_id" } name_of_trgr_col_map = { "openswath": "transition_group_id", "peakview_preprocess": "Pep Index" } name_of_id_col = name_of_id_col_map[file_format] name_of_trgr_col = name_of_trgr_col_map[file_format] # Only in openswath we have the ID and can go back to the original file. # We can write out the complete original files. writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += [ "align_runid", "align_origfilename", "align_clusterid" ] writer.writerow(header_first) for file_nr, f in enumerate(infiles): header_dict = {} if f.endswith('.gz'): import gzip filehandler = gzip.open(f, 'rb') else: filehandler = open(f) reader = csv.reader(filehandler, delimiter="\t") header = reader.next() for i, n in enumerate(header): header_dict[n] = i for row in reader: f_id = row[header_dict[name_of_id_col]] if selected_ids_dict.has_key(f_id): # Check the "id" and "transition_group_id" field. # Unfortunately the id can be non-unique, there we check both. trgroup_id = selected_ids_dict[f_id].peptide.get_id() unique_peptide_id = row[header_dict[name_of_trgr_col]] if unique_peptide_id == trgroup_id: row_to_write = row row_to_write += [ selected_ids_dict[f_id].peptide.run.get_id(), f, selected_ids_dict[f_id].get_cluster_id() ] # Replace run_id with the aligned id (align_runid) -> # otherwise the run_id is not guaranteed to be unique if file_format == "openswath": row_to_write[ header_dict["run_id"]] = selected_ids_dict[ f_id].peptide.run.get_id() writer.writerow(row_to_write) # 5. Write out the .tr transformation files if writeTrafoFiles: self._write_trafo_files() # 6. Write out the YAML file if len(yaml_outfile) > 0: import yaml myYaml = { "Commandline": sys.argv, "RawData": [], "PeakGroupData": [outfile], "ReferenceRun": self.transformation_collection.getReferenceRunID(), "FeatureAlignment": { "RawInputParameters": options.__dict__, "Parameters": {} }, "Parameters": {} } myYaml["Parameters"]["m_score_cutoff"] = float( options.fdr_cutoff) # deprecated myYaml["FeatureAlignment"]["Parameters"]["m_score_cutoff"] = float( options.fdr_cutoff) myYaml["FeatureAlignment"]["Parameters"]["fdr_cutoff"] = float( options.fdr_cutoff) myYaml["FeatureAlignment"]["Parameters"][ "aligned_fdr_cutoff"] = float(options.aligned_fdr_cutoff) for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = self._getTrafoFilename(current_run, ref_id) dirpath = os.path.dirname(current_run.orig_filename) ### Use real path (not very useful when moving data from one computer to another) ### filename = os.path.realpath(filename) ### dirpath = os.path.realpath(dirpath) this = { "id": current_id, "directory": dirpath, "trafo_file": filename } myYaml["RawData"].append(this) open(yaml_outfile, 'w').write(yaml.dump({"AlignedSwathRuns": myYaml}))
def _read_trafo(self, trafo_filenames): # Read the transformations transformation_collection_ = TransformationCollection() for filename in [d["trafo_file"] for d in trafo_filenames]: transformation_collection_.readTransformationData(filename) transformation_collection_.initialize_from_data(reverse=True)
class SplineAligner(): """ Use the datasmoothing part of msproteomicstoolslib to align two runs in retention times using splines. >>> spl_aligner = SplineAligner() >>> transformations = spl_aligner.rt_align_all_runs(this_exp, multipeptides, options.alignment_score, options.use_scikit) """ def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1, experiment=None): self.transformation_collection = TransformationCollection() self.alignment_fdr_threshold_ = alignment_fdr_threshold self.smoother = smoother self.tmpdir_ = external_r_tmpdir self.max_data_ = maxdata self._cacher = None self._experiment = experiment def _determine_best_run(self, experiment): maxcount = -1 bestrun = -1 for run in experiment.runs: cnt = 0 for prgroup in run: for peptide in prgroup: if peptide.get_decoy(): continue pg = peptide.get_best_peakgroup() if pg.get_fdr_score() < self.alignment_fdr_threshold_: cnt += 1 if cnt > maxcount: maxcount = cnt bestrun = run.get_id() print("Found best run", bestrun, "with %s features above the cutoff of %s%%" % (maxcount, self.alignment_fdr_threshold_)) return [r for r in experiment.runs if r.get_id() == bestrun][0] def _getRTData(self, bestrun, run, multipeptides): """ Return retention time data for reference and slave run """ if self._experiment is not None: return self._getRTData_cached(bestrun, run, multipeptides) else: return self._getRTData_legacy(bestrun, run, multipeptides) def _cache_RT_data(self, bestrun, run, multipeptides): self._cacher = [] for m in multipeptides: cached_vals = [] is_decoy = False for r in self._experiment.runs: val = None if m.hasPrecursorGroup(r.get_id()): al_pg = [pg for pg in m.getPrecursorGroup(r.get_id()).getAllPeakgroups() if pg.get_fdr_score() < self.alignment_fdr_threshold_] # We need to have a single, good peak group below the threshold (not a decoy) if len(al_pg) == 1: pep = m.getPrecursorGroup(r.get_id()).getOverallBestPeakgroup() if not pep.peptide.get_decoy() and pep.get_fdr_score() < self.alignment_fdr_threshold_: val = (pep.get_fdr_score(), pep.get_normalized_retentiontime()) cached_vals.append(val) # only append with at least 2 values ... if len([v for v in cached_vals if not v is None] ) > 1: self._cacher.append(cached_vals) def _getRTData_cached(self, bestrun, run, multipeptides): """ Return retention time data for reference and slave run """ if self._cacher is None: self._cache_RT_data(bestrun, run, multipeptides) run_nr = [k for k,r in enumerate(self._experiment.runs) if r.get_id() == run.get_id() ][0] bestrun_nr = [k for k,r in enumerate(self._experiment.runs) if r.get_id() == bestrun.get_id() ][0] data_tmp = [] for m in self._cacher: rund = m[ run_nr ] bestrund = m[ bestrun_nr ] # Skip empty entries if rund is None or bestrund is None: continue data_tmp.append( ( min( rund[0], bestrund[0]), bestrund[1], rund[1]) ) maxdata = self.max_data_ if maxdata == -1: # -1 means take all data maxdata = len(data_tmp) data1 = [] data2 = [] for fdr, d1, d2 in sorted(data_tmp)[:maxdata]: data1.append(d1) data2.append(d2) return data1,data2 def _getRTData_legacy(self, bestrun, run, multipeptides): """ Return retention time data for reference and slave run """ # data1 = reference data (master) # data2 = data to be aligned (slave) data1 = [] data2 = [] data_tmp = [] cnt_multiple = 0 for m in multipeptides: try: len_ali = len([pg for pg in m.getPrecursorGroup(run.get_id()).getAllPeakgroups() if pg.get_fdr_score() < self.alignment_fdr_threshold_]) len_ref = len([pg for pg in m.getPrecursorGroup(bestrun.get_id()).getAllPeakgroups() if pg.get_fdr_score() < self.alignment_fdr_threshold_]) # Do not consider peakgroups that are missing in one run # Do not consider peakgroups that have more than one good peakgroup if len_ali != 1 or len_ref != 1: if len_ali > 1 or len_ref > 1: cnt_multiple += 1 continue ref_pep = m.getPrecursorGroup(bestrun.get_id()).getOverallBestPeakgroup() align_pep = m.getPrecursorGroup(run.get_id()).getOverallBestPeakgroup() except KeyError: # it is possible that for some, no peak group exists in this run continue # Do not use decoy peptides if ref_pep.peptide.get_decoy() or align_pep.peptide.get_decoy(): continue if ref_pep.get_fdr_score() < self.alignment_fdr_threshold_ and \ align_pep.get_fdr_score() < self.alignment_fdr_threshold_: # data1.append(ref_pep.get_normalized_retentiontime()) # data2.append(align_pep.get_normalized_retentiontime()) data_tmp.append( ( ref_pep.get_fdr_score(), ref_pep.get_normalized_retentiontime(), align_pep.get_normalized_retentiontime() ) ) if cnt_multiple > len(multipeptides) * 0.8 : print ("") print (" Warning: Most of your data has more than one peakgroup with a score better than %s." % self.alignment_fdr_threshold_) print (" This may be a problem for the alignment, please consider adjusting the --alignment_score option." ) maxdata = self.max_data_ if maxdata == -1: # -1 means take all data maxdata = len(data_tmp) for fdr, d1, d2 in sorted(data_tmp)[:maxdata]: data1.append(d1) data2.append(d2) return data1,data2 def _spline_align_runs(self, bestrun, run, multipeptides): """Will align run against bestrun""" sm = smoothing.getSmoothingObj(smoother = self.smoother, tmpdir = self.tmpdir_) # get those peptides we want to use for alignment => for this use the mapping # data1 = reference data (master) # data2 = data to be aligned (slave) data1,data2 = self._getRTData(bestrun, run, multipeptides) if len(data2) < 2: print("No common identifications between %s and %s. Only found %s features below a cutoff of %s" % ( run.get_id(), bestrun.get_id(), len(data1), self.alignment_fdr_threshold_) ) print("If you ran the feature_alignment.py script, try to skip the re-alignment step (e.g. remove the --realign_runs option)." ) raise Exception("Not enough datapoints (less than 2 datapoints).") # Since we want to predict how to convert from slave to master, slave # is first and master is second. sm.initialize(data2, data1) data2_aligned = sm.predict(data2) # Store transformation in collection (from run to bestrun) self.transformation_collection.addTransformationData([data2, data1], run.get_id(), bestrun.get_id() ) self.transformation_collection.addTransformedData(data2_aligned, run.get_id(), bestrun.get_id() ) stdev = numpy.std(numpy.array(data1) - numpy.array(data2_aligned)) median = numpy.median(numpy.array(data1) - numpy.array(data2_aligned)) print("Will align run %s against %s, using %s features" % (run.get_id(), bestrun.get_id(), len(data1)) ) print(" Computed stdev", stdev, "and median", median ) # Store error for later d = self.transformation_error.transformations.get(run.get_id(), {}) d[bestrun.get_id()] = [stdev, median] self.transformation_error.transformations[ run.get_id() ] = d # Now predict on _all_ data and write this back to the data i = 0 all_pg = [] for prgr in run: for pep in prgr: all_pg.extend( [ (pg.get_normalized_retentiontime(), pg.get_feature_id()) for pg in pep.get_all_peakgroups()] ) rt_eval = [ pg[0] for pg in all_pg] aligned_result = sm.predict(rt_eval) for prgr in run: for pep in prgr: # TODO hack -> direct access to the internal peakgroups object mutable = [list(pg) for pg in pep.peakgroups_] for k in range(len(mutable)): mutable[k][2] = aligned_result[i] i += 1 pep.peakgroups_ = [ tuple(m) for m in mutable] def rt_align_all_runs(self, experiment, multipeptides): """ Align all runs contained in an MRExperiment Args: experiment(MRExperiment): a collection of runs multipeptides(list(multipeptides)): a list of Multipeptide derived from the above expriment """ print("Will re-align runs" ) # get the best run (e.g. the one with the most ids below threshold) bestrun = self._determine_best_run(experiment) ## spl_aligner.transformation_collection = experiment.transformation_collection self.transformation_collection.setReferenceRunID( bestrun.get_id() ) self.transformation_error = TransformationError() # go through all runs and align two runs at a time for run in experiment.runs: if run.get_id() == bestrun.get_id(): continue # do not align reference run itself self._spline_align_runs(bestrun, run, multipeptides) return self.transformation_collection def getTransformationError(self): """ Get the error of the transformation Returns: transformation_error(:class:`.TransformationError`) : the error of the transformation """ return self.transformation_error
def __init__(self): super(Experiment, self).__init__() self.transformation_collection = TransformationCollection()
class Experiment(MRExperiment): """ An Experiment is a container for multiple experimental runs - some of which may contain the same precursors. """ def __init__(self): super(Experiment, self).__init__() self.transformation_collection = TransformationCollection() def estimate_real_fdr(self, multipeptides, fraction_needed_selected): class DecoyStats(object): def __init__(self): self.est_real_fdr = 0.0 self.nr_decoys = 0 self.nr_targets = 0 self.decoy_pcnt = 0.0 self.est_real_fdr = 0.0 d = DecoyStats() precursors_to_be_used = [m for m in multipeptides if m.more_than_fraction_selected(fraction_needed_selected)] # count the decoys d.nr_decoys = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if prec.find_best_peptide_pg().peptide.get_decoy()]) d.nr_targets = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if not prec.find_best_peptide_pg().peptide.get_decoy()]) # estimate the real fdr by calculating the decoy ratio and dividing it # by the decoy ration obtained at @fdr_cutoff => which gives us the # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute # value, we multiply by fdr_cutoff again (which was used to obtain the # original estimated decoy percentage). if self.estimated_decoy_pcnt is None: return d if (d.nr_targets + d.nr_decoys) == 0: return d d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys) ) d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff return d def print_stats(self, multipeptides, fdr_cutoff, fraction_present, min_nrruns): alignment = AlignmentStatistics() alignment.count(multipeptides, fdr_cutoff, self.runs) # Count presence in all runs (before alignment) precursors_in_all_runs_wo_align = len([1 for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy()]) proteins_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy()])) peptides_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.sequence for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy()])) # Count presence in all runs (before alignment) precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] nr_peptides_target = len(set([prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy()])) nr_proteins_target = len(set([prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy()])) nr_precursors_in_all = len([1 for m in multipeptides if m.all_selected() and not m.get_decoy()]) max_pg = alignment.nr_good_precursors * len(self.runs) dstats = self.estimate_real_fdr(multipeptides, fraction_present) dstats_all = self.estimate_real_fdr(multipeptides, 1.0) # Get single/multiple hits stats from itertools import groupby precursors_quantified = [m for m in multipeptides if len(m.get_selected_peakgroups()) > 0] target_quant_protein_list = [ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy()] target_quant_protein_list.sort() nr_sh_target_proteins = sum( [len(list(group)) == 1 for key, group in groupby(target_quant_protein_list)] ) nr_mh_target_proteins = sum( [len(list(group)) > 1 for key, group in groupby(target_quant_protein_list)] ) ### Store for later (yaml output) alignment.nr_ambiguous = self.nr_ambiguous alignment.nr_multiple_align = self.nr_multiple_align alignment.precursors_in_all_runs_wo_align = precursors_in_all_runs_wo_align alignment.peptides_in_all_runs_wo_align_target = peptides_in_all_runs_wo_align_target alignment.proteins_in_all_runs_wo_align_target = proteins_in_all_runs_wo_align_target alignment.nr_precursors_in_all = nr_precursors_in_all alignment.nr_peptides_target = nr_peptides_target alignment.nr_proteins_target = nr_proteins_target # ########################################################################### # print("="*75) print("="*75) print("Total we have", len(self.runs), "runs with", alignment.nr_good_precursors, "peakgroups quantified in at least %s run(s) below m_score (q-value) %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " + "giving maximally nr peakgroups", max_pg) print("We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned", alignment.nr_aligned) print(" The order of", alignment.nr_changed, "peakgroups was changed,", max_pg - alignment.nr_quantified, "could not be aligned and %s were removed. Ambigous cases: %s, multiple suitable peakgroups: %s" % ( alignment.nr_removed, self.nr_ambiguous, self.nr_multiple_align)) print("We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)" % ( alignment.nr_quant_precursors, alignment.nr_good_precursors, min_nrruns, nr_precursors_in_all, precursors_in_all_runs_wo_align)) print("We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)" % ( len(alignment.quant_peptides), len(alignment.good_peptides), min_nrruns, nr_peptides_target, peptides_in_all_runs_wo_align_target)) print("We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)" % ( len(alignment.quant_proteins), len(alignment.good_proteins), min_nrruns, nr_proteins_target, proteins_in_all_runs_wo_align_target)) print("Of these %s proteins, %s were multiple hits and %s were single hits." % (len(alignment.quant_proteins), nr_mh_target_proteins, nr_sh_target_proteins)) # Get decoy estimates decoy_precursors = len([1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0 and m.find_best_peptide_pg().peptide.get_decoy()]) if len(precursors_in_all_runs) > 0: print("Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % ( dstats_all.decoy_pcnt, dstats_all.nr_decoys, dstats_all.nr_decoys + dstats_all.nr_targets, dstats_all.est_real_fdr*100)) print("Decoy percentage of peakgroups that are partially aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % ( dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys + dstats.nr_targets, dstats.est_real_fdr*100)) print("There were", decoy_precursors, "decoy precursors identified out of", \ alignment.nr_quant_precursors + decoy_precursors, "precursors which is %0.4f %%" % ( decoy_precursors * 100.0 / (alignment.nr_quant_precursors + decoy_precursors))) return alignment def _getTrafoFilename(self, current_run, ref_id): current_id = current_run.get_id() input_basename = os.path.basename(current_run.orig_filename) fn = os.path.splitext(input_basename)[0] dirname = os.path.dirname(current_run.orig_filename) filename = os.path.join(dirname, "%s-%s-%s.tr" % (fn, current_id, ref_id) ) return filename def _write_trafo_files(self): # Print out trafo data trafo_fnames = [] for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = self._getTrafoFilename(current_run, ref_id) trafo_fnames.append(filename) self.transformation_collection.writeTransformationData(filename, current_id, ref_id) self.transformation_collection.readTransformationData(filename) def write_to_file(self, multipeptides, options, alignment, tree=None, writeTrafoFiles=True): infiles = options.infiles outfile = options.outfile matrix_outfile = options.matrix_outfile yaml_outfile = options.yaml_outfile ids_outfile = options.ids_outfile fraction_needed_selected = options.min_frac_selected file_format = options.file_format # 1. Collect ids of selected features selected_pgs = [] for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups)*1.0 / len(self.runs)) < fraction_needed_selected: continue for p in m.getAllPeptides(): selected_pg = p.get_selected_peakgroup() clustered_pg = p.getClusteredPeakgroups() for pg in clustered_pg: selected_pgs.append(pg) selected_ids_dict = dict( [ (pg.get_feature_id(), pg) for pg in selected_pgs] ) # 2. Write out the (selected) ids if len(ids_outfile) > 0: fh = open(ids_outfile, "w") id_writer = csv.writer(fh, delimiter="\t") for pg in sorted(selected_pgs): id_writer.writerow([pg.get_feature_id()]) fh.close() del id_writer # 3. Write out the matrix outfile if len(matrix_outfile) > 0: write_out_matrix_file(matrix_outfile, self.runs, multipeptides, fraction_needed_selected, style=options.matrix_output_method, aligner_mscore_treshold=options.fdr_cutoff) # 4. Write out the full outfile if len(outfile) > 0 and options.readmethod == "full": # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += ["align_runid", "align_origfilename"] writer.writerow(header_first) for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups)*1.0 / len(self.runs)) < fraction_needed_selected: continue for p in m.get_peptides(): selected_pg = p.get_selected_peakgroup() if selected_pg is None: continue row_to_write = selected_pg.row row_to_write += [selected_pg.run.get_id(), selected_pg.run.orig_filename] # Replace run_id with the aligned id (align_runid) -> # otherwise the run_id is not guaranteed to be unique row_to_write[ header_dict["run_id"]] = selected_ids_dict[f_id].peptide.run.get_id() writer.writerow(row_to_write) elif len(outfile) > 0 and file_format in ["openswath", "peakview_preprocess"]: name_of_id_col_map = { "openswath" : "id" , "peakview_preprocess" : "preprocess_id"} name_of_trgr_col_map = { "openswath" : "transition_group_id" , "peakview_preprocess" : "Pep Index"} name_of_id_col = name_of_id_col_map[file_format] name_of_trgr_col = name_of_trgr_col_map[file_format] # Only in openswath we have the ID and can go back to the original file. # We can write out the complete original files. writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += ["align_runid", "align_origfilename", "align_clusterid"] writer.writerow(header_first) for file_nr, f in enumerate(infiles): header_dict = {} if f.endswith('.gz'): import gzip filehandler = gzip.open(f,'rb') else: filehandler = open(f) reader = csv.reader(filehandler, delimiter="\t") header = next(reader) for i,n in enumerate(header): header_dict[n] = i for row in reader: f_id = row[ header_dict[name_of_id_col]] if f_id in selected_ids_dict: # Check the "id" and "transition_group_id" field. # Unfortunately the id can be non-unique, there we check both. trgroup_id = selected_ids_dict[f_id].peptide.get_id() unique_peptide_id = row[ header_dict[name_of_trgr_col]] if unique_peptide_id == trgroup_id: row_to_write = row row_to_write += [selected_ids_dict[f_id].peptide.run.get_id(), f, selected_ids_dict[f_id].get_cluster_id()] # Replace run_id with the aligned id (align_runid) -> # otherwise the run_id is not guaranteed to be unique if file_format == "openswath" : row_to_write[ header_dict["run_id"]] = selected_ids_dict[f_id].peptide.run.get_id() writer.writerow(row_to_write) # 5. Write out the .tr transformation files if writeTrafoFiles: self._write_trafo_files() # 6. Write out the YAML file if len(yaml_outfile) > 0: import yaml myYaml = {"Commandline" : sys.argv, "RawData" : [], "PeakGroupData" : [ outfile ], "ReferenceRun" : self.transformation_collection.getReferenceRunID(), "FeatureAlignment" : { "RawInputParameters" : options.__dict__, "Parameters" : {} }, "Parameters" : {} } myYaml["Output"] = {} myYaml["Output"]["Tree"] = {} if tree is not None: myYaml["Output"]["Tree"]["Raw"] = [list(t) for t in tree] tree_mapped = [ [self.runs[a].get_id(), self.runs[b].get_id()] for a,b in tree] myYaml["Output"]["Tree"]["Mapped"] = tree_mapped tree_mapped = [ [self.runs[a].get_openswath_filename(), self.runs[b].get_openswath_filename()] for a,b in tree] myYaml["Output"]["Tree"]["MappedFile"] = tree_mapped tree_mapped = [ [self.runs[a].get_openswath_filename(), self.runs[b].get_openswath_filename()] for a,b in tree] myYaml["Output"]["Tree"]["MappedFile"] = tree_mapped tree_mapped = [ [self.runs[a].get_original_filename(), self.runs[b].get_original_filename()] for a,b in tree] myYaml["Output"]["Tree"]["MappedFileInput"] = tree_mapped myYaml["Output"]["Quantification"] = alignment.to_yaml() myYaml["Parameters"]["m_score_cutoff"] = float(options.fdr_cutoff) # deprecated myYaml["FeatureAlignment"]["Parameters"]["m_score_cutoff"] = float(options.fdr_cutoff) myYaml["FeatureAlignment"]["Parameters"]["fdr_cutoff"] = float(options.fdr_cutoff) myYaml["FeatureAlignment"]["Parameters"]["aligned_fdr_cutoff"] = float(options.aligned_fdr_cutoff) for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = self._getTrafoFilename(current_run, ref_id) dirpath = os.path.dirname(current_run.orig_filename) ### Use real path (not very useful when moving data from one computer to another) ### filename = os.path.realpath(filename) ### dirpath = os.path.realpath(dirpath) this = {"id" : current_id, "directory" : dirpath, "trafo_file" : filename} myYaml["RawData"].append(this) open(yaml_outfile, 'w').write(yaml.dump({"AlignedSwathRuns" : myYaml}))
def _read_trafo(self, trafo_filenames): # Read the transformations transformation_collection_ = TransformationCollection() for filename in [d["trafo_file"] for d in trafo_filenames]: transformation_collection_.readTransformationData(filename) transformation_collection_.initialize_from_data(reverse=True)
class Experiment(MRExperiment): """ An Experiment is a container for multiple experimental runs - some of which may contain the same precursors. """ def __init__(self): super(Experiment, self).__init__() self.transformation_collection = TransformationCollection() def get_max_pg(self): return len(self.runs)*len(self.union_transition_groups_set) def estimate_real_fdr(self, multipeptides, fraction_needed_selected): class DecoyStats(object): def __init__(self): self.est_real_fdr = 0.0 self.nr_decoys = 0 self.nr_targets = 0 self.decoy_pcnt = 0.0 self.est_real_fdr = 0.0 d = DecoyStats() precursors_to_be_used = [m for m in multipeptides if m.more_than_fraction_selected(fraction_needed_selected)] # count the decoys d.nr_decoys = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if prec.find_best_peptide_pg().peptide.get_decoy()]) d.nr_targets = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if not prec.find_best_peptide_pg().peptide.get_decoy()]) # estimate the real fdr by calculating the decoy ratio and dividing it # by the decoy ration obtained at @fdr_cutoff => which gives us the # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute # value, we multiply by fdr_cutoff again (which was used to obtain the # original estimated decoy percentage). if self.estimated_decoy_pcnt is None: return d if (d.nr_targets + d.nr_decoys) == 0: return d d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys) ) d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff return d def print_stats(self, multipeptides, alignment, outlier_detection, fdr_cutoff, fraction_present, min_nrruns): nr_precursors_total = len(self.union_transition_groups_set) # Do statistics and print out in_all_runs_wo_align = len([1 for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy()]) proteins_in_all_runs_wo_align = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff)])) proteins_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy()])) nr_all_proteins = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if not m.find_best_peptide_pg().peptide.get_decoy()])) nr_all_peptides = len(set([m.find_best_peptide_pg().peptide.sequence for m in multipeptides if not m.find_best_peptide_pg().peptide.get_decoy()])) peptides_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.sequence for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy()])) print("Present targets in all runs", in_all_runs_wo_align) precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] precursors_quantified = [m for m in multipeptides if len(m.get_selected_peakgroups()) > 0] # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] nr_decoys = len([1 for prec in precursors_in_all_runs if prec.find_best_peptide_pg().peptide.get_decoy()]) decoy_precursors = len([1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0 and m.find_best_peptide_pg().peptide.get_decoy()]) nr_peptides = len(set([prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs])) nr_proteins = len(set([prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs])) nr_peptides_target = len(set([prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy()])) nr_proteins_target = len(set([prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy()])) nr_precursors_to_quant = len(set([ prec for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy()])) nr_proteins_to_quant = len(set([ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy()])) nr_peptides_to_quant = len(set([ prec.find_best_peptide_pg().peptide.sequence for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy()])) nr_precursors_in_all = len([1 for m in multipeptides if m.all_selected() and not m.get_decoy()]) max_pg = self.get_max_pg() dstats = self.estimate_real_fdr(multipeptides, fraction_present) dstats_all = self.estimate_real_fdr(multipeptides, 1.0) print("="*75) print("="*75) print("Total we have", len(self.runs), "runs with", len(self.union_transition_groups_set),\ "peakgroups quantified in at least %s run(s) above FDR %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " + \ "giving maximally nr peakgroups", max_pg) print("We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned", \ alignment.nr_aligned, "and changed order of", alignment.nr_changed, "and could not align", alignment.could_not_align) print("We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)" % ( nr_precursors_to_quant, nr_precursors_total, min_nrruns, nr_precursors_in_all, in_all_runs_wo_align)) print("We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)" % ( nr_peptides_to_quant, nr_all_peptides, min_nrruns, nr_peptides_target, peptides_in_all_runs_wo_align_target)) print("We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)" % ( nr_proteins_to_quant, nr_all_proteins, min_nrruns, nr_proteins_target, proteins_in_all_runs_wo_align_target)) # print "quant proteins", nr_proteins_to_quant # Get decoy estimates if len(precursors_in_all_runs) > 0: print("Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % ( dstats_all.decoy_pcnt, dstats_all.nr_decoys, dstats_all.nr_decoys + dstats_all.nr_targets, dstats_all.est_real_fdr*100)) print("Decoy percentage of peakgroups that are partially aligned %1.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % ( dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys + dstats.nr_targets, dstats.est_real_fdr*100)) print("There were", decoy_precursors, "decoy precursors identified out of", nr_precursors_to_quant + decoy_precursors, "precursors which is %0.4f %%" % (decoy_precursors *100.0 / (nr_precursors_to_quant + decoy_precursors))) if outlier_detection is not None: print("Outliers:", outlier_detection.nr_outliers, "outliers in", len(multipeptides), "peptides or", outlier_detection.outlier_pg, "peakgroups out of", alignment.nr_quantified, "changed", outlier_detection.outliers_changed) def write_to_file(self, multipeptides, options): infiles = options.infiles outfile = options.outfile matrix_outfile = options.matrix_outfile matrix_excelfile = options.matix_excel yaml_outfile = options.yaml_outfile ids_outfile = options.ids_outfile fraction_needed_selected = options.min_frac_selected file_format = options.file_format selected_pgs = [] for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups)*1.0 / len(self.runs) < fraction_needed_selected) : continue for p in m.get_peptides(): selected_pg = p.get_selected_peakgroup() if selected_pg is None: continue selected_pgs.append(selected_pg) selected_ids_dict = dict( [ (pg.get_feature_id(), pg) for pg in selected_pgs] ) if len(ids_outfile) > 0: fh = open(ids_outfile, "w") id_writer = csv.writer(fh, delimiter="\t") for pg in selected_pgs: id_writer.writerow([pg.get_feature_id()]) fh.close() del id_writer if len(matrix_outfile) > 0: write_out_matrix_file(matrix_outfile, self.runs, multipeptides, fraction_needed_selected) if len(outfile) > 0 and options.readmethod == "full": # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += ["align_runid", "align_origfilename"] writer.writerow(header_first) for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups)*1.0 / len(self.runs) < fraction_needed_selected) : continue for p in m.get_peptides(): selected_pg = p.get_selected_peakgroup() if selected_pg is None: continue row_to_write = selected_pg.row row_to_write += [selected_pg.run.get_id(), selected_pg.run.orig_filename] writer.writerow(row_to_write) elif len(outfile) > 0 and file_format == "openswath": # only in openswath we have the ID and can go back to the original file ... # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += ["align_runid", "align_origfilename"] writer.writerow(header_first) for file_nr, f in enumerate(infiles): header_dict = {} reader = csv.reader(open(f), delimiter="\t") header = next(reader) for i,n in enumerate(header): header_dict[n] = i for row in reader: f_id = row[ header_dict["id"]] if f_id in selected_ids_dict: # Check the "id" and "transition_group_id" field. # Unfortunately the id can be non-unique, there we check both. trgroup_id = selected_ids_dict[f_id].peptide.get_id() unique_peptide_id = row[ header_dict["transition_group_id"]] if unique_peptide_id == trgroup_id: row_to_write = row row_to_write += [selected_ids_dict[f_id].peptide.run.get_id(), f] writer.writerow(row_to_write) # Print out trafo data trafo_fnames = [] for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = os.path.join(os.path.dirname(current_run.orig_filename), "transformation-%s-%s.tr" % (current_id, ref_id) ) trafo_fnames.append(filename) self.transformation_collection.writeTransformationData(filename, current_id, ref_id) self.transformation_collection.readTransformationData(filename) if len(yaml_outfile) > 0: import yaml myYaml = {"RawData" : [], "PeakGroupData" : [ outfile ], "ReferenceRun" : self.transformation_collection.getReferenceRunID() } for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = os.path.join(os.path.dirname(current_run.orig_filename), "transformation-%s-%s.tr" % (current_id, ref_id) ) dirpath = os.path.realpath(os.path.dirname(current_run.orig_filename)) this = {"id" : current_id, "directory" : dirpath, "trafo_file" : os.path.realpath(filename)} myYaml["RawData"].append(this) open(yaml_outfile, 'w').write(yaml.dump({"AlignedSwathRuns" : myYaml})) return trafo_fnames
class Experiment(MRExperiment): """ An Experiment is a container for multiple experimental runs - some of which may contain the same precursors. """ def __init__(self): super(Experiment, self).__init__() self.transformation_collection = TransformationCollection() def get_max_pg(self): return len(self.runs) * len(self.union_transition_groups_set) def estimate_real_fdr(self, multipeptides, fraction_needed_selected): class DecoyStats(object): def __init__(self): self.est_real_fdr = 0.0 self.nr_decoys = 0 self.nr_targets = 0 self.decoy_pcnt = 0.0 self.est_real_fdr = 0.0 d = DecoyStats() precursors_to_be_used = [ m for m in multipeptides if m.more_than_fraction_selected(fraction_needed_selected) ] # count the decoys d.nr_decoys = sum([ len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if prec.find_best_peptide_pg().peptide.get_decoy() ]) d.nr_targets = sum([ len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used if not prec.find_best_peptide_pg().peptide.get_decoy() ]) # estimate the real fdr by calculating the decoy ratio and dividing it # by the decoy ration obtained at @fdr_cutoff => which gives us the # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute # value, we multiply by fdr_cutoff again (which was used to obtain the # original estimated decoy percentage). if self.estimated_decoy_pcnt is None: return d if (d.nr_targets + d.nr_decoys) == 0: return d d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys)) d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff return d def print_stats(self, multipeptides, alignment, outlier_detection, fdr_cutoff, fraction_present, min_nrruns): nr_precursors_total = len(self.union_transition_groups_set) # Do statistics and print out in_all_runs_wo_align = len([ 1 for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy() ]) proteins_in_all_runs_wo_align = len( set([ m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff) ])) proteins_in_all_runs_wo_align_target = len( set([ m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy() ])) nr_all_proteins = len( set([ m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if not m.find_best_peptide_pg().peptide.get_decoy() ])) nr_all_peptides = len( set([ m.find_best_peptide_pg().peptide.sequence for m in multipeptides if not m.find_best_peptide_pg().peptide.get_decoy() ])) peptides_in_all_runs_wo_align_target = len( set([ m.find_best_peptide_pg().peptide.sequence for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy() ])) print("Present targets in all runs", in_all_runs_wo_align) precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] precursors_quantified = [ m for m in multipeptides if len(m.get_selected_peakgroups()) > 0 ] # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()] nr_decoys = len([ 1 for prec in precursors_in_all_runs if prec.find_best_peptide_pg().peptide.get_decoy() ]) decoy_precursors = len([ 1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0 and m.find_best_peptide_pg().peptide.get_decoy() ]) nr_peptides = len( set([ prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs ])) nr_proteins = len( set([ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs ])) nr_peptides_target = len( set([ prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy() ])) nr_proteins_target = len( set([ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy() ])) nr_precursors_to_quant = len( set([ prec for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy() ])) nr_proteins_to_quant = len( set([ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy() ])) nr_peptides_to_quant = len( set([ prec.find_best_peptide_pg().peptide.sequence for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy() ])) nr_precursors_in_all = len([ 1 for m in multipeptides if m.all_selected() and not m.get_decoy() ]) max_pg = self.get_max_pg() dstats = self.estimate_real_fdr(multipeptides, fraction_present) dstats_all = self.estimate_real_fdr(multipeptides, 1.0) print("=" * 75) print("=" * 75) print("Total we have", len(self.runs), "runs with", len(self.union_transition_groups_set),\ "peakgroups quantified in at least %s run(s) above FDR %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " + \ "giving maximally nr peakgroups", max_pg) print("We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned", \ alignment.nr_aligned, "and changed order of", alignment.nr_changed, "and could not align", alignment.could_not_align) print( "We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)" % (nr_precursors_to_quant, nr_precursors_total, min_nrruns, nr_precursors_in_all, in_all_runs_wo_align)) print( "We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)" % (nr_peptides_to_quant, nr_all_peptides, min_nrruns, nr_peptides_target, peptides_in_all_runs_wo_align_target)) print( "We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)" % (nr_proteins_to_quant, nr_all_proteins, min_nrruns, nr_proteins_target, proteins_in_all_runs_wo_align_target)) # print "quant proteins", nr_proteins_to_quant # Get decoy estimates if len(precursors_in_all_runs) > 0: print( "Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (dstats_all.decoy_pcnt, dstats_all.nr_decoys, dstats_all.nr_decoys + dstats_all.nr_targets, dstats_all.est_real_fdr * 100)) print( "Decoy percentage of peakgroups that are partially aligned %1.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys + dstats.nr_targets, dstats.est_real_fdr * 100)) print( "There were", decoy_precursors, "decoy precursors identified out of", nr_precursors_to_quant + decoy_precursors, "precursors which is %0.4f %%" % (decoy_precursors * 100.0 / (nr_precursors_to_quant + decoy_precursors))) if outlier_detection is not None: print("Outliers:", outlier_detection.nr_outliers, "outliers in", len(multipeptides), "peptides or", outlier_detection.outlier_pg, "peakgroups out of", alignment.nr_quantified, "changed", outlier_detection.outliers_changed) def write_to_file(self, multipeptides, options): infiles = options.infiles outfile = options.outfile matrix_outfile = options.matrix_outfile matrix_excelfile = options.matix_excel yaml_outfile = options.yaml_outfile ids_outfile = options.ids_outfile fraction_needed_selected = options.min_frac_selected file_format = options.file_format selected_pgs = [] for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups) * 1.0 / len(self.runs) < fraction_needed_selected): continue for p in m.get_peptides(): selected_pg = p.get_selected_peakgroup() if selected_pg is None: continue selected_pgs.append(selected_pg) selected_ids_dict = dict([(pg.get_feature_id(), pg) for pg in selected_pgs]) if len(ids_outfile) > 0: fh = open(ids_outfile, "w") id_writer = csv.writer(fh, delimiter="\t") for pg in selected_pgs: id_writer.writerow([pg.get_feature_id()]) fh.close() del id_writer if len(matrix_outfile) > 0: write_out_matrix_file(matrix_outfile, self.runs, multipeptides, fraction_needed_selected) if len(outfile) > 0 and options.readmethod == "full": # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += ["align_runid", "align_origfilename"] writer.writerow(header_first) for m in multipeptides: selected_peakgroups = m.get_selected_peakgroups() if (len(selected_peakgroups) * 1.0 / len(self.runs) < fraction_needed_selected): continue for p in m.get_peptides(): selected_pg = p.get_selected_peakgroup() if selected_pg is None: continue row_to_write = selected_pg.row row_to_write += [ selected_pg.run.get_id(), selected_pg.run.orig_filename ] writer.writerow(row_to_write) elif len(outfile) > 0 and file_format == "openswath": # only in openswath we have the ID and can go back to the original file ... # write out the complete original files writer = csv.writer(open(outfile, "w"), delimiter="\t") header_first = self.runs[0].header for run in self.runs: assert header_first == run.header header_first += ["align_runid", "align_origfilename"] writer.writerow(header_first) for file_nr, f in enumerate(infiles): header_dict = {} reader = csv.reader(open(f), delimiter="\t") header = next(reader) for i, n in enumerate(header): header_dict[n] = i for row in reader: f_id = row[header_dict["id"]] if f_id in selected_ids_dict: # Check the "id" and "transition_group_id" field. # Unfortunately the id can be non-unique, there we check both. trgroup_id = selected_ids_dict[f_id].peptide.get_id() unique_peptide_id = row[ header_dict["transition_group_id"]] if unique_peptide_id == trgroup_id: row_to_write = row row_to_write += [ selected_ids_dict[f_id].peptide.run.get_id(), f ] writer.writerow(row_to_write) # Print out trafo data trafo_fnames = [] for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = os.path.join( os.path.dirname(current_run.orig_filename), "transformation-%s-%s.tr" % (current_id, ref_id)) trafo_fnames.append(filename) self.transformation_collection.writeTransformationData( filename, current_id, ref_id) self.transformation_collection.readTransformationData(filename) if len(yaml_outfile) > 0: import yaml myYaml = { "RawData": [], "PeakGroupData": [outfile], "ReferenceRun": self.transformation_collection.getReferenceRunID() } for current_run in self.runs: current_id = current_run.get_id() ref_id = self.transformation_collection.getReferenceRunID() filename = os.path.join( os.path.dirname(current_run.orig_filename), "transformation-%s-%s.tr" % (current_id, ref_id)) dirpath = os.path.realpath( os.path.dirname(current_run.orig_filename)) this = { "id": current_id, "directory": dirpath, "trafo_file": os.path.realpath(filename) } myYaml["RawData"].append(this) open(yaml_outfile, 'w').write(yaml.dump({"AlignedSwathRuns": myYaml})) return trafo_fnames