Example #1
0
 def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1, experiment=None):
   self.transformation_collection = TransformationCollection()
   self.alignment_fdr_threshold_ = alignment_fdr_threshold
   self.smoother = smoother
   self.tmpdir_ = external_r_tmpdir
   self.max_data_ = maxdata
   self._cacher = None
   self._experiment = experiment
Example #2
0
    def __init__(self,
                 alignment_fdr_threshold=0.0001,
                 smoother="lowess",
                 external_r_tmpdir=None,
                 maxdata=-1,
                 experiment=None):
        self.transformation_collection = TransformationCollection()
        self.alignment_fdr_threshold_ = alignment_fdr_threshold
        self.smoother = smoother
        self.tmpdir_ = external_r_tmpdir
        self.max_data_ = maxdata
        self._cacher = None
        self._cy_cacher = None
        self._experiment = experiment

        try:
            from msproteomicstoolslib.algorithms.alignment.DataCacher import CyDataCacher
            self._cy_cacher = CyDataCacher()
        except ImportError:
            print(
                "WARNING: cannot import CyDataCacher, will use Python version (slower)."
            )
Example #3
0
    def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1, experiment=None):
      self.transformation_collection = TransformationCollection()
      self.alignment_fdr_threshold_ = alignment_fdr_threshold
      self.smoother = smoother
      self.tmpdir_ = external_r_tmpdir
      self.max_data_ = maxdata
      self._cacher = None
      self._cy_cacher = None
      self._experiment = experiment

      try:
          from msproteomicstoolslib.algorithms.alignment.DataCacher import CyDataCacher
          self._cy_cacher = CyDataCacher()
      except ImportError:
          print("WARNING: cannot import CyDataCacher, will use Python version (slower).")
 def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1):
   self.transformation_collection = TransformationCollection()
   self.alignment_fdr_threshold_ = alignment_fdr_threshold
   self.smoother = smoother
   self.tmpdir_ = external_r_tmpdir
   self.max_data_ = maxdata
class SplineAligner():
    """
    Use the datasmoothing part of msproteomicstoolslib to align two runs in
    retention times using splines.

    >>> spl_aligner = SplineAligner()
    >>> transformations = spl_aligner.rt_align_all_runs(this_exp, multipeptides, options.alignment_score, options.use_scikit)
    """
    def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1):
      self.transformation_collection = TransformationCollection()
      self.alignment_fdr_threshold_ = alignment_fdr_threshold
      self.smoother = smoother
      self.tmpdir_ = external_r_tmpdir
      self.max_data_ = maxdata

    def _determine_best_run(self, experiment):

        maxcount = -1
        bestrun = -1
        for run in experiment.runs:
            cnt = 0
            for prgroup in run:
                for peptide in prgroup:
                    if peptide.get_decoy(): continue
                    pg = peptide.get_best_peakgroup()
                    if pg.get_fdr_score() < self.alignment_fdr_threshold_:
                        cnt += 1
            if cnt > maxcount:
                maxcount = cnt
                bestrun = run.get_id()
        print("Found best run", bestrun, "with %s features above the cutoff of %s%%" % (maxcount, self.alignment_fdr_threshold_))
        return [r for r in experiment.runs if r.get_id() == bestrun][0]

    def _getRTData(self, bestrun, run, multipeptides):
        """ Return retention time data for reference and slave run """

        # data1 = reference data (master)
        # data2 = data to be aligned (slave)
        data1 = []
        data2 = []

        data_tmp = []
        cnt_multiple = 0
        for m in multipeptides:

            try: 
                len_ali = len([pg for pg in m.getPrecursorGroup(run.get_id()).getAllPeakgroups() 
                               if pg.get_fdr_score() < self.alignment_fdr_threshold_])
                len_ref = len([pg for pg in m.getPrecursorGroup(bestrun.get_id()).getAllPeakgroups() 
                               if pg.get_fdr_score() < self.alignment_fdr_threshold_])

                # Do not consider peakgroups that are missing in one run
                # Do not consider peakgroups that have more than one good peakgroup
                if len_ali != 1 or len_ref != 1:
                    if len_ali > 1 or len_ref > 1:
                        cnt_multiple += 1
                    continue

                ref_pep = m.getPrecursorGroup(bestrun.get_id()).getOverallBestPeakgroup()
                align_pep = m.getPrecursorGroup(run.get_id()).getOverallBestPeakgroup()
            except KeyError: 
                # it is possible that for some, no peak group exists in this run
                continue

            # Do not use decoy peptides
            if ref_pep.peptide.get_decoy() or align_pep.peptide.get_decoy(): 
                continue

            if ref_pep.get_fdr_score() < self.alignment_fdr_threshold_ and \
               align_pep.get_fdr_score() < self.alignment_fdr_threshold_:

                # data1.append(ref_pep.get_normalized_retentiontime())
                # data2.append(align_pep.get_normalized_retentiontime())
                data_tmp.append( (
                    ref_pep.get_fdr_score(), 
                    ref_pep.get_normalized_retentiontime(), 
                    align_pep.get_normalized_retentiontime() 
                ) )

        if cnt_multiple > len(multipeptides) * 0.8 :
            print ("")
            print ("  Warning: Most of your data has more than one peakgroup with a score better than %s."  % self.alignment_fdr_threshold_)
            print ("  This may be a problem for the alignment, please consider adjusting the --alignment_score option."  )

        maxdata = self.max_data_
        if maxdata == -1:
            # -1 means take all data
            maxdata = len(data_tmp)

        for fdr, d1, d2 in sorted(data_tmp)[:maxdata]:
            data1.append(d1)
            data2.append(d2)

        return data1,data2

    def _spline_align_runs(self, bestrun, run, multipeptides):
        """Will align run against bestrun"""

        sm = smoothing.getSmoothingObj(smoother = self.smoother, tmpdir = self.tmpdir_)

        # get those peptides we want to use for alignment => for this use the mapping
        # data1 = reference data (master)
        # data2 = data to be aligned (slave)
        data1,data2 = self._getRTData(bestrun, run, multipeptides)

        if len(data2) < 2:
            print("No common identifications between %s and %s. Only found %s features below a cutoff of %s" % ( 
                run.get_id(), bestrun.get_id(), len(data1), self.alignment_fdr_threshold_) )
            print("If you ran the feature_alignment.py script, try to skip the re-alignment step (e.g. remove the --realign_runs option)." )
            raise Exception("Not enough datapoints (less than 2 datapoints).")

        # Since we want to predict how to convert from slave to master, slave
        # is first and master is second.
        sm.initialize(data2, data1)
        data2_aligned = sm.predict(data2)

        # Store transformation in collection (from run to bestrun)
        self.transformation_collection.addTransformationData([data2, data1], run.get_id(), bestrun.get_id() )
        self.transformation_collection.addTransformedData(data2_aligned, run.get_id(), bestrun.get_id() )

        stdev = numpy.std(numpy.array(data1) - numpy.array(data2_aligned))
        median = numpy.median(numpy.array(data1) - numpy.array(data2_aligned))
        print("Will align run %s against %s, using %s features" % (run.get_id(), bestrun.get_id(), len(data1)) )
        print("  Computed stdev", stdev, "and median", median )

        # Store error for later
        d = self.transformation_error.transformations.get(run.get_id(), {})
        d[bestrun.get_id()] = [stdev, median]
        self.transformation_error.transformations[ run.get_id() ] = d

        # Now predict on _all_ data and write this back to the data
        i = 0
        all_pg = []
        for prgr in run:
            for pep in prgr:
                all_pg.extend( [ (pg.get_normalized_retentiontime(), pg.get_feature_id()) for pg in pep.get_all_peakgroups()] )
        rt_eval = [ pg[0] for pg in all_pg]
        aligned_result = sm.predict(rt_eval)
        for prgr in run:
            for pep in prgr:
                # TODO hack -> direct access to the internal peakgroups object
                mutable = [list(pg) for pg in pep.peakgroups_]
                for k in range(len(mutable)):
                    mutable[k][2] = aligned_result[i]
                    i += 1
                pep.peakgroups_ = [ tuple(m) for m in mutable]

    def rt_align_all_runs(self, experiment, multipeptides):
        """ Align all runs contained in an MRExperiment

        Args:
            experiment(MRExperiment): a collection of runs
            multipeptides(list(multipeptides)): a list of Multipeptide derived from the above expriment
        """

        print("Will re-align runs" )

        # get the best run (e.g. the one with the most ids below threshold)
        bestrun = self._determine_best_run(experiment)

        ## spl_aligner.transformation_collection = experiment.transformation_collection
        self.transformation_collection.setReferenceRunID( bestrun.get_id() )
        self.transformation_error = TransformationError()

        # go through all runs and align two runs at a time
        for run in experiment.runs:
            if run.get_id() == bestrun.get_id(): continue # do not align reference run itself
            self._spline_align_runs(bestrun, run, multipeptides)

        return self.transformation_collection

    def getTransformationError(self):
        """
        Get the error of the transformation

        Returns:
            transformation_error(:class:`.TransformationError`) : the error of the transformation
        """
        return self.transformation_error
Example #6
0
 def __init__(self):
     super(Experiment, self).__init__()
     self.transformation_collection = TransformationCollection()
Example #7
0
class Experiment(MRExperiment):
    """
    An Experiment is a container for multiple experimental runs - some of which may contain the same precursors.
    """
    def __init__(self):
        super(Experiment, self).__init__()
        self.transformation_collection = TransformationCollection()

    def estimate_real_fdr(self, multipeptides, fraction_needed_selected):
        class DecoyStats():
            def __init__(self):
                self.est_real_fdr = 0.0
                self.nr_decoys = 0
                self.nr_targets = 0
                self.decoy_pcnt = 0.0
                self.est_real_fdr = 0.0

        d = DecoyStats()
        precursors_to_be_used = [
            m for m in multipeptides
            if m.more_than_fraction_selected(fraction_needed_selected)
        ]

        # count the decoys
        d.nr_decoys = sum([
            len(prec.get_selected_peakgroups())
            for prec in precursors_to_be_used
            if prec.find_best_peptide_pg().peptide.get_decoy()
        ])
        d.nr_targets = sum([
            len(prec.get_selected_peakgroups())
            for prec in precursors_to_be_used
            if not prec.find_best_peptide_pg().peptide.get_decoy()
        ])
        # estimate the real fdr by calculating the decoy ratio and dividing it
        # by the decoy ration obtained at @fdr_cutoff => which gives us the
        # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute
        # value, we multiply by fdr_cutoff again (which was used to obtain the
        # original estimated decoy percentage).
        if self.estimated_decoy_pcnt is None: return d
        if (d.nr_targets + d.nr_decoys) == 0: return d
        d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys))
        d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff
        return d

    def print_stats(self, multipeptides, fdr_cutoff, fraction_present,
                    min_nrruns):

        alignment = AlignmentStatistics()
        alignment.count(multipeptides, fdr_cutoff)

        # Count presence in all runs (before alignment)
        precursors_in_all_runs_wo_align = len([
            1 for m in multipeptides
            if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy()
        ])
        proteins_in_all_runs_wo_align_target = len(
            set([
                m.find_best_peptide_pg().peptide.protein_name
                for m in multipeptides if m.all_above_cutoff(fdr_cutoff)
                and not m.find_best_peptide_pg().peptide.get_decoy()
            ]))
        peptides_in_all_runs_wo_align_target = len(
            set([
                m.find_best_peptide_pg().peptide.sequence
                for m in multipeptides if m.all_above_cutoff(fdr_cutoff)
                and not m.find_best_peptide_pg().peptide.get_decoy()
            ]))

        # Count presence in all runs (before alignment)
        precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        nr_peptides_target = len(
            set([
                prec.find_best_peptide_pg().peptide.sequence
                for prec in precursors_in_all_runs
                if not prec.find_best_peptide_pg().peptide.get_decoy()
            ]))
        nr_proteins_target = len(
            set([
                prec.find_best_peptide_pg().peptide.protein_name
                for prec in precursors_in_all_runs
                if not prec.find_best_peptide_pg().peptide.get_decoy()
            ]))

        nr_precursors_in_all = len([
            1 for m in multipeptides if m.all_selected() and not m.get_decoy()
        ])
        max_pg = alignment.nr_good_precursors * len(self.runs)
        dstats = self.estimate_real_fdr(multipeptides, fraction_present)
        dstats_all = self.estimate_real_fdr(multipeptides, 1.0)

        # Get single/multiple hits stats
        from itertools import groupby
        precursors_quantified = [
            m for m in multipeptides if len(m.get_selected_peakgroups()) > 0
        ]
        target_quant_protein_list = [
            prec.find_best_peptide_pg().peptide.protein_name
            for prec in precursors_quantified
            if not prec.find_best_peptide_pg().peptide.get_decoy()
        ]
        target_quant_protein_list.sort()
        nr_sh_target_proteins = sum([
            len(list(group)) == 1
            for key, group in groupby(target_quant_protein_list)
        ])
        nr_mh_target_proteins = sum([
            len(list(group)) > 1
            for key, group in groupby(target_quant_protein_list)
        ])

        #
        ###########################################################################
        #
        print "=" * 75
        print "=" * 75
        print "Total we have", len(self.runs), "runs with", alignment.nr_good_precursors, \
                "peakgroups quantified in at least %s run(s) below m_score (q-value) %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " + \
                "giving maximally nr peakgroups", max_pg
        print "We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned", \
                alignment.nr_aligned
        print "  The order of", alignment.nr_changed, "peakgroups was changed,", max_pg - alignment.nr_quantified, \
                "could not be aligned and %s were removed. Ambigous cases: %s, multiple suitable peakgroups: %s" % (
                    alignment.nr_removed, self.nr_ambiguous, self.nr_multiple_align)
        print "We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)" % (
            alignment.nr_quant_precursors, alignment.nr_good_precursors,
            min_nrruns, nr_precursors_in_all, precursors_in_all_runs_wo_align)
        print "We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)" % (
            len(alignment.quant_peptides), len(
                alignment.good_peptides), min_nrruns, nr_peptides_target,
            peptides_in_all_runs_wo_align_target)
        print "We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)" % (
            len(alignment.quant_proteins), len(
                alignment.good_proteins), min_nrruns, nr_proteins_target,
            proteins_in_all_runs_wo_align_target)
        print "  Of these %s proteins, %s were multiple hits and %s were single hits." % (
            len(alignment.quant_proteins), nr_mh_target_proteins,
            nr_sh_target_proteins)

        # Get decoy estimates
        decoy_precursors = len([
            1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0
            and m.find_best_peptide_pg().peptide.get_decoy()
        ])
        if len(precursors_in_all_runs) > 0:
            print "Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (
                dstats_all.decoy_pcnt, dstats_all.nr_decoys,
                dstats_all.nr_decoys + dstats_all.nr_targets,
                dstats_all.est_real_fdr * 100)

            print "Decoy percentage of peakgroups that are partially aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (
                dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys +
                dstats.nr_targets, dstats.est_real_fdr * 100)

            print "There were", decoy_precursors, "decoy precursors identified out of", \
                    alignment.nr_quant_precursors + decoy_precursors, "precursors which is %0.4f %%" % (
                        decoy_precursors *100.0 / (alignment.nr_quant_precursors + decoy_precursors))

    def _getTrafoFilename(self, current_run, ref_id):
        current_id = current_run.get_id()
        input_basename = os.path.basename(current_run.orig_filename)
        fn = os.path.splitext(input_basename)[0]
        dirname = os.path.dirname(current_run.orig_filename)
        filename = os.path.join(dirname,
                                "%s-%s-%s.tr" % (fn, current_id, ref_id))
        return filename

    def _write_trafo_files(self):
        # Print out trafo data
        trafo_fnames = []
        for current_run in self.runs:
            current_id = current_run.get_id()
            ref_id = self.transformation_collection.getReferenceRunID()
            filename = self._getTrafoFilename(current_run, ref_id)
            trafo_fnames.append(filename)
            self.transformation_collection.writeTransformationData(
                filename, current_id, ref_id)
            self.transformation_collection.readTransformationData(filename)

    def write_to_file(self, multipeptides, options, writeTrafoFiles=True):

        infiles = options.infiles
        outfile = options.outfile
        matrix_outfile = options.matrix_outfile
        yaml_outfile = options.yaml_outfile
        ids_outfile = options.ids_outfile
        fraction_needed_selected = options.min_frac_selected
        file_format = options.file_format

        # 1. Collect ids of selected features
        selected_pgs = []
        for m in multipeptides:

            selected_peakgroups = m.get_selected_peakgroups()
            if (len(selected_peakgroups) * 1.0 /
                    len(self.runs)) < fraction_needed_selected:
                continue

            for p in m.getAllPeptides():
                selected_pg = p.get_selected_peakgroup()
                clustered_pg = p.getClusteredPeakgroups()
                for pg in clustered_pg:
                    selected_pgs.append(pg)

        selected_ids_dict = dict([(pg.get_feature_id(), pg)
                                  for pg in selected_pgs])

        # 2. Write out the (selected) ids
        if len(ids_outfile) > 0:
            fh = open(ids_outfile, "w")
            id_writer = csv.writer(fh, delimiter="\t")
            for pg in selected_pgs:
                id_writer.writerow([pg.get_feature_id()])
            fh.close()
            del id_writer

        # 3. Write out the matrix outfile
        if len(matrix_outfile) > 0:
            write_out_matrix_file(matrix_outfile,
                                  self.runs,
                                  multipeptides,
                                  fraction_needed_selected,
                                  style=options.matrix_output_method,
                                  aligner_mscore_treshold=options.fdr_cutoff)

        # 4. Write out the full outfile
        if len(outfile) > 0 and options.readmethod == "full":
            # write out the complete original files
            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += ["align_runid", "align_origfilename"]
            writer.writerow(header_first)

            for m in multipeptides:

                selected_peakgroups = m.get_selected_peakgroups()
                if (len(selected_peakgroups) * 1.0 /
                        len(self.runs)) < fraction_needed_selected:
                    continue

                for p in m.get_peptides():
                    selected_pg = p.get_selected_peakgroup()
                    if selected_pg is None:
                        continue

                    row_to_write = selected_pg.row
                    row_to_write += [
                        selected_pg.run.get_id(), selected_pg.run.orig_filename
                    ]
                    # Replace run_id with the aligned id (align_runid) ->
                    # otherwise the run_id is not guaranteed to be unique
                    row_to_write[header_dict["run_id"]] = selected_ids_dict[
                        f_id].peptide.run.get_id()
                    writer.writerow(row_to_write)

        elif len(outfile) > 0 and file_format in [
                "openswath", "peakview_preprocess"
        ]:

            name_of_id_col_map = {
                "openswath": "id",
                "peakview_preprocess": "preprocess_id"
            }
            name_of_trgr_col_map = {
                "openswath": "transition_group_id",
                "peakview_preprocess": "Pep Index"
            }
            name_of_id_col = name_of_id_col_map[file_format]
            name_of_trgr_col = name_of_trgr_col_map[file_format]

            # Only in openswath we have the ID and can go back to the original file.
            # We can write out the complete original files.

            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += [
                "align_runid", "align_origfilename", "align_clusterid"
            ]
            writer.writerow(header_first)

            for file_nr, f in enumerate(infiles):
                header_dict = {}
                if f.endswith('.gz'):
                    import gzip
                    filehandler = gzip.open(f, 'rb')
                else:
                    filehandler = open(f)

                reader = csv.reader(filehandler, delimiter="\t")
                header = reader.next()
                for i, n in enumerate(header):
                    header_dict[n] = i

                for row in reader:
                    f_id = row[header_dict[name_of_id_col]]
                    if selected_ids_dict.has_key(f_id):
                        # Check the "id" and "transition_group_id" field.
                        # Unfortunately the id can be non-unique, there we check both.
                        trgroup_id = selected_ids_dict[f_id].peptide.get_id()
                        unique_peptide_id = row[header_dict[name_of_trgr_col]]
                        if unique_peptide_id == trgroup_id:
                            row_to_write = row
                            row_to_write += [
                                selected_ids_dict[f_id].peptide.run.get_id(),
                                f, selected_ids_dict[f_id].get_cluster_id()
                            ]
                            # Replace run_id with the aligned id (align_runid) ->
                            # otherwise the run_id is not guaranteed to be unique
                            if file_format == "openswath":
                                row_to_write[
                                    header_dict["run_id"]] = selected_ids_dict[
                                        f_id].peptide.run.get_id()
                            writer.writerow(row_to_write)

        # 5. Write out the .tr transformation files
        if writeTrafoFiles:
            self._write_trafo_files()

        # 6. Write out the YAML file
        if len(yaml_outfile) > 0:
            import yaml
            myYaml = {
                "Commandline": sys.argv,
                "RawData": [],
                "PeakGroupData": [outfile],
                "ReferenceRun":
                self.transformation_collection.getReferenceRunID(),
                "FeatureAlignment": {
                    "RawInputParameters": options.__dict__,
                    "Parameters": {}
                },
                "Parameters": {}
            }
            myYaml["Parameters"]["m_score_cutoff"] = float(
                options.fdr_cutoff)  # deprecated
            myYaml["FeatureAlignment"]["Parameters"]["m_score_cutoff"] = float(
                options.fdr_cutoff)
            myYaml["FeatureAlignment"]["Parameters"]["fdr_cutoff"] = float(
                options.fdr_cutoff)
            myYaml["FeatureAlignment"]["Parameters"][
                "aligned_fdr_cutoff"] = float(options.aligned_fdr_cutoff)
            for current_run in self.runs:
                current_id = current_run.get_id()
                ref_id = self.transformation_collection.getReferenceRunID()
                filename = self._getTrafoFilename(current_run, ref_id)
                dirpath = os.path.dirname(current_run.orig_filename)
                ### Use real path (not very useful when moving data from one computer to another)
                ### filename = os.path.realpath(filename)
                ### dirpath = os.path.realpath(dirpath)
                this = {
                    "id": current_id,
                    "directory": dirpath,
                    "trafo_file": filename
                }
                myYaml["RawData"].append(this)
            open(yaml_outfile,
                 'w').write(yaml.dump({"AlignedSwathRuns": myYaml}))
Example #8
0
 def _read_trafo(self, trafo_filenames):
     # Read the transformations
     transformation_collection_ = TransformationCollection()
     for filename in [d["trafo_file"] for d in trafo_filenames]:
         transformation_collection_.readTransformationData(filename)
     transformation_collection_.initialize_from_data(reverse=True)
Example #9
0
class SplineAligner():
    """
    Use the datasmoothing part of msproteomicstoolslib to align two runs in
    retention times using splines.

    >>> spl_aligner = SplineAligner()
    >>> transformations = spl_aligner.rt_align_all_runs(this_exp, multipeptides, options.alignment_score, options.use_scikit)
    """
    def __init__(self, alignment_fdr_threshold = 0.0001, smoother="lowess", external_r_tmpdir=None, maxdata=-1, experiment=None):
      self.transformation_collection = TransformationCollection()
      self.alignment_fdr_threshold_ = alignment_fdr_threshold
      self.smoother = smoother
      self.tmpdir_ = external_r_tmpdir
      self.max_data_ = maxdata
      self._cacher = None
      self._experiment = experiment

    def _determine_best_run(self, experiment):

        maxcount = -1
        bestrun = -1
        for run in experiment.runs:
            cnt = 0
            for prgroup in run:
                for peptide in prgroup:
                    if peptide.get_decoy(): continue
                    pg = peptide.get_best_peakgroup()
                    if pg.get_fdr_score() < self.alignment_fdr_threshold_:
                        cnt += 1
            if cnt > maxcount:
                maxcount = cnt
                bestrun = run.get_id()
        print("Found best run", bestrun, "with %s features above the cutoff of %s%%" % (maxcount, self.alignment_fdr_threshold_))
        return [r for r in experiment.runs if r.get_id() == bestrun][0]

    def _getRTData(self, bestrun, run, multipeptides):
        """ Return retention time data for reference and slave run """

        if self._experiment is not None:
            return self._getRTData_cached(bestrun, run, multipeptides)
        else:
            return self._getRTData_legacy(bestrun, run, multipeptides)

    def _cache_RT_data(self, bestrun, run, multipeptides):

        self._cacher = []
        for m in multipeptides:
            cached_vals = []
            is_decoy = False
            for r in self._experiment.runs:
                val = None
                if m.hasPrecursorGroup(r.get_id()):

                    al_pg = [pg for pg in m.getPrecursorGroup(r.get_id()).getAllPeakgroups()
                                   if pg.get_fdr_score() < self.alignment_fdr_threshold_]

                    # We need to have a single, good peak group below the threshold (not a decoy)
                    if len(al_pg) == 1:
                        pep = m.getPrecursorGroup(r.get_id()).getOverallBestPeakgroup()
                        if not pep.peptide.get_decoy() and pep.get_fdr_score() < self.alignment_fdr_threshold_:
                            val = (pep.get_fdr_score(), pep.get_normalized_retentiontime())

                cached_vals.append(val)

            # only append with at least 2 values ...
            if len([v for v in cached_vals if not v is None] ) > 1:
                self._cacher.append(cached_vals)

    def _getRTData_cached(self, bestrun, run, multipeptides):
        """ Return retention time data for reference and slave run """

        if self._cacher is None:
            self._cache_RT_data(bestrun, run, multipeptides)

        run_nr = [k for k,r in enumerate(self._experiment.runs) if r.get_id() == run.get_id() ][0]
        bestrun_nr = [k for k,r in enumerate(self._experiment.runs) if r.get_id() == bestrun.get_id() ][0]

        data_tmp = []
        for m in self._cacher:

            rund = m[ run_nr ]
            bestrund = m[ bestrun_nr ]

            # Skip empty entries
            if rund is None or bestrund is None:
                continue

            data_tmp.append( (
                min( rund[0], bestrund[0]),
                bestrund[1], rund[1]) )

        maxdata = self.max_data_
        if maxdata == -1:
            # -1 means take all data
            maxdata = len(data_tmp)

        data1 = []
        data2 = []
        for fdr, d1, d2 in sorted(data_tmp)[:maxdata]:
            data1.append(d1)
            data2.append(d2)

        return data1,data2

    def _getRTData_legacy(self, bestrun, run, multipeptides):
        """ Return retention time data for reference and slave run """

        # data1 = reference data (master)
        # data2 = data to be aligned (slave)
        data1 = []
        data2 = []

        data_tmp = []
        cnt_multiple = 0

        for m in multipeptides:

            try:
                len_ali = len([pg for pg in m.getPrecursorGroup(run.get_id()).getAllPeakgroups()
                               if pg.get_fdr_score() < self.alignment_fdr_threshold_])
                len_ref = len([pg for pg in m.getPrecursorGroup(bestrun.get_id()).getAllPeakgroups()
                               if pg.get_fdr_score() < self.alignment_fdr_threshold_])

                # Do not consider peakgroups that are missing in one run
                # Do not consider peakgroups that have more than one good peakgroup
                if len_ali != 1 or len_ref != 1:
                    if len_ali > 1 or len_ref > 1:
                        cnt_multiple += 1
                    continue

                ref_pep = m.getPrecursorGroup(bestrun.get_id()).getOverallBestPeakgroup()
                align_pep = m.getPrecursorGroup(run.get_id()).getOverallBestPeakgroup()
            except KeyError:
                # it is possible that for some, no peak group exists in this run
                continue

            # Do not use decoy peptides
            if ref_pep.peptide.get_decoy() or align_pep.peptide.get_decoy():
                continue

            if ref_pep.get_fdr_score() < self.alignment_fdr_threshold_ and \
               align_pep.get_fdr_score() < self.alignment_fdr_threshold_:

                # data1.append(ref_pep.get_normalized_retentiontime())
                # data2.append(align_pep.get_normalized_retentiontime())
                data_tmp.append( (
                    ref_pep.get_fdr_score(),
                    ref_pep.get_normalized_retentiontime(),
                    align_pep.get_normalized_retentiontime()
                ) )

        if cnt_multiple > len(multipeptides) * 0.8 :
            print ("")
            print ("  Warning: Most of your data has more than one peakgroup with a score better than %s."  % self.alignment_fdr_threshold_)
            print ("  This may be a problem for the alignment, please consider adjusting the --alignment_score option."  )

        maxdata = self.max_data_
        if maxdata == -1:
            # -1 means take all data
            maxdata = len(data_tmp)

        for fdr, d1, d2 in sorted(data_tmp)[:maxdata]:
            data1.append(d1)
            data2.append(d2)

        return data1,data2

    def _spline_align_runs(self, bestrun, run, multipeptides):
        """Will align run against bestrun"""

        sm = smoothing.getSmoothingObj(smoother = self.smoother, tmpdir = self.tmpdir_)

        # get those peptides we want to use for alignment => for this use the mapping
        # data1 = reference data (master)
        # data2 = data to be aligned (slave)
        data1,data2 = self._getRTData(bestrun, run, multipeptides)

        if len(data2) < 2:
            print("No common identifications between %s and %s. Only found %s features below a cutoff of %s" % ( 
                run.get_id(), bestrun.get_id(), len(data1), self.alignment_fdr_threshold_) )
            print("If you ran the feature_alignment.py script, try to skip the re-alignment step (e.g. remove the --realign_runs option)." )
            raise Exception("Not enough datapoints (less than 2 datapoints).")

        # Since we want to predict how to convert from slave to master, slave
        # is first and master is second.
        sm.initialize(data2, data1)
        data2_aligned = sm.predict(data2)

        # Store transformation in collection (from run to bestrun)
        self.transformation_collection.addTransformationData([data2, data1], run.get_id(), bestrun.get_id() )
        self.transformation_collection.addTransformedData(data2_aligned, run.get_id(), bestrun.get_id() )

        stdev = numpy.std(numpy.array(data1) - numpy.array(data2_aligned))
        median = numpy.median(numpy.array(data1) - numpy.array(data2_aligned))
        print("Will align run %s against %s, using %s features" % (run.get_id(), bestrun.get_id(), len(data1)) )
        print("  Computed stdev", stdev, "and median", median )

        # Store error for later
        d = self.transformation_error.transformations.get(run.get_id(), {})
        d[bestrun.get_id()] = [stdev, median]
        self.transformation_error.transformations[ run.get_id() ] = d

        # Now predict on _all_ data and write this back to the data
        i = 0
        all_pg = []
        for prgr in run:
            for pep in prgr:
                all_pg.extend( [ (pg.get_normalized_retentiontime(), pg.get_feature_id()) for pg in pep.get_all_peakgroups()] )
        rt_eval = [ pg[0] for pg in all_pg]
        aligned_result = sm.predict(rt_eval)
        for prgr in run:
            for pep in prgr:
                # TODO hack -> direct access to the internal peakgroups object
                mutable = [list(pg) for pg in pep.peakgroups_]
                for k in range(len(mutable)):
                    mutable[k][2] = aligned_result[i]
                    i += 1
                pep.peakgroups_ = [ tuple(m) for m in mutable]

    def rt_align_all_runs(self, experiment, multipeptides):
        """ Align all runs contained in an MRExperiment

        Args:
            experiment(MRExperiment): a collection of runs
            multipeptides(list(multipeptides)): a list of Multipeptide derived from the above expriment
        """

        print("Will re-align runs" )

        # get the best run (e.g. the one with the most ids below threshold)
        bestrun = self._determine_best_run(experiment)

        ## spl_aligner.transformation_collection = experiment.transformation_collection
        self.transformation_collection.setReferenceRunID( bestrun.get_id() )
        self.transformation_error = TransformationError()

        # go through all runs and align two runs at a time
        for run in experiment.runs:
            if run.get_id() == bestrun.get_id(): continue # do not align reference run itself
            self._spline_align_runs(bestrun, run, multipeptides)

        return self.transformation_collection

    def getTransformationError(self):
        """
        Get the error of the transformation

        Returns:
            transformation_error(:class:`.TransformationError`) : the error of the transformation
        """
        return self.transformation_error
 def __init__(self):
     super(Experiment, self).__init__()
     self.transformation_collection = TransformationCollection()
class Experiment(MRExperiment):
    """
    An Experiment is a container for multiple experimental runs - some of which may contain the same precursors.
    """

    def __init__(self):
        super(Experiment, self).__init__()
        self.transformation_collection = TransformationCollection()

    def estimate_real_fdr(self, multipeptides, fraction_needed_selected):
        class DecoyStats(object):
            def __init__(self):
                self.est_real_fdr = 0.0
                self.nr_decoys = 0
                self.nr_targets = 0
                self.decoy_pcnt = 0.0
                self.est_real_fdr = 0.0

        d = DecoyStats()
        precursors_to_be_used = [m for m in multipeptides if m.more_than_fraction_selected(fraction_needed_selected)]

        # count the decoys
        d.nr_decoys = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used
                          if prec.find_best_peptide_pg().peptide.get_decoy()])
        d.nr_targets = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used
                          if not prec.find_best_peptide_pg().peptide.get_decoy()])
        # estimate the real fdr by calculating the decoy ratio and dividing it
        # by the decoy ration obtained at @fdr_cutoff => which gives us the
        # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute
        # value, we multiply by fdr_cutoff again (which was used to obtain the
        # original estimated decoy percentage).
        if self.estimated_decoy_pcnt is None: return d
        if (d.nr_targets + d.nr_decoys) == 0: return d
        d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys) )
        d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff
        return d

    def print_stats(self, multipeptides, fdr_cutoff, fraction_present, min_nrruns):

        alignment = AlignmentStatistics()
        alignment.count(multipeptides, fdr_cutoff, self.runs)

        # Count presence in all runs (before alignment)
        precursors_in_all_runs_wo_align = len([1 for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy()])
        proteins_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides 
                                                        if m.all_above_cutoff(fdr_cutoff) and 
                                                        not m.find_best_peptide_pg().peptide.get_decoy()]))
        peptides_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.sequence for m in multipeptides 
                                                        if m.all_above_cutoff(fdr_cutoff) and 
                                                        not m.find_best_peptide_pg().peptide.get_decoy()]))

        # Count presence in all runs (before alignment)
        precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        nr_peptides_target = len(set([prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs 
                                      if not prec.find_best_peptide_pg().peptide.get_decoy()]))
        nr_proteins_target = len(set([prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs 
                                      if not prec.find_best_peptide_pg().peptide.get_decoy()]))

        nr_precursors_in_all = len([1 for m in multipeptides if m.all_selected() and not m.get_decoy()])
        max_pg = alignment.nr_good_precursors * len(self.runs)
        dstats = self.estimate_real_fdr(multipeptides, fraction_present)
        dstats_all = self.estimate_real_fdr(multipeptides, 1.0)

        # Get single/multiple hits stats
        from itertools import groupby
        precursors_quantified = [m for m in multipeptides if len(m.get_selected_peakgroups()) > 0]
        target_quant_protein_list = [ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_quantified 
                                     if not prec.find_best_peptide_pg().peptide.get_decoy()]
        target_quant_protein_list.sort()
        nr_sh_target_proteins = sum( [len(list(group)) == 1 for key, group in groupby(target_quant_protein_list)] )
        nr_mh_target_proteins = sum( [len(list(group)) > 1 for key, group in groupby(target_quant_protein_list)] )

        ### Store for later (yaml output)
        alignment.nr_ambiguous = self.nr_ambiguous
        alignment.nr_multiple_align = self.nr_multiple_align
        alignment.precursors_in_all_runs_wo_align = precursors_in_all_runs_wo_align
        alignment.peptides_in_all_runs_wo_align_target = peptides_in_all_runs_wo_align_target
        alignment.proteins_in_all_runs_wo_align_target = proteins_in_all_runs_wo_align_target

        alignment.nr_precursors_in_all = nr_precursors_in_all
        alignment.nr_peptides_target = nr_peptides_target
        alignment.nr_proteins_target = nr_proteins_target

        #
        ###########################################################################
        #
        print("="*75)
        print("="*75)
        print("Total we have", len(self.runs), "runs with", alignment.nr_good_precursors,
              "peakgroups quantified in at least %s run(s) below m_score (q-value) %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " +
              "giving maximally nr peakgroups", max_pg)
        print("We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned",
              alignment.nr_aligned)
        print("  The order of", alignment.nr_changed, "peakgroups was changed,", max_pg - alignment.nr_quantified,
              "could not be aligned and %s were removed. Ambigous cases: %s, multiple suitable peakgroups: %s" % (
              alignment.nr_removed, self.nr_ambiguous, self.nr_multiple_align))
        print("We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)" % (
          alignment.nr_quant_precursors, alignment.nr_good_precursors, min_nrruns, nr_precursors_in_all, precursors_in_all_runs_wo_align))
        print("We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)" % (
              len(alignment.quant_peptides), len(alignment.good_peptides), min_nrruns, nr_peptides_target, peptides_in_all_runs_wo_align_target))
        print("We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)" % (
              len(alignment.quant_proteins), len(alignment.good_proteins), min_nrruns, nr_proteins_target, proteins_in_all_runs_wo_align_target))
        print("Of these %s proteins, %s were multiple hits and %s were single hits." % (len(alignment.quant_proteins), nr_mh_target_proteins, nr_sh_target_proteins))

        # Get decoy estimates
        decoy_precursors = len([1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0 and m.find_best_peptide_pg().peptide.get_decoy()])
        if len(precursors_in_all_runs) > 0:
            print("Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (
                dstats_all.decoy_pcnt, dstats_all.nr_decoys, dstats_all.nr_decoys + dstats_all.nr_targets, dstats_all.est_real_fdr*100))

            print("Decoy percentage of peakgroups that are partially aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (
                dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys + dstats.nr_targets, dstats.est_real_fdr*100))

            print("There were", decoy_precursors, "decoy precursors identified out of", \
                  alignment.nr_quant_precursors + decoy_precursors, "precursors which is %0.4f %%" % (
                      decoy_precursors * 100.0 / (alignment.nr_quant_precursors + decoy_precursors)))

        return alignment

    def _getTrafoFilename(self, current_run, ref_id):
        current_id = current_run.get_id()
        input_basename = os.path.basename(current_run.orig_filename)
        fn = os.path.splitext(input_basename)[0]
        dirname = os.path.dirname(current_run.orig_filename)
        filename = os.path.join(dirname, "%s-%s-%s.tr" % (fn, current_id, ref_id) )
        return filename

    def _write_trafo_files(self):
        # Print out trafo data
        trafo_fnames = []
        for current_run in self.runs:
            current_id = current_run.get_id()
            ref_id = self.transformation_collection.getReferenceRunID()
            filename = self._getTrafoFilename(current_run, ref_id)
            trafo_fnames.append(filename)
            self.transformation_collection.writeTransformationData(filename, current_id, ref_id)
            self.transformation_collection.readTransformationData(filename)

    def write_to_file(self, multipeptides, options, alignment, tree=None, writeTrafoFiles=True):

        infiles = options.infiles
        outfile = options.outfile
        matrix_outfile = options.matrix_outfile
        yaml_outfile = options.yaml_outfile
        ids_outfile = options.ids_outfile
        fraction_needed_selected = options.min_frac_selected
        file_format = options.file_format

        # 1. Collect ids of selected features
        selected_pgs = []
        for m in multipeptides:

            selected_peakgroups = m.get_selected_peakgroups()
            if (len(selected_peakgroups)*1.0 / len(self.runs)) < fraction_needed_selected: 
                continue

            for p in m.getAllPeptides():
                selected_pg = p.get_selected_peakgroup()
                clustered_pg = p.getClusteredPeakgroups()
                for pg in clustered_pg:
                    selected_pgs.append(pg)

        selected_ids_dict = dict( [ (pg.get_feature_id(), pg) for pg in selected_pgs] )

        # 2. Write out the (selected) ids
        if len(ids_outfile) > 0:
            fh = open(ids_outfile, "w")
            id_writer = csv.writer(fh, delimiter="\t")
            for pg in sorted(selected_pgs):
                id_writer.writerow([pg.get_feature_id()])
            fh.close()
            del id_writer

        # 3. Write out the matrix outfile
        if len(matrix_outfile) > 0:
            write_out_matrix_file(matrix_outfile, self.runs, multipeptides,
                                  fraction_needed_selected,
                                  style=options.matrix_output_method,
                                  aligner_mscore_treshold=options.fdr_cutoff)

        # 4. Write out the full outfile
        if len(outfile) > 0 and options.readmethod == "full":
            # write out the complete original files 
            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += ["align_runid", "align_origfilename"]
            writer.writerow(header_first)

            for m in multipeptides:

                selected_peakgroups = m.get_selected_peakgroups()
                if (len(selected_peakgroups)*1.0 / len(self.runs)) < fraction_needed_selected:
                    continue

                for p in m.get_peptides():
                    selected_pg = p.get_selected_peakgroup()
                    if selected_pg is None: 
                        continue

                    row_to_write = selected_pg.row
                    row_to_write += [selected_pg.run.get_id(), selected_pg.run.orig_filename]
                    # Replace run_id with the aligned id (align_runid) ->
                    # otherwise the run_id is not guaranteed to be unique 
                    row_to_write[ header_dict["run_id"]] = selected_ids_dict[f_id].peptide.run.get_id()
                    writer.writerow(row_to_write)

        elif len(outfile) > 0 and file_format in ["openswath", "peakview_preprocess"]:

            name_of_id_col_map = { "openswath" : "id" , "peakview_preprocess" : "preprocess_id"}
            name_of_trgr_col_map = { "openswath" : "transition_group_id" , "peakview_preprocess" : "Pep Index"}
            name_of_id_col = name_of_id_col_map[file_format]
            name_of_trgr_col = name_of_trgr_col_map[file_format]

            # Only in openswath we have the ID and can go back to the original file.
            # We can write out the complete original files.

            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += ["align_runid", "align_origfilename", "align_clusterid"]
            writer.writerow(header_first)

            for file_nr, f in enumerate(infiles):
              header_dict = {}
              if f.endswith('.gz'):
                  import gzip
                  filehandler = gzip.open(f,'rb')
              else:
                  filehandler = open(f)

              reader = csv.reader(filehandler, delimiter="\t")
              header = next(reader)
              for i,n in enumerate(header):
                header_dict[n] = i

              for row in reader:
                  f_id = row[ header_dict[name_of_id_col]]
                  if f_id in selected_ids_dict:
                      # Check the "id" and "transition_group_id" field.
                      # Unfortunately the id can be non-unique, there we check both.
                      trgroup_id = selected_ids_dict[f_id].peptide.get_id()
                      unique_peptide_id = row[ header_dict[name_of_trgr_col]]
                      if unique_peptide_id == trgroup_id:
                          row_to_write = row
                          row_to_write += [selected_ids_dict[f_id].peptide.run.get_id(), f, selected_ids_dict[f_id].get_cluster_id()]
                          # Replace run_id with the aligned id (align_runid) ->
                          # otherwise the run_id is not guaranteed to be unique 
                          if file_format == "openswath" : 
                              row_to_write[ header_dict["run_id"]] = selected_ids_dict[f_id].peptide.run.get_id()
                          writer.writerow(row_to_write)

        # 5. Write out the .tr transformation files
        if writeTrafoFiles:
            self._write_trafo_files()

        # 6. Write out the YAML file
        if len(yaml_outfile) > 0:
            import yaml
            myYaml = {"Commandline" : sys.argv, 
                      "RawData" : [], "PeakGroupData" : [ outfile ],
                      "ReferenceRun" : self.transformation_collection.getReferenceRunID(), 
                      "FeatureAlignment" : 
                      {
                        "RawInputParameters" : options.__dict__,
                        "Parameters" : {}
                      },
                      "Parameters" : {}
                     }

            myYaml["Output"] = {}
            myYaml["Output"]["Tree"] = {}
            if tree is not None:
                myYaml["Output"]["Tree"]["Raw"] = [list(t) for t in tree]
                tree_mapped = [ [self.runs[a].get_id(), self.runs[b].get_id()] for a,b in tree]
                myYaml["Output"]["Tree"]["Mapped"] = tree_mapped
                tree_mapped = [ [self.runs[a].get_openswath_filename(), self.runs[b].get_openswath_filename()] for a,b in tree]
                myYaml["Output"]["Tree"]["MappedFile"] = tree_mapped
                tree_mapped = [ [self.runs[a].get_openswath_filename(), self.runs[b].get_openswath_filename()] for a,b in tree]
                myYaml["Output"]["Tree"]["MappedFile"] = tree_mapped
                tree_mapped = [ [self.runs[a].get_original_filename(), self.runs[b].get_original_filename()] for a,b in tree]
                myYaml["Output"]["Tree"]["MappedFileInput"] = tree_mapped

            myYaml["Output"]["Quantification"] = alignment.to_yaml()
            myYaml["Parameters"]["m_score_cutoff"] = float(options.fdr_cutoff) # deprecated
            myYaml["FeatureAlignment"]["Parameters"]["m_score_cutoff"] = float(options.fdr_cutoff)
            myYaml["FeatureAlignment"]["Parameters"]["fdr_cutoff"] = float(options.fdr_cutoff)
            myYaml["FeatureAlignment"]["Parameters"]["aligned_fdr_cutoff"] = float(options.aligned_fdr_cutoff)
            for current_run in self.runs:
                current_id = current_run.get_id()
                ref_id = self.transformation_collection.getReferenceRunID()
                filename = self._getTrafoFilename(current_run, ref_id)
                dirpath = os.path.dirname(current_run.orig_filename)
                ### Use real path (not very useful when moving data from one computer to another)
                ### filename = os.path.realpath(filename)
                ### dirpath = os.path.realpath(dirpath)
                this = {"id" : current_id, "directory" : dirpath, "trafo_file" : filename}
                myYaml["RawData"].append(this)
            open(yaml_outfile, 'w').write(yaml.dump({"AlignedSwathRuns" : myYaml}))
Example #12
0
 def _read_trafo(self, trafo_filenames):
     # Read the transformations
     transformation_collection_ = TransformationCollection()
     for filename in [d["trafo_file"] for d in trafo_filenames]:
       transformation_collection_.readTransformationData(filename)
     transformation_collection_.initialize_from_data(reverse=True)
class Experiment(MRExperiment):
    """
    An Experiment is a container for multiple experimental runs - some of which may contain the same precursors.
    """

    def __init__(self):
        super(Experiment, self).__init__()
        self.transformation_collection = TransformationCollection()

    def get_max_pg(self):
      return len(self.runs)*len(self.union_transition_groups_set)

    def estimate_real_fdr(self, multipeptides, fraction_needed_selected):
        class DecoyStats(object): 
            def __init__(self):
                self.est_real_fdr = 0.0
                self.nr_decoys = 0
                self.nr_targets = 0
                self.decoy_pcnt = 0.0
                self.est_real_fdr = 0.0

        d = DecoyStats()
        precursors_to_be_used = [m for m in multipeptides if m.more_than_fraction_selected(fraction_needed_selected)]

        # count the decoys
        d.nr_decoys = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used 
                          if prec.find_best_peptide_pg().peptide.get_decoy()])
        d.nr_targets = sum([len(prec.get_selected_peakgroups()) for prec in precursors_to_be_used 
                          if not prec.find_best_peptide_pg().peptide.get_decoy()])
        # estimate the real fdr by calculating the decoy ratio and dividing it
        # by the decoy ration obtained at @fdr_cutoff => which gives us the
        # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute
        # value, we multiply by fdr_cutoff again (which was used to obtain the
        # original estimated decoy percentage).
        if self.estimated_decoy_pcnt is None: return d
        if (d.nr_targets + d.nr_decoys) == 0: return d
        d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys) )
        d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff 
        return d

    def print_stats(self, multipeptides, alignment, outlier_detection, fdr_cutoff, fraction_present, min_nrruns):
        nr_precursors_total = len(self.union_transition_groups_set)

        # Do statistics and print out
        in_all_runs_wo_align = len([1 for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy()])
        proteins_in_all_runs_wo_align = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff)]))
        proteins_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy()]))
        nr_all_proteins = len(set([m.find_best_peptide_pg().peptide.protein_name for m in multipeptides if not m.find_best_peptide_pg().peptide.get_decoy()]))
        nr_all_peptides = len(set([m.find_best_peptide_pg().peptide.sequence for m in multipeptides if not m.find_best_peptide_pg().peptide.get_decoy()]))
        peptides_in_all_runs_wo_align_target = len(set([m.find_best_peptide_pg().peptide.sequence for m in multipeptides if m.all_above_cutoff(fdr_cutoff) and not m.find_best_peptide_pg().peptide.get_decoy()]))

        print("Present targets in all runs", in_all_runs_wo_align)
        precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        precursors_quantified = [m for m in multipeptides if len(m.get_selected_peakgroups()) > 0]

        # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        nr_decoys = len([1 for prec in precursors_in_all_runs if prec.find_best_peptide_pg().peptide.get_decoy()])

        decoy_precursors = len([1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0 and m.find_best_peptide_pg().peptide.get_decoy()])

        nr_peptides = len(set([prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs]))
        nr_proteins = len(set([prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs]))
        nr_peptides_target = len(set([prec.find_best_peptide_pg().peptide.sequence for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy()]))
        nr_proteins_target = len(set([prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_in_all_runs if not prec.find_best_peptide_pg().peptide.get_decoy()]))

        nr_precursors_to_quant = len(set([ prec for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy()]))
        nr_proteins_to_quant = len(set([ prec.find_best_peptide_pg().peptide.protein_name for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy()]))
        nr_peptides_to_quant = len(set([ prec.find_best_peptide_pg().peptide.sequence for prec in precursors_quantified if not prec.find_best_peptide_pg().peptide.get_decoy()]))

        nr_precursors_in_all = len([1 for m in multipeptides if m.all_selected() and not m.get_decoy()])
        max_pg = self.get_max_pg()
        dstats = self.estimate_real_fdr(multipeptides, fraction_present)
        dstats_all = self.estimate_real_fdr(multipeptides, 1.0)
        print("="*75)
        print("="*75)
        print("Total we have", len(self.runs), "runs with", len(self.union_transition_groups_set),\
                "peakgroups quantified in at least %s run(s) above FDR %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " + \
                "giving maximally nr peakgroups", max_pg)
        print("We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned", \
                alignment.nr_aligned, "and changed order of", alignment.nr_changed, "and could not align", alignment.could_not_align)

        print("We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)" % (
          nr_precursors_to_quant, nr_precursors_total, min_nrruns, nr_precursors_in_all, in_all_runs_wo_align))
        print("We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)" % (
          nr_peptides_to_quant, nr_all_peptides, min_nrruns, nr_peptides_target, peptides_in_all_runs_wo_align_target))
        print("We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)" % (
          nr_proteins_to_quant, nr_all_proteins, min_nrruns, nr_proteins_target, proteins_in_all_runs_wo_align_target))

        # print "quant proteins", nr_proteins_to_quant

        # Get decoy estimates
        if len(precursors_in_all_runs) > 0:
            print("Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (
                dstats_all.decoy_pcnt, dstats_all.nr_decoys, dstats_all.nr_decoys + dstats_all.nr_targets, dstats_all.est_real_fdr*100))

            print("Decoy percentage of peakgroups that are partially aligned %1.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%" % (
                dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys + dstats.nr_targets, dstats.est_real_fdr*100))

            print("There were", decoy_precursors, "decoy precursors identified out of", nr_precursors_to_quant + decoy_precursors, "precursors which is %0.4f %%" % (decoy_precursors *100.0 / (nr_precursors_to_quant + decoy_precursors)))


        if outlier_detection is not None: 
            print("Outliers:", outlier_detection.nr_outliers, "outliers in", len(multipeptides), "peptides or", outlier_detection.outlier_pg, "peakgroups out of", alignment.nr_quantified, "changed", outlier_detection.outliers_changed)

    def write_to_file(self, multipeptides, options):

        infiles = options.infiles
        outfile = options.outfile
        matrix_outfile = options.matrix_outfile
        matrix_excelfile = options.matix_excel
        yaml_outfile = options.yaml_outfile
        ids_outfile = options.ids_outfile
        fraction_needed_selected = options.min_frac_selected
        file_format = options.file_format

        selected_pgs = []
        for m in multipeptides:
            selected_peakgroups = m.get_selected_peakgroups()
            if (len(selected_peakgroups)*1.0 / len(self.runs) < fraction_needed_selected) : continue
            for p in m.get_peptides():
                selected_pg = p.get_selected_peakgroup()
                if selected_pg is None: continue
                selected_pgs.append(selected_pg)
        selected_ids_dict = dict( [ (pg.get_feature_id(), pg) for pg in selected_pgs] )

        if len(ids_outfile) > 0:
            fh = open(ids_outfile, "w")
            id_writer = csv.writer(fh, delimiter="\t")
            for pg in selected_pgs:
                id_writer.writerow([pg.get_feature_id()])
            fh.close()
            del id_writer

        if len(matrix_outfile) > 0:
            write_out_matrix_file(matrix_outfile, self.runs, multipeptides, fraction_needed_selected)


        if len(outfile) > 0 and options.readmethod == "full":
            # write out the complete original files 
            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += ["align_runid", "align_origfilename"]
            writer.writerow(header_first)

            for m in multipeptides:
                selected_peakgroups = m.get_selected_peakgroups()
                if (len(selected_peakgroups)*1.0 / len(self.runs) < fraction_needed_selected) : continue
                for p in m.get_peptides():
                    selected_pg = p.get_selected_peakgroup()
                    if selected_pg is None: continue
                    row_to_write = selected_pg.row
                    row_to_write += [selected_pg.run.get_id(), selected_pg.run.orig_filename]
                    writer.writerow(row_to_write)
        elif len(outfile) > 0 and file_format == "openswath":
            # only in openswath we have the ID and can go back to the original file ... 
            # write out the complete original files 
            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += ["align_runid", "align_origfilename"]
            writer.writerow(header_first)

            for file_nr, f in enumerate(infiles):
              header_dict = {}
              reader = csv.reader(open(f), delimiter="\t")
              header = next(reader)
              for i,n in enumerate(header):
                header_dict[n] = i
              for row in reader:
                  f_id = row[ header_dict["id"]]
                  if f_id in selected_ids_dict:
                      # Check the "id" and "transition_group_id" field. 
                      # Unfortunately the id can be non-unique, there we check both.
                      trgroup_id = selected_ids_dict[f_id].peptide.get_id()
                      unique_peptide_id = row[ header_dict["transition_group_id"]]
                      if unique_peptide_id == trgroup_id:
                          row_to_write = row
                          row_to_write += [selected_ids_dict[f_id].peptide.run.get_id(), f]
                          writer.writerow(row_to_write)
 
        # Print out trafo data
        trafo_fnames = []
        for current_run in self.runs:
          current_id = current_run.get_id()
          ref_id = self.transformation_collection.getReferenceRunID() 
          filename = os.path.join(os.path.dirname(current_run.orig_filename), "transformation-%s-%s.tr" % (current_id, ref_id) )
          trafo_fnames.append(filename)
          self.transformation_collection.writeTransformationData(filename, current_id, ref_id)
          self.transformation_collection.readTransformationData(filename)

        if len(yaml_outfile) > 0:
            import yaml
            myYaml = {"RawData" : [], "PeakGroupData" : [ outfile ],
                      "ReferenceRun" : self.transformation_collection.getReferenceRunID() }
            for current_run in self.runs:
                current_id = current_run.get_id()
                ref_id = self.transformation_collection.getReferenceRunID() 
                filename = os.path.join(os.path.dirname(current_run.orig_filename), "transformation-%s-%s.tr" % (current_id, ref_id) )
                dirpath = os.path.realpath(os.path.dirname(current_run.orig_filename))
                this = {"id" : current_id, "directory" : dirpath, "trafo_file" : os.path.realpath(filename)}
                myYaml["RawData"].append(this)
            open(yaml_outfile, 'w').write(yaml.dump({"AlignedSwathRuns" : myYaml}))

        return trafo_fnames
class Experiment(MRExperiment):
    """
    An Experiment is a container for multiple experimental runs - some of which may contain the same precursors.
    """
    def __init__(self):
        super(Experiment, self).__init__()
        self.transformation_collection = TransformationCollection()

    def get_max_pg(self):
        return len(self.runs) * len(self.union_transition_groups_set)

    def estimate_real_fdr(self, multipeptides, fraction_needed_selected):
        class DecoyStats(object):
            def __init__(self):
                self.est_real_fdr = 0.0
                self.nr_decoys = 0
                self.nr_targets = 0
                self.decoy_pcnt = 0.0
                self.est_real_fdr = 0.0

        d = DecoyStats()
        precursors_to_be_used = [
            m for m in multipeptides
            if m.more_than_fraction_selected(fraction_needed_selected)
        ]

        # count the decoys
        d.nr_decoys = sum([
            len(prec.get_selected_peakgroups())
            for prec in precursors_to_be_used
            if prec.find_best_peptide_pg().peptide.get_decoy()
        ])
        d.nr_targets = sum([
            len(prec.get_selected_peakgroups())
            for prec in precursors_to_be_used
            if not prec.find_best_peptide_pg().peptide.get_decoy()
        ])
        # estimate the real fdr by calculating the decoy ratio and dividing it
        # by the decoy ration obtained at @fdr_cutoff => which gives us the
        # decoy in/decrease realtive to fdr_cutoff. To calculate the absolute
        # value, we multiply by fdr_cutoff again (which was used to obtain the
        # original estimated decoy percentage).
        if self.estimated_decoy_pcnt is None: return d
        if (d.nr_targets + d.nr_decoys) == 0: return d
        d.decoy_pcnt = (d.nr_decoys * 100.0 / (d.nr_targets + d.nr_decoys))
        d.est_real_fdr = d.decoy_pcnt / self.estimated_decoy_pcnt * self.initial_fdr_cutoff
        return d

    def print_stats(self, multipeptides, alignment, outlier_detection,
                    fdr_cutoff, fraction_present, min_nrruns):
        nr_precursors_total = len(self.union_transition_groups_set)

        # Do statistics and print out
        in_all_runs_wo_align = len([
            1 for m in multipeptides
            if m.all_above_cutoff(fdr_cutoff) and not m.get_decoy()
        ])
        proteins_in_all_runs_wo_align = len(
            set([
                m.find_best_peptide_pg().peptide.protein_name
                for m in multipeptides if m.all_above_cutoff(fdr_cutoff)
            ]))
        proteins_in_all_runs_wo_align_target = len(
            set([
                m.find_best_peptide_pg().peptide.protein_name
                for m in multipeptides if m.all_above_cutoff(fdr_cutoff)
                and not m.find_best_peptide_pg().peptide.get_decoy()
            ]))
        nr_all_proteins = len(
            set([
                m.find_best_peptide_pg().peptide.protein_name
                for m in multipeptides
                if not m.find_best_peptide_pg().peptide.get_decoy()
            ]))
        nr_all_peptides = len(
            set([
                m.find_best_peptide_pg().peptide.sequence
                for m in multipeptides
                if not m.find_best_peptide_pg().peptide.get_decoy()
            ]))
        peptides_in_all_runs_wo_align_target = len(
            set([
                m.find_best_peptide_pg().peptide.sequence
                for m in multipeptides if m.all_above_cutoff(fdr_cutoff)
                and not m.find_best_peptide_pg().peptide.get_decoy()
            ]))

        print("Present targets in all runs", in_all_runs_wo_align)
        precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        precursors_quantified = [
            m for m in multipeptides if len(m.get_selected_peakgroups()) > 0
        ]

        # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        # precursors_in_all_runs = [m for m in multipeptides if m.all_selected()]
        nr_decoys = len([
            1 for prec in precursors_in_all_runs
            if prec.find_best_peptide_pg().peptide.get_decoy()
        ])

        decoy_precursors = len([
            1 for m in multipeptides if len(m.get_selected_peakgroups()) > 0
            and m.find_best_peptide_pg().peptide.get_decoy()
        ])

        nr_peptides = len(
            set([
                prec.find_best_peptide_pg().peptide.sequence
                for prec in precursors_in_all_runs
            ]))
        nr_proteins = len(
            set([
                prec.find_best_peptide_pg().peptide.protein_name
                for prec in precursors_in_all_runs
            ]))
        nr_peptides_target = len(
            set([
                prec.find_best_peptide_pg().peptide.sequence
                for prec in precursors_in_all_runs
                if not prec.find_best_peptide_pg().peptide.get_decoy()
            ]))
        nr_proteins_target = len(
            set([
                prec.find_best_peptide_pg().peptide.protein_name
                for prec in precursors_in_all_runs
                if not prec.find_best_peptide_pg().peptide.get_decoy()
            ]))

        nr_precursors_to_quant = len(
            set([
                prec for prec in precursors_quantified
                if not prec.find_best_peptide_pg().peptide.get_decoy()
            ]))
        nr_proteins_to_quant = len(
            set([
                prec.find_best_peptide_pg().peptide.protein_name
                for prec in precursors_quantified
                if not prec.find_best_peptide_pg().peptide.get_decoy()
            ]))
        nr_peptides_to_quant = len(
            set([
                prec.find_best_peptide_pg().peptide.sequence
                for prec in precursors_quantified
                if not prec.find_best_peptide_pg().peptide.get_decoy()
            ]))

        nr_precursors_in_all = len([
            1 for m in multipeptides if m.all_selected() and not m.get_decoy()
        ])
        max_pg = self.get_max_pg()
        dstats = self.estimate_real_fdr(multipeptides, fraction_present)
        dstats_all = self.estimate_real_fdr(multipeptides, 1.0)
        print("=" * 75)
        print("=" * 75)
        print("Total we have", len(self.runs), "runs with", len(self.union_transition_groups_set),\
                "peakgroups quantified in at least %s run(s) above FDR %0.4f %%" % (min_nrruns, fdr_cutoff*100) + ", " + \
                "giving maximally nr peakgroups", max_pg)
        print("We were able to quantify", alignment.nr_quantified, "/", max_pg, "peakgroups of which we aligned", \
                alignment.nr_aligned, "and changed order of", alignment.nr_changed, "and could not align", alignment.could_not_align)

        print(
            "We were able to quantify %s / %s precursors in %s runs, and %s in all runs (up from %s before alignment)"
            % (nr_precursors_to_quant, nr_precursors_total, min_nrruns,
               nr_precursors_in_all, in_all_runs_wo_align))
        print(
            "We were able to quantify %s / %s peptides in %s runs, and %s in all runs (up from %s before alignment)"
            % (nr_peptides_to_quant, nr_all_peptides, min_nrruns,
               nr_peptides_target, peptides_in_all_runs_wo_align_target))
        print(
            "We were able to quantify %s / %s proteins in %s runs, and %s in all runs (up from %s before alignment)"
            % (nr_proteins_to_quant, nr_all_proteins, min_nrruns,
               nr_proteins_target, proteins_in_all_runs_wo_align_target))

        # print "quant proteins", nr_proteins_to_quant

        # Get decoy estimates
        if len(precursors_in_all_runs) > 0:
            print(
                "Decoy percentage of peakgroups that are fully aligned %0.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%"
                % (dstats_all.decoy_pcnt, dstats_all.nr_decoys,
                   dstats_all.nr_decoys + dstats_all.nr_targets,
                   dstats_all.est_real_fdr * 100))

            print(
                "Decoy percentage of peakgroups that are partially aligned %1.4f %% (%s out of %s) which roughly corresponds to a peakgroup FDR of %s %%"
                % (dstats.decoy_pcnt, dstats.nr_decoys, dstats.nr_decoys +
                   dstats.nr_targets, dstats.est_real_fdr * 100))

            print(
                "There were", decoy_precursors,
                "decoy precursors identified out of",
                nr_precursors_to_quant + decoy_precursors,
                "precursors which is %0.4f %%" %
                (decoy_precursors * 100.0 /
                 (nr_precursors_to_quant + decoy_precursors)))

        if outlier_detection is not None:
            print("Outliers:", outlier_detection.nr_outliers, "outliers in",
                  len(multipeptides), "peptides or",
                  outlier_detection.outlier_pg, "peakgroups out of",
                  alignment.nr_quantified, "changed",
                  outlier_detection.outliers_changed)

    def write_to_file(self, multipeptides, options):

        infiles = options.infiles
        outfile = options.outfile
        matrix_outfile = options.matrix_outfile
        matrix_excelfile = options.matix_excel
        yaml_outfile = options.yaml_outfile
        ids_outfile = options.ids_outfile
        fraction_needed_selected = options.min_frac_selected
        file_format = options.file_format

        selected_pgs = []
        for m in multipeptides:
            selected_peakgroups = m.get_selected_peakgroups()
            if (len(selected_peakgroups) * 1.0 / len(self.runs) <
                    fraction_needed_selected):
                continue
            for p in m.get_peptides():
                selected_pg = p.get_selected_peakgroup()
                if selected_pg is None: continue
                selected_pgs.append(selected_pg)
        selected_ids_dict = dict([(pg.get_feature_id(), pg)
                                  for pg in selected_pgs])

        if len(ids_outfile) > 0:
            fh = open(ids_outfile, "w")
            id_writer = csv.writer(fh, delimiter="\t")
            for pg in selected_pgs:
                id_writer.writerow([pg.get_feature_id()])
            fh.close()
            del id_writer

        if len(matrix_outfile) > 0:
            write_out_matrix_file(matrix_outfile, self.runs, multipeptides,
                                  fraction_needed_selected)

        if len(outfile) > 0 and options.readmethod == "full":
            # write out the complete original files
            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += ["align_runid", "align_origfilename"]
            writer.writerow(header_first)

            for m in multipeptides:
                selected_peakgroups = m.get_selected_peakgroups()
                if (len(selected_peakgroups) * 1.0 / len(self.runs) <
                        fraction_needed_selected):
                    continue
                for p in m.get_peptides():
                    selected_pg = p.get_selected_peakgroup()
                    if selected_pg is None: continue
                    row_to_write = selected_pg.row
                    row_to_write += [
                        selected_pg.run.get_id(), selected_pg.run.orig_filename
                    ]
                    writer.writerow(row_to_write)
        elif len(outfile) > 0 and file_format == "openswath":
            # only in openswath we have the ID and can go back to the original file ...
            # write out the complete original files
            writer = csv.writer(open(outfile, "w"), delimiter="\t")
            header_first = self.runs[0].header
            for run in self.runs:
                assert header_first == run.header
            header_first += ["align_runid", "align_origfilename"]
            writer.writerow(header_first)

            for file_nr, f in enumerate(infiles):
                header_dict = {}
                reader = csv.reader(open(f), delimiter="\t")
                header = next(reader)
                for i, n in enumerate(header):
                    header_dict[n] = i
                for row in reader:
                    f_id = row[header_dict["id"]]
                    if f_id in selected_ids_dict:
                        # Check the "id" and "transition_group_id" field.
                        # Unfortunately the id can be non-unique, there we check both.
                        trgroup_id = selected_ids_dict[f_id].peptide.get_id()
                        unique_peptide_id = row[
                            header_dict["transition_group_id"]]
                        if unique_peptide_id == trgroup_id:
                            row_to_write = row
                            row_to_write += [
                                selected_ids_dict[f_id].peptide.run.get_id(), f
                            ]
                            writer.writerow(row_to_write)

        # Print out trafo data
        trafo_fnames = []
        for current_run in self.runs:
            current_id = current_run.get_id()
            ref_id = self.transformation_collection.getReferenceRunID()
            filename = os.path.join(
                os.path.dirname(current_run.orig_filename),
                "transformation-%s-%s.tr" % (current_id, ref_id))
            trafo_fnames.append(filename)
            self.transformation_collection.writeTransformationData(
                filename, current_id, ref_id)
            self.transformation_collection.readTransformationData(filename)

        if len(yaml_outfile) > 0:
            import yaml
            myYaml = {
                "RawData": [],
                "PeakGroupData": [outfile],
                "ReferenceRun":
                self.transformation_collection.getReferenceRunID()
            }
            for current_run in self.runs:
                current_id = current_run.get_id()
                ref_id = self.transformation_collection.getReferenceRunID()
                filename = os.path.join(
                    os.path.dirname(current_run.orig_filename),
                    "transformation-%s-%s.tr" % (current_id, ref_id))
                dirpath = os.path.realpath(
                    os.path.dirname(current_run.orig_filename))
                this = {
                    "id": current_id,
                    "directory": dirpath,
                    "trafo_file": os.path.realpath(filename)
                }
                myYaml["RawData"].append(this)
            open(yaml_outfile,
                 'w').write(yaml.dump({"AlignedSwathRuns": myYaml}))

        return trafo_fnames