def test_reference_2(self):

        rid = "0_1"
        self.tr_data.reference = "0_0" # set reference run to 0_0

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree]

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "median")

        # Reference 0_0 means that we transformed from 0_2 to 0_0 and then to 0_1
        self.assertAlmostEqual(border_l, 
                               self.tr_data.getTrafo("0_0", "0_1").predict(
                                 self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ]) 
                               ))
        self.assertAlmostEqual(border_r, 
                               self.tr_data.getTrafo("0_0", "0_1").predict(
                                 self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ]) 
                               ))

        self.assertAlmostEqual(border_l, 187.18146718146681)
        self.assertAlmostEqual(border_r, 202.00772200772167)
Exemple #2
0
    def test_alignAllCluster_1(self):
        """Test the best cluster align
        
        This is using the best possible conditions with only 7 seconds retention time cutoff

          - Run1 : 100s     [threadRT = 100s] 
          - Run2 : 112s     [threadRT = 106s]
          - Run3 : 120s     [threadRT = 118s]
          - Run4 : xxx      [threadRT = 126s]  (should be around 130s)
          - Run5 : 139s     [threadRT = 133s]
        """

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(
            algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id())
                       for a, b in tree]

        alignment = algo.TreeConsensusAlignment(max_rt_diff=6,
                                                fdr_cutoff=0.1,
                                                aligned_fdr_cutoff=0.25,
                                                correctRT_using_pg=True,
                                                verbose=True)
        alignment.alignAllCluster(self.multipeptides, tree_mapped,
                                  self.tr_data)

        # We should have 4 peakgroups
        prec1 = self.mpep
        self.assertEqual(len(prec1.get_selected_peakgroups()), 4)

        # Check that we have all the correct ones (1,2,4,8)
        self.assertEqual(
            set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']),
            set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
Exemple #3
0
    def test_alignBestCluster_1(self):
        """Test the best cluster align

        This is now using no correction of the alignment thread by using the
        found peakgroup. In this case it means that after finding the second
        peakgroup at 112 s, the search RT for run 2 is still at 106 seconds
        which gets mapped to 112 seconds in run 3 (but the next pg is at 120s,
        too far for 7 seconds tolerance).
        """

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(
            algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id())
                       for a, b in tree]

        alignment = algo.TreeConsensusAlignment(max_rt_diff=6,
                                                fdr_cutoff=0.1,
                                                aligned_fdr_cutoff=0.25,
                                                correctRT_using_pg=False)
        alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped,
                                          self.tr_data)

        # Now only 2 peakgroups should be selected
        prec1 = self.mpep
        self.assertEqual(len(prec1.get_selected_peakgroups()), 2)

        # Check that we have all the correct ones (only 1,2)
        self.assertEqual(
            set(['peakgroup2', 'peakgroup1']),
            set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
    def test_reference_1(self):

        rid = "0_0"
        self.tr_data.reference = "0_2" # set reference run to 0_2

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree]

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "median")

        # Direct transformation from 0_2 to 0_0
        self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ])[0])
        self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ])[0])

        self.assertAlmostEqual(border_l, 77.992277992277934)
        self.assertAlmostEqual(border_r, 84.1698841699)

        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "mean")
        self.assertAlmostEqual(border_l, 77.992277992277934)
        self.assertAlmostEqual(border_r, 84.1698841699)
        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "max_width")
        self.assertAlmostEqual(border_l, 77.992277992277934)
        self.assertAlmostEqual(border_r, 84.1698841699)

        self.assertRaises(Exception, integrationBorderReference, self.new_exp, selected_pg, 
            rid, self.tr_data, "dummy")
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope,
                   initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff,
                   smoothing_method, method, use_RT_correction,
                   stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force):
    """
    Minimum Spanning Tree (MST) based local aligment 
    """

    spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp)

    if mst_use_ref:
        # force reference-based alignment
        bestrun = spl_aligner._determine_best_run(exp)
        ref = spl_aligner._determine_best_run(exp).get_id()
        refrun_id, refrun = [(i, run) for i, run in enumerate(exp.runs)
                             if run.get_id() == ref][0]
        tree = [(i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id]
    else:
        tree = MinimumSpanningTree(
            getDistanceMatrix(exp, multipeptides, spl_aligner))

    print("Computed Tree:", tree)

    # Get alignments
    tr_data = LightTransformationData()
    for edge in tree:
        addDataToTrafo(tr_data,
                       exp.runs[edge[0]],
                       exp.runs[edge[1]],
                       spl_aligner,
                       multipeptides,
                       smoothing_method,
                       max_rt_diff,
                       force=force)

    tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id())
                   for a, b in tree]

    # Perform work
    al = TreeConsensusAlignment(max_rt_diff,
                                fdr_cutoff,
                                aligned_fdr_cutoff,
                                rt_diff_isotope=rt_diff_isotope,
                                correctRT_using_pg=use_RT_correction,
                                stdev_max_rt_per_run=stdev_max_rt_per_run,
                                use_local_stdev=use_local_stdev)

    if method == "LocalMST":
        al.alignBestCluster(multipeptides, tree_mapped, tr_data)
    elif method == "LocalMSTAllCluster":
        al.alignAllCluster(multipeptides, tree_mapped, tr_data)

    # Store number of ambigous cases (e.g. where more than one peakgroup below
    # the strict quality cutoff was found in the RT window) and the number of
    # cases where multiple possibilities were found.
    exp.nr_ambiguous = al.nr_ambiguous
    exp.nr_multiple_align = al.nr_multiple_align

    return tree
    def test_shortestPath_2(self):

        rid = "0_1"
        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree]

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        border_l, border_r = integrationBorderShortestPath(selected_pg, 
            rid, self.tr_data, tree_mapped)

        # Shortest path means that we transformed from 0_2 to 0_1
        self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_1").predict( [ 240.0 ] ))
        self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_1").predict( [ 260.0 ] ))

        self.assertAlmostEqual(border_l, 168.03088803088787)
        self.assertAlmostEqual(border_r, 183.32046332046318)
Exemple #7
0
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope,
                   initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff,
                   smoothing_method, method, use_RT_correction,
                   stdev_max_rt_per_run, use_local_stdev):
    """
    Minimum Spanning Tree (MST) based local aligment 
    """

    spl_aligner = SplineAligner(initial_alignment_cutoff)
    tree = MinimumSpanningTree(
        getDistanceMatrix(exp, multipeptides, spl_aligner))
    print "Computed Tree:", tree

    # Get alignments
    tr_data = LightTransformationData()
    for edge in tree:
        addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]],
                       spl_aligner, multipeptides, smoothing_method,
                       max_rt_diff)

    tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id())
                   for a, b in tree]

    # Perform work
    al = TreeConsensusAlignment(max_rt_diff,
                                fdr_cutoff,
                                aligned_fdr_cutoff,
                                rt_diff_isotope=rt_diff_isotope,
                                correctRT_using_pg=use_RT_correction,
                                stdev_max_rt_per_run=stdev_max_rt_per_run,
                                use_local_stdev=use_local_stdev)

    if method == "LocalMST":
        al.alignBestCluster(multipeptides, tree_mapped, tr_data)
    elif method == "LocalMSTAllCluster":
        al.alignAllCluster(multipeptides, tree_mapped, tr_data)

    # Store number of ambigous cases (e.g. where more than one peakgroup below
    # the strict quality cutoff was found in the RT window) and the number of
    # cases where multiple possibilities were found.
    exp.nr_ambiguous = al.nr_ambiguous
    exp.nr_multiple_align = al.nr_multiple_align
Exemple #8
0
    def test_alignBestCluster_2(self):
        """Test the best cluster align

        This is now using no correction of the alignment thread by using the
        found peakgroup (e.g. no correction of the threading). 

          - Run1 : 100s     [threadRT = 100s] 
          - Run2 : 112s     [threadRT = 106s]
          - Run3 : 120s     [threadRT = 112s]
          - Run4 : xxx      [threadRT = 118s]
          - Run5 : 139s     [threadRT = 124s]

        By using a larger tolerance of 15s, we can still manage to find all the correct peakgroups
        """

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(
            algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id())
                       for a, b in tree]

        alignment = algo.TreeConsensusAlignment(max_rt_diff=15,
                                                fdr_cutoff=0.1,
                                                aligned_fdr_cutoff=0.25,
                                                correctRT_using_pg=False)
        alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped,
                                          self.tr_data)

        # Now only 2 peakgroups should be selected
        prec1 = self.mpep
        self.assertEqual(len(prec1.get_selected_peakgroups()), 4)

        # Check that we have all the correct ones (1,2,4,8)
        self.assertEqual(
            set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']),
            set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
Exemple #9
0
 def test_prepare(self):
     spl_aligner = SplineAligner(self.initial_alignment_cutoff)
     tree = MinimumSpanningTree(
         algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
     self.assertEqual(tree, [(3, 4), (2, 3), (1, 2), (0, 1)])
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test):
    """Impute values across chromatograms

    Args:
        peakgroups_file(filename): CSV file containing all peakgroups
        mzML_file(filename): mzML file containing chromatograms
    Returns:
        A tuple of:
            new_exp(AlignmentExperiment): experiment containing the aligned peakgroups
            multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides

    This function will read the csv file with all peakgroups as well as the
    provided chromatogram file (.chrom.mzML). It will then try to impute
    missing values for those peakgroups where no values is currently present,
    reading the raw chromatograms.
    """

    # We do not want to exclude any peakgroups for noiseIntegration (we assume
    # that alignment has already happened)
    fdr_cutoff_all_pg = 1.0

    start = time.time()
    reader = SWATHScoringReader.newReader([peakgroups_file],
                                          options.file_format,
                                          readmethod="complete",
                                          enable_isotopic_grouping = not options.disable_isotopic_grouping)
    new_exp = Experiment()
    new_exp.runs = reader.parse_files()
    multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False)
    print("Parsing the peakgroups file took %ss" % (time.time() - start) )

    mapping = {}
    precursors_mapping = {}
    sequences_mapping = {}
    protein_mapping = {}
    inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False)
    mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()])
    if VERBOSE:
        print mapping

    # Do only a single run : read only one single file
    start = time.time()
    swath_chromatograms = SwathChromatogramCollection()
    swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv)
    print("Reading the chromatogram files took %ss" % (time.time() - start) )
    assert len(swath_chromatograms.getRunIDs() ) == 1
    rid = swath_chromatograms.getRunIDs()[0]

    start = time.time()
    initial_alignment_cutoff = 0.0001
    max_rt_diff = 30
    sd_data = -1 # We do not use the standard deviation data in this algorithm
    tr_data = transformations.LightTransformationData()
    spl_aligner = SplineAligner(initial_alignment_cutoff)

    if method == "singleClosestRun":
        tree_mapped = None

        run_1 = [r for r in new_exp.runs if r.get_id() == rid][0]
        dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id())
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for run_0 in new_exp.runs:
            helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides,
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    elif method == "singleShortestPath":
        dist_matrix = None

        tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner))
        tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree]
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for edge in tree:
            helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], 
                new_exp.runs[edge[1]], spl_aligner, multipeptides, 
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    else:
        raise Exception("Unknown method: " + method)

    print("Alignment took %ss" % (time.time() - start) )
    start = time.time()
    multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
        tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix,
        disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    print("Analyzing the runs took %ss" % (time.time() - start) )

    return new_exp, multipeptides
Exemple #11
0
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope,
                   initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff,
                   smoothing_method, method, use_RT_correction,
                   stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force,
                   optimized_cython):
    """
    Minimum Spanning Tree (MST) based local aligment 
    """

    spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp)

    if mst_use_ref:
        # force reference-based alignment
        bestrun = spl_aligner._determine_best_run(exp)
        ref = spl_aligner._determine_best_run(exp).get_id()
        refrun_id, refrun = [(i, run) for i, run in enumerate(exp.runs)
                             if run.get_id() == ref][0]
        tree = [(i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id]
    else:
        start = time.time()
        tree = MinimumSpanningTree(
            getDistanceMatrix(exp, multipeptides, spl_aligner))
        print("Computing tree took %0.2fs" % (time.time() - start))

    print("Computed Tree:", tree)

    # Get alignments
    start = time.time()
    try:
        from msproteomicstoolslib.cython._optimized import CyLightTransformationData
        if optimized_cython:
            tr_data = CyLightTransformationData()
        else:
            tr_data = LightTransformationData()
    except ImportError:
        print(
            "WARNING: cannot import CyLightTransformationData, will use Python version (slower)."
        )
        tr_data = LightTransformationData()

    for edge in tree:
        addDataToTrafo(tr_data,
                       exp.runs[edge[0]],
                       exp.runs[edge[1]],
                       spl_aligner,
                       multipeptides,
                       smoothing_method,
                       max_rt_diff,
                       force=force)

    tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id())
                   for a, b in tree]
    print("Computing transformations for all edges took %0.2fs" %
          (time.time() - start))

    # Perform work
    al = TreeConsensusAlignment(max_rt_diff,
                                fdr_cutoff,
                                aligned_fdr_cutoff,
                                rt_diff_isotope=rt_diff_isotope,
                                correctRT_using_pg=use_RT_correction,
                                stdev_max_rt_per_run=stdev_max_rt_per_run,
                                use_local_stdev=use_local_stdev)

    if method == "LocalMST":
        if optimized_cython:
            al.alignBestCluster(multipeptides, tree_mapped, tr_data)
        else:
            print(
                "WARNING: cannot utilize optimized MST alignment (needs readmethod = cminimal), will use Python version (slower)."
            )
            al.alignBestCluster_legacy(multipeptides, tree_mapped, tr_data)
    elif method == "LocalMSTAllCluster":
        al.alignAllCluster(multipeptides, tree_mapped, tr_data)

    # Store number of ambigous cases (e.g. where more than one peakgroup below
    # the strict quality cutoff was found in the RT window) and the number of
    # cases where multiple possibilities were found.
    exp.nr_ambiguous = al.nr_ambiguous
    exp.nr_multiple_align = al.nr_multiple_align

    return tree