def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff,
                   fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method,
                   use_RT_correction, stdev_max_rt_per_run, use_local_stdev, mst_use_ref):
    """
    Minimum Spanning Tree (MST) based local aligment 
    """

    spl_aligner = SplineAligner(initial_alignment_cutoff)

    if mst_use_ref:
        # force reference-based alignment
        bestrun = spl_aligner._determine_best_run(exp)
        ref = spl_aligner._determine_best_run(exp).get_id()
        refrun_id, refrun = [ (i,run) for i, run in enumerate(exp.runs) if run.get_id() == ref][0]
        tree = [( i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id]
    else:
        tree = MinimumSpanningTree(getDistanceMatrix(exp, multipeptides, spl_aligner))

    print("Computed Tree:", tree)

    
    # Get alignments
    tr_data = LightTransformationData()
    for edge in tree:
        addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]],
                       spl_aligner, multipeptides, smoothing_method,
                       max_rt_diff)

    tree_mapped = [ (exp.runs[a].get_id(), exp.runs[b].get_id()) for a,b in tree]

    # Perform work
    al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, 
                                rt_diff_isotope=rt_diff_isotope,
                                correctRT_using_pg=use_RT_correction,
                                stdev_max_rt_per_run=stdev_max_rt_per_run,
                                use_local_stdev=use_local_stdev)

    if method == "LocalMST":
        al.alignBestCluster(multipeptides, tree_mapped, tr_data)
    elif method == "LocalMSTAllCluster":
        al.alignAllCluster(multipeptides, tree_mapped, tr_data)

    # Store number of ambigous cases (e.g. where more than one peakgroup below
    # the strict quality cutoff was found in the RT window) and the number of
    # cases where multiple possibilities were found.
    exp.nr_ambiguous = al.nr_ambiguous
    exp.nr_multiple_align = al.nr_multiple_align

    return tree
    def setUp(self):
        # Set up dirs
        self.dirname = os.path.dirname(os.path.abspath(__file__))
        self.topdir = os.path.join(os.path.join(self.dirname, ".."), "..")
        self.datadir = os.path.join(os.path.join(self.topdir, "test"), "data")
        self.scriptdir = os.path.join(self.topdir, "analysis")

        # Set up files
        peakgroups_file = os.path.join(self.datadir, "imputeValues/imputeValues_5_input.csv")
        mzml_file = os.path.join(self.datadir, "imputeValues/r004_small/split_olgas_otherfile.chrom.mzML")

        # Parameters
        self.initial_alignment_cutoff = 0.0001
        fdr_cutoff_all_pg = 1.0
        max_rt_diff = 30

        # Read input
        reader = SWATHScoringReader.newReader([peakgroups_file], "openswath", readmethod="complete")
        self.new_exp = MRExperiment()
        self.new_exp.runs = reader.parse_files()
        self.multipeptides = self.new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False)

        # Align all against all
        self.tr_data = transformations.LightTransformationData()
        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        for run_0 in self.new_exp.runs:
            for run_1 in self.new_exp.runs:
                helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner, self.multipeptides, "linear", 30)

        # Select two interesting peptides
        pepname = "21517_C[160]NVVISGGTGSGK/2_run0 0 0"
        self.current_mpep1 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0]

        pepname = "26471_GYEDPPAALFR/2_run0 0 0"
        self.current_mpep2 = [m for m in self.multipeptides if m.getAllPeptides()[0].get_id() == pepname][0]
Exemple #3
0
    def test_alignAllCluster_1(self):
        """Test the best cluster align
        
        This is using the best possible conditions with only 7 seconds retention time cutoff

          - Run1 : 100s     [threadRT = 100s] 
          - Run2 : 112s     [threadRT = 106s]
          - Run3 : 120s     [threadRT = 118s]
          - Run4 : xxx      [threadRT = 126s]  (should be around 130s)
          - Run5 : 139s     [threadRT = 133s]
        """

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(
            algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id())
                       for a, b in tree]

        alignment = algo.TreeConsensusAlignment(max_rt_diff=6,
                                                fdr_cutoff=0.1,
                                                aligned_fdr_cutoff=0.25,
                                                correctRT_using_pg=True,
                                                verbose=True)
        alignment.alignAllCluster(self.multipeptides, tree_mapped,
                                  self.tr_data)

        # We should have 4 peakgroups
        prec1 = self.mpep
        self.assertEqual(len(prec1.get_selected_peakgroups()), 4)

        # Check that we have all the correct ones (1,2,4,8)
        self.assertEqual(
            set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']),
            set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
    def test_reference_1(self):

        rid = "0_0"
        self.tr_data.reference = "0_2" # set reference run to 0_2

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree]

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "median")

        # Direct transformation from 0_2 to 0_0
        self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ])[0])
        self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ])[0])

        self.assertAlmostEqual(border_l, 77.992277992277934)
        self.assertAlmostEqual(border_r, 84.1698841699)

        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "mean")
        self.assertAlmostEqual(border_l, 77.992277992277934)
        self.assertAlmostEqual(border_r, 84.1698841699)
        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "max_width")
        self.assertAlmostEqual(border_l, 77.992277992277934)
        self.assertAlmostEqual(border_r, 84.1698841699)

        self.assertRaises(Exception, integrationBorderReference, self.new_exp, selected_pg, 
            rid, self.tr_data, "dummy")
    def test_reference_2(self):

        rid = "0_1"
        self.tr_data.reference = "0_0" # set reference run to 0_0

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree]

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        border_l, border_r = integrationBorderReference(self.new_exp, selected_pg, 
            rid, self.tr_data, "median")

        # Reference 0_0 means that we transformed from 0_2 to 0_0 and then to 0_1
        self.assertAlmostEqual(border_l, 
                               self.tr_data.getTrafo("0_0", "0_1").predict(
                                 self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ]) 
                               ))
        self.assertAlmostEqual(border_r, 
                               self.tr_data.getTrafo("0_0", "0_1").predict(
                                 self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ]) 
                               ))

        self.assertAlmostEqual(border_l, 187.18146718146681)
        self.assertAlmostEqual(border_r, 202.00772200772167)
Exemple #6
0
    def test_alignBestCluster_1(self):
        """Test the best cluster align

        This is now using no correction of the alignment thread by using the
        found peakgroup. In this case it means that after finding the second
        peakgroup at 112 s, the search RT for run 2 is still at 106 seconds
        which gets mapped to 112 seconds in run 3 (but the next pg is at 120s,
        too far for 7 seconds tolerance).
        """

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(
            algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id())
                       for a, b in tree]

        alignment = algo.TreeConsensusAlignment(max_rt_diff=6,
                                                fdr_cutoff=0.1,
                                                aligned_fdr_cutoff=0.25,
                                                correctRT_using_pg=False)
        alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped,
                                          self.tr_data)

        # Now only 2 peakgroups should be selected
        prec1 = self.mpep
        self.assertEqual(len(prec1.get_selected_peakgroups()), 2)

        # Check that we have all the correct ones (only 1,2)
        self.assertEqual(
            set(['peakgroup2', 'peakgroup1']),
            set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
Exemple #7
0
def doReferenceAlignment(options, this_exp, multipeptides):

    # Performing re-alignment using a reference run
    if options.realign_method != "diRT":
        start = time.time()
        spl_aligner = SplineAligner(alignment_fdr_threshold = options.alignment_score, 
                                   smoother=options.realign_method,
                                   external_r_tmpdir = options.tmpdir, 
                                   experiment=this_exp)
        this_exp.transformation_collection = spl_aligner.rt_align_all_runs(this_exp, multipeptides)
        trafoError = spl_aligner.getTransformationError()
        print("Aligning the runs took %0.2fs" % (time.time() - start) )

    try:
        options.aligned_fdr_cutoff = float(options.aligned_fdr_cutoff)
    except ValueError:
        # We have a range of values to step through. 
        # Since we trust the input, wo dont do error checking.
        exec("fdr_range = numpy.arange(%s)" % options.aligned_fdr_cutoff)
        options.aligned_fdr_cutoff = estimate_aligned_fdr_cutoff(options, this_exp, multipeptides, fdr_range)

    try:
        options.rt_diff_cutoff = float(options.rt_diff_cutoff)
    except ValueError:
        if options.rt_diff_cutoff == "auto_2medianstdev":
            options.rt_diff_cutoff = 2*numpy.median(list(trafoError.getStdev()))
        elif options.rt_diff_cutoff == "auto_3medianstdev":
            options.rt_diff_cutoff = 3*numpy.median(list(trafoError.getStdev()))
        elif options.rt_diff_cutoff == "auto_4medianstdev":
            options.rt_diff_cutoff = 4*numpy.median(list(trafoError.getStdev()))
        elif options.rt_diff_cutoff == "auto_maxstdev":
            options.rt_diff_cutoff = max(list(trafoError.getStdev()))
        else:
            raise Exception("max_rt_diff either needs to be a value in seconds or" + \
                            "one of ('auto_2medianstdev', 'auto_3medianstdev', " + \
                            "'auto_4medianstdev', 'auto_maxstdev'). Found instead: '%s'" % options.rt_diff_cutoff)

    print("Will calculate with aligned_fdr cutoff of", options.aligned_fdr_cutoff, "and an RT difference of", options.rt_diff_cutoff)
    start = time.time()
    AlignmentAlgorithm().align_features(multipeptides, 
                    options.rt_diff_cutoff, options.fdr_cutoff,
                    options.aligned_fdr_cutoff, options.method)
    print("Re-aligning peak groups took %0.2fs" % (time.time() - start) )
def doReferenceAlignment(options, this_exp, multipeptides):

    # Performing re-alignment using a reference run
    if options.realign_method != "diRT":
        start = time.time()
        spl_aligner = SplineAligner(alignment_fdr_threshold = options.alignment_score, 
                                   smoother=options.realign_method,
                                   external_r_tmpdir = options.tmpdir, 
                                   experiment=this_exp)
        this_exp.transformation_collection = spl_aligner.rt_align_all_runs(this_exp, multipeptides)
        trafoError = spl_aligner.getTransformationError()
        print("Aligning the runs took %0.2fs" % (time.time() - start) )

    try:
        options.aligned_fdr_cutoff = float(options.aligned_fdr_cutoff)
    except ValueError:
        # We have a range of values to step through. 
        # Since we trust the input, wo dont do error checking.
        exec("fdr_range = numpy.arange(%s)" % options.aligned_fdr_cutoff)
        options.aligned_fdr_cutoff = estimate_aligned_fdr_cutoff(options, this_exp, multipeptides, fdr_range)

    try:
        options.rt_diff_cutoff = float(options.rt_diff_cutoff)
    except ValueError:
        if options.rt_diff_cutoff == "auto_2medianstdev":
            options.rt_diff_cutoff = 2*numpy.median(list(trafoError.getStdev()))
        elif options.rt_diff_cutoff == "auto_3medianstdev":
            options.rt_diff_cutoff = 3*numpy.median(list(trafoError.getStdev()))
        elif options.rt_diff_cutoff == "auto_4medianstdev":
            options.rt_diff_cutoff = 4*numpy.median(list(trafoError.getStdev()))
        elif options.rt_diff_cutoff == "auto_maxstdev":
            options.rt_diff_cutoff = max(list(trafoError.getStdev()))
        else:
            raise Exception("max_rt_diff either needs to be a value in seconds or" + \
                            "one of ('auto_2medianstdev', 'auto_3medianstdev', " + \
                            "'auto_4medianstdev', 'auto_maxstdev'). Found instead: '%s'" % options.rt_diff_cutoff)

    print("Will calculate with aligned_fdr cutoff of", options.aligned_fdr_cutoff, "and an RT difference of", options.rt_diff_cutoff)
    start = time.time()
    AlignmentAlgorithm().align_features(multipeptides, 
                    options.rt_diff_cutoff, options.fdr_cutoff,
                    options.aligned_fdr_cutoff, options.method)
    print("Re-aligning peak groups took %0.2fs" % (time.time() - start) )
    def test_shortestPath_2(self):

        rid = "0_1"
        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.new_exp.runs[a].get_id(), self.new_exp.runs[b].get_id()) for a,b in tree]

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        border_l, border_r = integrationBorderShortestPath(selected_pg, 
            rid, self.tr_data, tree_mapped)

        # Shortest path means that we transformed from 0_2 to 0_1
        self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_1").predict( [ 240.0 ] ))
        self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_1").predict( [ 260.0 ] ))

        self.assertAlmostEqual(border_l, 168.03088803088787)
        self.assertAlmostEqual(border_r, 183.32046332046318)
    def test_shortestDistance_2(self):

        rid = "0_1"
        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        dist_matrix = getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        rmap = dict([(r.get_id(),i) for i,r in enumerate(self.new_exp.runs) ])
        border_l, border_r = integrationBorderShortestDistance(selected_pg, 
            rid, self.tr_data, dist_matrix, rmap)

        # Shortest distance means that we transformed directly from 0_2 to 0_1
        self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_1").predict([ 240.0 ])[0])
        self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_1").predict([ 260.0 ])[0])

        self.assertAlmostEqual(border_l, 168.03088803088787)
        self.assertAlmostEqual(border_r, 183.32046332)
    def test_shortestDistance_1(self):

        rid = "0_0"
        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        dist_matrix = getDistanceMatrix(self.new_exp, self.multipeptides, spl_aligner)

        # Select peakgroups, compute left/right border
        selected_pg = [pg for p in self.current_mpep1.getAllPeptides() for pg in p.get_all_peakgroups() if pg.get_cluster_id() == 1]
        rmap = dict([(r.get_id(),i) for i,r in enumerate(self.new_exp.runs) ])
        border_l, border_r = integrationBorderShortestDistance(selected_pg, 
            rid, self.tr_data, dist_matrix, rmap)

        # Direct transformation from 0_2 to 0_0
        self.assertAlmostEqual(border_l, self.tr_data.getTrafo("0_2", "0_0").predict([ 240.0 ])[0])
        self.assertAlmostEqual(border_r, self.tr_data.getTrafo("0_2", "0_0").predict([ 260.0 ])[0])

        self.assertAlmostEqual(border_l, 77.992277992277934)
        self.assertAlmostEqual(border_r, 84.1698841699)
Exemple #12
0
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope,
                   initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff,
                   smoothing_method, method, use_RT_correction,
                   stdev_max_rt_per_run, use_local_stdev):
    """
    Minimum Spanning Tree (MST) based local aligment 
    """

    spl_aligner = SplineAligner(initial_alignment_cutoff)
    tree = MinimumSpanningTree(
        getDistanceMatrix(exp, multipeptides, spl_aligner))
    print "Computed Tree:", tree

    # Get alignments
    tr_data = LightTransformationData()
    for edge in tree:
        addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]],
                       spl_aligner, multipeptides, smoothing_method,
                       max_rt_diff)

    tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id())
                   for a, b in tree]

    # Perform work
    al = TreeConsensusAlignment(max_rt_diff,
                                fdr_cutoff,
                                aligned_fdr_cutoff,
                                rt_diff_isotope=rt_diff_isotope,
                                correctRT_using_pg=use_RT_correction,
                                stdev_max_rt_per_run=stdev_max_rt_per_run,
                                use_local_stdev=use_local_stdev)

    if method == "LocalMST":
        al.alignBestCluster(multipeptides, tree_mapped, tr_data)
    elif method == "LocalMSTAllCluster":
        al.alignAllCluster(multipeptides, tree_mapped, tr_data)

    # Store number of ambigous cases (e.g. where more than one peakgroup below
    # the strict quality cutoff was found in the RT window) and the number of
    # cases where multiple possibilities were found.
    exp.nr_ambiguous = al.nr_ambiguous
    exp.nr_multiple_align = al.nr_multiple_align
Exemple #13
0
    def test_alignBestCluster_2(self):
        """Test the best cluster align

        This is now using no correction of the alignment thread by using the
        found peakgroup (e.g. no correction of the threading). 

          - Run1 : 100s     [threadRT = 100s] 
          - Run2 : 112s     [threadRT = 106s]
          - Run3 : 120s     [threadRT = 112s]
          - Run4 : xxx      [threadRT = 118s]
          - Run5 : 139s     [threadRT = 124s]

        By using a larger tolerance of 15s, we can still manage to find all the correct peakgroups
        """

        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        tree = MinimumSpanningTree(
            algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
        tree_mapped = [(self.exp.runs[a].get_id(), self.exp.runs[b].get_id())
                       for a, b in tree]

        alignment = algo.TreeConsensusAlignment(max_rt_diff=15,
                                                fdr_cutoff=0.1,
                                                aligned_fdr_cutoff=0.25,
                                                correctRT_using_pg=False)
        alignment.alignBestCluster_legacy(self.multipeptides, tree_mapped,
                                          self.tr_data)

        # Now only 2 peakgroups should be selected
        prec1 = self.mpep
        self.assertEqual(len(prec1.get_selected_peakgroups()), 4)

        # Check that we have all the correct ones (1,2,4,8)
        self.assertEqual(
            set(['peakgroup8', 'peakgroup2', 'peakgroup4', 'peakgroup1']),
            set([p.get_feature_id() for p in prec1.get_selected_peakgroups()]))
Exemple #14
0
def main(options):
    infiles = options.feature_files
    chromatograms = options.chromatogram_files

    readfilter = ReadFilter()
    file_format = 'openswath'
    readmethod = "minimal"

    reader = SWATHScoringReader.newReader(infiles,
                                          file_format,
                                          readmethod,
                                          readfilter,
                                          enable_isotopic_grouping=False,
                                          read_cluster_id=False)
    reader.map_infiles_chromfiles(chromatograms)
    runs = reader.parse_files()
    MStoFeature = MSfileRunMapping(chromatograms, runs)
    precursor_to_transitionID, precursor_sequence = getPrecursorTransitionMapping(
        infiles[0])
    MZs = mzml_accessors(runs, MStoFeature)
    MZs.set_precursor_to_chromID(precursor_to_transitionID)

    this_exp = Experiment()
    this_exp.set_runs(runs)
    start = time.time()
    fdr_cutoff = options.aligned_fdr_cutoff
    multipeptides = this_exp.get_all_multipeptides(fdr_cutoff,
                                                   verbose=False,
                                                   verbosity=10)
    print("Mapping the precursors took %0.2fs" % (time.time() - start))

    # Reference based alignment
    # best_run = this_exp.determine_best_run(alignment_fdr_threshold = 0.05)
    reference_run = referenceForPrecursor(
        refType="precursor_specific",
        alignment_fdr_threshold=options.fdr_cutoff
    ).get_reference_for_precursors(multipeptides)
    # Pairwise global alignment
    spl_aligner = SplineAligner(alignment_fdr_threshold=fdr_cutoff,
                                smoother="lowess",
                                experiment=this_exp)
    tr_data = initialize_transformation()
    # Initialize XIC smoothing function
    chrom_smoother = chromSmoother(smoother="sgolay", kernelLen=11, polyOrd=4)

    # Calculate the aligned retention time for each precursor across all runs
    prec_ids = list(precursor_to_transitionID.keys())
    for i in range(len(prec_ids)):
        prec_id = prec_ids[i]  #9719 9720
        refrun = reference_run.get(prec_id)
        if not refrun:
            print(
                "The precursor {} doesn't have any associated reference run. Skipping!"
                .format(prec_id))
            continue
        eXps = list(set(runs) - set([refrun]))
        # Extract XICs from reference run and smooth it.
        XICs_ref = MZs.extractXIC_group(refrun, prec_id)
        if not XICs_ref:
            continue
        XICs_ref_sm = chrom_smoother.smoothXICs(XICs_ref)
        # For each precursor, we need peptide_group_label and trgr_id
        peptide_group_label = precursor_sequence[prec_id][0]
        # Iterate through all other runs and align them to the reference run
        for eXprun in eXps:
            ## Extract XICs from experiment run and smooth it.
            XICs_eXp = MZs.extractXIC_group(eXprun, prec_id)
            if not XICs_eXp:
                continue
            XICs_eXp_sm = chrom_smoother.smoothXICs(XICs_eXp)
            t_ref_aligned, t_eXp_aligned = RTofAlignedXICs(
                XICs_ref_sm,
                XICs_eXp_sm,
                tr_data,
                spl_aligner,
                eXprun,
                refrun,
                multipeptides,
                RSEdistFactor=4,
                alignType=b"hybrid",
                normalization=b"mean",
                simType=b"dotProductMasked",
                goFactor=0.125,
                geFactor=40,
                cosAngleThresh=0.3,
                OverlapAlignment=True,
                dotProdThresh=0.96,
                gapQuantile=0.5,
                hardConstrain=False,
                samples4gradient=100)
            # Update retention time of all peak-groups to reference peak-group
            updateRetentionTime(eXprun, peptide_group_label, prec_id,
                                t_ref_aligned, t_eXp_aligned)

    AlignmentAlgorithm().align_features(
        multipeptides,
        rt_diff_cutoff=40,
        fdr_cutoff=0.01,
        aligned_fdr_cutoff=options.aligned_fdr_cutoff,
        method=options.method)
    al = this_exp.print_stats(multipeptides, 0.05, 0.1, 1)
    write_out_matrix_file(options.matrix_outfile, runs, multipeptides,
                          options.min_frac_selected,
                          options.matrix_output_method, True, 0.05,
                          precursor_sequence)
Exemple #15
0
 def test_prepare(self):
     spl_aligner = SplineAligner(self.initial_alignment_cutoff)
     tree = MinimumSpanningTree(
         algo.getDistanceMatrix(self.exp, self.multipeptides, spl_aligner))
     self.assertEqual(tree, [(3, 4), (2, 3), (1, 2), (0, 1)])
Exemple #16
0
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope,
                   initial_alignment_cutoff, fdr_cutoff, aligned_fdr_cutoff,
                   smoothing_method, method, use_RT_correction,
                   stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force,
                   optimized_cython):
    """
    Minimum Spanning Tree (MST) based local aligment 
    """

    spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp)

    if mst_use_ref:
        # force reference-based alignment
        bestrun = spl_aligner._determine_best_run(exp)
        ref = spl_aligner._determine_best_run(exp).get_id()
        refrun_id, refrun = [(i, run) for i, run in enumerate(exp.runs)
                             if run.get_id() == ref][0]
        tree = [(i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id]
    else:
        start = time.time()
        tree = MinimumSpanningTree(
            getDistanceMatrix(exp, multipeptides, spl_aligner))
        print("Computing tree took %0.2fs" % (time.time() - start))

    print("Computed Tree:", tree)

    # Get alignments
    start = time.time()
    try:
        from msproteomicstoolslib.cython._optimized import CyLightTransformationData
        if optimized_cython:
            tr_data = CyLightTransformationData()
        else:
            tr_data = LightTransformationData()
    except ImportError:
        print(
            "WARNING: cannot import CyLightTransformationData, will use Python version (slower)."
        )
        tr_data = LightTransformationData()

    for edge in tree:
        addDataToTrafo(tr_data,
                       exp.runs[edge[0]],
                       exp.runs[edge[1]],
                       spl_aligner,
                       multipeptides,
                       smoothing_method,
                       max_rt_diff,
                       force=force)

    tree_mapped = [(exp.runs[a].get_id(), exp.runs[b].get_id())
                   for a, b in tree]
    print("Computing transformations for all edges took %0.2fs" %
          (time.time() - start))

    # Perform work
    al = TreeConsensusAlignment(max_rt_diff,
                                fdr_cutoff,
                                aligned_fdr_cutoff,
                                rt_diff_isotope=rt_diff_isotope,
                                correctRT_using_pg=use_RT_correction,
                                stdev_max_rt_per_run=stdev_max_rt_per_run,
                                use_local_stdev=use_local_stdev)

    if method == "LocalMST":
        if optimized_cython:
            al.alignBestCluster(multipeptides, tree_mapped, tr_data)
        else:
            print(
                "WARNING: cannot utilize optimized MST alignment (needs readmethod = cminimal), will use Python version (slower)."
            )
            al.alignBestCluster_legacy(multipeptides, tree_mapped, tr_data)
    elif method == "LocalMSTAllCluster":
        al.alignAllCluster(multipeptides, tree_mapped, tr_data)

    # Store number of ambigous cases (e.g. where more than one peakgroup below
    # the strict quality cutoff was found in the RT window) and the number of
    # cases where multiple possibilities were found.
    exp.nr_ambiguous = al.nr_ambiguous
    exp.nr_multiple_align = al.nr_multiple_align

    return tree
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test):
    """Impute values across chromatograms

    Args:
        peakgroups_file(filename): CSV file containing all peakgroups
        mzML_file(filename): mzML file containing chromatograms
    Returns:
        A tuple of:
            new_exp(AlignmentExperiment): experiment containing the aligned peakgroups
            multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides

    This function will read the csv file with all peakgroups as well as the
    provided chromatogram file (.chrom.mzML). It will then try to impute
    missing values for those peakgroups where no values is currently present,
    reading the raw chromatograms.
    """

    # We do not want to exclude any peakgroups for noiseIntegration (we assume
    # that alignment has already happened)
    fdr_cutoff_all_pg = 1.0

    start = time.time()
    reader = SWATHScoringReader.newReader([peakgroups_file],
                                          options.file_format,
                                          readmethod="complete",
                                          enable_isotopic_grouping = not options.disable_isotopic_grouping)
    new_exp = Experiment()
    new_exp.runs = reader.parse_files()
    multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False)
    print("Parsing the peakgroups file took %ss" % (time.time() - start) )

    mapping = {}
    precursors_mapping = {}
    sequences_mapping = {}
    protein_mapping = {}
    inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False)
    mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()])
    if VERBOSE:
        print mapping

    # Do only a single run : read only one single file
    start = time.time()
    swath_chromatograms = SwathChromatogramCollection()
    swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv)
    print("Reading the chromatogram files took %ss" % (time.time() - start) )
    assert len(swath_chromatograms.getRunIDs() ) == 1
    rid = swath_chromatograms.getRunIDs()[0]

    start = time.time()
    initial_alignment_cutoff = 0.0001
    max_rt_diff = 30
    sd_data = -1 # We do not use the standard deviation data in this algorithm
    tr_data = transformations.LightTransformationData()
    spl_aligner = SplineAligner(initial_alignment_cutoff)

    if method == "singleClosestRun":
        tree_mapped = None

        run_1 = [r for r in new_exp.runs if r.get_id() == rid][0]
        dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id())
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for run_0 in new_exp.runs:
            helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides,
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    elif method == "singleShortestPath":
        dist_matrix = None

        tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner))
        tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree]
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for edge in tree:
            helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], 
                new_exp.runs[edge[1]], spl_aligner, multipeptides, 
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    else:
        raise Exception("Unknown method: " + method)

    print("Alignment took %ss" % (time.time() - start) )
    start = time.time()
    multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
        tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix,
        disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    print("Analyzing the runs took %ss" % (time.time() - start) )

    return new_exp, multipeptides
Exemple #18
0
    def setUp(self):

        import msproteomicstoolslib.data_structures.Precursor as precursor
        import msproteomicstoolslib.data_structures.PrecursorGroup as precursor_group
        import msproteomicstoolslib.format.TransformationCollection as transformations
        from msproteomicstoolslib.algorithms.alignment.SplineAligner import SplineAligner
        import msproteomicstoolslib.algorithms.alignment.AlignmentHelper as helper

        # 0. id
        # 1. quality score (FDR)
        # 2. retention time (normalized)
        # 3. intensity

        mpeps = [Multipeptide() for i in range(3)]
        [m.set_nr_runs(5) for m in mpeps]

        # Parameters
        self.initial_alignment_cutoff = 0.001

        runs = [MockRun("0_%s" % (i + 1)) for i in range(5)]
        ids = 0
        for i in range(5):

            # Two alignment peptides
            p = precursor.Precursor("anchorpeptide_1", runs[i])
            pg_tuple = ("id_%s" % ids, 0.0001, 100 + i * 10, 10000)
            p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_1", -1)
            prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i])
            prgr.addPrecursor(p)
            mpeps[0].insert(runs[i].get_id(), prgr)
            ids += 1

            p = precursor.Precursor("anchorpeptide_2", runs[i])
            pg_tuple = ("id_%s" % ids, 0.0001, 1000 + i * 100, 10000)
            p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_2", -1)
            prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i])
            prgr.addPrecursor(p)
            mpeps[1].insert(runs[i].get_id(), prgr)
            ids += 1

            # The noise peptide
            p = precursor.Precursor("anchorpeptide_3", runs[i])
            pg_tuple = ("id_%s" % ids, 0.0001, 500 + i * 40, 10000)
            p.add_peakgroup_tpl(pg_tuple, "anchorpeptide_3", -1)
            prgr = precursor_group.PrecursorGroup(p.get_id(), runs[i])
            prgr.addPrecursor(p)
            mpeps[2].insert(runs[i].get_id(), prgr)
            ids += 1

        m = Multipeptide()
        m.set_nr_runs(5)

        # Run 1
        #  - peakgroup 1 : RT = 110 seconds [correct]
        p = precursor.Precursor("precursor_1", runs[0])
        pg_tuple = ("peakgroup1", 0.01, 100, 10000)
        p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1)
        prgr = precursor_group.PrecursorGroup(p.get_id(), runs[0])
        prgr.addPrecursor(p)
        m.insert(runs[0].get_id(), prgr)

        # Run 2:
        #  - peakgroup 2 : RT = 115 seconds [correct]
        #  - peakgroup 3 : RT = 130 seconds
        p = precursor.Precursor("precursor_1", runs[1])
        pg_tuple = ("peakgroup2", 0.2, 112, 10000)
        p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1)
        pg_tuple = ("peakgroup3", 0.18, 130, 10000)
        p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1)
        prgr = precursor_group.PrecursorGroup(p.get_id(), runs[1])
        prgr.addPrecursor(p)
        m.insert(runs[1].get_id(), prgr)

        # Run 3:
        #  - peakgroup 4 : RT = 120 seconds [correct]
        #  - peakgroup 5 : RT = 130 seconds
        p = precursor.Precursor("precursor_1", runs[2])
        pg_tuple = ("peakgroup4", 0.2, 120, 10000)
        p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1)
        pg_tuple = ("peakgroup5", 0.17, 130, 10000)
        p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1)
        prgr = precursor_group.PrecursorGroup(p.get_id(), runs[2])
        prgr.addPrecursor(p)
        m.insert(runs[2].get_id(), prgr)

        # Run 4:
        #  - peakgroup 6 : missing          [correct]
        #  - peakgroup 7 : RT = 145 seconds
        p = precursor.Precursor("precursor_1", runs[3])
        pg_tuple = ("peakgroup7", 0.18, 145, 10000)
        p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1)
        prgr = precursor_group.PrecursorGroup(p.get_id(), runs[3])
        prgr.addPrecursor(p)
        m.insert(runs[3].get_id(), prgr)

        # Run 5:
        #  - peakgroup 8 : RT = 140 seconds [correct]
        #  - peakgroup 9 : missing
        p = precursor.Precursor("precursor_1", runs[4])
        pg_tuple = ("peakgroup8", 0.1, 139, 10000)
        p.add_peakgroup_tpl(pg_tuple, "precursor_1", -1)
        prgr = precursor_group.PrecursorGroup(p.get_id(), runs[4])
        prgr.addPrecursor(p)
        m.insert(runs[4].get_id(), prgr)

        self.mpep = m
        self.exp = Dummy()
        self.exp.runs = runs

        mpeps.append(m)
        self.multipeptides = mpeps

        # Align all against all
        self.tr_data = transformations.LightTransformationData()
        spl_aligner = SplineAligner(self.initial_alignment_cutoff)
        for run_0 in self.exp.runs:
            for run_1 in self.exp.runs:
                helper.addDataToTrafo(self.tr_data, run_0, run_1, spl_aligner,
                                      self.multipeptides, "linear", 30)
def doMSTAlignment(exp, multipeptides, max_rt_diff, rt_diff_isotope, initial_alignment_cutoff,
                   fdr_cutoff, aligned_fdr_cutoff, smoothing_method, method,
                   use_RT_correction, stdev_max_rt_per_run, use_local_stdev, mst_use_ref, force, optimized_cython):
    """
    Minimum Spanning Tree (MST) based local aligment 
    """

    spl_aligner = SplineAligner(initial_alignment_cutoff, experiment=exp)

    if mst_use_ref:
        # force reference-based alignment
        bestrun = spl_aligner._determine_best_run(exp)
        ref = spl_aligner._determine_best_run(exp).get_id()
        refrun_id, refrun = [ (i,run) for i, run in enumerate(exp.runs) if run.get_id() == ref][0]
        tree = [( i, refrun_id) for i in range(len(exp.runs)) if i != refrun_id]
    else:
        start = time.time()
        tree = MinimumSpanningTree(getDistanceMatrix(exp, multipeptides, spl_aligner))
        print("Computing tree took %0.2fs" % (time.time() - start) )

    print("Computed Tree:", tree)

    
    # Get alignments
    start = time.time()
    try:
        from msproteomicstoolslib.cython._optimized import CyLightTransformationData
        if optimized_cython:
            tr_data = CyLightTransformationData()
        else:
            tr_data = LightTransformationData()
    except ImportError:
        print("WARNING: cannot import CyLightTransformationData, will use Python version (slower).")
        tr_data = LightTransformationData()

    for edge in tree:
        addDataToTrafo(tr_data, exp.runs[edge[0]], exp.runs[edge[1]],
                       spl_aligner, multipeptides, smoothing_method,
                       max_rt_diff, force=force)

    tree_mapped = [ (exp.runs[a].get_id(), exp.runs[b].get_id()) for a,b in tree]
    print("Computing transformations for all edges took %0.2fs" % (time.time() - start) )

    # Perform work
    al = TreeConsensusAlignment(max_rt_diff, fdr_cutoff, aligned_fdr_cutoff, 
                                rt_diff_isotope=rt_diff_isotope,
                                correctRT_using_pg=use_RT_correction,
                                stdev_max_rt_per_run=stdev_max_rt_per_run,
                                use_local_stdev=use_local_stdev)

    if method == "LocalMST":
        if optimized_cython:
            al.alignBestCluster(multipeptides, tree_mapped, tr_data)
        else:
            print("WARNING: cannot utilize optimized MST alignment (needs readmethod = cminimal), will use Python version (slower).")
            al.alignBestCluster_legacy(multipeptides, tree_mapped, tr_data)
    elif method == "LocalMSTAllCluster":
        al.alignAllCluster(multipeptides, tree_mapped, tr_data)

    # Store number of ambigous cases (e.g. where more than one peakgroup below
    # the strict quality cutoff was found in the RT window) and the number of
    # cases where multiple possibilities were found.
    exp.nr_ambiguous = al.nr_ambiguous
    exp.nr_multiple_align = al.nr_multiple_align

    return tree