def runImputeValues(options, peakgroups_file, trafo_fnames, is_test):
    """Impute values across chromatograms

    Args:
        peakgroups_file(filename): CSV file containing all peakgroups
        trafo_fnames(filename): A list of .tr filenames (it is assumed that in
            the same directory also the chromatogram mzML reside)
    Returns:
        A tuple of:
            new_exp(AlignmentExperiment): experiment containing the aligned peakgroups
            multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides

    This function will read the csv file with all peakgroups as well as the
    transformation files (.tr) and the corresponding raw chromatograms which
    need to be in the same folder. It will then try to impute missing values
    for those peakgroups where no values is currently present, reading the raw
    chromatograms.
    """

    # We do not want to exclude any peakgroups for noiseIntegration (we assume
    # that alignment has already happened)
    fdr_cutoff_all_pg = 1.0

    start = time.time()
    reader = SWATHScoringReader.newReader([peakgroups_file],
                                          options.file_format,
                                          readmethod="complete",
                                          enable_isotopic_grouping = not options.disable_isotopic_grouping)
    new_exp = Experiment()
    new_exp.runs = reader.parse_files()
    multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False)
    print("Parsing the peakgroups file took %ss" % (time.time() - start) )

    start = time.time()
    transformation_collection_ = transformations.TransformationCollection()
    for filename in trafo_fnames:
        transformation_collection_.readTransformationData(filename)

    # Read the datapoints and perform the smoothing
    print("Reading the trafo file took %ss" % (time.time() - start) )
    start = time.time()
    transformation_collection_.initialize_from_data(reverse=True, smoother=options.realign_method)
    print("Initializing the trafo file took %ss" % (time.time() - start) )

    if options.do_single_run and not options.dry_run:
        # Do only a single run : read only one single file
        start = time.time()
        swath_chromatograms = SwathChromatogramCollection()
        swath_chromatograms.parseFromTrafoFiles([ options.do_single_run ])
        print("Reading the chromatogram files took %ss" % (time.time() - start) )
        assert len(swath_chromatograms.getRunIDs() ) == 1
        rid = swath_chromatograms.getRunIDs()[0]
        # 
        start = time.time()
        multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
            transformation_collection_, options.border_option, 
            onlyExtractFromRun=rid, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
        print("Analyzing the runs took %ss" % (time.time() - start) )
        return new_exp, multipeptides

    swath_chromatograms = SwathChromatogramCollection()
    swath_chromatograms.parseFromTrafoFiles(trafo_fnames)
    print("Reading the chromatogram files took %ss" % (time.time() - start) )

    if options.dry_run:
        print "Dry Run only"
        print "Found multipeptides:", len(multipeptides)
        print "Found swath chromatograms:", swath_chromatograms
        return [], []

    start = time.time()
    if options.cache_in_memory:
        run_ids = [r.get_id() for r in new_exp.runs]
        for rid in run_ids:
            # Create the cache for run "rid" and then only extract peakgroups from this run
            swath_chromatograms.createRunCache(rid)
            multipeptides = analyze_multipeptides(new_exp, multipeptides, 
                swath_chromatograms, transformation_collection_, options.border_option, 
                onlyExtractFromRun=rid, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    else:
        multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
            transformation_collection_, options.border_option, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    print("Analyzing the runs took %ss" % (time.time() - start) )
    return new_exp, multipeptides
def runImputeValues(options, peakgroups_file, trafo_fnames, is_test):
    """Impute values across chromatograms

    Args:
        peakgroups_file(filename): CSV file containing all peakgroups
        trafo_fnames(filename): A list of .tr filenames (it is assumed that in
            the same directory also the chromatogram mzML reside)
    Returns:
        A tuple of:
            new_exp(AlignmentExperiment): experiment containing the aligned peakgroups
            multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides

    This function will read the csv file with all peakgroups as well as the
    transformation files (.tr) and the corresponding raw chromatograms which
    need to be in the same folder. It will then try to impute missing values
    for those peakgroups where no values is currently present, reading the raw
    chromatograms.
    """

    # We do not want to exclude any peakgroups for noiseIntegration (we assume
    # that alignment has already happened)
    fdr_cutoff_all_pg = 1.0

    start = time.time()
    reader = SWATHScoringReader.newReader([peakgroups_file],
                                          options.file_format,
                                          readmethod="complete",
                                          enable_isotopic_grouping = not options.disable_isotopic_grouping)
    new_exp = Experiment()
    new_exp.runs = reader.parse_files()
    multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False)
    print("Parsing the peakgroups file took %ss" % (time.time() - start) )

    start = time.time()
    transformation_collection_ = transformations.TransformationCollection()
    for filename in trafo_fnames:
        transformation_collection_.readTransformationData(filename)

    # Read the datapoints and perform the smoothing
    print("Reading the trafo file took %ss" % (time.time() - start) )
    start = time.time()
    transformation_collection_.initialize_from_data(reverse=True, smoother=options.realign_method)
    print("Initializing the trafo file took %ss" % (time.time() - start) )

    if options.do_single_run and not options.dry_run:
        # Do only a single run : read only one single file
        start = time.time()
        swath_chromatograms = SwathChromatogramCollection()
        swath_chromatograms.parseFromTrafoFiles([ options.do_single_run ])
        print("Reading the chromatogram files took %ss" % (time.time() - start) )
        assert len(swath_chromatograms.getRunIDs() ) == 1
        rid = swath_chromatograms.getRunIDs()[0]
        # 
        start = time.time()
        multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
            transformation_collection_, options.border_option, 
            onlyExtractFromRun=rid, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
        print("Analyzing the runs took %ss" % (time.time() - start) )
        return new_exp, multipeptides

    swath_chromatograms = SwathChromatogramCollection()
    swath_chromatograms.parseFromTrafoFiles(trafo_fnames)
    print("Reading the chromatogram files took %ss" % (time.time() - start) )

    if options.dry_run:
        print "Dry Run only"
        print "Found multipeptides:", len(multipeptides)
        print "Found swath chromatograms:", swath_chromatograms
        return [], []

    start = time.time()
    if options.cache_in_memory:
        run_ids = [r.get_id() for r in new_exp.runs]
        for rid in run_ids:
            # Create the cache for run "rid" and then only extract peakgroups from this run
            swath_chromatograms.createRunCache(rid)
            multipeptides = analyze_multipeptides(new_exp, multipeptides, 
                swath_chromatograms, transformation_collection_, options.border_option, 
                onlyExtractFromRun=rid, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    else:
        multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
            transformation_collection_, options.border_option, disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    print("Analyzing the runs took %ss" % (time.time() - start) )
    return new_exp, multipeptides
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test):
    """Impute values across chromatograms

    Args:
        peakgroups_file(filename): CSV file containing all peakgroups
        mzML_file(filename): mzML file containing chromatograms
    Returns:
        A tuple of:
            new_exp(AlignmentExperiment): experiment containing the aligned peakgroups
            multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides

    This function will read the csv file with all peakgroups as well as the
    provided chromatogram file (.chrom.mzML). It will then try to impute
    missing values for those peakgroups where no values is currently present,
    reading the raw chromatograms.
    """

    # We do not want to exclude any peakgroups for noiseIntegration (we assume
    # that alignment has already happened)
    fdr_cutoff_all_pg = 1.0

    start = time.time()
    reader = SWATHScoringReader.newReader([peakgroups_file],
                                          options.file_format,
                                          readmethod="complete",
                                          enable_isotopic_grouping = not options.disable_isotopic_grouping)
    new_exp = Experiment()
    new_exp.runs = reader.parse_files()
    multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False)
    print("Parsing the peakgroups file took %ss" % (time.time() - start) )

    mapping = {}
    precursors_mapping = {}
    sequences_mapping = {}
    protein_mapping = {}
    inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False)
    mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()])
    if VERBOSE:
        print mapping

    # Do only a single run : read only one single file
    start = time.time()
    swath_chromatograms = SwathChromatogramCollection()
    swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv)
    print("Reading the chromatogram files took %ss" % (time.time() - start) )
    assert len(swath_chromatograms.getRunIDs() ) == 1
    rid = swath_chromatograms.getRunIDs()[0]

    start = time.time()
    initial_alignment_cutoff = 0.0001
    max_rt_diff = 30
    sd_data = -1 # We do not use the standard deviation data in this algorithm
    tr_data = transformations.LightTransformationData()
    spl_aligner = SplineAligner(initial_alignment_cutoff)

    if method == "singleClosestRun":
        tree_mapped = None

        run_1 = [r for r in new_exp.runs if r.get_id() == rid][0]
        dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id())
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for run_0 in new_exp.runs:
            helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides,
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    elif method == "singleShortestPath":
        dist_matrix = None

        tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner))
        tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree]
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for edge in tree:
            helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], 
                new_exp.runs[edge[1]], spl_aligner, multipeptides, 
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    else:
        raise Exception("Unknown method: " + method)

    print("Alignment took %ss" % (time.time() - start) )
    start = time.time()
    multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
        tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix,
        disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    print("Analyzing the runs took %ss" % (time.time() - start) )

    return new_exp, multipeptides
def runSingleFileImputation(options, peakgroups_file, mzML_file, method, is_test):
    """Impute values across chromatograms

    Args:
        peakgroups_file(filename): CSV file containing all peakgroups
        mzML_file(filename): mzML file containing chromatograms
    Returns:
        A tuple of:
            new_exp(AlignmentExperiment): experiment containing the aligned peakgroups
            multipeptides(list(AlignmentHelper.Multipeptide)): list of multipeptides

    This function will read the csv file with all peakgroups as well as the
    provided chromatogram file (.chrom.mzML). It will then try to impute
    missing values for those peakgroups where no values is currently present,
    reading the raw chromatograms.
    """

    # We do not want to exclude any peakgroups for noiseIntegration (we assume
    # that alignment has already happened)
    fdr_cutoff_all_pg = 1.0

    start = time.time()
    reader = SWATHScoringReader.newReader([peakgroups_file],
                                          options.file_format,
                                          readmethod="complete",
                                          enable_isotopic_grouping = not options.disable_isotopic_grouping)
    new_exp = Experiment()
    new_exp.runs = reader.parse_files()
    multipeptides = new_exp.get_all_multipeptides(fdr_cutoff_all_pg, verbose=False)
    print("Parsing the peakgroups file took %ss" % (time.time() - start) )

    mapping = {}
    precursors_mapping = {}
    sequences_mapping = {}
    protein_mapping = {}
    inferMapping([ mzML_file ], [ peakgroups_file ], mapping, precursors_mapping, sequences_mapping, protein_mapping, verbose=False)
    mapping_inv = dict([(v[0],k) for k,v in mapping.iteritems()])
    if VERBOSE:
        print mapping

    # Do only a single run : read only one single file
    start = time.time()
    swath_chromatograms = SwathChromatogramCollection()
    swath_chromatograms.parseFromMzML([ mzML_file ], mapping_inv)
    print("Reading the chromatogram files took %ss" % (time.time() - start) )
    assert len(swath_chromatograms.getRunIDs() ) == 1
    rid = swath_chromatograms.getRunIDs()[0]

    start = time.time()
    initial_alignment_cutoff = 0.0001
    max_rt_diff = 30
    sd_data = -1 # We do not use the standard deviation data in this algorithm
    tr_data = transformations.LightTransformationData()
    spl_aligner = SplineAligner(initial_alignment_cutoff)

    if method == "singleClosestRun":
        tree_mapped = None

        run_1 = [r for r in new_exp.runs if r.get_id() == rid][0]
        dist_matrix = getDistanceMatrix(new_exp, multipeptides, spl_aligner, singleRowId=run_1.get_id())
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for run_0 in new_exp.runs:
            helper.addDataToTrafo(tr_data, run_0, run_1, spl_aligner, multipeptides,
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    elif method == "singleShortestPath":
        dist_matrix = None

        tree = MinimumSpanningTree(getDistanceMatrix(new_exp, multipeptides, spl_aligner))
        tree_mapped = [(new_exp.runs[a].get_id(), new_exp.runs[b].get_id()) for a,b in tree]
        print("Distance matrix took %ss" % (time.time() - start) )

        start = time.time()
        for edge in tree:
            helper.addDataToTrafo(tr_data, new_exp.runs[edge[0]], 
                new_exp.runs[edge[1]], spl_aligner, multipeptides, 
                options.realign_method, max_rt_diff, sd_max_data_length=sd_data)

    else:
        raise Exception("Unknown method: " + method)

    print("Alignment took %ss" % (time.time() - start) )
    start = time.time()
    multipeptides = analyze_multipeptides(new_exp, multipeptides, swath_chromatograms,
        tr_data, options.border_option, rid, tree=tree_mapped, mat=dist_matrix,
        disable_isotopic_transfer=options.disable_isotopic_transfer, is_test=is_test)
    print("Analyzing the runs took %ss" % (time.time() - start) )

    return new_exp, multipeptides