Ejemplo n.º 1
0
def process(args):
    """
    Filter loci usable for instability status prediction.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """
    reports = MSIReport.parse(args.input_reports)
    for spl in reports:
        # Filter loci status
        for locus_id, locus in spl.loci.items():
            res_locus = locus.results[args.method_name]
            if len(res_locus.data) != 0 and res_locus.getCount(
            ) < args.min_distrib_support:
                res_locus.status = Status.undetermined
                res_locus.score = None
        # Re-repocess sample status
        if args.consensus_method == "majority":
            spl.setStatusByMajority(args.method_name, args.min_voting_loci)
        elif args.consensus_method == "ratio":
            spl.setStatusByInstabilityRatio(args.method_name,
                                            args.min_voting_loci,
                                            args.instability_ratio)
        elif args.consensus_method == "count":
            spl.setStatusByInstabilityCount(args.method_name,
                                            args.min_voting_loci,
                                            args.instability_count)
        spl.setScore(args.method_name, args.undetermined_weight,
                     args.locus_weight_is_score)
    # Write report
    MSIReport.write(reports, args.output_reports)
Ejemplo n.º 2
0
def process(args):
    """
    Aggregate report and analysis information coming from mSINGS in serialisation of anacore.msi.MSISAmple object.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """
    # Aggregate data
    msi_spl = list(MSINGSReport(args.input_report).samples.values())[0]
    with MSINGSAnalysis(args.input_analysis) as FH_analysis:
        for record in FH_analysis:
            if record.position in msi_spl.loci:
                msi_spl.loci[record.position].name = record.name
                if "MSINGS" not in msi_spl.loci[record.position].results:
                    msi_spl.loci[
                        record.position].results["MSINGS"] = LocusResDistrib(
                            Status.none)
                else:
                    msi_spl.loci[record.position].results[
                        "MSINGS"]._class = "LocusResDistrib"
                msi_spl.loci[record.position].results[
                    "MSINGS"].data = record.results["MSINGS"].data
            else:
                msi_spl.addLocus(record)
    # Write report
    MSIReport.write([msi_spl], args.output_report)
Ejemplo n.º 3
0
def launchAddClf(args, models_path, reports_path):
    """
    Launch second classification with a list of classifiers and update reports file.

    :param args: Arguments of the script.
    :type args: argparse.NameSpace
    :param models_path: Path to the models file (format: MSIReport).
    :type models_path: str
    :param reports_path: Path to the report file obtained with first classification (format: MSIReport).
    :type reports_path: str
    """
    for clf_name in args.add_classifiers:
        method_name = clf_name
        clf_params = None
        if clf_name.startswith("RandomForest:"):
            n_estimators = clf_name.split(":")[1]
            clf_name = "RandomForest"
            clf_params = '{"n_estimators": ' + n_estimators + '}'
        # Copy combination produced by MIAmS in data of the new method
        reports = MSIReport.parse(out_reports_path)
        lociInitData(reports, args.default_classifier, method_name)
        MSIReport.write(reports, out_reports_path)
        # Submit classification
        submitAddClf(models_path, out_reports_path, out_reports_path, args,
                     method_name, clf_name, clf_params)
Ejemplo n.º 4
0
def process(args):
    """
    ********************************************************.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """

    data_by_spl = getLocusAnnotDict(args.input_loci_annotations)
    msi_samples = MSIReport.parse(args.input_report)
    for curr_spl in msi_samples:
        addLociResToSpl(curr_spl, data_by_spl[curr_spl.name])
    MSIReport.write(msi_samples, args.output_report)
Ejemplo n.º 5
0
def getHigherPeakByLocus(models, min_support_reads):
    """
    Return length of the higher peak of each model by locus.

    :param models: The list of MSIReport representing the models (status known and stored in Expected result).
    :type models: list
    :param min_support_reads: The minimum number of reads on locus to use the stability status of the current model.
    :type min_support_reads: int
    :return: By locus the list of higher peak length.
    :rtype: dict
    """
    higher_by_locus = {}
    models_samples = MSIReport.parse(models)
    for curr_spl in models_samples:
        for locus_id, curr_locus in curr_spl.loci.items():
            if locus_id not in higher_by_locus:
                higher_by_locus[locus_id] = []
            if "model" in curr_locus.results:
                if curr_locus.results["model"].status == Status.stable and curr_locus.results["model"].getNbFrag() > (min_support_reads / 2):
                    max_peak = None
                    max_count = -1
                    for length, count in curr_locus.results["model"].data["nb_by_length"].items():
                        if count >= max_count:  # "=" for select the tallest
                            max_count = count
                            max_peak = int(length)
                    higher_by_locus[locus_id].append(max_peak)
    return higher_by_locus
Ejemplo n.º 6
0
def process(args):
    """
    Tag stability for loci and sample from length distribution on loci.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """
    spl_name = args.sample_name
    if args.sample_name is None:
        spl_name = os.path.basename(args.output_report).split(".")[0]
        if spl_name.endswith("_report"):
            spl_name = spl_name[:-7]
    msi_spl = MSISample(spl_name)
    # Parse lengths metrics by loci
    with HashedSVIO(args.input_combined_list) as FH_loci_list:
        for record in FH_loci_list:
            with open(record["Filepath"]) as FH_locus:
                locus_metrics = json.load(FH_locus)
            msi_locus = MSILocus.fromDict({
                "name": record["Locus_name"],
                "position": record["Locus_position"],
                "results": {
                    "PairsCombi": {
                        "_class": "LocusResPairsCombi",
                        "status": Status.none,
                        "data": {
                            "nb_by_length":
                            locus_metrics["nb_by_length"],
                            "nb_pairs_aligned":
                            locus_metrics["nb_uncombined_pairs"] +
                            locus_metrics["nb_combined_pairs"]
                        }
                    }
                }
            })
            msi_spl.addLocus(msi_locus)
    # Process status
    msi_models = MSIReport.parse(args.input_models)
    for locus_id in msi_spl.loci:
        processor = PairsCombiProcessor(locus_id, msi_models, [msi_spl],
                                        args.min_support)
        processor.setLocusStatus()
    msi_spl.setStatus("PairsCombi")
    # Write report
    MSIReport.write([msi_spl], args.output_report)
Ejemplo n.º 7
0
def process(args):
    """
    Create MSISample from loci metrics.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """
    spl_name = args.sample_name
    if args.sample_name is None:
        spl_name = os.path.basename(args.output_report).split(".")[0]
        if spl_name.endswith("_report"):
            spl_name = spl_name[:-7]
    msi_spl = MSISample(spl_name)
    # Add result data by loci
    addLociDataFromFiles(msi_spl, args.input_loci_metrics_list,
                         args.method_name, args.result_keys,
                         args.method_class_name)
    # Write report
    MSIReport.write([msi_spl], args.output_report)
Ejemplo n.º 8
0
def process(args):
    """
    Create training data for MSI classifiers. These references are stored in
    MSIReport format.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """
    # Get method name from annotations file
    method_names = set()
    for record in MSIAnnot(args.input_loci_annot):
        method_names.add(record["method_id"])
    if len(method_names) != 1:
        raise ValueError(
            'The annotation file must contain only one value for method_id. The file "{}" contains {}.'
            .format(args.input_reports, method_names))
    result_id = list(method_names)[0]
    # Get reference loci from targets file
    ref_loci = []
    with BEDIO(args.input_targets) as FH_in:
        for record in FH_in:
            ref_loci.append(
                MSILocus(
                    "{}:{}-{}".format(record.chrom, record.start - 1,
                                      record.end), record.name))
    # Aggregate samples
    msi_samples = getAggregatedSpl(args.inputs_report)
    # Add locus result info
    data_by_spl = getLocusAnnotDict(args.input_loci_annot)
    for curr_spl in msi_samples:
        addLociResToSpl(curr_spl, data_by_spl[curr_spl.name],
                        LocusResPairsCombi)
    # Filter locus results
    populateLoci(msi_samples, ref_loci)
    pruneResults(msi_samples, result_id, args.min_support_fragments)
    # Display metrics
    writeStatusMetrics(msi_samples, result_id, args.output_info)
    # Write output
    MSIReport.write(msi_samples, args.output_references)
Ejemplo n.º 9
0
def process(args):
    """
    Predict classification (status and score) for all samples loci.

    :param args: The namespace extracted from the script arguments.
    :type args: Namespace
    """
    train_dataset = MSIReport.parse(args.input_references)
    test_dataset = MSIReport.parse(args.input_evaluated)

    # Classification by locus
    loci_ids = sorted(train_dataset[0].loci.keys())
    for locus_id in loci_ids:
        # Select the samples with a sufficient number of fragment for classify the distribution
        evaluated_test_dataset = []
        for spl in test_dataset:
            if spl.loci[locus_id].results[args.method_name].getNbFrag() < args.min_support_fragments:
                spl.loci[locus_id].results[args.method_name].status = Status.undetermined
                spl.loci[locus_id].results[args.method_name].score = None
            else:
                evaluated_test_dataset.append(spl)
        # Classify
        if len(evaluated_test_dataset) != 0:
            clf = MIAmSClassifier(locus_id, args.method_name, "model", args.classifier, args.classifier_params)
            clf.fit(train_dataset)
            clf.set_status(evaluated_test_dataset)

    # Classification by sample
    for spl in test_dataset:
        if args.consensus_method == "majority":
            spl.setStatusByMajority(args.method_name, args.min_voting_loci)
        elif args.consensus_method == "ratio":
            spl.setStatusByInstabilityRatio(args.method_name, args.min_voting_loci, args.instability_ratio)
        elif args.consensus_method == "count":
            spl.setStatusByInstabilityCount(args.method_name, args.min_voting_loci, args.instability_count)
        spl.setScore(args.method_name, args.undetermined_weight, args.locus_weight_is_score)

    MSIReport.write(test_dataset, args.output_report)
Ejemplo n.º 10
0
def getAggregatedSpl(in_reports):
    """
    Return one list of MSISample from several MSReport.

    :param in_reports: Pathes to the MSIReport files.
    :type in_reports: list of MSIReport
    :return: List of MSISample.
    :rtype: list
    """
    aggregated_spl = []
    for curr_report in in_reports:
        msi_samples = MSIReport.parse(curr_report)
        for curr_spl in msi_samples:
            aggregated_spl.append(curr_spl)
    return aggregated_spl
Ejemplo n.º 11
0
     spl_name
     for idx, spl_name in enumerate(ordered_spl_names)
     if idx in test_idx
 }
 train_samples = [
     lib for lib in librairies if lib["spl_name"] in train_names
 ]  # Select all libraries corresponding to the train samples
 test_samples = [
     lib for lib in librairies if lib["spl_name"] in test_names
 ]  # Select all libraries corresponding to the test samples
 # Process learn and tag
 train(train_samples, annotation_path, design_folder, baseline_path,
       models_path, learn_log_path, args)
 predict(test_samples, design_folder, baseline_path, models_path,
         out_folder, args)
 models = MSIReport.parse(models_path)
 reports = getMSISamples(os.path.join(out_folder, "data"))
 if len(args.add_classifiers) > 0:
     log.info(
         "Process {} additionnal classifiers on dataset {}/{} ({})."
         .format(len(args.add_classifiers), dataset_id,
                 args.nb_tests - 1, dataset_md5))
     MSIReport.write(reports, out_reports_path)
     launchAddClf(args, models_path, out_reports_path)
     reports = MSIReport.parse(out_reports_path)
 # Write results and dataset
 use_header = False
 out_mode = "a"
 if dataset_id == 0:
     use_header = True
     out_mode = "w"
Ejemplo n.º 12
0
    # Get status by locus
    status_by_spl = {}
    with HashedSVIO(args.input_status, title_starter="") as FH_in:
        for record in FH_in:
            status_by_spl[record["sample"]] = {
                locus: status
                for locus, status in record.items()
                if locus not in ["sample", "sample_status"]
            }

    # Get min and max amplicon size by locus
    range_by_locus = {}
    for filename in os.listdir(args.input_data):
        filepath = os.path.join(args.input_data, filename)
        report = MSIReport.parse(filepath)
        for spl in report:
            for locus_id, locus in spl.loci.items():
                if locus_id not in range_by_locus:
                    range_by_locus[locus_id] = {"min": 300, "max": 0}
                range_by_locus[locus_id]["min"] = min(
                    locus.results[args.reference_method].getMinLength(),
                    range_by_locus[locus_id]["min"])
                range_by_locus[locus_id]["max"] = max(
                    locus.results[args.reference_method].getMaxLength(),
                    range_by_locus[locus_id]["max"])

    # Write lengths distributions
    for filename in os.listdir(args.input_data):
        filepath = os.path.join(args.input_data, filename)
        report = MSIReport.parse(filepath)