Exemple #1
0
    def test_mixed_MTBC_NTM(self):
        species_predictor = AMRSpeciesPredictor(
            phylo_group_covgs={},
            sub_complex_covgs={},
            species_covgs={},
            lineage_covgs={},
            hierarchy_json_file=self.hierarchy_json_file)
        species_predictor.out_json["phylogenetics"] = {}
        species_predictor.out_json["phylogenetics"]["phylo_group"] = {
            "Non_tuberculosis_mycobacterium_complex": {
                "percent_coverage": 58.71542975006994,
                "median_depth": 36
            },
            "Mycobacterium_tuberculosis_complex": {
                "percent_coverage": 62.81850563578579,
                "median_depth": 2
            }
        }
        assert species_predictor.is_mtbc_present()
        assert species_predictor.is_ntm_present()

        assert len(
            species_predictor._get_present_phylo_groups(
                species_predictor.out_json["phylogenetics"]
                ["phylo_group"])) == 2
Exemple #2
0
    def test_mixed_chimera(self):
        species_predictor = AMRSpeciesPredictor(
            phylo_group_covgs={},
            sub_complex_covgs={},
            species_covgs={},
            lineage_covgs={},
            hierarchy_json_file=self.hierarchy_json_file)
        species_predictor.out_json["phylogenetics"] = {
            "sub_complex": {
                "Mycobacterium_avium_complex": {
                    "percent_coverage": 98.346,
                    "median_depth": 54.0
                }
            },
            "phylo_group": {
                "Non_tuberculosis_mycobacterium_complex": {
                    "percent_coverage": 82.846,
                    "median_depth": 49
                }
            },
            "species": {
                "Mycobacterium_chimaera": {
                    "percent_coverage": 99.162,
                    "median_depth": 39
                },
                "Mycobacterium_intracellulare": {
                    "percent_coverage": 98.662,
                    "median_depth": 45
                },
                "Mycobacterium_bovis": {
                    "percent_coverage": 9.894,
                    "median_depth": 12.0
                }
            }
        }

        out_dict = species_predictor.choose_best(
            species_predictor.out_json["phylogenetics"])

        assert "Mycobacterium_chimaera" in out_dict["species"]
        assert "Mycobacterium_intracellulare" in out_dict["species"]
        assert "Mycobacterium_bovis" not in out_dict["species"]
    def test_mixed_chimera(self):
        species_predictor = AMRSpeciesPredictor(
            phylo_group_covgs={},
            sub_complex_covgs={},
            species_covgs={},
            lineage_covgs={},
            hierarchy_json_file=self.hierarchy_json_file)
        species_predictor.out_json["phylogenetics"] = {
            "sub_complex": {
                "Mycobacterium_avium_complex": {
                    "percent_coverage": 98.346,
                    "median_depth": 54.0
                }
            },
            "phylo_group": {
                "Non_tuberculosis_mycobacterium_complex": {
                    "percent_coverage": 82.846,
                    "median_depth": 49
                }
            },
            "species": {
                "Mycobacterium_chimaera": {
                    "percent_coverage": 99.162,
                    "median_depth": 39
                },
                "Mycobacterium_intracellulare": {
                    "percent_coverage": 98.662,
                    "median_depth": 45
                },
                "Mycobacterium_bovis": {
                    "percent_coverage": 9.894,
                    "median_depth": 12.0
                }
            }
        }

        out_dict = species_predictor.choose_best(
            species_predictor.out_json["phylogenetics"])

        assert "Mycobacterium_chimaera" in out_dict["species"]
        assert "Mycobacterium_intracellulare" in out_dict["species"]
        assert "Mycobacterium_bovis" not in out_dict["species"]
    def test_get_best_coverage_dict(self):
        species_predictor = AMRSpeciesPredictor(
            phylo_group_covgs={},
            sub_complex_covgs={},
            species_covgs={},
            lineage_covgs={},
            hierarchy_json_file=self.hierarchy_json_file)

        best_species = species_predictor._get_best_coverage_dict({
            "Mycobacterium_chimaera": {
                "percent_coverage": 99.162,
                "median_depth": 39
            },
            "Mycobacterium_intracellulare": {
                "percent_coverage": 98.662,
                "median_depth": 45
            },
            "Mycobacterium_bovis": {
                "percent_coverage": 9.894,
                "median_depth": 12.0
            }}).keys()
        assert list(best_species) == ["Mycobacterium_chimaera"]
Exemple #5
0
    def test_get_best_coverage_dict(self):
        species_predictor = AMRSpeciesPredictor(
            phylo_group_covgs={},
            sub_complex_covgs={},
            species_covgs={},
            lineage_covgs={},
            hierarchy_json_file=self.hierarchy_json_file)

        best_species = species_predictor._get_best_coverage_dict({
            "Mycobacterium_chimaera": {
                "percent_coverage": 99.162,
                "median_depth": 39
            },
            "Mycobacterium_intracellulare": {
                "percent_coverage": 98.662,
                "median_depth": 45
            },
            "Mycobacterium_bovis": {
                "percent_coverage": 9.894,
                "median_depth": 12.0
            }}).keys()
        assert list(best_species) == ["Mycobacterium_chimaera"]
Exemple #6
0
def detect_species_and_get_depths(cov_parser, hierarchy_json,
                                  wanted_phylo_group):
    depths = []
    if wanted_phylo_group is None:
        return {}, depths

    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cov_parser.covgs.get(
            "complex", cov_parser.covgs.get("phylo_group", {})),
        sub_complex_covgs=cov_parser.covgs.get("sub-complex", {}),
        species_covgs=cov_parser.covgs["species"],
        lineage_covgs=cov_parser.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json,
    )
    phylogenetics = species_predictor.run()

    if wanted_phylo_group in species_predictor.out_json["phylogenetics"][
            "phylo_group"]:
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            [wanted_phylo_group]["median_depth"]
        ]
    return phylogenetics, depths
    def test_mixed_MTBC_NTM(self):
        species_predictor = AMRSpeciesPredictor(
            phylo_group_covgs={},
            sub_complex_covgs={},
            species_covgs={},
            lineage_covgs={},
            hierarchy_json_file=self.hierarchy_json_file)
        species_predictor.out_json["phylogenetics"] = {}
        species_predictor.out_json["phylogenetics"]["phylo_group"] = {
            "Non_tuberculosis_mycobacterium_complex": {
                "percent_coverage": 58.71542975006994,
                "median_depth": 36
            },
            "Mycobacterium_tuberculosis_complex": {
                "percent_coverage": 62.81850563578579,
                "median_depth": 2
            }
        }
        assert species_predictor.is_mtbc_present()
        assert species_predictor.is_ntm_present()

        assert len(species_predictor._get_present_phylo_groups(
            species_predictor.out_json["phylogenetics"]["phylo_group"])) == 2
Exemple #8
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        if args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-feb-09-2017.fasta.gz"]
        elif args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-feb-09-2017.fasta.gz"]

    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
        panel_name = "tb-gn-staph-amr"
    elif args.species == "staph":
        panels = STAPH_PANELS
        panel_name = "staph-amr"
        # Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif args.species == "tb":
        panels = TB_PANELS
        panel_name = "tb-amr"
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        # Predictor = TBPredictor
    elif args.species == "gn":
        panels = GN_PANELS
        panel_name = "gn-amr"
        # Predictor = GramNegPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(
            os.path.join(
                os.path.dirname(__file__),
                "..",
                f)) for f in panels]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(
                os.path.dirname(__file__),
                "..",
                hierarchy_json_file))
    if args.ont:
        args.expected_error_rate = 0.15
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)
        args.filters = ["LOW_GT_CONF"]
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        mccortex31_path=args.mccortex31_path)
    cp.run()
    logger.debug('CoverageParser complete')

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get(
            "complex",
            cp.covgs.get(
                "phylo_group",
                {})),
        sub_complex_covgs=cp.covgs.get(
            "sub-complex",
            {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get(
            "sub-species",
            {}),
        hierarchy_json_file=hierarchy_json_file)
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    Predictor = None
    if species_predictor.is_saureus_present():
        depths = [species_predictor.out_json["phylogenetics"]
                  ["phylo_group"]["Staphaureus"]["median_depth"]]
        Predictor = StaphPredictor
    elif species_predictor.is_mtbc_present():
        depths = [species_predictor.out_json["phylogenetics"]["phylo_group"][
            "Mycobacterium_tuberculosis_complex"]["median_depth"]]
        Predictor = TBPredictor
    elif species_predictor.is_gram_neg_present():
        Predictor = GramNegPredictor
        try:
            depths = [species_predictor.out_json["phylogenetics"][
                "species"]["Klebsiella_pneumoniae"]["median_depth"]]
        except KeyError:
            depths = [species_predictor.out_json["phylogenetics"]
                      ["species"]["Escherichia_coli"]["median_depth"]]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if depths:
        gt = Genotyper(sample=args.sample,
                       expected_depths=depths,
                       expected_error_rate=args.expected_error_rate,
                       variant_covgs=cp.variant_covgs,
                       gene_presence_covgs=cp.covgs["presence"],
                       base_json=base_json,
                       contamination_depths=[],
                       report_all_calls=True,
                       ignore_filtered=True,
                       filters=args.filters,
                       variant_confidence_threshold=args.min_variant_conf,
                       sequence_confidence_threshold=args.min_gene_conf
                       )
        gt.run()
        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = cp.estimate_depth()
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult()
    if Predictor is not None and max(depths) > args.min_depth:
        predictor = Predictor(variant_calls=gt.variant_calls,
                              called_genes=gt.sequence_calls_dict,
                              base_json=base_json[args.sample],
                              depth_threshold=args.min_depth,
                              ignore_filtered=True,
                              ignore_minor_calls=args.ont)
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[
        args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()
    print(json.dumps(base_json, indent=4))
Exemple #9
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        variant_to_resistance_json_fp = None
        if args.species == "tb" and args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "201901":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-hunt-probe-set-jan-03-2019.fasta.gz",
            ]
            data_dir = os.path.abspath(
                os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
            variant_to_resistance_json_fp = os.path.join(
                data_dir, "variant_to_resistance_drug-jan-03-2019.json")
        elif args.species == "tb" and args.panel == "atlas":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
                "data/panels/tb-k21-probe-set-feb-09-2017.fasta.gz",
            ]
        elif args.panel == "custom":
            if not args.custom_probe_set_path:
                raise ValueError("Custom panel requires custom_probe_set_path")
            TB_PANELS = [
                args.custom_probe_set_path,
                "data/panels/tb-species-170421.fasta.gz",
            ]
            variant_to_resistance_json_fp = args.custom_variant_to_resistance_json
    Predictor = None
    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
    elif args.species == "staph":
        panels = STAPH_PANELS
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
        variant_to_resistance_json_fp = None
    elif args.species == "tb":
        panels = TB_PANELS
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        mccortex31_path=args.mccortex31_path,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        kmer_count_error_rate, incorrect_kmer_to_pc_cov = (
            gt.
            estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov())
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                # expected_error_rate=args.expected_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    # write to file is specified by user, otherwise send to stdout
    if args.output_format == "csv":
        output = json_to_csv(base_json)
    else:
        ## Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        output = json.dumps(base_json, indent=4)

    if args.output:
        with open(args.output, "w") as outfile:
            outfile.write(output)
    else:
        print(output)
Exemple #10
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        if args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-160330.fasta.gz",
                "data/panels/tb-amr-bradley_2015.fasta.gz"
            ]
        elif args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-160330.fasta.gz",
                "data/panels/tb-amr-walker_2015.fasta.gz"
            ]

    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
        panel_name = "tb-gn-staph-amr"
    elif args.species == "staph":
        panels = STAPH_PANELS
        panel_name = "staph-amr"
        # Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif args.species == "tb":
        panels = TB_PANELS
        panel_name = "tb-amr"
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        # Predictor = TBPredictor
    elif args.species == "gn":
        panels = GN_PANELS
        panel_name = "gn-amr"
        # Predictor = GramNegPredictor
    logging.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(sample=args.sample,
                        panel_file_paths=panels,
                        seq=args.seq,
                        kmer=args.kmer,
                        force=args.force,
                        threads=1,
                        verbose=False,
                        tmp_dir=args.tmp,
                        skeleton_dir=args.skeleton_dir,
                        mccortex31_path=args.mccortex31_path)
    cp.run()
    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file)
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    Predictor = None
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
        Predictor = StaphPredictor
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
        Predictor = TBPredictor
    elif species_predictor.is_gram_neg_present():
        Predictor = GramNegPredictor
        try:
            depths = [
                species_predictor.out_json["phylogenetics"]["species"]
                ["Klebsiella_pneumoniae"]["median_depth"]
            ]
        except KeyError:
            depths = [
                species_predictor.out_json["phylogenetics"]["species"]
                ["Escherichia_coli"]["median_depth"]
            ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if depths:
        gt = Genotyper(sample=args.sample,
                       expected_depths=depths,
                       variant_covgs=cp.variant_covgs,
                       gene_presence_covgs=cp.covgs["presence"],
                       base_json=base_json,
                       contamination_depths=[],
                       report_all_calls=True,
                       ignore_filtered=True,
                       variant_confidence_threshold=args.min_variant_conf,
                       sequence_confidence_threshold=args.min_gene_conf)
        gt.run()
        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = cp.estimate_depth()
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if Predictor is not None and max(depths) > args.min_depth:
        predictor = Predictor(variant_calls=gt.variant_calls,
                              called_genes=gt.sequence_calls_dict,
                              base_json=base_json[args.sample],
                              depth_threshold=args.min_depth,
                              ignore_filtered=True)
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()
    print(json.dumps(base_json, indent=4))
Exemple #11
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    variant_to_resistance_json_fp: Optional[PathLike] = None
    species = Species(args.species)
    if species is not Species.TB and args.panel != "custom":
        args.panel = "default"
    panels = Panel.from_species_and_name(species, args.panel)

    if species is Species.TB and panels.name is TbPanel.NEJM_WALKER:
        data_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
        variant_to_resistance_json_fp = os.path.join(
            data_dir, "variant_to_resistance_drug-jan-03-2019.json")
    if panels.name in (TbPanel.CUSTOM, StaphPanel.CUSTOM):
        if not args.custom_probe_set_path:
            raise ValueError("Custom panel requires custom_probe_set_path")

        if not os.path.exists(args.custom_probe_set_path):
            raise FileNotFoundError(
                f"Custom probe path {args.custom_probe_set_path} does not exist!"
            )
        panels.add_path(args.custom_probe_set_path)

        if not os.path.exists(args.custom_variant_to_resistance_json):
            raise FileNotFoundError(
                ("Custom variant to resistance json "
                 f"{args.custom_variant_to_resistance_json} does not exist!"))
        variant_to_resistance_json_fp = args.custom_variant_to_resistance_json

    if species is Species.STAPH:
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif species is Species.TB:
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    else:
        raise ValueError(f"Unrecognised species {species}")

    logger.info("Running AMR prediction with panels %s" %
                ", ".join(panels.paths))
    version = dict()
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels.paths
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        (
            kmer_count_error_rate,
            incorrect_kmer_to_pc_cov,
        ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov(
        )
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    outputs = {}

    if args.output_format in ["csv", "json_and_csv"]:
        outputs["csv"] = json_to_csv(base_json)
    if args.output_format in ["json", "json_and_csv"]:
        # Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        outputs["json"] = json.dumps(base_json, indent=4)

    if len(outputs) == 0:
        raise ValueError(
            (f"Output format must be one of: csv,json,json_and_csv. Got "
             f"'{args.output_format}'"))

    for output_type, output in outputs.items():
        # write to file is specified by user, otherwise send to stdout
        if args.output:
            if args.output_format == "json_and_csv":
                outfile = args.output + "." + output_type
            else:
                outfile = args.output
            with open(outfile, "w") as f:
                f.write(output)
        else:
            print(output)
Exemple #12
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        if args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-feb-09-2017.fasta.gz"
            ]
        elif args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-feb-09-2017.fasta.gz"
            ]
        elif args.panel == "custom":
            if not args.custom_probe_set_path:
                raise ValueError("Custom panel requires custom_probe_set_path")
            TB_PANELS = [
                args.custom_probe_set_path,
                "data/panels/tb-species-170421.fasta.gz"
            ]
    Predictor = None
    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
    elif args.species == "staph":
        panels = STAPH_PANELS
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif args.species == "tb":
        panels = TB_PANELS
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    if args.ont:
        args.expected_error_rate = 0.15
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)
        args.filters = ["LOW_GT_CONF"]
        args.model = "kmer_count"
    # Run Cortex
    cp = CoverageParser(sample=args.sample,
                        panel_file_paths=panels,
                        seq=args.seq,
                        kmer=args.kmer,
                        force=args.force,
                        threads=1,
                        verbose=False,
                        tmp_dir=args.tmp,
                        skeleton_dir=args.skeleton_dir,
                        mccortex31_path=args.mccortex31_path)
    cp.run()
    logger.debug('CoverageParser complete')

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file)
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None
    if depths or args.force:
        gt = Genotyper(sample=args.sample,
                       expected_depths=depths,
                       expected_error_rate=args.expected_error_rate,
                       variant_covgs=cp.variant_covgs,
                       gene_presence_covgs=cp.covgs["presence"],
                       base_json=base_json,
                       contamination_depths=[],
                       report_all_calls=True,
                       ignore_filtered=True,
                       filters=args.filters,
                       variant_confidence_threshold=args.min_variant_conf,
                       sequence_confidence_threshold=args.min_gene_conf,
                       model=args.model)
        gt.run()
        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(variant_calls=gt.variant_calls,
                              called_genes=gt.sequence_calls_dict,
                              base_json=base_json[args.sample],
                              depth_threshold=args.min_depth,
                              ignore_filtered=True,
                              ignore_minor_calls=args.ont,
                              variant_to_resistance_json_fp=args.
                              custom_variant_to_resistance_json)
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    # write to file is specified by user, otherwise send to stdout
    if args.output:
        with open(args.output, 'w') as outfile:
            json.dump(base_json, outfile, indent=4)
    else:
        print(json.dumps(base_json, indent=4))