def test_mixed_MTBC_NTM(self): species_predictor = AMRSpeciesPredictor( phylo_group_covgs={}, sub_complex_covgs={}, species_covgs={}, lineage_covgs={}, hierarchy_json_file=self.hierarchy_json_file) species_predictor.out_json["phylogenetics"] = {} species_predictor.out_json["phylogenetics"]["phylo_group"] = { "Non_tuberculosis_mycobacterium_complex": { "percent_coverage": 58.71542975006994, "median_depth": 36 }, "Mycobacterium_tuberculosis_complex": { "percent_coverage": 62.81850563578579, "median_depth": 2 } } assert species_predictor.is_mtbc_present() assert species_predictor.is_ntm_present() assert len( species_predictor._get_present_phylo_groups( species_predictor.out_json["phylogenetics"] ["phylo_group"])) == 2
def test_mixed_chimera(self): species_predictor = AMRSpeciesPredictor( phylo_group_covgs={}, sub_complex_covgs={}, species_covgs={}, lineage_covgs={}, hierarchy_json_file=self.hierarchy_json_file) species_predictor.out_json["phylogenetics"] = { "sub_complex": { "Mycobacterium_avium_complex": { "percent_coverage": 98.346, "median_depth": 54.0 } }, "phylo_group": { "Non_tuberculosis_mycobacterium_complex": { "percent_coverage": 82.846, "median_depth": 49 } }, "species": { "Mycobacterium_chimaera": { "percent_coverage": 99.162, "median_depth": 39 }, "Mycobacterium_intracellulare": { "percent_coverage": 98.662, "median_depth": 45 }, "Mycobacterium_bovis": { "percent_coverage": 9.894, "median_depth": 12.0 } } } out_dict = species_predictor.choose_best( species_predictor.out_json["phylogenetics"]) assert "Mycobacterium_chimaera" in out_dict["species"] assert "Mycobacterium_intracellulare" in out_dict["species"] assert "Mycobacterium_bovis" not in out_dict["species"]
def test_get_best_coverage_dict(self): species_predictor = AMRSpeciesPredictor( phylo_group_covgs={}, sub_complex_covgs={}, species_covgs={}, lineage_covgs={}, hierarchy_json_file=self.hierarchy_json_file) best_species = species_predictor._get_best_coverage_dict({ "Mycobacterium_chimaera": { "percent_coverage": 99.162, "median_depth": 39 }, "Mycobacterium_intracellulare": { "percent_coverage": 98.662, "median_depth": 45 }, "Mycobacterium_bovis": { "percent_coverage": 9.894, "median_depth": 12.0 }}).keys() assert list(best_species) == ["Mycobacterium_chimaera"]
def detect_species_and_get_depths(cov_parser, hierarchy_json, wanted_phylo_group): depths = [] if wanted_phylo_group is None: return {}, depths species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cov_parser.covgs.get( "complex", cov_parser.covgs.get("phylo_group", {})), sub_complex_covgs=cov_parser.covgs.get("sub-complex", {}), species_covgs=cov_parser.covgs["species"], lineage_covgs=cov_parser.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json, ) phylogenetics = species_predictor.run() if wanted_phylo_group in species_predictor.out_json["phylogenetics"][ "phylo_group"]: depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] [wanted_phylo_group]["median_depth"] ] return phylogenetics, depths
def test_mixed_MTBC_NTM(self): species_predictor = AMRSpeciesPredictor( phylo_group_covgs={}, sub_complex_covgs={}, species_covgs={}, lineage_covgs={}, hierarchy_json_file=self.hierarchy_json_file) species_predictor.out_json["phylogenetics"] = {} species_predictor.out_json["phylogenetics"]["phylo_group"] = { "Non_tuberculosis_mycobacterium_complex": { "percent_coverage": 58.71542975006994, "median_depth": 36 }, "Mycobacterium_tuberculosis_complex": { "percent_coverage": 62.81850563578579, "median_depth": 2 } } assert species_predictor.is_mtbc_present() assert species_predictor.is_ntm_present() assert len(species_predictor._get_present_phylo_groups( species_predictor.out_json["phylogenetics"]["phylo_group"])) == 2
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None if args.panel is not None: if args.panel == "bradley-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-bradley-probe-set-feb-09-2017.fasta.gz"] elif args.panel == "walker-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-feb-09-2017.fasta.gz"] if not args.species: panels = TB_PANELS + GN_PANELS + STAPH_PANELS panel_name = "tb-gn-staph-amr" elif args.species == "staph": panels = STAPH_PANELS panel_name = "staph-amr" # Predictor = StaphPredictor args.kmer = 15 # Forced elif args.species == "tb": panels = TB_PANELS panel_name = "tb-amr" hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" # Predictor = TBPredictor elif args.species == "gn": panels = GN_PANELS panel_name = "gn-amr" # Predictor = GramNegPredictor logger.info("Running AMR prediction with panels %s" % ", ".join(panels)) version = {} version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath( os.path.join( os.path.dirname(__file__), "..", f)) for f in panels] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join( os.path.dirname(__file__), "..", hierarchy_json_file)) if args.ont: args.expected_error_rate = 0.15 logger.debug("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.filters = ["LOW_GT_CONF"] # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, mccortex31_path=args.mccortex31_path) cp.run() logger.debug('CoverageParser complete') # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get( "complex", cp.covgs.get( "phylo_group", {})), sub_complex_covgs=cp.covgs.get( "sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get( "sub-species", {}), hierarchy_json_file=hierarchy_json_file) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] Predictor = None if species_predictor.is_saureus_present(): depths = [species_predictor.out_json["phylogenetics"] ["phylo_group"]["Staphaureus"]["median_depth"]] Predictor = StaphPredictor elif species_predictor.is_mtbc_present(): depths = [species_predictor.out_json["phylogenetics"]["phylo_group"][ "Mycobacterium_tuberculosis_complex"]["median_depth"]] Predictor = TBPredictor elif species_predictor.is_gram_neg_present(): Predictor = GramNegPredictor try: depths = [species_predictor.out_json["phylogenetics"][ "species"]["Klebsiella_pneumoniae"]["median_depth"]] except KeyError: depths = [species_predictor.out_json["phylogenetics"] ["species"]["Escherichia_coli"]["median_depth"]] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if depths: gt = Genotyper(sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = cp.estimate_depth() args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult() if Predictor is not None and max(depths) > args.min_depth: predictor = Predictor(variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ont) mykrobe_predictor_susceptibility_result = predictor.run() base_json[ args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version).to_dict() if not args.keep_tmp: cp.remove_temporary_files() print(json.dumps(base_json, indent=4))
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None if args.panel is not None: variant_to_resistance_json_fp = None if args.species == "tb" and args.panel == "bradley-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-bradley-probe-set-jan-2019.fasta.gz", ] elif args.species == "tb" and args.panel == "walker-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-jan-2019.fasta.gz", ] elif args.species == "tb" and args.panel == "201901": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-hunt-probe-set-jan-03-2019.fasta.gz", ] data_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/predict/tb/")) variant_to_resistance_json_fp = os.path.join( data_dir, "variant_to_resistance_drug-jan-03-2019.json") elif args.species == "tb" and args.panel == "atlas": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-jan-2019.fasta.gz", "data/panels/tb-k21-probe-set-feb-09-2017.fasta.gz", ] elif args.panel == "custom": if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") TB_PANELS = [ args.custom_probe_set_path, "data/panels/tb-species-170421.fasta.gz", ] variant_to_resistance_json_fp = args.custom_variant_to_resistance_json Predictor = None if not args.species: panels = TB_PANELS + GN_PANELS + STAPH_PANELS elif args.species == "staph": panels = STAPH_PANELS Predictor = StaphPredictor args.kmer = 15 # Forced variant_to_resistance_json_fp = None elif args.species == "tb": panels = TB_PANELS hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor logger.info("Running AMR prediction with panels %s" % ", ".join(panels)) version = {} version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, mccortex31_path=args.mccortex31_path, ) cp.run() logger.debug("CoverageParser complete") # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file, ) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() kmer_count_error_rate, incorrect_kmer_to_pc_cov = ( gt. estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov()) logger.info("Estimated error rate for kmer count model: " + str(round(100 * kmer_count_error_rate, 2)) + "%") if args.guess_sequence_method and kmer_count_error_rate > 0.001: logger.warning( "Guess sequence method is on, and we've guessed ONT") args.ont = True if args.ont: args.expected_error_rate = 0.15 args.ploidy = "haploid" args.ignore_minor_calls = True logger.warning("Setting ploidy to haploid") logger.warning("Setting ignore_minor_calls to True") logger.warning("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.model = "kmer_count" # If the user didn't specify the conf_percent_cutoff, then set it # depending on whether or not the --ont flag was used if args.conf_percent_cutoff == -1: args.conf_percent_cutoff = 90 if args.ont else 100 # conf_percent_cutoff == 100 means that we want to keep all variant calls, # in which case there is no need to run the simulations if args.conf_percent_cutoff < 100: logger.info("Expected depth: " + str(depths[0])) conf_thresholder = ConfThresholder(kmer_count_error_rate, depths[0], args.kmer, incorrect_kmer_to_pc_cov) time_start = time.time() conf_threshold = conf_thresholder.get_conf_threshold( percent_to_keep=args.conf_percent_cutoff) time_end = time.time() time_to_sim = time_end - time_start logger.info("Simulation time: " + str(time_to_sim)) logger.info("Confidence cutoff (using percent cutoff " + str(args.conf_percent_cutoff) + "%): " + str(conf_threshold)) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=kmer_count_error_rate, # expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=conf_threshold, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args. min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor( variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ignore_minor_calls, variant_to_resistance_json_fp=variant_to_resistance_json_fp, ) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model, ).to_dict() if not args.keep_tmp: cp.remove_temporary_files() # write to file is specified by user, otherwise send to stdout if args.output_format == "csv": output = json_to_csv(base_json) else: ## Verbose json output requires --report_all_calls if not args.report_all_calls: del base_json[args.sample]["variant_calls"] del base_json[args.sample]["sequence_calls"] output = json.dumps(base_json, indent=4) if args.output: with open(args.output, "w") as outfile: outfile.write(output) else: print(output)
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None if args.panel is not None: if args.panel == "bradley-2015": TB_PANELS = [ "data/panels/tb-species-160330.fasta.gz", "data/panels/tb-amr-bradley_2015.fasta.gz" ] elif args.panel == "walker-2015": TB_PANELS = [ "data/panels/tb-species-160330.fasta.gz", "data/panels/tb-amr-walker_2015.fasta.gz" ] if not args.species: panels = TB_PANELS + GN_PANELS + STAPH_PANELS panel_name = "tb-gn-staph-amr" elif args.species == "staph": panels = STAPH_PANELS panel_name = "staph-amr" # Predictor = StaphPredictor args.kmer = 15 # Forced elif args.species == "tb": panels = TB_PANELS panel_name = "tb-amr" hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" # Predictor = TBPredictor elif args.species == "gn": panels = GN_PANELS panel_name = "gn-amr" # Predictor = GramNegPredictor logging.info("Running AMR prediction with panels %s" % ", ".join(panels)) version = {} version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) # Run Cortex cp = CoverageParser(sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, mccortex31_path=args.mccortex31_path) cp.run() # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] Predictor = None if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] Predictor = StaphPredictor elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] Predictor = TBPredictor elif species_predictor.is_gram_neg_present(): Predictor = GramNegPredictor try: depths = [ species_predictor.out_json["phylogenetics"]["species"] ["Klebsiella_pneumoniae"]["median_depth"] ] except KeyError: depths = [ species_predictor.out_json["phylogenetics"]["species"] ["Escherichia_coli"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if depths: gt = Genotyper(sample=args.sample, expected_depths=depths, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = cp.estimate_depth() args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if Predictor is not None and max(depths) > args.min_depth: predictor = Predictor(variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version).to_dict() if not args.keep_tmp: cp.remove_temporary_files() print(json.dumps(base_json, indent=4))
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None variant_to_resistance_json_fp: Optional[PathLike] = None species = Species(args.species) if species is not Species.TB and args.panel != "custom": args.panel = "default" panels = Panel.from_species_and_name(species, args.panel) if species is Species.TB and panels.name is TbPanel.NEJM_WALKER: data_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/predict/tb/")) variant_to_resistance_json_fp = os.path.join( data_dir, "variant_to_resistance_drug-jan-03-2019.json") if panels.name in (TbPanel.CUSTOM, StaphPanel.CUSTOM): if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") if not os.path.exists(args.custom_probe_set_path): raise FileNotFoundError( f"Custom probe path {args.custom_probe_set_path} does not exist!" ) panels.add_path(args.custom_probe_set_path) if not os.path.exists(args.custom_variant_to_resistance_json): raise FileNotFoundError( ("Custom variant to resistance json " f"{args.custom_variant_to_resistance_json} does not exist!")) variant_to_resistance_json_fp = args.custom_variant_to_resistance_json if species is Species.STAPH: Predictor = StaphPredictor args.kmer = 15 # Forced elif species is Species.TB: hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor else: raise ValueError(f"Unrecognised species {species}") logger.info("Running AMR prediction with panels %s" % ", ".join(panels.paths)) version = dict() version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels.paths ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, ) cp.run() logger.debug("CoverageParser complete") # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file, ) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() ( kmer_count_error_rate, incorrect_kmer_to_pc_cov, ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov( ) logger.info("Estimated error rate for kmer count model: " + str(round(100 * kmer_count_error_rate, 2)) + "%") if args.guess_sequence_method and kmer_count_error_rate > 0.001: logger.warning( "Guess sequence method is on, and we've guessed ONT") args.ont = True if args.ont: args.expected_error_rate = 0.15 args.ploidy = "haploid" args.ignore_minor_calls = True logger.warning("Setting ploidy to haploid") logger.warning("Setting ignore_minor_calls to True") logger.warning("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.model = "kmer_count" # If the user didn't specify the conf_percent_cutoff, then set it # depending on whether or not the --ont flag was used if args.conf_percent_cutoff == -1: args.conf_percent_cutoff = 90 if args.ont else 100 # conf_percent_cutoff == 100 means that we want to keep all variant calls, # in which case there is no need to run the simulations if args.conf_percent_cutoff < 100: logger.info("Expected depth: " + str(depths[0])) conf_thresholder = ConfThresholder(kmer_count_error_rate, depths[0], args.kmer, incorrect_kmer_to_pc_cov) time_start = time.time() conf_threshold = conf_thresholder.get_conf_threshold( percent_to_keep=args.conf_percent_cutoff) time_end = time.time() time_to_sim = time_end - time_start logger.info("Simulation time: " + str(time_to_sim)) logger.info("Confidence cutoff (using percent cutoff " + str(args.conf_percent_cutoff) + "%): " + str(conf_threshold)) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=kmer_count_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=conf_threshold, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args. min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor( variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ignore_minor_calls, variant_to_resistance_json_fp=variant_to_resistance_json_fp, ) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model, ).to_dict() if not args.keep_tmp: cp.remove_temporary_files() outputs = {} if args.output_format in ["csv", "json_and_csv"]: outputs["csv"] = json_to_csv(base_json) if args.output_format in ["json", "json_and_csv"]: # Verbose json output requires --report_all_calls if not args.report_all_calls: del base_json[args.sample]["variant_calls"] del base_json[args.sample]["sequence_calls"] outputs["json"] = json.dumps(base_json, indent=4) if len(outputs) == 0: raise ValueError( (f"Output format must be one of: csv,json,json_and_csv. Got " f"'{args.output_format}'")) for output_type, output in outputs.items(): # write to file is specified by user, otherwise send to stdout if args.output: if args.output_format == "json_and_csv": outfile = args.output + "." + output_type else: outfile = args.output with open(outfile, "w") as f: f.write(output) else: print(output)
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None if args.panel is not None: if args.panel == "bradley-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-bradley-probe-set-feb-09-2017.fasta.gz" ] elif args.panel == "walker-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-feb-09-2017.fasta.gz" ] elif args.panel == "custom": if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") TB_PANELS = [ args.custom_probe_set_path, "data/panels/tb-species-170421.fasta.gz" ] Predictor = None if not args.species: panels = TB_PANELS + GN_PANELS + STAPH_PANELS elif args.species == "staph": panels = STAPH_PANELS Predictor = StaphPredictor args.kmer = 15 # Forced elif args.species == "tb": panels = TB_PANELS hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor logger.info("Running AMR prediction with panels %s" % ", ".join(panels)) version = {} version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) if args.ont: args.expected_error_rate = 0.15 logger.debug("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.filters = ["LOW_GT_CONF"] args.model = "kmer_count" # Run Cortex cp = CoverageParser(sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, mccortex31_path=args.mccortex31_path) cp.run() logger.debug('CoverageParser complete') # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper(sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor(variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ont, variant_to_resistance_json_fp=args. custom_variant_to_resistance_json) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model).to_dict() if not args.keep_tmp: cp.remove_temporary_files() # write to file is specified by user, otherwise send to stdout if args.output: with open(args.output, 'w') as outfile: json.dump(base_json, outfile, indent=4) else: print(json.dumps(base_json, indent=4))