def write_outputs(args, base_json): outputs = {} if args.output_format in ["csv", "json_and_csv"]: outputs["csv"] = json_to_csv(base_json) if args.output_format in ["json", "json_and_csv"]: # Verbose json output requires --report_all_calls if not args.report_all_calls: del base_json[args.sample]["variant_calls"] del base_json[args.sample]["sequence_calls"] del base_json[args.sample]["lineage_calls"] outputs["json"] = json.dumps(base_json, indent=4) if len(outputs) == 0: raise ValueError( (f"Output format must be one of: csv,json,json_and_csv. Got " f"'{args.output_format}'")) for output_type, output in outputs.items(): # write to file is specified by user, otherwise send to stdout if args.output: if args.output_format == "json_and_csv": outfile = args.output + "." + output_type else: outfile = args.output with open(outfile, "w") as f: f.write(output) else: print(output)
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None if args.panel is not None: variant_to_resistance_json_fp = None if args.species == "tb" and args.panel == "bradley-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-bradley-probe-set-jan-2019.fasta.gz", ] elif args.species == "tb" and args.panel == "walker-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-jan-2019.fasta.gz", ] elif args.species == "tb" and args.panel == "201901": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-hunt-probe-set-jan-03-2019.fasta.gz", ] data_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/predict/tb/")) variant_to_resistance_json_fp = os.path.join( data_dir, "variant_to_resistance_drug-jan-03-2019.json") elif args.species == "tb" and args.panel == "atlas": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-jan-2019.fasta.gz", "data/panels/tb-k21-probe-set-feb-09-2017.fasta.gz", ] elif args.panel == "custom": if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") TB_PANELS = [ args.custom_probe_set_path, "data/panels/tb-species-170421.fasta.gz", ] variant_to_resistance_json_fp = args.custom_variant_to_resistance_json Predictor = None if not args.species: panels = TB_PANELS + GN_PANELS + STAPH_PANELS elif args.species == "staph": panels = STAPH_PANELS Predictor = StaphPredictor args.kmer = 15 # Forced variant_to_resistance_json_fp = None elif args.species == "tb": panels = TB_PANELS hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor logger.info("Running AMR prediction with panels %s" % ", ".join(panels)) version = {} version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, mccortex31_path=args.mccortex31_path, ) cp.run() logger.debug("CoverageParser complete") # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file, ) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() kmer_count_error_rate, incorrect_kmer_to_pc_cov = ( gt. estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov()) logger.info("Estimated error rate for kmer count model: " + str(round(100 * kmer_count_error_rate, 2)) + "%") if args.guess_sequence_method and kmer_count_error_rate > 0.001: logger.warning( "Guess sequence method is on, and we've guessed ONT") args.ont = True if args.ont: args.expected_error_rate = 0.15 args.ploidy = "haploid" args.ignore_minor_calls = True logger.warning("Setting ploidy to haploid") logger.warning("Setting ignore_minor_calls to True") logger.warning("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.model = "kmer_count" # If the user didn't specify the conf_percent_cutoff, then set it # depending on whether or not the --ont flag was used if args.conf_percent_cutoff == -1: args.conf_percent_cutoff = 90 if args.ont else 100 # conf_percent_cutoff == 100 means that we want to keep all variant calls, # in which case there is no need to run the simulations if args.conf_percent_cutoff < 100: logger.info("Expected depth: " + str(depths[0])) conf_thresholder = ConfThresholder(kmer_count_error_rate, depths[0], args.kmer, incorrect_kmer_to_pc_cov) time_start = time.time() conf_threshold = conf_thresholder.get_conf_threshold( percent_to_keep=args.conf_percent_cutoff) time_end = time.time() time_to_sim = time_end - time_start logger.info("Simulation time: " + str(time_to_sim)) logger.info("Confidence cutoff (using percent cutoff " + str(args.conf_percent_cutoff) + "%): " + str(conf_threshold)) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=kmer_count_error_rate, # expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=conf_threshold, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args. min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor( variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ignore_minor_calls, variant_to_resistance_json_fp=variant_to_resistance_json_fp, ) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model, ).to_dict() if not args.keep_tmp: cp.remove_temporary_files() # write to file is specified by user, otherwise send to stdout if args.output_format == "csv": output = json_to_csv(base_json) else: ## Verbose json output requires --report_all_calls if not args.report_all_calls: del base_json[args.sample]["variant_calls"] del base_json[args.sample]["sequence_calls"] output = json.dumps(base_json, indent=4) if args.output: with open(args.output, "w") as outfile: outfile.write(output) else: print(output)
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None variant_to_resistance_json_fp: Optional[PathLike] = None species = Species(args.species) if species is not Species.TB and args.panel != "custom": args.panel = "default" panels = Panel.from_species_and_name(species, args.panel) if species is Species.TB and panels.name is TbPanel.NEJM_WALKER: data_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/predict/tb/")) variant_to_resistance_json_fp = os.path.join( data_dir, "variant_to_resistance_drug-jan-03-2019.json") if panels.name in (TbPanel.CUSTOM, StaphPanel.CUSTOM): if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") if not os.path.exists(args.custom_probe_set_path): raise FileNotFoundError( f"Custom probe path {args.custom_probe_set_path} does not exist!" ) panels.add_path(args.custom_probe_set_path) if not os.path.exists(args.custom_variant_to_resistance_json): raise FileNotFoundError( ("Custom variant to resistance json " f"{args.custom_variant_to_resistance_json} does not exist!")) variant_to_resistance_json_fp = args.custom_variant_to_resistance_json if species is Species.STAPH: Predictor = StaphPredictor args.kmer = 15 # Forced elif species is Species.TB: hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor else: raise ValueError(f"Unrecognised species {species}") logger.info("Running AMR prediction with panels %s" % ", ".join(panels.paths)) version = dict() version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels.paths ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, ) cp.run() logger.debug("CoverageParser complete") # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file, ) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() ( kmer_count_error_rate, incorrect_kmer_to_pc_cov, ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov( ) logger.info("Estimated error rate for kmer count model: " + str(round(100 * kmer_count_error_rate, 2)) + "%") if args.guess_sequence_method and kmer_count_error_rate > 0.001: logger.warning( "Guess sequence method is on, and we've guessed ONT") args.ont = True if args.ont: args.expected_error_rate = 0.15 args.ploidy = "haploid" args.ignore_minor_calls = True logger.warning("Setting ploidy to haploid") logger.warning("Setting ignore_minor_calls to True") logger.warning("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.model = "kmer_count" # If the user didn't specify the conf_percent_cutoff, then set it # depending on whether or not the --ont flag was used if args.conf_percent_cutoff == -1: args.conf_percent_cutoff = 90 if args.ont else 100 # conf_percent_cutoff == 100 means that we want to keep all variant calls, # in which case there is no need to run the simulations if args.conf_percent_cutoff < 100: logger.info("Expected depth: " + str(depths[0])) conf_thresholder = ConfThresholder(kmer_count_error_rate, depths[0], args.kmer, incorrect_kmer_to_pc_cov) time_start = time.time() conf_threshold = conf_thresholder.get_conf_threshold( percent_to_keep=args.conf_percent_cutoff) time_end = time.time() time_to_sim = time_end - time_start logger.info("Simulation time: " + str(time_to_sim)) logger.info("Confidence cutoff (using percent cutoff " + str(args.conf_percent_cutoff) + "%): " + str(conf_threshold)) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=kmer_count_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=conf_threshold, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args. min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor( variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ignore_minor_calls, variant_to_resistance_json_fp=variant_to_resistance_json_fp, ) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model, ).to_dict() if not args.keep_tmp: cp.remove_temporary_files() outputs = {} if args.output_format in ["csv", "json_and_csv"]: outputs["csv"] = json_to_csv(base_json) if args.output_format in ["json", "json_and_csv"]: # Verbose json output requires --report_all_calls if not args.report_all_calls: del base_json[args.sample]["variant_calls"] del base_json[args.sample]["sequence_calls"] outputs["json"] = json.dumps(base_json, indent=4) if len(outputs) == 0: raise ValueError( (f"Output format must be one of: csv,json,json_and_csv. Got " f"'{args.output_format}'")) for output_type, output in outputs.items(): # write to file is specified by user, otherwise send to stdout if args.output: if args.output_format == "json_and_csv": outfile = args.output + "." + output_type else: outfile = args.output with open(outfile, "w") as f: f.write(output) else: print(output)