def main_load_library(args): lib_prefix = args.prefix.split("/")[-1] files = { "gff": ".gff", "ref": ".fasta", "barcode": ".barcode.bed", "version": ".version.json", "proteins": ".proteins.csv", "non_coding_bed": ".non_coding.bed" } if pp.nofolder(sys.base_prefix + "/share/covidprofiler"): pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/")) pp.run_cmd("cp %s %s" % (args.msa, "%s/share/covidprofiler/%s.msa.fa" % (sys.base_prefix, lib_prefix))) pp.run_cmd("cp %s %s" % (args.meta, "%s/share/covidprofiler/%s.msa.meta.csv" % (sys.base_prefix, lib_prefix))) for key in files: new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[ key] pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location)) pp.run_cmd("samtools faidx %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict"): pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict") pp.log("Sucessfully imported library")
def main_load_library(args): lib_prefix = args.prefix.split("/")[-1] files = { "gff": ".gff", "ref": ".fasta", "barcode": ".barcode.bed", "version": ".version.json" } if pp.nofolder(sys.base_prefix + "/share/covidprofiler"): pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/")) for key in files: new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[ key] pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location)) pp.run_cmd("samtools faidx %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict"): pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict") pp.run_cmd("gatk CreateSequenceDictionary -R %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") pp.log("Sucessfully imported library")
def get_summary(json_results, conf, columns=None, drug_order=None, reporting_af=0.0): if not columns: columns = [] drugs = set() for l in open(conf["bed"]): arr = l.rstrip().split() for d in arr[5].split(","): drugs.add(d) if drug_order: drugs = drug_order drug_table = [] results = {} annotation = {} for key in columns: if key not in json_results["dr_variants"][0]: pp.log( "%s not found in variant annotation, is this a valid column in the database CSV file? Exiting!" % key, True) for x in json_results["dr_variants"]: d = x["drug"] if float(x["freq"]) < reporting_af: continue if d not in results: results[d] = list() results[d].append("%s %s (%.2f)" % (x["gene"], x["change"], x["freq"])) if d not in annotation: annotation[d] = {key: [] for key in columns} for key in columns: annotation[d][key].append(x[key]) for d in drugs: if d in results: results[d] = ", ".join(results[d]) if len(results[d]) > 0 else "" r = "R" if len(results[d]) > 0 else "" for key in columns: annotation[d][key] = ", ".join( annotation[d][key]) if len(annotation[d][key]) > 0 else "" else: results[d] = "" r = "" dictline = { "Drug": d.capitalize(), "Genotypic Resistance": r, "Mutations": results[d] } for key in columns: dictline[key] = annotation[d][key] if d in annotation else "" drug_table.append(dictline) pipeline_tbl = [{ "Analysis": "Mapping", "Program": json_results["pipeline"]["mapper"] }, { "Analysis": "Variant Calling", "Program": json_results["pipeline"]["variant_caller"] }] new_json = json_results.copy() new_json["drug_table"] = drug_table new_json["pipline_table"] = pipeline_tbl return new_json
def write_html(json_results,conf,outfile,columns = None,drug_order = None): json_results = get_summary(json_results,conf,columns = columns, drug_order=drug_order) html_strings = {} html_strings["id"] = json_results["id"] html_strings["date"] = time.ctime() html_strings["strain"] = json_results["sublin"] html_strings["drtype"] = json_results["drtype"] html_strings["dr_report"] = dict_list2html(json_results["drug_table"],["Drug","Genotypic Resistance","Mutations"]+columns,{"Drug":"Drug<sup>1</sup>","Genotypic Resistance":"Resistance","Mutations":"Supporting Mutations (frequency)"}) html_strings["lineage_report"] = dict_list2html(json_results["lineage"],["lin","family","spoligotype","rd"],{"lin":"Lineage<sup>2</sup>","frac":"Estimated fraction","family":"Family","spoligotype":"Main Spoligotype","rd":"RDS"}) html_strings["other_var_report"] = dict_list2html(json_results["other_variants"],["gene","genome_pos","change","freq"],{"gene":"Gene","genome_pos":"Chromosome Position","change":"Mutation","freq":"Estimated fraction"}) html_strings["pipeline"] = dict_list2html(json_results["pipline_table"],["Analysis","Program"]) html_strings["version"] = json_results["tbprofiler_version"] o = open(outfile,"w") pp.log("Writing results to %s" % outfile) o.write(load_html(html_strings)) o.close()
def main(args): vcf = vcf_class(args.vcf) # vcf.get_mean_genotype() if args.genes: vcf.get_genesum() geno_file = vcf.prefix + ".geno" genesum_file = vcf.prefix + ".genesum" meta = {} for s in vcf.samples: meta[s] = {} for row in csv.DictReader(open(args.csv)): for pheno in row.keys(): if pheno == "id": continue if row['id'] not in meta: continue meta[row["id"]][pheno] = row[pheno] phenos = [x.rstrip() for x in open(args.phenos).readlines()] cmd_file = pp.get_random_file() X = open(cmd_file, "w") for pheno in phenos: pheno_file = "%s.pheno" % pheno if pheno not in row: pp.log("%s not in CSV file" % pheno, True) P = open(pheno_file, "w") P.write("\n".join([ meta[s][pheno] if pheno in meta[s] else "NA" for s in vcf.samples ])) P.close() X.write( "gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s.genesum -notsnp\n" % (pheno_file, geno_file, pheno, pheno_file, geno_file, pheno, pheno, pheno_file, genesum_file, pheno, pheno)) X.close() if args.preprocess: pp.log("Preprocessing finished\n", True) else: pp.run_cmd("cat %s | parallel -j %s" % (cmd_file, args.threads))
def calculate(args): sample_file = args.samples dst_file = args.dst dst = load_dst(dst_file) drug_loci = pp.load_bed(args.bed, [6], 4) # {'Rv0668': ('rifampicin')} FAIL = open("samples_not_found.txt", "w") samples = [x.rstrip() for x in open(sample_file).readlines()] ext = ".results.json" drugs = [d.lower() for d in dst[samples[0]].keys()] results = { d: { "tp": [], "tn": [], "fp": [], "fn": [] } for d in drugs + ["flq", "mdr", "xdr", "sus"] } counts = { d: { "tp": 0, "tn": 0, "fp": 0, "fn": 0 } for d in drugs + ["flq", "mdr", "xdr", "sus"] } pre = args.dir if args.dir else "" for s in tqdm(samples): res_file = "%s/%s%s" % (pre, s, ext) if pp.nofile(res_file): pp.log("Warning: %s does not exist!" % res_file) FAIL.write("%s\n" % s) continue res = json.load(open(res_file)) na_drugs = set() for locus in drug_loci: if res["missing_regions"][locus] > args.miss: for tmp in drug_loci[locus][0].split(","): na_drugs.add(tmp) resistant_drugs = [d["drug"].lower() for d in res["dr_variants"]] for d in drugs: if d in na_drugs: dst[s][d] = "NA" for d in drugs: if dst[s][d] == "0" and d not in resistant_drugs: results[d]["tn"].append(s) counts[d]["tn"] += 1 elif dst[s][d] == "0" and d in resistant_drugs: results[d]["fp"].append(s) counts[d]["fp"] += 1 elif dst[s][d] == "1" and d not in resistant_drugs: results[d]["fn"].append(s) counts[d]["fn"] += 1 elif dst[s][d] == "1" and d in resistant_drugs: results[d]["tp"].append(s) counts[d]["tp"] += 1 #### Fluoroquinolones #### dst_flq = "0" dst_flq_NA = True for d in fluoroquinolones: if d not in dst[s]: continue if dst[s][d] != "NA": dst_flq_NA = False if dst[s][d] == "1": dst_flq = "1" dst_flq_list = [dst[s][d] for d in fluoroquinolones if d in dst[s]] if "1" in dst_flq_list and "0" in dst_flq_list: dst_flq = "NA" if dst_flq_NA: dst_flq = "NA" gst_flq = "0" for d in fluoroquinolones: if d in resistant_drugs: gst_flq = "1" if dst_flq == "1" and gst_flq == "1": results["flq"]["tp"].append(s) counts["flq"]["tp"] += 1 if dst_flq == "0" and gst_flq == "1": results["flq"]["fp"].append(s) counts["flq"]["fp"] += 1 if dst_flq == "1" and gst_flq == "0": results["flq"]["fn"].append(s) counts["flq"]["fn"] += 1 if dst_flq == "0" and gst_flq == "0": results["flq"]["tn"].append(s) counts["flq"]["tn"] += 1 #### MDR & XDR #### dst_mdr = "1" if dst[s]["rifampicin"] == "1" and dst[s][ "isoniazid"] == "1" else "0" if dst[s]["rifampicin"] == "NA" or dst[s]["isoniazid"] == "NA": dst_mdr = "NA" flq = False flq_NA = True for d in fluoroquinolones: if d not in dst[s]: continue if dst[s][d] != "NA": flq_NA = False if dst[s][d] == "1": flq = True amg = False amg_NA = True for d in aminoglycosides: if d not in dst[s]: continue if dst[s][d] != "NA": amg_NA = False if dst[s][d] == "1": amg = True dst_xdr = "1" if dst_mdr == "1" and flq and amg else "0" if flq_NA or amg_NA: dst_xdr = "NA" if dst_mdr == "NA": dst_xdr = "NA" #### Profiling results ##### gst_mdr = "1" if "rifampicin" in resistant_drugs and "isoniazid" in resistant_drugs else "0" flq = False for d in fluoroquinolones: if d in resistant_drugs: flq = True amg = False for d in aminoglycosides: if d in resistant_drugs: amg = True gst_xdr = "1" if gst_mdr == "1" and flq and amg else "0" if dst_mdr == "1" and gst_mdr == "1": results["mdr"]["tp"].append(s) counts["mdr"]["tp"] += 1 if dst_mdr == "0" and gst_mdr == "1": results["mdr"]["fp"].append(s) counts["mdr"]["fp"] += 1 if dst_mdr == "1" and gst_mdr == "0": results["mdr"]["fn"].append(s) counts["mdr"]["fn"] += 1 if dst_mdr == "0" and gst_mdr == "0": results["mdr"]["tn"].append(s) counts["mdr"]["tn"] += 1 if dst_xdr == "1" and gst_xdr == "1": results["xdr"]["tp"].append(s) counts["xdr"]["tp"] += 1 if dst_xdr == "0" and gst_xdr == "1": results["xdr"]["fp"].append(s) counts["xdr"]["fp"] += 1 if dst_xdr == "1" and gst_xdr == "0": results["xdr"]["fn"].append(s) counts["xdr"]["fn"] += 1 if dst_xdr == "0" and gst_xdr == "0": results["xdr"]["tn"].append(s) counts["xdr"]["tn"] += 1 ### susceptibility if "NA" not in [dst[s][d] for d in first_line]: dst_sus = "1" if "1" not in [dst[s][d] for d in drugs] else "0" gst_sus = "1" if all( [x not in resistant_drugs for x in first_line]) else "0" if dst_sus == "1" and gst_sus == "1": results["sus"]["tp"].append(s) counts["sus"]["tp"] += 1 if dst_sus == "0" and gst_sus == "1": results["sus"]["fp"].append(s) counts["sus"]["fp"] += 1 if dst_sus == "1" and gst_sus == "0": results["sus"]["fn"].append(s) counts["sus"]["fn"] += 1 if dst_sus == "0" and gst_sus == "0": results["sus"]["tn"].append(s) counts["sus"]["tn"] += 1 json.dump(results, open("results.json", "w")) json.dump(counts, open("counts.json", "w")) counts = json.load(open("counts.json")) drugs = [x.rstrip().lower() for x in open(args.drugs).readlines() ] if args.drugs else list(counts.keys()) print("Drug\tNum\tSusceptible\tResistant\tSensitivity\tSpecificity") for d in drugs: if d not in counts: continue if counts[d]["tp"] + counts[d]["fn"] == 0 or counts[d]["tn"] + counts[ d]["fp"] == 0: continue sensitivity = counts[d]["tp"] / (counts[d]["tp"] + counts[d]["fn"]) specificity = counts[d]["tn"] / (counts[d]["tn"] + counts[d]["fp"]) total = counts[d]["tp"] + counts[d]["fp"] + counts[d]["tn"] + counts[ d]["fn"] suc = counts[d]["tn"] + counts[d]["fp"] res = counts[d]["tp"] + counts[d]["fn"] print("%s\t%s\t%s\t%s\t%s\t%s" % (d.capitalize(), total, suc, res, sensitivity, specificity))
def main_profile(args): #### Setup conf dictionary ### if args.db == "tbdb" and not args.external_db and pp.nofile( sys.base_prefix + "/share/tbprofiler/tbdb.fasta"): pp.log( "Can't find the tbdb file at %s. Please run 'tb-profiler update_tbdb' to load the default library or specify another using the '--external_db' flag" % sys.base_prefix, ext=True) if args.external_db: conf = get_conf_dict(args.external_db) else: conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) ### Create folders for results if they don't exist ### if pp.nofolder(args.dir): os.mkdir(args.dir) for x in ["bam", "vcf", "results"]: if pp.nofolder(args.dir + "/" + x): os.mkdir(args.dir + "/" + x) ### Set up platform dependant parameters ### if args.platform == "nanopore": args.mapper = "minimap2" args.caller = "bcftools" args.no_trim = True run_delly = False else: if args.no_delly: run_delly = False else: run_delly = True ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix ### Create bam file if fastq has been supplied ### if args.bam == None: if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) else: exit("\nPlease provide a bam file or a fastq file(s)...Exiting!\n") bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) bam_file = bam_obj.bam_file else: bam_file = args.bam print(args.delly_bcf_file) run_coverage = False if args.no_coverage else True ### Run profiling module from pathogen-profiler ### results = pp.bam_profiler( conf=conf, bam_file=bam_file, prefix=files_prefix, platform=args.platform, caller=args.caller, threads=args.threads, no_flagstat=args.no_flagstat, run_delly=run_delly, calling_params=args.calling_params, coverage_fraction_threshold=args.coverage_fraction_threshold, missing_cov_threshold=args.missing_cov_threshold, delly_bcf_file=args.delly_bcf_file) json.dump(results, open(args.prefix + ".tmp_results.json", "w")) ### Reformat the results to TB-Profiler style ### results = tbp.reformat(results, conf, reporting_af=args.reporting_af) results["id"] = args.prefix results["tbprofiler_version"] = tbp._VERSION results["pipeline"] = { "mapper": args.mapper if not args.bam else "N/A", "variant_caller": args.caller } json_output = args.dir + "/results/" + args.prefix + ".results.json" tex_output = args.dir + "/results/" + args.prefix + ".results.tex" text_output = args.dir + "/results/" + args.prefix + ".results.txt" csv_output = args.dir + "/results/" + args.prefix + ".results.csv" json.dump(results, open(json_output, "w")) extra_columns = [x.lower() for x in args.add_columns.split(",") ] if args.add_columns else [] if args.pdf: tbp.write_tex(results, conf, tex_output, extra_columns) pp.run_cmd("pdflatex %s" % tex_output, verbose=1) pp.rm_files([ tex_output, args.dir + "/" + args.prefix + ".results.aux", args.dir + "/" + args.prefix + ".results.log" ]) if args.txt: tbp.write_text(results, conf, text_output, extra_columns, reporting_af=args.reporting_af) if args.csv: tbp.write_csv(results, conf, csv_output, extra_columns) ### Move files to respective directories ### if not args.bam: pp.run_cmd("mv %(dir)s/%(prefix)s.bam* %(dir)s/bam/" % vars(args)) if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) pp.run_cmd("mv -f %(dir)s/%(prefix)s*.vcf.gz* %(dir)s/vcf/" % vars(args)) if run_delly and results["delly"] == "success" and not args.delly_bcf_file: pp.run_cmd("mv -f %(dir)s/%(prefix)s.delly.bcf* %(dir)s/vcf/" % vars(args)) ### Add meta data to results if args.meta: for row in csv.DictReader(open(args.meta)): if row["id"] == results["id"]: for col in row: results["meta_" + col] = row[col] pp.log("Profiling finished sucessfully!")
def main(args): if args.drugs: args.drugs = [x.lower() for x in args.drugs.split(",")] conf = conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) json_db = json.load(open(conf["json_db"])) drug2genes = defaultdict(set) gene2drugs = defaultdict(set) gene2lt = {} lt2gene = {} for l in open(conf["bed"]): row = l.rstrip().split() for d in row[5].split(","): drug2genes[d].add(row[3]) gene2drugs[row[3]].add(d) gene2lt[row[3]] = row[3] gene2lt[row[4]] = row[3] lt2gene[row[3]] = row[4] mutations = [] for l in open(args.mutations): row = l.strip().split() mutations.append((gene2lt[row[0]], row[1])) meta = {} reader = csv.DictReader(open(args.meta)) drug_resistant_isolates = { d: set() for d in drug2genes if d in reader.fieldnames } for row in reader: meta[row["id"]] = row for drug in drug_resistant_isolates: if row[drug] == "1": drug_resistant_isolates[drug].add(row["id"]) pp.log(f"Analysing {len(drug_resistant_isolates)} drugs") if args.samples: samples = [ x.rstrip() for x in open(args.samples).readlines() if x.rstrip() in meta ] else: samples = [ x.replace(".results.json", "") for x in os.listdir("results/") if x[-13:] == ".results.json" if x.replace(".results.json", "") in meta ] variants = {x: set() for x in mutations} hgvs2bcftools = {} variant_drug_associations = defaultdict(set) for s in tqdm(samples): tmp = json.load(open(f"{args.dir}/{s}.results.json")) for var in tmp["dr_variants"] + tmp["other_variants"]: if (var["locus_tag"], var["change"]) in mutations: hgvs2bcftools[var["change"]] = var["_internal_change"] variants[(var["locus_tag"], var["change"])].add(s) if "drug" in var: variant_drug_associations[(var["locus_tag"], var["change"])].add(var["drug"]) total_sample_n = len(samples) pp.log(f"Found {total_sample_n} samples in meta list with result files") pp.log("-" * 40) pp.log(variant_drug_associations) print( "Gene,Mutation,Drug resistance association,Total frequency (percentage),Associated drugs,Drug resitant frequency (percentage)" ) for gene, mut in variants: total_freq = len(variants[(gene, mut)]) total_pct = total_freq / total_sample_n * 100 dr_associated = "Not associated" if (gene, mut) in variant_drug_associations: drugs = variant_drug_associations[(gene, mut)] dr_associated = "Associated" else: drugs = gene2drugs[gene] if args.drugs: drugs = [d for d in drugs if d in args.drugs] dr_freqs = [] dr_pcts = [] for drug in drugs: dr_freq = len(variants[(gene, mut)].intersection( drug_resistant_isolates[drug])) dr_pct = dr_freq / len(drug_resistant_isolates[drug]) * 100 dr_freqs.append(dr_freq) dr_pcts.append(dr_pct) zipped_list = ["%s (%.2f)" % (x, y) for x, y in zip(dr_freqs, dr_pcts)] print("%s,%s,%s,%s (%.2f),%s,%s" % (lt2gene[gene], mut, dr_associated, total_freq, total_pct, ';'.join(drugs), ';'.join(zipped_list)))
def main(args): if pp.nofolder(args.out_dir): pp.run_cmd("mkdir %s" % args.out_dir) conf = { "ref": args.ref, "gff": args.gff, "bed": args.bed, "ann": args.ann, } if args.conf: conf = json.load(open(args.conf)) for x in ["ref", "gff", "bed", "ann"]: if conf[x] == None: pp.log("%s variable is not defined" % x, True) bam_obj = pp.bam(args.bam, args.prefix, conf["ref"], platform=args.platform) bcf_obj = bam_obj.call_variants( prefix=args.prefix + ".targets", call_method=args.call_method, gff_file=conf["gff"], bed_file=conf["bed"], mixed_as_missing=False if args.platform == "Illumina" else True, threads=args.threads, min_dp=args.min_depth, af=args.af, caller=args.caller) csq = bcf_obj.load_csq(ann_file=conf["ann"]) variants = [] chr2gene_pos = {} for l in open(conf["ann"]): row = l.rstrip().split() chr2gene_pos[int(row[1])] = int(row[3]) for var in list(csq.values())[0]: var["_internal_change"] = var["change"] var["change"] = pp.reformat_mutations(var["change"], var["type"], var["gene_id"], chr2gene_pos) variants.append(var) if not args.no_delly: delly_bcf = bam_obj.run_delly() deletions = delly_bcf.overlap_bed(conf["bed"]) for deletion in deletions: tmp_change = pp.reformat_mutations( "%(chr)s_%(start)s_%(end)s" % deletion, var["type"], var["gene_id"], chr2gene_pos) tmp = { "genome_pos": deletion["start"], "gene_id": deletion["region"], "chr": deletion["chr"], "freq": 1, "type": "large_deletion", "change": tmp_change } variants.append(tmp) json.dump(variants, open("%s/%s.pp-results.json" % (args.out_dir, args.prefix), "w")) for x in [ ".targets.bcf", ".targets.csq.bcf", ".targets.csq.bcf.csi", ".targets.delly.bcf", ".targets.delly.bcf.csi", ".targets.del_pos.bed", ".targets.gvcf.gz", ".targets.gvcf.gz.csi", ".targets.missing.bcf" ]: if args.no_delly and "delly" in x: continue pp.run_cmd("rm %s%s" % (args.prefix, x))