def main_load_library(args): lib_prefix = args.prefix.split("/")[-1] files = { "gff": ".gff", "ref": ".fasta", "barcode": ".barcode.bed", "version": ".version.json" } if pp.nofolder(sys.base_prefix + "/share/covidprofiler"): pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/")) for key in files: new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[ key] pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location)) pp.run_cmd("samtools faidx %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict"): pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict") pp.run_cmd("gatk CreateSequenceDictionary -R %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") pp.log("Sucessfully imported library")
def main_load_library(args): lib_prefix = args.prefix.split("/")[-1] files = { "gff": ".gff", "ref": ".fasta", "barcode": ".barcode.bed", "version": ".version.json", "proteins": ".proteins.csv", "non_coding_bed": ".non_coding.bed" } if pp.nofolder(sys.base_prefix + "/share/covidprofiler"): pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/")) pp.run_cmd("cp %s %s" % (args.msa, "%s/share/covidprofiler/%s.msa.fa" % (sys.base_prefix, lib_prefix))) pp.run_cmd("cp %s %s" % (args.meta, "%s/share/covidprofiler/%s.msa.meta.csv" % (sys.base_prefix, lib_prefix))) for key in files: new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[ key] pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location)) pp.run_cmd("samtools faidx %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict"): pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict") pp.log("Sucessfully imported library")
def main_profile(args): #### Setup conf dictionary ### if args.db == "tbdb" and not args.external_db and pp.nofile( sys.base_prefix + "/share/tbprofiler/tbdb.fasta"): pp.log( "Can't find the tbdb file at %s. Please run 'tb-profiler update_tbdb' to load the default library or specify another using the '--external_db' flag" % sys.base_prefix, ext=True) if args.external_db: conf = get_conf_dict(args.external_db) else: conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) ### Create folders for results if they don't exist ### if pp.nofolder(args.dir): os.mkdir(args.dir) for x in ["bam", "vcf", "results"]: if pp.nofolder(args.dir + "/" + x): os.mkdir(args.dir + "/" + x) ### Set up platform dependant parameters ### if args.platform == "nanopore": args.mapper = "minimap2" args.caller = "bcftools" args.no_trim = True run_delly = False else: if args.no_delly: run_delly = False else: run_delly = True ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix ### Create bam file if fastq has been supplied ### if args.bam == None: if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) else: exit("\nPlease provide a bam file or a fastq file(s)...Exiting!\n") bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) bam_file = bam_obj.bam_file else: bam_file = args.bam print(args.delly_bcf_file) run_coverage = False if args.no_coverage else True ### Run profiling module from pathogen-profiler ### results = pp.bam_profiler( conf=conf, bam_file=bam_file, prefix=files_prefix, platform=args.platform, caller=args.caller, threads=args.threads, no_flagstat=args.no_flagstat, run_delly=run_delly, calling_params=args.calling_params, coverage_fraction_threshold=args.coverage_fraction_threshold, missing_cov_threshold=args.missing_cov_threshold, delly_bcf_file=args.delly_bcf_file) json.dump(results, open(args.prefix + ".tmp_results.json", "w")) ### Reformat the results to TB-Profiler style ### results = tbp.reformat(results, conf, reporting_af=args.reporting_af) results["id"] = args.prefix results["tbprofiler_version"] = tbp._VERSION results["pipeline"] = { "mapper": args.mapper if not args.bam else "N/A", "variant_caller": args.caller } json_output = args.dir + "/results/" + args.prefix + ".results.json" tex_output = args.dir + "/results/" + args.prefix + ".results.tex" text_output = args.dir + "/results/" + args.prefix + ".results.txt" csv_output = args.dir + "/results/" + args.prefix + ".results.csv" json.dump(results, open(json_output, "w")) extra_columns = [x.lower() for x in args.add_columns.split(",") ] if args.add_columns else [] if args.pdf: tbp.write_tex(results, conf, tex_output, extra_columns) pp.run_cmd("pdflatex %s" % tex_output, verbose=1) pp.rm_files([ tex_output, args.dir + "/" + args.prefix + ".results.aux", args.dir + "/" + args.prefix + ".results.log" ]) if args.txt: tbp.write_text(results, conf, text_output, extra_columns, reporting_af=args.reporting_af) if args.csv: tbp.write_csv(results, conf, csv_output, extra_columns) ### Move files to respective directories ### if not args.bam: pp.run_cmd("mv %(dir)s/%(prefix)s.bam* %(dir)s/bam/" % vars(args)) if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) pp.run_cmd("mv -f %(dir)s/%(prefix)s*.vcf.gz* %(dir)s/vcf/" % vars(args)) if run_delly and results["delly"] == "success" and not args.delly_bcf_file: pp.run_cmd("mv -f %(dir)s/%(prefix)s.delly.bcf* %(dir)s/vcf/" % vars(args)) ### Add meta data to results if args.meta: for row in csv.DictReader(open(args.meta)): if row["id"] == results["id"]: for col in row: results["meta_" + col] = row[col] pp.log("Profiling finished sucessfully!")
def main_profile(args): if pp.nofolder(args.dir): os.mkdir(args.dir) conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix if args.fasta: if args.read1 or args.read2: sys.stderr.write( "Please use --fasta or --read1/2 but not both... Exiting!\n") quit() fasta_obj = pp.fasta(args.fasta) wg_vcf_obj = pp.vcf( fasta_obj.get_ref_variants(conf["ref"], prefix=args.prefix, file_prefix=files_prefix)) else: if not args.read1: sys.stderr.write( "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n" ) quit() ### Create bam file if fastq has been supplied ### if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) wg_vcf_obj = bam_obj.call_variants(conf["ref"], args.caller, remove_missing=True) cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"], wg_vcf_obj.samples[0], wg_vcf_obj.prefix + ".consensus.fasta") if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) refseq = pp.fasta(conf["ref"]).fa_dict refseqname = list(refseq.keys())[0] results = {} barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) barcode = pp.barcode(barcode_mutations, conf["barcode"]) clade = ";".join(sorted([d["annotation"] for d in barcode])) sys.stdout.write("%s\t%s\n" % (args.prefix, clade)) results["clade"] = clade variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"], conf["gff"], conf["proteins"]) results["variants"] = variant_data json.dump(results, open("%s.results.json" % files_prefix, "w"))
def main(args): if pp.nofolder(args.out_dir): pp.run_cmd("mkdir %s" % args.out_dir) conf = { "ref": args.ref, "gff": args.gff, "bed": args.bed, "ann": args.ann, } if args.conf: conf = json.load(open(args.conf)) for x in ["ref", "gff", "bed", "ann"]: if conf[x] == None: pp.log("%s variable is not defined" % x, True) bam_obj = pp.bam(args.bam, args.prefix, conf["ref"], platform=args.platform) bcf_obj = bam_obj.call_variants( prefix=args.prefix + ".targets", call_method=args.call_method, gff_file=conf["gff"], bed_file=conf["bed"], mixed_as_missing=False if args.platform == "Illumina" else True, threads=args.threads, min_dp=args.min_depth, af=args.af, caller=args.caller) csq = bcf_obj.load_csq(ann_file=conf["ann"]) variants = [] chr2gene_pos = {} for l in open(conf["ann"]): row = l.rstrip().split() chr2gene_pos[int(row[1])] = int(row[3]) for var in list(csq.values())[0]: var["_internal_change"] = var["change"] var["change"] = pp.reformat_mutations(var["change"], var["type"], var["gene_id"], chr2gene_pos) variants.append(var) if not args.no_delly: delly_bcf = bam_obj.run_delly() deletions = delly_bcf.overlap_bed(conf["bed"]) for deletion in deletions: tmp_change = pp.reformat_mutations( "%(chr)s_%(start)s_%(end)s" % deletion, var["type"], var["gene_id"], chr2gene_pos) tmp = { "genome_pos": deletion["start"], "gene_id": deletion["region"], "chr": deletion["chr"], "freq": 1, "type": "large_deletion", "change": tmp_change } variants.append(tmp) json.dump(variants, open("%s/%s.pp-results.json" % (args.out_dir, args.prefix), "w")) for x in [ ".targets.bcf", ".targets.csq.bcf", ".targets.csq.bcf.csi", ".targets.delly.bcf", ".targets.delly.bcf.csi", ".targets.del_pos.bed", ".targets.gvcf.gz", ".targets.gvcf.gz.csi", ".targets.missing.bcf" ]: if args.no_delly and "delly" in x: continue pp.run_cmd("rm %s%s" % (args.prefix, x))