def main_lineage(args): conf_file = pp.filecheck(tbp._ROOT + "/../" + args.db + ".config.json") conf = json.load(open(conf_file)) pp.filecheck(args.bcf) bcf = pp.bcf(args.bcf) mutations = bcf.get_bed_gt(conf["barcode"], conf["ref"]) results = {} results["barcode"] = pp.barcode(mutations, conf["barcode"]) tbp.barcode2lineage(results) if args.prefix: outfile = "%s.lineage.%s" % (args.prefix, args.outfmt) O = open(outfile, "w") if args.outfmt == "json": json.dump(results["lineage"], O) elif args.outfmt == "txt": O.write(tbp.text.lineagejson2text(results["lineage"])) O.close()
def main_profile(args): if pp.nofolder(args.dir): os.mkdir(args.dir) conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix if args.fasta: if args.read1 or args.read2: sys.stderr.write( "Please use --fasta or --read1/2 but not both... Exiting!\n") quit() fasta_obj = pp.fasta(args.fasta) wg_vcf_obj = pp.vcf( fasta_obj.get_ref_variants(conf["ref"], prefix=args.prefix, file_prefix=files_prefix)) else: if not args.read1: sys.stderr.write( "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n" ) quit() ### Create bam file if fastq has been supplied ### if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) wg_vcf_obj = bam_obj.call_variants(conf["ref"], args.caller, remove_missing=True) cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"], wg_vcf_obj.samples[0], wg_vcf_obj.prefix + ".consensus.fasta") if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) refseq = pp.fasta(conf["ref"]).fa_dict refseqname = list(refseq.keys())[0] results = {} barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) barcode = pp.barcode(barcode_mutations, conf["barcode"]) clade = ";".join(sorted([d["annotation"] for d in barcode])) sys.stdout.write("%s\t%s\n" % (args.prefix, clade)) results["clade"] = clade variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"], conf["gff"], conf["proteins"]) results["variants"] = variant_data json.dump(results, open("%s.results.json" % files_prefix, "w"))
def profile_vcf(filename, conf): params = conf.copy() params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz") params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz") params["filename"] = filename params["tmphdr"] = pp.get_random_file() params["tmptxt"] = pp.get_random_file() l = "" for l in pp.cmd_out( "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" % params): pass AD_found = False if l == "" else True if AD_found == False: open(params["tmphdr"], "w").write( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n" ) pp.run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s" % params) pp.run_cmd("bgzip %(tmptxt)s" % params) pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params) pp.run_cmd( "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s" % params) else: pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params) pp.run_cmd( "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s -Oz -o %(tmpcsq)s -p a" % params) csq_bcf_obj = pp.bcf(params["tmpcsq"]) csq = csq_bcf_obj.load_csq(ann_file=conf["ann"]) results = { "variants": [], "missing_pos": [], "qc": { "pct_reads_mapped": "NA", "num_reads_mapped": "NA" } } for sample in csq: results["variants"] = csq[sample] all_bcf_obj = pp.bcf(params["tmpvcf"]) mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][ 325505]["C"] == 50: mutations["Chromosome"][325505] = {"T": 25} if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][ 599868]["G"] == 50: mutations["Chromosome"][599868] = {"A": 25} if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][ 931123]["C"] == 50: mutations["Chromosome"][931123] = {"T": 25} if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][ 1759252]["T"] == 50: mutations["Chromosome"][1759252] = {"G": 25} json.dump(mutations, open("dump.json", "w")) barcode_mutations = pp.barcode(mutations, conf["barcode"]) results["barcode"] = barcode_mutations results = pp.db_compare(db_file=conf["json_db"], mutations=results) bed_regions = pp.load_bed(conf["bed"], [4], 4) missing_regions = {gene: "NA" for gene in bed_regions} results["missing_regions"] = missing_regions if AD_found: pp.run_cmd("rm %(tmpcsq)s" % params) else: pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params) return results