def get_ann(variants): uuid = str(uuid4()) #"463545ef-71fc-449b-8f4e-9c907ee6fbf5" with open(uuid, "w") as O: O.write('##fileformat=VCFv4.2\n') O.write( '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n') O.write('##contig=<ID=Chromosome,length=4411532>\n') O.write( '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttest\n') for var in variants.values(): O.write( "Chromosome\t%(pos)s\t.\t%(ref)s\t%(alt)s\t255\t.\t.\tGT\t1\n" % var) results = {} keys = list(variants.keys()) vals = list(variants.values()) i = 0 for l in pp.cmd_out(f"snpEff ann Mycobacterium_tuberculosis_h37rv {uuid}"): if l[0] == "#": continue row = l.strip().split() for ann in row[7].split(","): a = ann.split("|") if vals[i]["gene"] in [a[3], a[4]]: results[keys[ i]] = a[9] if vals[i]["type"] == "nucleotide" else a[10] i += 1 os.remove(uuid) return results
def main(args): ref = pp.fasta(args.ref).fa_dict cds = gff_load_cds(args.gff) final_list = [] coding = defaultdict(list) generator = pp.cmd_out( f"bcftools view {args.vcf}") if args.vcf else sys.stdin for l in generator: row = l.strip().split() if l[0] == "#": sys.stdout.write(l.strip() + "\n") elif len(row[3]) > 1 or len(row[4]) > 1: final_list.append(row) else: gene, cpos = get_codon_pos(row[0], int(row[1]), cds) if gene == None: final_list.append(row) else: coding[(gene, cpos)].append(row) for rows in coding.values(): chrom = rows[0][0] pos = sorted([int(r[1]) for r in rows]) ref_nucs = {p: ref[chrom][p - 1] for p in range(pos[0], pos[-1] + 1)} alt_nucs = ref_nucs.copy() for i, p in enumerate(pos): alt_nucs[p] = rows[i][4] new_row = rows[0] new_row[3] = "".join(ref_nucs.values()) new_row[4] = "".join(alt_nucs.values()) final_list.append(new_row) for row in sorted(final_list, key=lambda x: int(x[1])): sys.stdout.write("\t".join(row) + "\n")
def get_mean_genotype(self, outfile=None): self.outfile = outfile if self.outfile == None: self.outfile = self.prefix + ".geno" O = open(self.outfile, "w") for l in tqdm( pp.cmd_out( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%TGT]\\n' %(filename)s" % vars(self))): row = l.rstrip().split() alts = row[3].split(",") for alt in alts: ref = "%s/%s" % (row[2], row[2]) tmp = "%s/%s" % (alt, alt) genos = [] for x in row[4:]: if x == ref: genos.append("0") elif x == tmp: genos.append("1") else: genos.append("NA") O.write("%s, %s, %s, %s\n" % (row[0] + "_" + row[1] + "_" + alt, row[2], alt, ", ".join(genos))) O.close()
def get_variant_data(vcf_file,ref_file,gff_file,protein_file): nsp_data = {} gene_info = {} for row in csv.DictReader(open(protein_file)): row["Start"] = int(row["Start"]) row["End"] = int(row["End"]) gene_info[row["Gene"]] = {"function":row["Putative function"],"DOI":row["DOI"]} if row["Region"]!="nsp": continue for i in range(row["Start"],row["End"]+1): nsp_data[i] = row pp.run_cmd("samtools faidx %s" % ref_file) results = defaultdict(list) for l in pp.cmd_out("bcftools view %s | bcftools csq -f %s -g %s -p a | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'" % (vcf_file,ref_file,gff_file)): # Replace " " with "N" because if the alt allele contains N then # translated consequence will have spaces row = l.strip().replace(" ","N").split() pos,ref,alts_str,af_str,csq_str = row alt_af = sum([float(x) for x in af_str.split(",")]) csqs = csq_str.split(",") types = [] changes = [] genes = [] pos = int(pos) for i in range(len(csqs)): if csqs[i][0]=="@": # results[pos].append(results[int(csqs[i][1:])][0]) pass elif csqs[i]==".": results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":"intergenic","changes":"NA","gene":"NA","gene_function":"NA","gene_reference":"NA"}) else: csq = csqs[i].split("|") types.append(csq[0].replace("*","")) if csq[1]=="orf1ab": codon_pos = get_codon_num(csq[5]) if codon_pos in nsp_data: genes.append(nsp_data[codon_pos]["Gene"]) codon_pos = codon_pos-nsp_data[codon_pos]["Start"]+1 changes.append(change_codon_number(csq[5],codon_pos)) else: genes.append("orf1ab") changes.append(csq[5]) else: changes.append(csq[5] if len(csq)>5 else "") genes.append(csq[1]) if len(set(types))==1: types = list(set(types)) results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":",".join(types), "changes":",".join(changes),"gene":genes[0], "gene_function":gene_info[genes[0]]["function"], "gene_reference":gene_info[genes[0]]["DOI"]}) final_results = [] for res in list(results.values()): for r in res: final_results.append(r) # if len(res)==1: # final_results.append(res[0]) # else: # quit("ERROR! more than one variant for a position") return final_results
def main(args): generator = pp.cmd_out( f"bcftools view {args.vcf}") if args.vcf else sys.stdin convert = dict(zip(args.source, args.target)) for l in generator: if l[0] == "#": sys.stdout.write(l) else: row = l.strip().split() row[0] = convert[row[0]] sys.stdout.write("\t".join(row) + "\n")
def create_species_db(args, extra_files=None): if not extra_files: extra_files = {} version = {"name": args.prefix} if not args.db_name: for l in pp.cmd_out("git log | head -4"): row = l.strip().split() if row == []: continue version[row[0].replace(":", "")] = " ".join(row[1:]) version["commit"] = version["commit"][:7] else: version["Date"] = str( datetime.now()) if not args.db_date else args.db_date version["name"] = args.db_name if args.db_name else "NA" version["commit"] = args.db_commit if args.db_name else "NA" version["Author"] = args.db_author if args.db_author else "NA" kmer_file = args.prefix + ".kmers.txt" version_file = args.prefix + ".version.json" shutil.copyfile(args.kmers, kmer_file) json.dump(version, open(version_file, "w")) for file in extra_files.values(): target = f"{args.prefix}.{file}" shutil.copyfile(file, target) variables_file = args.prefix + ".variables.json" variables = {} variables["files"] = { "kmers": kmer_file, "version": version_file, "variables": variables_file } if extra_files: for key, val in extra_files.items(): variables["files"][key] = f"{args.prefix}.{val}" json.dump(variables, open(variables_file, "w")) if args.load: load_dir = f"{sys.base_prefix}/share/{args.software_name}" if not os.path.isdir(load_dir): os.mkdir(load_dir) for key, val in variables['files'].items(): target = f"{load_dir}/{val}" infolog(f"Copying file: {val} ---> {target}") shutil.copyfile(val, target)
def get_genesum(self, outfile=None): self.outfile = outfile if self.outfile == None: self.outfile = self.prefix + ".gensum" genesum = defaultdict(lambda: defaultdict(int)) O = open(self.outfile, "w") for l in tqdm( pp.cmd_out( "bcftools query -f '[%%SAMPLE\\t%%GT\\t%%TBCSQ\\n]' %(filename)s" % vars(self))): row = l.split() #por4A 1/1 synonymous|Rv0002|gene1|protein_coding|+|109L|2378G>A synonymous|Rv0002|gene1|protein_coding|+|109L|2378G>A info = row[2].split("|") if info[0] == "synonymous": continue if info[0][0] == "@": continue genesum[info[1]][row[0]] += 1 for gene in genesum: O.write( "%s\tNA\tNA\t%s\n" % (gene, "\t".join(str(genesum[gene][s]) for s in self.samples))) O.close()
def run_profile(uniq_id, storage_dir, fasta=None, R1=None, R2=None): cp.log("This is the worker. Running %s" % uniq_id) if fasta: pp.run_cmd( "covid-profiler.py profile --fasta %s --prefix %s --dir %s" % (fasta, uniq_id, storage_dir)) elif R1 and not R2: pp.run_cmd("covid-profiler.py profile -1 %s --prefix %s --dir %s" % (R1, uniq_id, storage_dir)) elif R1 and R2: pp.run_cmd( "covid-profiler.py profile -1 %s -2 %s --prefix %s --dir %s" % (R1, R2, uniq_id, storage_dir)) else: sys.stderr.write("ERROR!!! Check file inputs to profile worker!") pp.run_cmd("zip -j %s/%s.zip %s/%s*" % (storage_dir, uniq_id, storage_dir, uniq_id)) results = json.load(open("%s/%s.results.json" % (storage_dir, uniq_id))) if R1: pp.run_cmd("bcftools view %s/%s.vcf.gz > %s/%s.vcf" % (storage_dir, uniq_id, storage_dir, uniq_id)) for l in pp.cmd_out( "bedtools genomecov -ibam %s/%s.bam -d | datamash mean 3" % (storage_dir, uniq_id)): cp.log(l) results["mean_depth"] = round(float(l.strip()), 2) results["num_variants"] = len(results["variants"]) client = MongoClient() db = client.test_database db.profiler_results.find_one_and_update( {"_id": uniq_id}, {"$set": { "results": results, "status": "done" }}) return True
def __init__(self, filename, prefix=None, threads=1): self.samples = [] self.filename = filename self.prefix = prefix self.threads = threads if prefix == None: if filename[-4:] == ".bcf": self.prefix = filename[:-4] elif filename[-5:] == ".gbcf": self.prefix = filename[:-5] elif filename[-7:] == ".vcf.gz": self.prefix = filename[:-7] elif filename[-8:] == ".gvcf.gz": self.prefix = filename[:-8] elif filename[-4:] == ".vcf": self.prefix = filename[:-4] else: self.prefix = filename else: self.prefix = prefix index_bcf(filename, self.threads) for l in pp.cmd_out("bcftools query -l %(filename)s" % vars(self)): self.samples.append(l.rstrip())
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files samples_with_mutation = [] variant_position_set = set() for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant: samples_with_mutation.append(s) variant_position_set.add(var["genome_pos"]) sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation)) # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"] if len(samples_with_mutation)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found")) quit() elif len(variant_position_set)>1: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos")) quit() if len(variant_position_set)==1: variant_position = int(list(variant_position_set)[0]) sys.stderr.write("\nGenome position is %s\n" % variant_position) sys.stderr.write("\nPerforming ReadPosRankSum test\n") # variant_position = 3841662 params = vars(args) params["ref"] = conf["ref"] params["pos"] = variant_position params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz") read_pos_rank_sums = [] for s in tqdm(samples_with_mutation): params["sample"] = s pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0) pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0) pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0) for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0): row = l.strip().split() if row[1]==".": continue if int(row[0])==variant_position: read_pos_rank_sums.append((s,float(row[1]))) if len(read_pos_rank_sums)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples")) else: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums]))) pp.rm_files([params["tmp_vcf"]])
def get_variant_data(vcf_file, ref_file, gff_file, protein_file): nsp_data = {} gene_info = {} for row in csv.DictReader(open(protein_file)): row["Start"] = int(row["Start"]) row["End"] = int(row["End"]) gene_info[row["Gene"]] = { "function": row["Putative function"], "DOI": row["DOI"] } if row["Region"] != "nsp": continue for i in range(row["Start"], row["End"] + 1): nsp_data[i] = row pp.run_cmd("samtools faidx %s" % ref_file) results = {} for l in pp.cmd_out( "bcftools view %s | bcftools csq -f %s -g %s | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'" % (vcf_file, ref_file, gff_file)): pos, ref, alts_str, af_str, csq_str = l.strip().split() alt_af = sum([float(x) for x in af_str.split(",")]) csqs = csq_str.split(",") types = [] changes = [] genes = [] pos = int(pos) for i in range(len(csqs)): if csqs[i][0] == "@": results[pos] = results[int(csqs[i][1:])] elif csqs[i] == ".": results[pos] = { "pos": pos, "alts": alts_str, "alt_af": alt_af, "types": "intergenic", "changes": "NA", "gene": "NA", "gene_function": "NA", "gene_reference": "NA" } else: csq = csqs[i].split("|") types.append(csq[0].replace("*", "")) if csq[1] == "orf1ab": codon_pos = get_codon_num(csq[5]) if codon_pos in nsp_data: genes.append(nsp_data[codon_pos]["Gene"]) codon_pos = codon_pos - nsp_data[codon_pos]["Start"] + 1 changes.append(change_codon_number(csq[5], codon_pos)) else: genes.append("orf1ab") changes.append(csq[5]) else: changes.append(csq[5]) genes.append(csq[1]) if len(set(types)) == 1: types = list(set(types)) results[pos] = { "pos": pos, "alts": alts_str, "alt_af": alt_af, "types": ",".join(types), "changes": ",".join(changes), "gene": genes[0], "gene_function": gene_info[genes[0]]["function"], "gene_reference": gene_info[genes[0]]["DOI"] } return results
def main_profile(args): if pp.nofolder(args.dir): os.mkdir(args.dir) conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix if args.fasta: if args.read1 or args.read2: sys.stderr.write( "Please use --fasta or --read1/2 but not both... Exiting!\n") quit() fasta_obj = pp.fasta(args.fasta) wg_vcf_obj = pp.vcf( fasta_obj.get_ref_variants(conf["ref"], prefix=args.prefix, file_prefix=files_prefix)) else: if not args.read1: sys.stderr.write( "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n" ) quit() ### Create bam file if fastq has been supplied ### if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) wg_vcf_obj = bam_obj.call_variants(conf["ref"], args.caller, remove_missing=True) cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"], wg_vcf_obj.samples[0], wg_vcf_obj.prefix + ".consensus.fasta") if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) refseq = pp.fasta(conf["ref"]).fa_dict refseqname = list(refseq.keys())[0] results = {} if not args.fasta: for l in pp.cmd_out("bedtools genomecov -ibam %s | datamash median 3" % (bam_obj.bam_file)): results["mean_depth"] = int(l.strip()) barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) barcode = pp.barcode(barcode_mutations, conf["barcode"]) clade = ";".join(sorted([d["annotation"] for d in barcode])) sys.stdout.write("%s\t%s\n" % (args.prefix, clade)) results["clade"] = clade variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"], conf["gff"], conf["proteins"]) results["variants"] = variant_data json.dump(results, open("%s.results.json" % files_prefix, "w"))
def profile_vcf(filename, conf): params = conf.copy() params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz") params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz") params["filename"] = filename params["tmphdr"] = pp.get_random_file() params["tmptxt"] = pp.get_random_file() l = "" for l in pp.cmd_out( "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" % params): pass AD_found = False if l == "" else True if AD_found == False: open(params["tmphdr"], "w").write( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n" ) pp.run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s" % params) pp.run_cmd("bgzip %(tmptxt)s" % params) pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params) pp.run_cmd( "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s" % params) else: pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params) pp.run_cmd( "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s -Oz -o %(tmpcsq)s -p a" % params) csq_bcf_obj = pp.bcf(params["tmpcsq"]) csq = csq_bcf_obj.load_csq(ann_file=conf["ann"]) results = { "variants": [], "missing_pos": [], "qc": { "pct_reads_mapped": "NA", "num_reads_mapped": "NA" } } for sample in csq: results["variants"] = csq[sample] all_bcf_obj = pp.bcf(params["tmpvcf"]) mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][ 325505]["C"] == 50: mutations["Chromosome"][325505] = {"T": 25} if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][ 599868]["G"] == 50: mutations["Chromosome"][599868] = {"A": 25} if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][ 931123]["C"] == 50: mutations["Chromosome"][931123] = {"T": 25} if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][ 1759252]["T"] == 50: mutations["Chromosome"][1759252] = {"G": 25} json.dump(mutations, open("dump.json", "w")) barcode_mutations = pp.barcode(mutations, conf["barcode"]) results["barcode"] = barcode_mutations results = pp.db_compare(db_file=conf["json_db"], mutations=results) bed_regions = pp.load_bed(conf["bed"], [4], 4) missing_regions = {gene: "NA" for gene in bed_regions} results["missing_regions"] = missing_regions if AD_found: pp.run_cmd("rm %(tmpcsq)s" % params) else: pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params) return results