def fasta2vcf(fasta_file, outfile): conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) refseq = pp.fasta(conf["ref"]).fa_dict seqs = pp.fasta(fasta_file) samples = list(seqs.fa_dict.keys()) for sample in samples: fname = pp.get_random_file() open(fname, "w").write(">%s\n%s\n" % (sample, seqs.fa_dict[sample])) fasta_obj = pp.fasta(fname) vcf_obj = pp.vcf(fasta_obj.get_ref_variants(conf["ref"], sample)) pp.run_cmd("rm %s" % fname) sample_chunks = [samples[i:i + 200] for i in range(0, len(samples), 200)] tmp_vcfs = [] for tmp_samps in sample_chunks: tmp_list = pp.get_random_file() tmp_vcf = pp.get_random_file() open(tmp_list, "w").write("\n".join(["%s.vcf.gz" % x for x in tmp_samps])) pp.run_cmd("bcftools merge -0 -l %s -Oz -o %s" % (tmp_list, tmp_vcf)) pp.run_cmd("bcftools index %s" % tmp_vcf) tmp_vcfs.append(tmp_vcf) pp.rm_files([tmp_list]) pp.run_cmd("bcftools merge -0 %s | bcftools view -V indels -Oz -o %s" % (" ".join(tmp_vcfs), outfile)) vcf_files = ["%s.vcf.gz" % s for s in samples] vcf_csi_files = ["%s.vcf.gz.csi" % s for s in samples] pp.rm_files(vcf_files + vcf_csi_files + tmp_vcfs)
def run_fuzznuc(seqs, pattern, pmismatch=0): tmpfile = pp.get_random_file() pp.run_cmd( "fuzznuc -sequence %s -pattern %s -outfile %s -complement -pmismatch %s" % (seqs, pattern, tmpfile, pmismatch)) result = parse_fuzznuc_output(tmpfile) pp.rm_files([tmpfile]) return result
def vcf2consensus(bam, vcf, ref, id, consensus): tmp_bed = pp.get_random_file() pp.run_cmd( 'bedtools genomecov -d -ibam %s | awk \'$3<10\' | awk \'{print $1"\\t"$2"\\t"$2}\' > %s' % (bam, tmp_bed)) pp.run_cmd( "bcftools consensus -f %s -m %s -M N %s | sed 's/^>.*/>%s/' > %s" % (ref, tmp_bed, vcf, id, consensus)) pp.run_cmd("rm %s" % tmp_bed)
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(".targets.csq.vcf.gz", "") for x in os.listdir(args.dir) if x[-19:] == ".targets.csq.vcf.gz" ] sample_fastas = defaultdict(list) params = { "tmp_locations": pp.get_random_file(), "tmp_mappings": pp.get_random_file(), "ref": conf["ref"] } pp.run_cmd("awk '{print $1\":\"$2\"-\"$3\"\\t\"$5}' %s > %s" % (conf["bed"], params["tmp_mappings"])) pp.run_cmd("cut -f1 %s > %s" % (params["tmp_mappings"], params["tmp_locations"])) FILES = {} for l in open(params["tmp_mappings"]): row = l.rstrip().split() FILES[row[0]] = open("%s.fasta" % row[1], "w") for s in samples: params["vcf"] = "%s/%s.targets.csq.vcf.gz" % (args.dir, s) params["tmp_vcf"] = "%s/%s.targets.csq.tmp.vcf.gz" % (args.dir, s) params["sample_fa"] = "%s.targets.fa" % (s) pp.run_cmd( "bcftools filter -e 'sum(AD)=0' -S . %(vcf)s | bcftools view -a | grep -v NON_REF | bcftools view -Oz -o %(tmp_vcf)s" % params) pp.run_cmd("bcftools index %(tmp_vcf)s" % params) pp.run_cmd( "samtools faidx -r %(tmp_locations)s %(ref)s | bcftools consensus -H A %(tmp_vcf)s > %(sample_fa)s" % params) fa_dict = pp.fasta(params["sample_fa"]).fa_dict for locus in fa_dict: FILES[locus].write(">%s\n%s\n" % (s, fa_dict[locus])) pp.rm_files([params["tmp_vcf"]]) pp.rm_files([params["tmp_locations"], params["tmp_mappings"]])
def phylogeny(prefix,conf_file,sample_file=None,base_dir = ".",threads=3): conf = json.load(open(conf_file)) if sample_file: samples = [x.rstrip() for x in open(sample_file).readlines()] else: samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"] samples_file = pp.get_random_file() OUT = open(samples_file,"w") OUT.write("%s\n"%"\n".join(samples)) OUT.close() for s in samples: tprefix = s+".genome" gbcf_file = "%s.gbcf" % tprefix if pp.nofile("%s/vcf/%s.genome.gbcf" % (base_dir,s)): bam_file = "%s/bam/%s.bam" % (base_dir,s) bam_obj = pp.bam(bam_file,s,conf["ref"]) bam_obj.gbcf(prefix=tprefix) pp.run_cmd("mv %s* %s/vcf" % (gbcf_file,base_dir)) cmd = "merge_vcfs.py %s %s %s --vcf_dir %s/vcf/ --vcf_ext genome.gbcf" % (samples_file,conf["ref"],prefix,base_dir) print(cmd)
def main(args): vcf = vcf_class(args.vcf) # vcf.get_mean_genotype() if args.genes: vcf.get_genesum() geno_file = vcf.prefix + ".geno" genesum_file = vcf.prefix + ".genesum" meta = {} for s in vcf.samples: meta[s] = {} for row in csv.DictReader(open(args.csv)): for pheno in row.keys(): if pheno == "id": continue if row['id'] not in meta: continue meta[row["id"]][pheno] = row[pheno] phenos = [x.rstrip() for x in open(args.phenos).readlines()] cmd_file = pp.get_random_file() X = open(cmd_file, "w") for pheno in phenos: pheno_file = "%s.pheno" % pheno if pheno not in row: pp.log("%s not in CSV file" % pheno, True) P = open(pheno_file, "w") P.write("\n".join([ meta[s][pheno] if pheno in meta[s] else "NA" for s in vcf.samples ])) P.close() X.write( "gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s.genesum -notsnp\n" % (pheno_file, geno_file, pheno, pheno_file, geno_file, pheno, pheno, pheno_file, genesum_file, pheno, pheno)) X.close() if args.preprocess: pp.log("Preprocessing finished\n", True) else: pp.run_cmd("cat %s | parallel -j %s" % (cmd_file, args.threads))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files samples_with_mutation = [] variant_position_set = set() for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant: samples_with_mutation.append(s) variant_position_set.add(var["genome_pos"]) sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation)) # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"] if len(samples_with_mutation)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found")) quit() elif len(variant_position_set)>1: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos")) quit() if len(variant_position_set)==1: variant_position = int(list(variant_position_set)[0]) sys.stderr.write("\nGenome position is %s\n" % variant_position) sys.stderr.write("\nPerforming ReadPosRankSum test\n") # variant_position = 3841662 params = vars(args) params["ref"] = conf["ref"] params["pos"] = variant_position params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz") read_pos_rank_sums = [] for s in tqdm(samples_with_mutation): params["sample"] = s pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0) pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0) pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0) for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0): row = l.strip().split() if row[1]==".": continue if int(row[0])==variant_position: read_pos_rank_sums.append((s,float(row[1]))) if len(read_pos_rank_sums)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples")) else: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums]))) pp.rm_files([params["tmp_vcf"]])
def profile_vcf(filename, conf): params = conf.copy() params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz") params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz") params["filename"] = filename params["tmphdr"] = pp.get_random_file() params["tmptxt"] = pp.get_random_file() l = "" for l in pp.cmd_out( "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" % params): pass AD_found = False if l == "" else True if AD_found == False: open(params["tmphdr"], "w").write( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n" ) pp.run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s" % params) pp.run_cmd("bgzip %(tmptxt)s" % params) pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params) pp.run_cmd( "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s" % params) else: pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params) pp.run_cmd( "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s -Oz -o %(tmpcsq)s -p a" % params) csq_bcf_obj = pp.bcf(params["tmpcsq"]) csq = csq_bcf_obj.load_csq(ann_file=conf["ann"]) results = { "variants": [], "missing_pos": [], "qc": { "pct_reads_mapped": "NA", "num_reads_mapped": "NA" } } for sample in csq: results["variants"] = csq[sample] all_bcf_obj = pp.bcf(params["tmpvcf"]) mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][ 325505]["C"] == 50: mutations["Chromosome"][325505] = {"T": 25} if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][ 599868]["G"] == 50: mutations["Chromosome"][599868] = {"A": 25} if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][ 931123]["C"] == 50: mutations["Chromosome"][931123] = {"T": 25} if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][ 1759252]["T"] == 50: mutations["Chromosome"][1759252] = {"G": 25} json.dump(mutations, open("dump.json", "w")) barcode_mutations = pp.barcode(mutations, conf["barcode"]) results["barcode"] = barcode_mutations results = pp.db_compare(db_file=conf["json_db"], mutations=results) bed_regions = pp.load_bed(conf["bed"], [4], 4) missing_regions = {gene: "NA" for gene in bed_regions} results["missing_regions"] = missing_regions if AD_found: pp.run_cmd("rm %(tmpcsq)s" % params) else: pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params) return results