def main_load_library(args): lib_prefix = args.prefix.split("/")[-1] files = { "gff": ".gff", "ref": ".fasta", "barcode": ".barcode.bed", "version": ".version.json", "proteins": ".proteins.csv", "non_coding_bed": ".non_coding.bed" } if pp.nofolder(sys.base_prefix + "/share/covidprofiler"): pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/")) for key in files: new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[ key] pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location)) pp.run_cmd("samtools faidx %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".fasta") if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict"): pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" + lib_prefix + ".dict") pp.log("Sucessfully imported library")
def run_fuzznuc(seqs, pattern, pmismatch=0): tmpfile = pp.get_random_file() pp.run_cmd( "fuzznuc -sequence %s -pattern %s -outfile %s -complement -pmismatch %s" % (seqs, pattern, tmpfile, pmismatch)) result = parse_fuzznuc_output(tmpfile) pp.rm_files([tmpfile]) return result
def get_variant_data(vcf_file,ref_file,gff_file,protein_file): nsp_data = {} gene_info = {} for row in csv.DictReader(open(protein_file)): row["Start"] = int(row["Start"]) row["End"] = int(row["End"]) gene_info[row["Gene"]] = {"function":row["Putative function"],"DOI":row["DOI"]} if row["Region"]!="nsp": continue for i in range(row["Start"],row["End"]+1): nsp_data[i] = row pp.run_cmd("samtools faidx %s" % ref_file) results = defaultdict(list) for l in pp.cmd_out("bcftools view %s | bcftools csq -f %s -g %s -p a | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'" % (vcf_file,ref_file,gff_file)): # Replace " " with "N" because if the alt allele contains N then # translated consequence will have spaces row = l.strip().replace(" ","N").split() pos,ref,alts_str,af_str,csq_str = row alt_af = sum([float(x) for x in af_str.split(",")]) csqs = csq_str.split(",") types = [] changes = [] genes = [] pos = int(pos) for i in range(len(csqs)): if csqs[i][0]=="@": # results[pos].append(results[int(csqs[i][1:])][0]) pass elif csqs[i]==".": results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":"intergenic","changes":"NA","gene":"NA","gene_function":"NA","gene_reference":"NA"}) else: csq = csqs[i].split("|") types.append(csq[0].replace("*","")) if csq[1]=="orf1ab": codon_pos = get_codon_num(csq[5]) if codon_pos in nsp_data: genes.append(nsp_data[codon_pos]["Gene"]) codon_pos = codon_pos-nsp_data[codon_pos]["Start"]+1 changes.append(change_codon_number(csq[5],codon_pos)) else: genes.append("orf1ab") changes.append(csq[5]) else: changes.append(csq[5] if len(csq)>5 else "") genes.append(csq[1]) if len(set(types))==1: types = list(set(types)) results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":",".join(types), "changes":",".join(changes),"gene":genes[0], "gene_function":gene_info[genes[0]]["function"], "gene_reference":gene_info[genes[0]]["DOI"]}) final_results = [] for res in list(results.values()): for r in res: final_results.append(r) # if len(res)==1: # final_results.append(res[0]) # else: # quit("ERROR! more than one variant for a position") return final_results
def index_bcf(bcffile, threads=1, overwrite=False): """ Indexing a bam file """ cmd = "bcftools index --threads %s -f %s" % (threads, bcffile) if filecheck(bcffile): if nofile(bcffile + ".csi"): pp.run_cmd(cmd) elif os.path.getmtime(bcffile + ".csi") < os.path.getmtime(bcffile) or overwrite: pp.run_cmd(cmd)
def fasta2vcf(fasta_file, outfile): conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) refseq = pp.fasta(conf["ref"]).fa_dict seqs = pp.fasta(fasta_file) samples = list(seqs.fa_dict.keys()) for sample in samples: fname = pp.get_random_file() open(fname, "w").write(">%s\n%s\n" % (sample, seqs.fa_dict[sample])) fasta_obj = pp.fasta(fname) vcf_obj = pp.vcf(fasta_obj.get_ref_variants(conf["ref"], sample)) pp.run_cmd("rm %s" % fname) sample_chunks = [samples[i:i + 200] for i in range(0, len(samples), 200)] tmp_vcfs = [] for tmp_samps in sample_chunks: tmp_list = pp.get_random_file() tmp_vcf = pp.get_random_file() open(tmp_list, "w").write("\n".join(["%s.vcf.gz" % x for x in tmp_samps])) pp.run_cmd("bcftools merge -0 -l %s -Oz -o %s" % (tmp_list, tmp_vcf)) pp.run_cmd("bcftools index %s" % tmp_vcf) tmp_vcfs.append(tmp_vcf) pp.rm_files([tmp_list]) pp.run_cmd("bcftools merge -0 %s | bcftools view -V indels -Oz -o %s" % (" ".join(tmp_vcfs), outfile)) vcf_files = ["%s.vcf.gz" % s for s in samples] vcf_csi_files = ["%s.vcf.gz.csi" % s for s in samples] pp.rm_files(vcf_files + vcf_csi_files + tmp_vcfs)
def main_aln(args): """ mafft --auto --thread -1 --keeplength --addfragments gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.fasta ~/covid/cvdb.fasta > gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln python ~/gisaid_scripts/get_fasta_stats.py --fasta gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln --bed ~/covid/static/coding.bed --out gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.stats awk '$3<=2.5 && $4<=3 && $5<=50' gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln.stats | cut -f1 > seq_filtered_samples.txt seqtk subseq gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.aln seq_filtered_samples.txt > gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.aln python ~/gisaid_scripts/mask_fasta.py --fasta gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.aln --bed ~/covid/static/non_coding_mask.bed --out gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.aln python ~/gisaid_scripts/mask_fasta_non_acgt.py --fasta gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.aln --out gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.aln snp-sites -v gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.aln | python ~/gisaid_scripts/vcf_fix_ref.py --ref ~/covid/cvdb.fasta | python ~/gisaid_scripts/vcf_mask_non_acgt.py | tqdm | bcftools view -a -Oz -o gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.vcf.gz bcftools norm --threads 4 -m - gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.vcf.gz -Oz -o gisaid_hcov-19_2020_07_22_09.meta_filtered.filtered.site_filtered.bed_masked.acgt.multi_split.vcf.gz """ conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) pp.run_cmd( "mafft --auto --thread %s --keeplength --addfragments %s %s > %s.aln" % (args.threads, args.fasta, conf["ref"], args.prefix)) pp.run_cmd( "covid_profiler_mask_fasta.py --fasta %s.aln --bed %s --out %s.bed_masked.aln" % (args.prefix, conf["non_coding_bed"], args.prefix)) pp.run_cmd( "covid_profiler_mask_fasta_non_acgt.py --fasta %s.bed_masked.aln --out %s.bed_masked.acgt.aln" % (args.prefix, args.prefix)) pp.run_cmd( "iqtree -m GTR+F+R2 -s %s.bed_masked.acgt.aln -nt %s -czb -pre %s" % (args.prefix, args.threads, args.prefix))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.vcf_suffix, "") for x in os.listdir(args.vcf_dir) if x[-len(args.vcf_suffix):] == args.vcf_suffix ] for l in open(conf["gff"]): row = l.strip().split() if len(row) <= 2: continue if row[2] != "gene": continue if "Name=%s" % args.gene in l or "gene:%s" % args.gene in l: break start, end = int(row[3]), int(row[4]) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files if not os.path.isfile("%s/%s%s" % (args.dir, s, args.suffix)): continue data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) vars = json.dumps([ d for d in data["dr_variants"] + data["other_variants"] if d["locus_tag"] == args.gene ]) print(vars) if "deletion" not in vars and "frameshift" not in vars and "inframe" not in vars and "stop" not in vars and "start" not in vars: revseq = "| revseq -sequence /dev/stdin -outseq /dev/stdout" if row[ 6] == "-" else "" pp.run_cmd( "samtools faidx %s Chromosome:%s-%s | bcftools consensus %s/%s%s %s | sed 's/^>.*/>%s/' > %s.%s.fasta" % (conf["ref"], start, end, args.vcf_dir, s, args.vcf_suffix, revseq, s, s, args.gene), verbose=1)
def tbprofiler(fq1,fq2,uniq_id,db,storage_dir,platform): conf = get_conf_dict(sys.base_prefix+"/share/tbprofiler/tbdb") drug_order = ["isoniazid","rifampicin","ethambutol","pyrazinamide","streptomycin","ethionamide","fluoroquinolones","amikacin","capreomycin","kanamycin"] if fq1 and fq2: fastq_obj = pp.fastq(fq1,fq2) elif fq1 and fq2==None: fastq_obj = pp.fastq(fq1) files_prefix = storage_dir+"/"+uniq_id bam_obj = fastq_obj.map_to_ref( ref_file=conf["ref"], prefix=files_prefix,sample_name=uniq_id, aligner="bwa", platform=platform, threads=4 ) bam_file = bam_obj.bam_file results = pp.bam_profiler( conf=conf, bam_file=bam_file, prefix=files_prefix, platform=platform, caller="bcftools", threads=4, no_flagstat=False, run_delly = True ) results = tbp.reformat(results, conf, reporting_af=0.1) results["id"] = uniq_id results["tbprofiler_version"] = tbp._VERSION results["pipeline"] = {"mapper":"bcftools","variant_caller":"bcftools"} results = tbp.get_summary(results,conf,drug_order=drug_order) outfile = "%s.results.json" % (storage_dir+"/"+uniq_id) json.dump(results,open(outfile,"w")) conn = sqlite3.connect(db) c = conn.cursor() c.execute("UPDATE results SET result = ?, lineage = ?, drtype = ?, status = 'completed' where id = ?", (open(outfile).readline(),results["sublin"],results["drtype"],uniq_id,)) c.execute("UPDATE full_results SET main_lineage = ?, sub_lineage = ?, DR_type = ?, MDR = ?, XDR = ?",(results["main_lin"],results["sublin"],results["drtype"],results["MDR"],results["XDR"])) for d in results["drug_table"]: c.execute("UPDATE full_results SET %s = ? where id = ?" % d["Drug"].lower().replace("-","_"), (d["Mutations"],uniq_id,)) conn.commit() pp.run_cmd("rm %s/%s*" % (storage_dir,uniq_id)) return True
def get_sample_meta(samples, debug=False): if not args.debug: pp.run_cmd( "esearch -db nucleotide -query '%s' | efetch -format gb > temp.gb" % ",".join(samples)) data = [] for seq_record in SeqIO.parse(open("temp.gb"), "gb"): sample = seq_record.id.split(".")[0] source = [ feat for feat in seq_record.features if feat.type == "source" ][0] country = "NA" date = "NA" if "country" in source.qualifiers: country = source.qualifiers["country"][0].split(":")[0] if "collection_date" in source.qualifiers: date = source.qualifiers["collection_date"][0] data.append({"id": sample, "country": country, "date": date}) return data
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(".targets.csq.vcf.gz", "") for x in os.listdir(args.dir) if x[-19:] == ".targets.csq.vcf.gz" ] sample_fastas = defaultdict(list) params = { "tmp_locations": pp.get_random_file(), "tmp_mappings": pp.get_random_file(), "ref": conf["ref"] } pp.run_cmd("awk '{print $1\":\"$2\"-\"$3\"\\t\"$5}' %s > %s" % (conf["bed"], params["tmp_mappings"])) pp.run_cmd("cut -f1 %s > %s" % (params["tmp_mappings"], params["tmp_locations"])) FILES = {} for l in open(params["tmp_mappings"]): row = l.rstrip().split() FILES[row[0]] = open("%s.fasta" % row[1], "w") for s in samples: params["vcf"] = "%s/%s.targets.csq.vcf.gz" % (args.dir, s) params["tmp_vcf"] = "%s/%s.targets.csq.tmp.vcf.gz" % (args.dir, s) params["sample_fa"] = "%s.targets.fa" % (s) pp.run_cmd( "bcftools filter -e 'sum(AD)=0' -S . %(vcf)s | bcftools view -a | grep -v NON_REF | bcftools view -Oz -o %(tmp_vcf)s" % params) pp.run_cmd("bcftools index %(tmp_vcf)s" % params) pp.run_cmd( "samtools faidx -r %(tmp_locations)s %(ref)s | bcftools consensus -H A %(tmp_vcf)s > %(sample_fa)s" % params) fa_dict = pp.fasta(params["sample_fa"]).fa_dict for locus in fa_dict: FILES[locus].write(">%s\n%s\n" % (s, fa_dict[locus])) pp.rm_files([params["tmp_vcf"]]) pp.rm_files([params["tmp_locations"], params["tmp_mappings"]])
def phylogeny(prefix,conf_file,sample_file=None,base_dir = ".",threads=3): conf = json.load(open(conf_file)) if sample_file: samples = [x.rstrip() for x in open(sample_file).readlines()] else: samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"] samples_file = pp.get_random_file() OUT = open(samples_file,"w") OUT.write("%s\n"%"\n".join(samples)) OUT.close() for s in samples: tprefix = s+".genome" gbcf_file = "%s.gbcf" % tprefix if pp.nofile("%s/vcf/%s.genome.gbcf" % (base_dir,s)): bam_file = "%s/bam/%s.bam" % (base_dir,s) bam_obj = pp.bam(bam_file,s,conf["ref"]) bam_obj.gbcf(prefix=tprefix) pp.run_cmd("mv %s* %s/vcf" % (gbcf_file,base_dir)) cmd = "merge_vcfs.py %s %s %s --vcf_dir %s/vcf/ --vcf_ext genome.gbcf" % (samples_file,conf["ref"],prefix,base_dir) print(cmd)
def run_profile(uniq_id, storage_dir, fasta=None, R1=None, R2=None): cp.log("This is the worker. Running %s" % uniq_id) if fasta: pp.run_cmd( "covid-profiler.py profile --fasta %s --prefix %s --dir %s" % (fasta, uniq_id, storage_dir)) elif R1 and not R2: pp.run_cmd("covid-profiler.py profile -1 %s --prefix %s --dir %s" % (R1, uniq_id, storage_dir)) elif R1 and R2: pp.run_cmd( "covid-profiler.py profile -1 %s -2 %s --prefix %s --dir %s" % (R1, R2, uniq_id, storage_dir)) else: sys.stderr.write("ERROR!!! Check file inputs to profile worker!") pp.run_cmd("zip -j %s/%s.zip %s/%s*" % (storage_dir, uniq_id, storage_dir, uniq_id)) results = json.load(open("%s/%s.results.json" % (storage_dir, uniq_id))) if R1: pp.run_cmd("bcftools view %s/%s.vcf.gz > %s/%s.vcf" % (storage_dir, uniq_id, storage_dir, uniq_id)) for l in pp.cmd_out( "bedtools genomecov -ibam %s/%s.bam -d | datamash mean 3" % (storage_dir, uniq_id)): cp.log(l) results["mean_depth"] = round(float(l.strip()), 2) results["num_variants"] = len(results["variants"]) client = MongoClient() db = client.test_database db.profiler_results.find_one_and_update( {"_id": uniq_id}, {"$set": { "results": results, "status": "done" }}) return True
def main_profile(args): if args.external_db: conf = get_conf_dict(args.external_db) else: conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if not args.prefix: args.prefix = args.bam.split("/")[-1].replace(".bam", "").replace(".cram", "") bam_obj = pp.bam(args.bam, args.prefix, platform=args.platform) vcf_obj = bam_obj.call_variants(conf["ref"], caller=args.caller, bed_file=conf["bed"], threads=args.threads) csq_vcf_obj = vcf_obj.csq(conf["ref"], conf["gff"]) csq = csq_vcf_obj.load_csq(ann_file=conf["ann"]) results = {"variants": []} for sample in csq: results["variants"] = csq[sample] outfile = "%s%s" % (args.prefix, args.suffix) json.dump(results, open(outfile, "w")) pp.run_cmd("rm %(prefix)s.targets.vcf.gz* %(prefix)s.targets.csq.vcf.gz*" % vars(args))
def main(args): vcf = vcf_class(args.vcf) # vcf.get_mean_genotype() if args.genes: vcf.get_genesum() geno_file = vcf.prefix + ".geno" genesum_file = vcf.prefix + ".genesum" meta = {} for s in vcf.samples: meta[s] = {} for row in csv.DictReader(open(args.csv)): for pheno in row.keys(): if pheno == "id": continue if row['id'] not in meta: continue meta[row["id"]][pheno] = row[pheno] phenos = [x.rstrip() for x in open(args.phenos).readlines()] cmd_file = pp.get_random_file() X = open(cmd_file, "w") for pheno in phenos: pheno_file = "%s.pheno" % pheno if pheno not in row: pp.log("%s not in CSV file" % pheno, True) P = open(pheno_file, "w") P.write("\n".join([ meta[s][pheno] if pheno in meta[s] else "NA" for s in vcf.samples ])) P.close() X.write( "gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s.genesum -notsnp\n" % (pheno_file, geno_file, pheno, pheno_file, geno_file, pheno, pheno, pheno_file, genesum_file, pheno, pheno)) X.close() if args.preprocess: pp.log("Preprocessing finished\n", True) else: pp.run_cmd("cat %s | parallel -j %s" % (cmd_file, args.threads))
def vcf2consensus(bam, vcf, ref, id, consensus): tmp_bed = pp.get_random_file() pp.run_cmd( 'bedtools genomecov -d -ibam %s | awk \'$3<10\' | awk \'{print $1"\\t"$2"\\t"$2}\' > %s' % (bam, tmp_bed)) pp.run_cmd( "bcftools consensus -f %s -m %s -M N %s | sed 's/^>.*/>%s/' > %s" % (ref, tmp_bed, vcf, id, consensus)) pp.run_cmd("rm %s" % tmp_bed)
def profile_primer(primerF, primerR, probe, uniq_id, save_dir): pp.run_cmd( "covid-profiler.py primer --primerF %s --primerR %s --probe %s --out %s/%s.csv" % (primerF, primerR, probe, save_dir, uniq_id)) pp.run_cmd("covid_plot_primers.R %s/%s.csv %s %s %s" % (save_dir, uniq_id, primerF, primerR, probe)) pp.run_cmd("rm %s/%s.csv" % (save_dir, uniq_id)) client = MongoClient() db = client.test_database db.primer_results.find_one_and_update({"_id": uniq_id}, {"$set": { "status": "done" }}) return True
def run_phylogeny(file, uniq_id, working_dir="/tmp/"): cp.log("This is the worker. Running %s" % uniq_id) pp.run_cmd( "covid_profiler_align_fasta.py --fasta %s --working-dir %s --out %s" % (file, working_dir, uniq_id)) return True
def profile_vcf(filename, conf): params = conf.copy() params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz") params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz") params["filename"] = filename params["tmphdr"] = pp.get_random_file() params["tmptxt"] = pp.get_random_file() l = "" for l in pp.cmd_out( "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" % params): pass AD_found = False if l == "" else True if AD_found == False: open(params["tmphdr"], "w").write( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n" ) pp.run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s" % params) pp.run_cmd("bgzip %(tmptxt)s" % params) pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params) pp.run_cmd( "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s" % params) else: pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params) pp.run_cmd( "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s -Oz -o %(tmpcsq)s -p a" % params) csq_bcf_obj = pp.bcf(params["tmpcsq"]) csq = csq_bcf_obj.load_csq(ann_file=conf["ann"]) results = { "variants": [], "missing_pos": [], "qc": { "pct_reads_mapped": "NA", "num_reads_mapped": "NA" } } for sample in csq: results["variants"] = csq[sample] all_bcf_obj = pp.bcf(params["tmpvcf"]) mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][ 325505]["C"] == 50: mutations["Chromosome"][325505] = {"T": 25} if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][ 599868]["G"] == 50: mutations["Chromosome"][599868] = {"A": 25} if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][ 931123]["C"] == 50: mutations["Chromosome"][931123] = {"T": 25} if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][ 1759252]["T"] == 50: mutations["Chromosome"][1759252] = {"G": 25} json.dump(mutations, open("dump.json", "w")) barcode_mutations = pp.barcode(mutations, conf["barcode"]) results["barcode"] = barcode_mutations results = pp.db_compare(db_file=conf["json_db"], mutations=results) bed_regions = pp.load_bed(conf["bed"], [4], 4) missing_regions = {gene: "NA" for gene in bed_regions} results["missing_regions"] = missing_regions if AD_found: pp.run_cmd("rm %(tmpcsq)s" % params) else: pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params) return results
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files samples_with_mutation = [] variant_position_set = set() for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant: samples_with_mutation.append(s) variant_position_set.add(var["genome_pos"]) sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation)) # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"] if len(samples_with_mutation)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found")) quit() elif len(variant_position_set)>1: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos")) quit() if len(variant_position_set)==1: variant_position = int(list(variant_position_set)[0]) sys.stderr.write("\nGenome position is %s\n" % variant_position) sys.stderr.write("\nPerforming ReadPosRankSum test\n") # variant_position = 3841662 params = vars(args) params["ref"] = conf["ref"] params["pos"] = variant_position params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz") read_pos_rank_sums = [] for s in tqdm(samples_with_mutation): params["sample"] = s pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0) pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0) pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0) for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0): row = l.strip().split() if row[1]==".": continue if int(row[0])==variant_position: read_pos_rank_sums.append((s,float(row[1]))) if len(read_pos_rank_sums)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples")) else: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums]))) pp.rm_files([params["tmp_vcf"]])
def main(args): vcf_class = pp.vcf(args.vcf) vcf_positions = vcf_class.get_positions() if not args.fasta: if not args.ref: sys.stderr.write( "\nERROR: Please supply a reference with --ref\n\n") quit() pp.run_cmd( "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" % vars(args)) args.fasta = "%s.snps.fa" % vcf_class.prefix if pp.nofile("%s.asr.state" % args.fasta): pp.run_cmd( "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr" % vars(args)) tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1) node_names = set([tree.name] + [n.name.split("/")[0] for n in tree.get_descendants()]) leaf_names = set(tree.get_leaf_names()) internal_node_names = node_names - leaf_names states_file = "%s.asr.state" % args.fasta states = defaultdict(dict) sys.stderr.write("Loading states\n") for l in tqdm(open(states_file)): if l[0] == "#": continue row = l.strip().split() if row[0] == "Node": continue site = int(row[1]) if row[0] not in internal_node_names: continue states[site][row[0]] = row[2] seqs = pp.fasta(args.fasta).fa_dict for site in tqdm(list(states)): for sample in seqs: states[site][sample] = seqs[sample][site - 1] acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"]) convergent_sites = [] for site in tqdm(list(states)): nucleotides = set([states[site][n] for n in node_names]) if len(nucleotides) == 1: continue # Set up storage objects origins = [] tree.add_feature("state", states[site][tree.name]) for n in tree.traverse(): if n == tree: continue node_state = states[site][n.name] if node_state != n.get_ancestors( )[0].state and node_state in acgt and n.get_ancestors( )[0].state in acgt: origins.append(n.name) n.add_feature("state", node_state) if len(origins) > 1: convergent_sites.append((site, vcf_positions[site - 1], origins)) with open(args.out, "w") as O: for site in convergent_sites: O.write("%s\t%s\n" % (site[1][1], len(site[2])))
def get_variant_data(vcf_file, ref_file, gff_file, protein_file): nsp_data = {} gene_info = {} for row in csv.DictReader(open(protein_file)): row["Start"] = int(row["Start"]) row["End"] = int(row["End"]) gene_info[row["Gene"]] = { "function": row["Putative function"], "DOI": row["DOI"] } if row["Region"] != "nsp": continue for i in range(row["Start"], row["End"] + 1): nsp_data[i] = row pp.run_cmd("samtools faidx %s" % ref_file) results = {} for l in pp.cmd_out( "bcftools view %s | bcftools csq -f %s -g %s | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'" % (vcf_file, ref_file, gff_file)): pos, ref, alts_str, af_str, csq_str = l.strip().split() alt_af = sum([float(x) for x in af_str.split(",")]) csqs = csq_str.split(",") types = [] changes = [] genes = [] pos = int(pos) for i in range(len(csqs)): if csqs[i][0] == "@": results[pos] = results[int(csqs[i][1:])] elif csqs[i] == ".": results[pos] = { "pos": pos, "alts": alts_str, "alt_af": alt_af, "types": "intergenic", "changes": "NA", "gene": "NA", "gene_function": "NA", "gene_reference": "NA" } else: csq = csqs[i].split("|") types.append(csq[0].replace("*", "")) if csq[1] == "orf1ab": codon_pos = get_codon_num(csq[5]) if codon_pos in nsp_data: genes.append(nsp_data[codon_pos]["Gene"]) codon_pos = codon_pos - nsp_data[codon_pos]["Start"] + 1 changes.append(change_codon_number(csq[5], codon_pos)) else: genes.append("orf1ab") changes.append(csq[5]) else: changes.append(csq[5]) genes.append(csq[1]) if len(set(types)) == 1: types = list(set(types)) results[pos] = { "pos": pos, "alts": alts_str, "alt_af": alt_af, "types": ",".join(types), "changes": ",".join(changes), "gene": genes[0], "gene_function": gene_info[genes[0]]["function"], "gene_reference": gene_info[genes[0]]["DOI"] } return results
def create_db(args, extra_files=None): variables = json.load(open("variables.json")) genome_file = "%s.fasta" % args.prefix gff_file = "%s.gff" % args.prefix bed_file = "%s.bed" % args.prefix json_file = "%s.dr.json" % args.prefix version_file = "%s.version.json" % args.prefix if not extra_files: extra_files = {} if args.match_ref: chrom_conversion = match_ref_chrom_names(args.match_ref, "genome.fasta") shutil.copyfile(args.match_ref, genome_file) else: chrom_conversion = match_ref_chrom_names("genome.fasta", "genome.fasta") shutil.copyfile("genome.fasta", genome_file) with open(gff_file, "w") as O: for l in open("genome.gff"): if l.strip() == "": continue if l[0] == "#": O.write(l) else: row = l.strip().split() if row[0] in chrom_conversion: row[0] = chrom_conversion[row[0]] O.write("\t".join(row) + "\n") genes = load_gff(gff_file) gene_name2gene_id = {g.name: g.locus_tag for g in genes.values()} gene_name2gene_id.update( {g.locus_tag: g.locus_tag for g in genes.values()}) db = {} locus_tag_to_drug_dict = defaultdict(set) with open(args.prefix + ".conversion.log", "w") as L: if args.csv: mutation_lookup = get_snpeff_formated_mutation_list( args.csv, "genome.fasta", "genome.gff", json.load(open("variables.json"))["snpEff_db"]) for row in csv.DictReader(open(args.csv)): locus_tag = gene_name2gene_id[row["Gene"]] drug = row["Drug"].lower() mut = mutation_lookup[(row["Gene"], row["Mutation"])] if args.include_original_mutation: row["original_mutation"] = row["Mutation"] if mut != row["Mutation"]: L.write( f"Converted {row['Gene']} {row['Mutation']} to {mut}\n" ) locus_tag_to_drug_dict[locus_tag].add(drug) if locus_tag not in db: db[locus_tag] = {} if mut not in db[locus_tag]: db[locus_tag][mut] = {"annotations": []} tmp_annotation = {"type": "drug", "drug": row["Drug"]} annotation_columns = set(row.keys()) - set( ["Gene", "Mutation", "Drug"]) for col in annotation_columns: if row[col] == "": continue tmp_annotation[col.lower()] = row[col] db[locus_tag][mut]["annotations"].append(tmp_annotation) db[locus_tag][mut]["genome_positions"] = get_genome_position( genes[locus_tag], mut) db[locus_tag][mut]["chromosome"] = genes[locus_tag].chrom if args.other_annotations: mutation_lookup = get_snpeff_formated_mutation_list( args.other_annotations, "genome.fasta", "genome.gff", json.load(open("variables.json"))["snpEff_db"]) for row in csv.DictReader(open(args.other_annotations)): locus_tag = gene_name2gene_id[row["Gene"]] mut = mutation_lookup[(row["Gene"], row["Mutation"])] if mut != row["Mutation"]: L.write( f"Converted {row['Gene']} {row['Mutation']} to {mut}\n" ) if locus_tag not in db: db[locus_tag] = {} if mut not in db[locus_tag]: db[locus_tag][mut] = {"annotations": []} tmp_annotation = {"type": row["Type"]} if args.include_original_mutation: tmp_annotation["original_mutation"] = row["Mutation"] for x in row["Info"].split(";"): key, val = x.split("=") tmp_annotation[key.lower()] = val if key == "drug": locus_tag_to_drug_dict[locus_tag].add(val) db[locus_tag][mut]["annotations"].append(tmp_annotation) db[locus_tag][mut]["genome_positions"] = get_genome_position( genes[locus_tag], mut) db[locus_tag][mut]["chromosome"] = genes[locus_tag].chrom if args.watchlist: for row in csv.DictReader(open(args.watchlist)): locus_tag = gene_name2gene_id[row["Gene"]] for d in row["Drug"].split(","): drug = d.lower() locus_tag_to_drug_dict[locus_tag].add(drug) version = {"name": args.prefix} if not args.custom: for l in cmd_out("git log | head -4"): row = l.strip().split() if row == []: continue version[row[0].replace(":", "")] = " ".join(row[1:]) version["commit"] = version["commit"][:7] else: version["Date"] = str( datetime.now()) if not args.db_date else args.db_date version["name"] = args.db_name if args.db_name else "NA" version["commit"] = args.db_commit if args.db_name else "NA" version["Author"] = args.db_author if args.db_author else "NA" json.dump(version, open(version_file, "w")) json.dump(db, open(json_file, "w")) if "barcode" in extra_files: barcode_file = f"{args.prefix}.{extra_files['barcode']}" with open(barcode_file, "w") as O: for l in open("barcode.bed"): if l[0] == "#": continue row = l.strip().split("\t") row[0] = chrom_conversion[row[0]] O.write("\t".join(row) + "\n") if "amplicon_primers" in vars(args) and args.amplicon_primers: write_amplicon_bed(genome_file, genes, db, args.amplicon_primers, bed_file) variables['amplicon'] = True else: ref_fasta_dict = fa2dict(genome_file) write_bed(db, locus_tag_to_drug_dict, genes, ref_fasta_dict, bed_file) variables['amplicon'] = False for file in extra_files.values(): target = f"{args.prefix}.{file}" shutil.copyfile(file, target) if list(chrom_conversion.keys()) != list(chrom_conversion.values()): variables["chromosome_conversion"] = { "target": list(chrom_conversion.keys()), "source": list(chrom_conversion.values()) } variables_file = args.prefix + ".variables.json" variables["files"] = { "ref": genome_file, "gff": gff_file, "bed": bed_file, "version": version_file, "json_db": json_file, "variables": variables_file } if extra_files: for key, val in extra_files.items(): variables["files"][key] = f"{args.prefix}.{val}" json.dump(variables, open(variables_file, "w")) if os.path.isfile("snpEffectPredictor.bin"): snpeff_db_name = json.load(open("variables.json"))["snpEff_db"] load_snpEff_db("snpEffectPredictor.bin", snpeff_db_name) if args.load: load_dir = f"{sys.base_prefix}/share/{args.software_name}" if not os.path.isdir(load_dir): os.mkdir(load_dir) for key, val in variables['files'].items(): target = f"{load_dir}/{val}" infolog(f"Copying file: {val} ---> {target}") shutil.copyfile(val, target) if key == "ref": pp.run_cmd(f"bwa index {target}") pp.run_cmd(f"samtools faidx {target}") tmp = target.replace(".fasta", "") pp.run_cmd(f"samtools dict {target} -o {tmp}.dict") successlog("Sucessfully imported library")
def main_profile(args): #### Setup conf dictionary ### if args.db == "tbdb" and not args.external_db and pp.nofile( sys.base_prefix + "/share/tbprofiler/tbdb.fasta"): pp.log( "Can't find the tbdb file at %s. Please run 'tb-profiler update_tbdb' to load the default library or specify another using the '--external_db' flag" % sys.base_prefix, ext=True) if args.external_db: conf = get_conf_dict(args.external_db) else: conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) ### Create folders for results if they don't exist ### if pp.nofolder(args.dir): os.mkdir(args.dir) for x in ["bam", "vcf", "results"]: if pp.nofolder(args.dir + "/" + x): os.mkdir(args.dir + "/" + x) ### Set up platform dependant parameters ### if args.platform == "nanopore": args.mapper = "minimap2" args.caller = "bcftools" args.no_trim = True run_delly = False else: if args.no_delly: run_delly = False else: run_delly = True ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix ### Create bam file if fastq has been supplied ### if args.bam == None: if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) else: exit("\nPlease provide a bam file or a fastq file(s)...Exiting!\n") bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) bam_file = bam_obj.bam_file else: bam_file = args.bam print(args.delly_bcf_file) run_coverage = False if args.no_coverage else True ### Run profiling module from pathogen-profiler ### results = pp.bam_profiler( conf=conf, bam_file=bam_file, prefix=files_prefix, platform=args.platform, caller=args.caller, threads=args.threads, no_flagstat=args.no_flagstat, run_delly=run_delly, calling_params=args.calling_params, coverage_fraction_threshold=args.coverage_fraction_threshold, missing_cov_threshold=args.missing_cov_threshold, delly_bcf_file=args.delly_bcf_file) json.dump(results, open(args.prefix + ".tmp_results.json", "w")) ### Reformat the results to TB-Profiler style ### results = tbp.reformat(results, conf, reporting_af=args.reporting_af) results["id"] = args.prefix results["tbprofiler_version"] = tbp._VERSION results["pipeline"] = { "mapper": args.mapper if not args.bam else "N/A", "variant_caller": args.caller } json_output = args.dir + "/results/" + args.prefix + ".results.json" tex_output = args.dir + "/results/" + args.prefix + ".results.tex" text_output = args.dir + "/results/" + args.prefix + ".results.txt" csv_output = args.dir + "/results/" + args.prefix + ".results.csv" json.dump(results, open(json_output, "w")) extra_columns = [x.lower() for x in args.add_columns.split(",") ] if args.add_columns else [] if args.pdf: tbp.write_tex(results, conf, tex_output, extra_columns) pp.run_cmd("pdflatex %s" % tex_output, verbose=1) pp.rm_files([ tex_output, args.dir + "/" + args.prefix + ".results.aux", args.dir + "/" + args.prefix + ".results.log" ]) if args.txt: tbp.write_text(results, conf, text_output, extra_columns, reporting_af=args.reporting_af) if args.csv: tbp.write_csv(results, conf, csv_output, extra_columns) ### Move files to respective directories ### if not args.bam: pp.run_cmd("mv %(dir)s/%(prefix)s.bam* %(dir)s/bam/" % vars(args)) if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) pp.run_cmd("mv -f %(dir)s/%(prefix)s*.vcf.gz* %(dir)s/vcf/" % vars(args)) if run_delly and results["delly"] == "success" and not args.delly_bcf_file: pp.run_cmd("mv -f %(dir)s/%(prefix)s.delly.bcf* %(dir)s/vcf/" % vars(args)) ### Add meta data to results if args.meta: for row in csv.DictReader(open(args.meta)): if row["id"] == results["id"]: for col in row: results["meta_" + col] = row[col] pp.log("Profiling finished sucessfully!")
def main_profile(args): if pp.nofolder(args.dir): os.mkdir(args.dir) conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix if args.fasta: if args.read1 or args.read2: sys.stderr.write( "Please use --fasta or --read1/2 but not both... Exiting!\n") quit() fasta_obj = pp.fasta(args.fasta) wg_vcf_obj = pp.vcf( fasta_obj.get_ref_variants(conf["ref"], prefix=args.prefix, file_prefix=files_prefix)) else: if not args.read1: sys.stderr.write( "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n" ) quit() ### Create bam file if fastq has been supplied ### if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) wg_vcf_obj = bam_obj.call_variants(conf["ref"], args.caller, remove_missing=True) cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"], wg_vcf_obj.samples[0], wg_vcf_obj.prefix + ".consensus.fasta") if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) refseq = pp.fasta(conf["ref"]).fa_dict refseqname = list(refseq.keys())[0] results = {} barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) barcode = pp.barcode(barcode_mutations, conf["barcode"]) clade = ";".join(sorted([d["annotation"] for d in barcode])) sys.stdout.write("%s\t%s\n" % (args.prefix, clade)) results["clade"] = clade variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"], conf["gff"], conf["proteins"]) results["variants"] = variant_data json.dump(results, open("%s.results.json" % files_prefix, "w"))
def main(args): if pp.nofolder(args.out_dir): pp.run_cmd("mkdir %s" % args.out_dir) conf = { "ref": args.ref, "gff": args.gff, "bed": args.bed, "ann": args.ann, } if args.conf: conf = json.load(open(args.conf)) for x in ["ref", "gff", "bed", "ann"]: if conf[x] == None: pp.log("%s variable is not defined" % x, True) bam_obj = pp.bam(args.bam, args.prefix, conf["ref"], platform=args.platform) bcf_obj = bam_obj.call_variants( prefix=args.prefix + ".targets", call_method=args.call_method, gff_file=conf["gff"], bed_file=conf["bed"], mixed_as_missing=False if args.platform == "Illumina" else True, threads=args.threads, min_dp=args.min_depth, af=args.af, caller=args.caller) csq = bcf_obj.load_csq(ann_file=conf["ann"]) variants = [] chr2gene_pos = {} for l in open(conf["ann"]): row = l.rstrip().split() chr2gene_pos[int(row[1])] = int(row[3]) for var in list(csq.values())[0]: var["_internal_change"] = var["change"] var["change"] = pp.reformat_mutations(var["change"], var["type"], var["gene_id"], chr2gene_pos) variants.append(var) if not args.no_delly: delly_bcf = bam_obj.run_delly() deletions = delly_bcf.overlap_bed(conf["bed"]) for deletion in deletions: tmp_change = pp.reformat_mutations( "%(chr)s_%(start)s_%(end)s" % deletion, var["type"], var["gene_id"], chr2gene_pos) tmp = { "genome_pos": deletion["start"], "gene_id": deletion["region"], "chr": deletion["chr"], "freq": 1, "type": "large_deletion", "change": tmp_change } variants.append(tmp) json.dump(variants, open("%s/%s.pp-results.json" % (args.out_dir, args.prefix), "w")) for x in [ ".targets.bcf", ".targets.csq.bcf", ".targets.csq.bcf.csi", ".targets.delly.bcf", ".targets.delly.bcf.csi", ".targets.del_pos.bed", ".targets.gvcf.gz", ".targets.gvcf.gz.csi", ".targets.missing.bcf" ]: if args.no_delly and "delly" in x: continue pp.run_cmd("rm %s%s" % (args.prefix, x))
def run_primer_conservation(primerF, primerR, probe, uniq_id, save_dir): pp.run_cmd( "primer_analysis.py --fp %s --rp %s --probe %s --dir %s --out %s --write-json" % (primerF, primerR, probe, save_dir, uniq_id)) return True
def main(args): args.uuid = str(uuid4()) conf = covid_profiler.get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) vars(args).update(conf) args.final_aln = "%s/%s.aln" % (args.working_dir, args.out) args.final_vcf = "%s/%s.vcf.gz" % (args.working_dir, args.out) args.final_csv = "%s/%s.variant_info.csv" % (args.working_dir, args.out) for name, seq in pyfastx.Fasta(args.ref, build_index=False): ref_seq = seq pp.run_cmd( "mafft --auto --thread -1 --keeplength --addfragments %(fasta)s %(ref)s > %(working_dir)s/%(uuid)s.aln" % vars(args)) troublesome_sites = set() if args.mask_troublesome_sites: from urllib.request import urlopen with urlopen( 'https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/problematic_sites_sarsCov2.vcf' ) as response: for l in response: row = l.decode().strip().split() if row[0][0] == "#": continue troublesome_sites.add(int(row[1])) print(troublesome_sites) with open(args.final_aln, "w") as O: for entry in tqdm( pyfastx.Fasta("%(working_dir)s/%(uuid)s.aln" % vars(args), full_name=True)): masked_seq = list(entry.seq.upper()) for start, end in [(1, 265), (29675, 29903)]: for i in range(start - 1, end): masked_seq[i] = "N" for pos in troublesome_sites: masked_seq[pos - 1] = "N" acgt = set(["A", "C", "G", "T"]) for pos in [i for i, n in enumerate(masked_seq) if n not in acgt]: masked_seq[pos] = "N" O.write(">%s\n%s\n" % (entry.name, "".join(masked_seq))) pp.run_cmd( "snp-sites -v %(final_aln)s | covid_profiler_vcf_fix_ref.py --ref %(ref)s | covid_profiler_vcf_mask_non_acgt.py | tqdm | bcftools view -a -Oz -o %(uuid)s.bed_masked.vcf.gz" % vars(args)) pp.run_cmd( "bcftools stats -s - %(uuid)s.bed_masked.vcf.gz > %(uuid)s.bed_masked.vcf.gz.stats" % vars(args)) pp.run_cmd( "bcftools norm -m - %(uuid)s.bed_masked.vcf.gz -Oz -o %(final_vcf)s" % vars(args)) variant_data = covid_profiler.get_variant_data(args.final_vcf, conf["ref"], conf["gff"], conf["proteins"]) with open(args.final_csv, "w") as O: fieldnames = list(variant_data[0].keys()) writer = csv.DictWriter(O, fieldnames) writer.writeheader() writer.writerows(variant_data) pp.run_cmd("iqtree -s %(final_aln)s -m GTR+F+G4 -nt 1" % vars(args)) sys.stderr.write("\n\n----------------\n") sys.stderr.write("Program complete\n") sys.stderr.write("----------------\n") sys.stderr.write("Alignment: %s\n" % args.final_aln) sys.stderr.write("VCF: %s\n" % args.final_vcf) sys.stderr.write("Variant summary csv: %s\n" % args.final_csv)