def main(args): if args.prefix: individual_bams = ["%s/%s%s" % (args.dir,run,args.suffix) for run in args.prefix.split("_")] new_id = args.new_id if args.new_id else args.prefix elif args.bams: individual_bams = args.bams.split(",") new_id = args.new_id if args.new_id else "_".join([bam.split("/")[-1].replace(args.suffix,"") for bam in individual_bams]) elif (not args.prefix and not args.bams) or (args.prefix and args.bams): sys.stderr.write("Need wither '--bams' or '--prefix'... Exiting!\n") quit() if len(individual_bams)==1: sys.stderr.write("Need more than one bam... Exiting!\n") quit() for bam in individual_bams: fm.filecheck(bam) new_bamfile = "%s/%s%s" % (args.dir,new_id,args.suffix) tmp_bamfile = fm.get_random_file() tmp_file = fm.get_random_file() with open(tmp_file,"w") as O: for l in fm.cmd_out("samtools view -H %s" % individual_bams[0]): row = l.strip().split("\t") if row[0]=="@RG": continue row[1] = "ID:%s" % new_id row[2] = "SM:%s" % new_id O.write("%s\n" % "\t".join(row)) fm.run_cmd("samtools merge -@ %s - %s | samtools reheader -i %s - | samtools addreplacerg -@ %s - -r 'ID:%s\\tSM:%s\\tPL:Illumina' -o %s" % ( args.threads," ".join(individual_bams), tmp_file, args.threads,new_id, new_id, new_bamfile) ) fm.run_cmd("samtools index %s" % new_bamfile) fm.rm_files([tmp_file,tmp_bamfile])
def main(args): fm.filecheck(args.query) fm.filecheck(args.subject) ref_gene_seq = list(fm.fasta(args.query).fa_dict.values())[0] start_anchor = ref_gene_seq[:args.anchor_size] end_anchor = ref_gene_seq[-args.anchor_size:] tmp_in = fm.get_random_file() tmp_out = fm.get_random_file() with open(tmp_in, "w") as O: O.write(">tmp\n%s" % start_anchor) fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" % (tmp_in, args.subject, tmp_out), verbose=0) start_hits = parse_blast(tmp_out, args.anchor_size * 0.9) with open(tmp_in, "w") as O: O.write(">tmp\n%s" % end_anchor) fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" % (tmp_in, args.subject, tmp_out), verbose=0) end_hits = parse_blast(tmp_out, args.anchor_size * 0.9) fm.rm_files([tmp_in, tmp_out]) result_type = "" if args.strict_one_hit and (len(start_hits) > 1 or len(end_hits) > 1): result_type = "NA" else: if start_hits[0]["subject_seq"] == end_hits[0]["subject_seq"]: result_type = "OK" start_hit = start_hits[0] end_hit = end_hits[0] else: result_type = "Fragmented" with open("%s.result.txt" % args.prefix, "w") as O: O.write("%s\t%s\n" % (args.prefix, result_type)) if result_type != "OK": quit() print(start_hit, end_hit) subject_seqs = fm.fasta(args.subject).fa_dict if start_hit["subject_strand"] == "Plus" and end_hit[ "subject_strand"] == "Plus": hit_seq = subject_seqs[ start_hit["subject_seq"]][start_hit["subject_start"] - 1:end_hit["subject_end"]] elif start_hit["subject_strand"] == "Minus" and end_hit[ "subject_strand"] == "Minus": hit_seq = revcom(subject_seqs[start_hit["subject_seq"]] [end_hit["subject_end"] - 1:start_hit["subject_start"]]) # import pdb; pdb.set_trace() with open("%s.extracted_seq.fa" % args.prefix, "w") as O: O.write(">%s\n%s\n" % (args.prefix, hit_seq))
def main_import(args): FAILED_SAMPLES = open("%s.failed_samples.log" % args.prefix, "w") params = vars(args) params["map_file"]= f"{args.prefix}.map" with open(params["map_file"],"w") as O: # Set up list to hold sample names samples = [] # Loop through sample-file and do (1) append samples to list, (2) write sample to map file and (3) check for VCF index for line in open(args.sample_file): sample = line.rstrip() vcf_file = f"{args.vcf_dir}/{sample}{args.vcf_extension}" sys.stderr.write(f"Looking for {vcf_file}") if os.path.isfile(vcf_file): sys.stderr.write("...OK\n") else: sys.stderr.write("...Not found...skipping\n") continue # filecheck(vcf_file) if args.ignore_missing and nofile(vcf_file): FAILED_SAMPLES.write("%s\tno_file\n" % sample) continue if nofile(f"{vcf_file}.validated"): if nofile(f"{vcf_file}.tbi"): run_cmd(f"tabix {vcf_file}") run_cmd(f"gatk ValidateVariants -R {args.ref} -V {vcf_file} -gvcf && touch {vcf_file}.validated") if nofile(f"{vcf_file}.validated"): FAILED_SAMPLES.write("%s\tno_validation\n" % sample) continue samples.append(sample) O.write("%s\t%s\n" % (sample,vcf_file)) if nofile(f"{vcf_file}.tbi"): run_cmd(f"bcftools index --tbi {vcf_file}") # Create .dict file (GATK fasta index) has been created for the reference if nofile("%s.dict" % args.ref.replace(".fasta","").replace(".fa","")): run_cmd("gatk CreateSequenceDictionary -R %(ref)s" % params) # Create .fai file (SAMtools fasta index) has been created for the reference if nofile("%s.fai" % args.ref.replace(".fasta","").replace(".fa","")): run_cmd("samtools faidx %(ref)s" % params) window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params if nofile("%(prefix)s.dbconf.json" % params): import_cmd = "gatk GenomicsDBImport --genomicsdb-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2) json.dump({"num_genome_chunks":args.num_genome_chunks},open("%(prefix)s.dbconf.json" % params,"w")) else: conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json"))) for l in cmd_out(window_cmd): row = l.strip().split() dirname = "%s_%s_genomics_db" % (args.prefix,row[1]) sys.stderr.write("Looking for direcotry named %s..." % dirname) foldercheck(dirname) sys.stderr.write("OK\n") import_cmd = "gatk GenomicsDBImport --genomicsdb-update-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
def main(args): nodes = defaultdict(set) sys.stderr.write("Loading taxonomy\n") for l in tqdm(open(fm.filecheck(args.tax_dump))): row = l.strip().split() nodes[row[2]].add(row[0]) def flatten(d): v = [[i] if not isinstance(i, list) else flatten(i) for i in d] return [i for b in v for i in b] def get_tax(t): if len(nodes[t]) == 0: return [t] return [t] + flatten([get_tax(sub_t) for sub_t in nodes[t]]) sys.stderr.write("Extracting read names\n") args.tmp_file = str(uuid4()) with open(args.tmp_file, "w") as O: if args.exclude: tax_tree = set( flatten([get_tax(x) for x in args.exclude.split(",")])) for l in tqdm(open(fm.filecheck(args.kraken2_output))): row = l.strip().split() if row[2] not in tax_tree: O.write("%s\n" % row[1]) else: tax_tree = set( flatten([get_tax(x) for x in args.extract.split(",")])) for l in tqdm(open(fm.filecheck(args.kraken2_output))): row = l.strip().split() if row[2] in tax_tree: O.write("%s\n" % row[1]) sys.stderr.write("Writing filtered fastq files\n") fm.filecheck(args.R1) args.R1_filt = args.R1.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R1)s %(tmp_file)s | gzip -c > %(R1_filt)s" % vars(args)) if args.R2: fm.filecheck(args.R2) args.R2_filt = args.R2.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R2)s %(tmp_file)s | gzip -c > %(R2_filt)s" % vars(args)) fm.rm_files([args.tmp_file])
def main_genotype(args): conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json"))) params = vars(args) params["num_genome_chunks"] = conf["num_genome_chunks"] window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params params["window_cmd"] = window_cmd # Check folders exist for l in cmd_out(window_cmd): row = l.strip().split() dirname = "%s_%s_genomics_db" % (args.prefix,row[1]) sys.stderr.write("Looking for direcotry named %s..." % dirname) foldercheck(dirname) sys.stderr.write("OK\n") genotype_cmd = "gatk --java-options \"-Xmx40g\" GenotypeGVCFs -R %(ref)s -V gendb://%(prefix)s_{2}_genomics_db -O %(prefix)s.{2}.genotyped.vcf.gz" % params run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {genotype_cmd}",verbose=2) run_cmd("bcftools concat -Oz -o %(prefix)s.%(subfix_vcf)s.genotyped.vcf.gz `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz\"}'`" % params) run_cmd("rm `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz*\"}'`" % params)
def main(args): check_programs(["taxonkit", "seqtk"]) if not os.path.isdir( "%s/.taxonkit/" % os.path.expanduser("~")) or not os.path.isfile( "%s/.taxonkit/nodes.dmp" % os.path.expanduser("~")): download_files() nodes = set() sys.stderr.write("Loading taxonomy\n") cmd = "taxonkit list --ids %s" % (args.extract if args.extract else args.exclude) for l in fm.cmd_out(cmd): if l == "": continue row = l.strip().split() nodes.add(row[0]) sys.stderr.write("Extracting read names\n") args.tmp_file = str(uuid4()) total_reads = 0 kept_reads = 0 with open(args.tmp_file, "w") as O: if args.exclude: for l in tqdm(open(fm.filecheck(args.kraken2_output))): total_reads += 1 row = l.strip().split() if row[2] not in nodes: O.write("%s\n" % row[1]) kept_reads += 1 else: for l in tqdm(open(fm.filecheck(args.kraken2_output))): total_reads += 1 row = l.strip().split() if row[2] in nodes: O.write("%s\n" % row[1]) kept_reads += 1 sys.stderr.write("Writing filtered fastq files\n") fm.filecheck(args.R1) args.R1_filt = args.R1.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R1)s %(tmp_file)s | gzip -c > %(R1_filt)s" % vars(args)) if args.R2: fm.filecheck(args.R2) args.R2_filt = args.R2.replace(".fastq.gz", "").replace( ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz" fm.run_cmd("seqtk subseq %(R2)s %(tmp_file)s | gzip -c > %(R2_filt)s" % vars(args)) fm.rm_files([args.tmp_file]) sys.stderr.write("\nKept %s/%s reads\n" % (kept_reads, total_reads))
def main(args): samples = [] for l in open(args.samples): samples = [x.rstrip() for x in open(args.samples).readlines()] for s in samples: fm.filecheck("per_sample/%s%s" % (s, args.alignment_extension)) if fm.nofolder("%(dir)s/kraken" % vars(args)): fm.run_cmd("%(dir)s/kraken" % vars(args)) args.cmd_file = fm.get_random_file() with open(args.cmd_file, "w") as O: for s in samples: args.sample = s if fm.nofile("%(dir)s/per_sample/%(sample)s.median_dp.txt" % vars(args)): O.write( "printf %s\"\\t\"$(bedtools genomecov -d -ibam %s/per_sample/%s%s | datamash median 3)\"\\n\" > %s/per_sample/%s.median_dp.txt\n" % (s, args.dir, s, args.alignment_extension, args.dir, s)) if fm.nofile("%(dir)s/kraken/%(sample)s.done" % vars(args)): O.write( "kraken2 --db /run/user/506/standard --gzip-compressed --paired %(dir)s/fastq/%(sample)s_1.fastq.gz %(dir)s/fastq/%(sample)s_2.fastq.gz --report %(dir)s/kraken/%(sample)s.report.txt --out %(dir)s/kraken/%(sample)s.out.txt --threads 10 --memory-mapping && touch %(dir)s/kraken/%(sample)s.done\n" % vars(args)) fm.run_cmd("cat %(cmd_file)s | parallel -j %(io_heavy_threads)s" % vars(args), verbose=2) fm.rm_files([args.cmd_file]) sample_metrics = [] for s in samples: res = {"sample": s} args.sample = s for i, l in enumerate( open("%(dir)s/per_sample/%(sample)s.bqsr.bamstats" % vars(args))): row = l.rstrip().split() if i in [2, 3]: res[row[3]] = int(row[0]) elif i == 4: res[row[3]] = int(row[0]) res["mapped_percent"] = float(row[4].replace("(", "").replace( "%", "")) else: pass kraken_results = {} for l in open("%(dir)s/kraken/%(sample)s.report.txt" % vars(args)): row = l.strip().split() if row[3] not in kraken_results: kraken_results[row[3]] = (float(row[0]), " ".join(row[5:])) if float(row[0]) > kraken_results[row[3]][0]: kraken_results[row[3]] = (float(row[0]), " ".join(row[5:])) res["kraken_genus"] = "%s (%.2f)" % (kraken_results["G"][1], kraken_results["G"][0]) res["kraken_genus1"] = "%s (%.2f)" % (kraken_results["G1"][1], kraken_results["G1"][0]) res["kraken_species"] = "%s (%.2f)" % (kraken_results["S"][1], kraken_results["S"][0]) tbprofiler_result = json.load( open("%(dir)s/tbprofiler/results/%(sample)s.results.json" % vars(args))) res["lineage"] = tbprofiler_result["main_lin"] res["sub-lineage"] = tbprofiler_result["sublin"] res["drtype"] = tbprofiler_result["drtype"] tmp_drugs = defaultdict(list) for var in tbprofiler_result["dr_variants"]: for d in var["drugs"]: tmp_drugs[d["drug"]].append( "%s_%s (%.2f)" % (var["gene"], var["change"], var["freq"])) for d in drugs: res[d] = ", ".join(tmp_drugs[d]) sample_metrics.append(res) with open(args.out + ".sample_info.csv", "w") as O: writer = csv.DictWriter(O, fieldnames=list(sample_metrics[0])) writer.writeheader() writer.writerows(sample_metrics) vcf = fm.vcf_class(args.vcf) if fm.nofile(args.vcf + ".stats.txt"): fm.run_cmd( "bcftools norm -m - -f %(ref)s %(vcf)s | bcftools stats -v -s - > %(vcf)s.stats.txt" % (vars(args))) vcf_stats = vcf.load_stats() results = { "number of samples": vcf_stats["number of samples"], "number of records": vcf_stats["number of records"], "number of SNPs": vcf_stats["number of SNPs"], "number of indels": vcf_stats["number of indels"], } snp_results = [] if fm.nofile(args.vcf + ".csq_info.txt"): fm.run_cmd( "bcftools view -V indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' > %(vcf)s.csq_info.txt" % vars(args)) fm.run_cmd( "bcftools view -v indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' >> %(vcf)s.csq_info.txt" % vars(args)) variant_info = vcf.get_variant_data(args.ref, args.gff) with open(args.out + ".variant_info.csv", "w") as O: writer = csv.DictWriter(O, fieldnames=list(variant_info[0])) writer.writeheader() writer.writerows(variant_info)