def process(args): out_script = "%s.run.sh" % args.prefix O = open(out_script, "w") ps.filecheck(args.ref) samples = [] args.sample_file = "%s.samples.txt" % args.prefix for row in csv.DictReader(open(args.csv)): args.id = row["ID"] samples.append(row["ID"]) args.r1 = "%s/%s" % (args.fastq_dir, row["R1"]) ps.filecheck(args.r1) #params["centrifuge"] = "--centrifuge %s" % args.centrifuge if args.centrifuge else "" args.r2 = "%s/%s" % (args.fastq_dir, row["R2"]) ps.filecheck(args.r2) O.write( "illumina_pipeline.py %(ref)s %(r1)s %(r2)s %(id)s -t %(threads)s -m %(mapper)s \n" % vars(args)) O.write( "tb-profiler profile -p %(id)s -a %(id)s.bam -t %(threads)s\n" % vars(args)) O.write("merge_vcfs.py %(sample_file)s %(ref)s %(prefix)s\n" % vars(args)) args.snps_aln_file = "%s.snps.fa" % args.prefix O.write( "raxml-ng --msa %(snps_aln_file)s --search --model GTR+G --threads `raxml-ng --msa %(snps_aln_file)s --parse --model GTR+G | grep MPI | awk '{print $9}'`\n" % vars(args)) O.write("tb-profiler collate\n") O.close() open(args.sample_file, "w").write("\n".join(samples)) if not args.dry: ps.run_cmd("bash %s" % out_script)
def reheader(bamfile,oldname,newname,threads): tmpfile = "%s.header" % bamfile cmd = "samtools view %s -H | sed 's/%s/%s/g' > %s" % (bamfile,oldname,newname,tmpfile) ps.run_cmd(cmd) newbamfile = "%s.reheader.bam" % (bamfile.replace(".bam","")) cmd = "samtools reheader %s %s | samtools view -@ %s -b > %s" % (tmpfile,bamfile,threads,newbamfile) ps.run_cmd(cmd)
def main(args): bcf = ps.bcf(args.bcf) bcf.get_mean_genotype() bcf.get_genesum() geno_file = bcf.prefix+".geno" genesum_file = bcf.prefix+".genesum" meta = {} for s in bcf.samples: meta[s] = {} for row in csv.DictReader(open(args.csv)): for pheno in row.keys(): if pheno=="id": continue if row['id'] not in meta: continue meta[row["id"]][pheno] = row[pheno] phenos = [x.rstrip() for x in open(args.phenos).readlines()] cmd_file = ps.get_random_file() X = open(cmd_file,"w") for pheno in phenos: pheno_file = "%s.pheno" % pheno if pheno not in row: ps.log("%s not in CSV file"%pheno,True) P = open(pheno_file,"w") P.write("\n".join([meta[s][pheno] if pheno in meta[s] else "NA" for s in bcf.samples])) P.close() X.write("gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s.genesum -notsnp\n" % (pheno_file,geno_file,pheno,pheno_file,geno_file,pheno,pheno,pheno_file,genesum_file,pheno,pheno)) X.close() if args.preprocess: ps.log("Preprocessing finished\n", True) else: ps.run_cmd("cat %s | parallel -j %s" % (cmd_file,args.threads))
def main(args): bcf_file = args.bcf ref_file = args.ref bcf = ps.bcf(bcf_file) bcf.generate_consensus(ref_file, threads=args.threads, no_chrom=args.combine) if args.combine: files = " ".join( ["%s.%s.fasta" % (bcf.prefix, s) for s in bcf.samples]) ps.run_cmd("cat %s > %s.genome.fasta" % (files, bcf.prefix)) ps.run_cmd("rm %s" % (files))
def process(args): out_script = "%s.run.sh" % args.prefix O = open(out_script,"w") ps.filecheck(args.ref) samples = [] args.sample_file = "%s.samples.txt" % args.prefix for row in csv.DictReader(open(args.csv)): args.id = row["ID"] samples.append(row["ID"]) args.r1 = "%s/%s" % (args.fastq_dir,row["R1"]) ps.filecheck(args.r1) #params["centrifuge"] = "--centrifuge %s" % args.centrifuge if args.centrifuge else "" args.r2 = "%s/%s" % (args.fastq_dir,row["R2"]) ps.filecheck(args.r2) O.write("illumina_pipeline.py %(ref)s %(r1)s %(r2)s %(id)s -t %(threads)s -m %(mapper)s \n" % vars(args)) O.close() open(args.sample_file,"w").write("\n".join(samples)) if not args.dry: ps.run_cmd("bash %s" % out_script)
def process(args): out_script = "%s.run.sh" % args.prefix O = open(out_script, "w") ps.filecheck(args.ref) samples = [] args.sample_file = "%s.samples.txt" % args.prefix for row in csv.DictReader(open(args.csv)): args.id = row["ID"] samples.append(row["ID"]) args.r1 = "%s/%s" % (args.fastq_dir, row["R1"]) ps.filecheck(args.r1) #params["centrifuge"] = "--centrifuge %s" % args.centrifuge if args.centrifuge else "" args.r2 = "%s/%s" % (args.fastq_dir, row["R2"]) ps.filecheck(args.r2) O.write( "illumina_pipeline.py %(ref)s %(r1)s %(r2)s %(id)s -t %(threads)s -m %(mapper)s \n" % vars(args)) O.close() open(args.sample_file, "w").write("\n".join(samples)) if not args.dry: ps.run_cmd("bash %s" % out_script)
def main(args): bcf = ps.bcf(args.bcf) bcf.get_mean_genotype() bcf.get_genesum() geno_file = bcf.prefix + ".geno" genesum_file = bcf.prefix + ".genesum" meta = {} for s in bcf.samples: meta[s] = {} for row in csv.DictReader(open(args.csv)): for pheno in row.keys(): if pheno == "id": continue if row['id'] not in meta: continue meta[row["id"]][pheno] = row[pheno] phenos = [x.rstrip() for x in open(args.phenos).readlines()] cmd_file = ps.get_random_file() X = open(cmd_file, "w") for pheno in phenos: pheno_file = "%s.pheno" % pheno if pheno not in row: ps.log("%s not in CSV file" % pheno, True) P = open(pheno_file, "w") P.write("\n".join([ meta[s][pheno] if pheno in meta[s] else "NA" for s in bcf.samples ])) P.close() X.write( "gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s -maf 0.00005 -miss 0.99 && gemma -lmm 1 -p %s -g %s -k output/%s.cXX.txt -o %s.genesum -notsnp\n" % (pheno_file, geno_file, pheno, pheno_file, geno_file, pheno, pheno, pheno_file, genesum_file, pheno, pheno)) X.close() if args.preprocess: ps.log("Preprocessing finished\n", True) else: ps.run_cmd("cat %s | parallel -j %s" % (cmd_file, args.threads))
def merge_sample(sample, cram=False, reference=None, threads=20): cram_flag = "-CT %s" % reference if cram else "-b" extension = ".cram" if cram else ".bam" samples = sample.split("_") for x in samples: ps.filecheck(x + extension) bams = " ".join([x + extension for x in samples]) first_bam = samples[0] + extension temp_bam = sample + ".tmp.bam" final_bam = sample + extension header = sample + ".header" cmd = "samtools view %s -H | sed 's/%s/%s/g' > %s" % ( first_bam, samples[0], sample, header) ps.run_cmd(cmd) cmd = "samtools merge -@ %s %s %s" % (threads, temp_bam, bams) ps.run_cmd(cmd) cmd = "samtools reheader %s %s | samtools view -@ %s %s > %s" % ( header, temp_bam, threads, cram_flag, final_bam) ps.run_cmd(cmd) cmd = "rm %s %s" % (header, temp_bam) ps.run_cmd(cmd)
#! /usr/bin/env python import sys import pathogenseq as ps ref_file = sys.argv[1] query_file = sys.argv[2] prefix = sys.argv[3] ps.mauve_call_variants(ref_file,query_file,prefix) cmd = "bgzip -f %s.vcf" % prefix ps.run_cmd(cmd)
#! /usr/bin/env python import sys import pathogenseq as ps import json args = {} args["r1"] = sys.argv[1] args["r2"] = sys.argv[2] args["prefix"] = sys.argv[3] args["centrifuge_db"] = sys.argv[4] args["fasta_db"] = sys.argv[5] args["threads"] = sys.argv[6] args["fq_report"] = ps.get_random_file() args["log"] = ps.get_random_file() cmd = "centrifuge -x %(centrifuge_db)s -1 %(r1)s -2 %(r2)s -S %(log)s --report-file %(fq_report)s -p %(threads)s" % args ps.run_cmd(cmd) best_ref = "" best_score = 0 best_species = "" best_species_score = 0 for l in open(args["fq_report"]): row = l.rstrip().split("\t") if row[4] == "numReads": continue if row[2] == "leaf" and int(row[4]) > best_score: best_ref = row[0] best_score = int(row[4]) elif row[2] == "species" and int(row[4]) > best_species_score: best_species = row[0] best_species_score = int(row[4])