コード例 #1
0
def process(args):
    out_script = "%s.run.sh" % args.prefix
    O = open(out_script, "w")
    ps.filecheck(args.ref)

    samples = []
    args.sample_file = "%s.samples.txt" % args.prefix
    for row in csv.DictReader(open(args.csv)):
        args.id = row["ID"]
        samples.append(row["ID"])
        args.r1 = "%s/%s" % (args.fastq_dir, row["R1"])
        ps.filecheck(args.r1)
        #params["centrifuge"] = "--centrifuge %s" % args.centrifuge if args.centrifuge else ""
        args.r2 = "%s/%s" % (args.fastq_dir, row["R2"])
        ps.filecheck(args.r2)
        O.write(
            "illumina_pipeline.py %(ref)s %(r1)s %(r2)s %(id)s -t %(threads)s -m %(mapper)s \n"
            % vars(args))
        O.write(
            "tb-profiler profile -p %(id)s -a %(id)s.bam -t %(threads)s\n" %
            vars(args))

    O.write("merge_vcfs.py %(sample_file)s %(ref)s %(prefix)s\n" % vars(args))
    args.snps_aln_file = "%s.snps.fa" % args.prefix
    O.write(
        "raxml-ng --msa %(snps_aln_file)s --search --model GTR+G --threads `raxml-ng --msa %(snps_aln_file)s --parse --model GTR+G | grep MPI | awk '{print $9}'`\n"
        % vars(args))
    O.write("tb-profiler collate\n")
    O.close()
    open(args.sample_file, "w").write("\n".join(samples))
    if not args.dry:
        ps.run_cmd("bash %s" % out_script)
コード例 #2
0
def reheader(bamfile,oldname,newname,threads):
	tmpfile = "%s.header" % bamfile
	cmd = "samtools view %s -H | sed 's/%s/%s/g' > %s" % (bamfile,oldname,newname,tmpfile)
	ps.run_cmd(cmd)
	newbamfile = "%s.reheader.bam" % (bamfile.replace(".bam",""))
	cmd = "samtools reheader %s %s | samtools view -@ %s -b > %s" % (tmpfile,bamfile,threads,newbamfile)
	ps.run_cmd(cmd)
コード例 #3
0
def main(args):
	bcf = ps.bcf(args.bcf)
	bcf.get_mean_genotype()
	bcf.get_genesum()
	geno_file = bcf.prefix+".geno"
	genesum_file = bcf.prefix+".genesum"
	meta = {}
	for s in bcf.samples:
		meta[s] = {}
	for row in csv.DictReader(open(args.csv)):
		for pheno in row.keys():
			if pheno=="id": continue
			if row['id'] not in meta: continue
			meta[row["id"]][pheno] = row[pheno]
	phenos = [x.rstrip() for x in open(args.phenos).readlines()]
	cmd_file = ps.get_random_file()
	X = open(cmd_file,"w")
	for pheno in phenos:
		pheno_file = "%s.pheno" % pheno
		if pheno not in row:
			ps.log("%s not in CSV file"%pheno,True)
		P = open(pheno_file,"w")
		P.write("\n".join([meta[s][pheno] if pheno in meta[s] else "NA" for s in bcf.samples]))
		P.close()
		X.write("gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s.genesum -notsnp\n" % (pheno_file,geno_file,pheno,pheno_file,geno_file,pheno,pheno,pheno_file,genesum_file,pheno,pheno))
	X.close()

	if args.preprocess:
		ps.log("Preprocessing finished\n", True)
	else:
		ps.run_cmd("cat %s | parallel -j %s" % (cmd_file,args.threads))
コード例 #4
0
def main(args):
    bcf_file = args.bcf
    ref_file = args.ref
    bcf = ps.bcf(bcf_file)
    bcf.generate_consensus(ref_file,
                           threads=args.threads,
                           no_chrom=args.combine)
    if args.combine:
        files = " ".join(
            ["%s.%s.fasta" % (bcf.prefix, s) for s in bcf.samples])
        ps.run_cmd("cat %s > %s.genome.fasta" % (files, bcf.prefix))
        ps.run_cmd("rm %s" % (files))
コード例 #5
0
def process(args):
	out_script = "%s.run.sh" % args.prefix
	O = open(out_script,"w")
	ps.filecheck(args.ref)

	samples = []
	args.sample_file = "%s.samples.txt" % args.prefix
	for row in csv.DictReader(open(args.csv)):
		args.id = row["ID"]
		samples.append(row["ID"])
		args.r1 = "%s/%s" % (args.fastq_dir,row["R1"])
		ps.filecheck(args.r1)
		#params["centrifuge"] = "--centrifuge %s" % args.centrifuge if args.centrifuge else ""
		args.r2 = "%s/%s" % (args.fastq_dir,row["R2"])
		ps.filecheck(args.r2)
		O.write("illumina_pipeline.py %(ref)s %(r1)s %(r2)s %(id)s -t %(threads)s -m %(mapper)s \n" % vars(args))
	O.close()
	open(args.sample_file,"w").write("\n".join(samples))
	if not args.dry:
		ps.run_cmd("bash %s" % out_script)
コード例 #6
0
def process(args):
    out_script = "%s.run.sh" % args.prefix
    O = open(out_script, "w")
    ps.filecheck(args.ref)

    samples = []
    args.sample_file = "%s.samples.txt" % args.prefix
    for row in csv.DictReader(open(args.csv)):
        args.id = row["ID"]
        samples.append(row["ID"])
        args.r1 = "%s/%s" % (args.fastq_dir, row["R1"])
        ps.filecheck(args.r1)
        #params["centrifuge"] = "--centrifuge %s" % args.centrifuge if args.centrifuge else ""
        args.r2 = "%s/%s" % (args.fastq_dir, row["R2"])
        ps.filecheck(args.r2)
        O.write(
            "illumina_pipeline.py %(ref)s %(r1)s %(r2)s %(id)s -t %(threads)s -m %(mapper)s \n"
            % vars(args))
    O.close()
    open(args.sample_file, "w").write("\n".join(samples))
    if not args.dry:
        ps.run_cmd("bash %s" % out_script)
コード例 #7
0
def main(args):
    bcf = ps.bcf(args.bcf)
    bcf.get_mean_genotype()
    bcf.get_genesum()
    geno_file = bcf.prefix + ".geno"
    genesum_file = bcf.prefix + ".genesum"
    meta = {}
    for s in bcf.samples:
        meta[s] = {}
    for row in csv.DictReader(open(args.csv)):
        for pheno in row.keys():
            if pheno == "id": continue
            if row['id'] not in meta: continue
            meta[row["id"]][pheno] = row[pheno]
    phenos = [x.rstrip() for x in open(args.phenos).readlines()]
    cmd_file = ps.get_random_file()
    X = open(cmd_file, "w")
    for pheno in phenos:
        pheno_file = "%s.pheno" % pheno
        if pheno not in row:
            ps.log("%s not in CSV file" % pheno, True)
        P = open(pheno_file, "w")
        P.write("\n".join([
            meta[s][pheno] if pheno in meta[s] else "NA" for s in bcf.samples
        ]))
        P.close()
        X.write(
            "gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s.genesum -notsnp\n"
            % (pheno_file, geno_file, pheno, pheno_file, geno_file, pheno,
               pheno, pheno_file, genesum_file, pheno, pheno))
    X.close()

    if args.preprocess:
        ps.log("Preprocessing finished\n", True)
    else:
        ps.run_cmd("cat %s | parallel -j %s" % (cmd_file, args.threads))
コード例 #8
0
ファイル: merge_bams.py プロジェクト: jodyphelan/pathogenseq
def merge_sample(sample, cram=False, reference=None, threads=20):
    cram_flag = "-CT %s" % reference if cram else "-b"
    extension = ".cram" if cram else ".bam"
    samples = sample.split("_")

    for x in samples:
        ps.filecheck(x + extension)
    bams = " ".join([x + extension for x in samples])
    first_bam = samples[0] + extension
    temp_bam = sample + ".tmp.bam"
    final_bam = sample + extension
    header = sample + ".header"
    cmd = "samtools view %s -H | sed 's/%s/%s/g' > %s" % (
        first_bam, samples[0], sample, header)
    ps.run_cmd(cmd)
    cmd = "samtools merge -@ %s %s %s" % (threads, temp_bam, bams)
    ps.run_cmd(cmd)
    cmd = "samtools reheader %s %s | samtools view -@ %s %s > %s" % (
        header, temp_bam, threads, cram_flag, final_bam)
    ps.run_cmd(cmd)
    cmd = "rm %s %s" % (header, temp_bam)
    ps.run_cmd(cmd)
コード例 #9
0
ファイル: fasta2vcf.py プロジェクト: jodyphelan/pathogenseq
#! /usr/bin/env python
import sys
import pathogenseq as ps

ref_file = sys.argv[1]
query_file = sys.argv[2]
prefix = sys.argv[3]
ps.mauve_call_variants(ref_file,query_file,prefix)
cmd = "bgzip -f %s.vcf" % prefix
ps.run_cmd(cmd)
コード例 #10
0
#! /usr/bin/env python
import sys
import pathogenseq as ps
import json

args = {}
args["r1"] = sys.argv[1]
args["r2"] = sys.argv[2]
args["prefix"] = sys.argv[3]
args["centrifuge_db"] = sys.argv[4]
args["fasta_db"] = sys.argv[5]
args["threads"] = sys.argv[6]
args["fq_report"] = ps.get_random_file()
args["log"] = ps.get_random_file()
cmd = "centrifuge -x %(centrifuge_db)s -1 %(r1)s -2 %(r2)s -S %(log)s --report-file %(fq_report)s -p %(threads)s" % args
ps.run_cmd(cmd)

best_ref = ""
best_score = 0
best_species = ""
best_species_score = 0
for l in open(args["fq_report"]):
    row = l.rstrip().split("\t")
    if row[4] == "numReads": continue
    if row[2] == "leaf" and int(row[4]) > best_score:
        best_ref = row[0]
        best_score = int(row[4])
    elif row[2] == "species" and int(row[4]) > best_species_score:
        best_species = row[0]
        best_species_score = int(row[4])