Esempio n. 1
0
def main(args):
	vcf = vcf_class(args.vcf)
	vcf.get_mean_genotype()
	vcf.get_genesum()
	geno_file = vcf.prefix+".geno"
	genesum_file = vcf.prefix+".genesum"

	meta = {}
	for row in csv.DictReader(open(args.pheno_csv)):
		if args.pheno_id_column not in row:
			quit(bcolors.FAIL + f"\nError: Can't find {args.pheno_id_column} in csv columns, set it with --pheno-id-column\n" + bcolors.ENDC)
		meta[row[args.pheno_id_column]] = row

	if args.pheno_name_file:
		phenos = [l.strip() for l in open(args.pheno_name_file)]
	else:
		phenos = args.pheno_names

	
	cmd_file = str(uuid4())
	X = open(cmd_file,"w")
	for pheno in phenos:
		pheno_file = f"{pheno}.pheno"
		if pheno not in row:
			quit(bcolors.FAIL +  f"{pheno} not in CSV file" + bcolors.ENDC)
		P = open(pheno_file,"w")
		P.write("\n".join([meta[s][pheno] if pheno in meta[s] else "NA" for s in vcf.samples]))
		P.close()
		X.write("gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s.genesum -notsnp\n" % (pheno_file,geno_file,pheno,pheno_file,geno_file,pheno,pheno,pheno_file,genesum_file,pheno,pheno))
	X.close()

	if args.preprocess:
		fm.log("Preprocessing finished\n", True)
	else:
		fm.run_cmd("cat %s | parallel -j %s" % (cmd_file,args.threads))
Esempio n. 2
0
def main(args):
    if args.prefix:
        individual_bams = ["%s/%s%s" % (args.dir,run,args.suffix) for run in args.prefix.split("_")]
        new_id = args.new_id if args.new_id else args.prefix
    elif args.bams:
        individual_bams = args.bams.split(",")
        new_id = args.new_id if args.new_id else "_".join([bam.split("/")[-1].replace(args.suffix,"") for bam in individual_bams])
    elif (not args.prefix and not args.bams) or (args.prefix and args.bams):
        sys.stderr.write("Need wither '--bams' or '--prefix'... Exiting!\n")
        quit()
    if len(individual_bams)==1:
        sys.stderr.write("Need more than one bam... Exiting!\n")
        quit()
    for bam in individual_bams:
        fm.filecheck(bam)
    new_bamfile = "%s/%s%s" % (args.dir,new_id,args.suffix)
    tmp_bamfile = fm.get_random_file()
    tmp_file = fm.get_random_file()
    with open(tmp_file,"w") as O:
        for l in fm.cmd_out("samtools view -H %s" % individual_bams[0]):
            row = l.strip().split("\t")
            if row[0]=="@RG":
                continue
                row[1] = "ID:%s" % new_id
                row[2] = "SM:%s" % new_id
            O.write("%s\n" % "\t".join(row))

    fm.run_cmd("samtools merge -@ %s - %s | samtools reheader -i %s - | samtools addreplacerg -@ %s - -r 'ID:%s\\tSM:%s\\tPL:Illumina' -o %s" % (
        args.threads," ".join(individual_bams), tmp_file, args.threads,new_id, new_id, new_bamfile)
    )
    fm.run_cmd("samtools index %s" % new_bamfile)
    fm.rm_files([tmp_file,tmp_bamfile])
Esempio n. 3
0
def main(args):
    fm.filecheck(args.query)
    fm.filecheck(args.subject)

    ref_gene_seq = list(fm.fasta(args.query).fa_dict.values())[0]

    start_anchor = ref_gene_seq[:args.anchor_size]
    end_anchor = ref_gene_seq[-args.anchor_size:]

    tmp_in = fm.get_random_file()
    tmp_out = fm.get_random_file()
    with open(tmp_in, "w") as O:
        O.write(">tmp\n%s" % start_anchor)
    fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" %
               (tmp_in, args.subject, tmp_out),
               verbose=0)
    start_hits = parse_blast(tmp_out, args.anchor_size * 0.9)

    with open(tmp_in, "w") as O:
        O.write(">tmp\n%s" % end_anchor)
    fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" %
               (tmp_in, args.subject, tmp_out),
               verbose=0)
    end_hits = parse_blast(tmp_out, args.anchor_size * 0.9)

    fm.rm_files([tmp_in, tmp_out])

    result_type = ""
    if args.strict_one_hit and (len(start_hits) > 1 or len(end_hits) > 1):
        result_type = "NA"
    else:
        if start_hits[0]["subject_seq"] == end_hits[0]["subject_seq"]:
            result_type = "OK"
            start_hit = start_hits[0]
            end_hit = end_hits[0]
        else:
            result_type = "Fragmented"

    with open("%s.result.txt" % args.prefix, "w") as O:
        O.write("%s\t%s\n" % (args.prefix, result_type))

    if result_type != "OK":
        quit()

    print(start_hit, end_hit)
    subject_seqs = fm.fasta(args.subject).fa_dict
    if start_hit["subject_strand"] == "Plus" and end_hit[
            "subject_strand"] == "Plus":
        hit_seq = subject_seqs[
            start_hit["subject_seq"]][start_hit["subject_start"] -
                                      1:end_hit["subject_end"]]
    elif start_hit["subject_strand"] == "Minus" and end_hit[
            "subject_strand"] == "Minus":
        hit_seq = revcom(subject_seqs[start_hit["subject_seq"]]
                         [end_hit["subject_end"] -
                          1:start_hit["subject_start"]])

    # import pdb; pdb.set_trace()
    with open("%s.extracted_seq.fa" % args.prefix, "w") as O:
        O.write(">%s\n%s\n" % (args.prefix, hit_seq))
Esempio n. 4
0
def main_trim(args):
    if args.single:
        fm.run_cmd(
            "trimmomatic SE -phred33 %(read1)s %(prefix)s_trimmed.fq LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:36"
            % vars(args))
    else:
        fm.run_cmd(
            "trimmomatic PE -phred33 %(read1)s %(read2)s -baseout %(prefix)s LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:36"
            % vars(args))
Esempio n. 5
0
def main_gatk(args):
    if not args.prefix:
        args.prefix = args.bam.replace(".bam", "")
    fm.run_cmd(
        "gatk HaplotypeCaller -I %(bam)s -R %(ref)s -O %(prefix)s.g.vcf.gz -ERC %(erc)s"
        % vars(args))
    fm.run_cmd(
        "gatk ValidateVariants -V %(prefix)s.g.vcf.gz -gvcf -R %(ref)s && touch %(prefix)s.g.vcf.gz.validated"
        % vars(args))
def download_files(directory=None):
    sys.stderr.write("Downloading required files\n")
    import urllib.request
    if not directory:
        directory = "%s/.taxonkit/" % os.path.expanduser("~")
    if not os.path.isdir(directory):
        os.mkdir(directory)

    urllib.request.urlretrieve(
        'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz ',
        '%s/taxdump.tar.gz' % directory)
    fm.run_cmd("tar -C %s -xvf %s/taxdump.tar.gz" % (directory, directory))
Esempio n. 7
0
def main(args):
    vcf_obj = vcf_class(args.vcf)
    run_cmd(
        "plink --vcf %(vcf)s --distance square --double-id --allow-extra-chr --vcf-half-call missing --out %(vcf)s"
        % vars(args))
    outfile = open("%s.dists" % vcf_obj.prefix, "w")
    outfile.write("%s\n" % "\t".join(vcf_obj.samples))
    for l in open("%s.dist" % args.vcf):
        row = l.strip().split()
        outfile.write("%s\n" % "\t".join([str(float(x) / 2) for x in row]))
    outfile.close()
    run_cmd("rm %(vcf)s.dist %(vcf)s.log" % vars(args))
Esempio n. 8
0
 def __init__(self, filename, threads=4):
     self.samples = []
     self.filename = filename
     self.threads = threads
     self.prefix = get_vcf_prefix(filename)
     if nofile(filename + ".csi"):
         run_cmd("bcftools index  %(filename)s" % vars(self))
     self.temp_file = get_random_file()
     run_cmd("bcftools query -l %(filename)s > %(temp_file)s" % vars(self))
     for l in open(self.temp_file):
         self.samples.append(l.rstrip())
     os.remove(self.temp_file)
def main(args):
    check_programs(["taxonkit", "seqtk"])
    if not os.path.isdir(
            "%s/.taxonkit/" % os.path.expanduser("~")) or not os.path.isfile(
                "%s/.taxonkit/nodes.dmp" % os.path.expanduser("~")):
        download_files()
    nodes = set()
    sys.stderr.write("Loading taxonomy\n")
    cmd = "taxonkit list --ids %s" % (args.extract
                                      if args.extract else args.exclude)
    for l in fm.cmd_out(cmd):
        if l == "": continue
        row = l.strip().split()
        nodes.add(row[0])

    sys.stderr.write("Extracting read names\n")
    args.tmp_file = str(uuid4())
    total_reads = 0
    kept_reads = 0

    with open(args.tmp_file, "w") as O:
        if args.exclude:
            for l in tqdm(open(fm.filecheck(args.kraken2_output))):
                total_reads += 1
                row = l.strip().split()
                if row[2] not in nodes:
                    O.write("%s\n" % row[1])
                    kept_reads += 1
        else:
            for l in tqdm(open(fm.filecheck(args.kraken2_output))):
                total_reads += 1
                row = l.strip().split()
                if row[2] in nodes:
                    O.write("%s\n" % row[1])
                    kept_reads += 1

    sys.stderr.write("Writing filtered fastq files\n")
    fm.filecheck(args.R1)
    args.R1_filt = args.R1.replace(".fastq.gz", "").replace(
        ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz"
    fm.run_cmd("seqtk subseq %(R1)s %(tmp_file)s | gzip -c > %(R1_filt)s" %
               vars(args))

    if args.R2:
        fm.filecheck(args.R2)
        args.R2_filt = args.R2.replace(".fastq.gz", "").replace(
            ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz"
        fm.run_cmd("seqtk subseq %(R2)s %(tmp_file)s | gzip -c > %(R2_filt)s" %
                   vars(args))

    fm.rm_files([args.tmp_file])
    sys.stderr.write("\nKept %s/%s reads\n" % (kept_reads, total_reads))
def main(args):

    nodes = defaultdict(set)
    sys.stderr.write("Loading taxonomy\n")
    for l in tqdm(open(fm.filecheck(args.tax_dump))):
        row = l.strip().split()
        nodes[row[2]].add(row[0])

    def flatten(d):
        v = [[i] if not isinstance(i, list) else flatten(i) for i in d]
        return [i for b in v for i in b]

    def get_tax(t):
        if len(nodes[t]) == 0:
            return [t]
        return [t] + flatten([get_tax(sub_t) for sub_t in nodes[t]])

    sys.stderr.write("Extracting read names\n")
    args.tmp_file = str(uuid4())
    with open(args.tmp_file, "w") as O:
        if args.exclude:
            tax_tree = set(
                flatten([get_tax(x) for x in args.exclude.split(",")]))
            for l in tqdm(open(fm.filecheck(args.kraken2_output))):
                row = l.strip().split()
                if row[2] not in tax_tree:
                    O.write("%s\n" % row[1])
        else:
            tax_tree = set(
                flatten([get_tax(x) for x in args.extract.split(",")]))
            for l in tqdm(open(fm.filecheck(args.kraken2_output))):
                row = l.strip().split()
                if row[2] in tax_tree:
                    O.write("%s\n" % row[1])

    sys.stderr.write("Writing filtered fastq files\n")
    fm.filecheck(args.R1)
    args.R1_filt = args.R1.replace(".fastq.gz", "").replace(
        ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz"
    fm.run_cmd("seqtk subseq %(R1)s %(tmp_file)s | gzip -c > %(R1_filt)s" %
               vars(args))

    if args.R2:
        fm.filecheck(args.R2)
        args.R2_filt = args.R2.replace(".fastq.gz", "").replace(
            ".fq.gz", "").replace(".fastq", "") + ".kraken2_filt.fastq.gz"
        fm.run_cmd("seqtk subseq %(R2)s %(tmp_file)s | gzip -c > %(R2_filt)s" %
                   vars(args))

    fm.rm_files([args.tmp_file])
Esempio n. 11
0
def main(args):
    vcf_obj = vcf_class(args.vcf)
    if args.whole_genome:
        args.sample_file = get_random_file()
        open(args.sample_file, "w").write("\n".join(vcf_obj.samples) + "\n")
        run_cmd(
            'cat %(sample_file)s | parallel  --bar -j %(threads)s "bcftools consensus -f %(ref)s -s {} %(vcf)s | sed \'s/^>.*/>{}/\' > {}.tmp.fasta"'
            % vars(args))
        run_cmd('cat %s > %s.fa' %
                (" ".join(["%s.tmp.fasta" % s
                           for s in vcf_obj.samples]), vcf_obj.prefix))
        run_cmd('rm %s %s' %
                (" ".join(["%s.tmp.fasta" % s
                           for s in vcf_obj.samples]), args.sample_file))
        if args.tree:
            run_cmd("iqtree -s %s.fa -m GTR+G+ASC -nt AUTO" % vcf_obj.prefix)
    else:
        fasta_file = vcf_obj.vcf_to_fasta(args.ref, nofilt=args.snps_no_filt)
        if args.tree:
            run_cmd("iqtree -s %s -m GTR+G+ASC -nt AUTO" % fasta_file)
def main(args):
    which("bedtools")
    if not os.path.isfile(args.bam + ".genomecov.txt"):
        fm.run_cmd("bedtools genomecov -ibam %(bam)s > %(bam)s.genomecov.txt" %
                   vars(args))
    dp = defaultdict(dict)
    for l in open(args.bam + ".genomecov.txt"):
        row = l.strip().split()
        dp[row[0]][int(row[1])] = {
            "freq": int(row[2]),
            "fraction": float(row[4])
        }

    with open(args.out, "w") as O:
        writer = csv.DictWriter(
            O, fieldnames=["chrom", "mean", "std", "dp_0", "dp_5", "dp_10"])
        writer.writeheader()
        print()
        for chrom in dp:
            d1 = statsmodels.stats.weightstats.DescrStatsW(
                list(dp[chrom].keys()),
                [x["freq"] for x in dp[chrom].values()])
            # import pdb; pdb.set_trace()
            res = {
                "chrom":
                chrom,
                "mean":
                d1.mean,
                "std":
                d1.std,
                "dp_0":
                (1 - sum([dp[chrom][x]["fraction"] for x in [0]])) * 100,
                "dp_5": (1 - sum([dp[chrom][x]["fraction"]
                                  for x in range(6)])) * 100,
                "dp_10":
                (1 - sum([dp[chrom][x]["fraction"] for x in range(11)])) * 100
            }
            writer.writerow(res)
Esempio n. 13
0
def main(args):
    randint = rand_generator.randint(1, 999999)

    window_cmd = "bedtools makewindows -n %(chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % vars(
        args)

    cmd_to_run = "\"bcftools view --threads %s -r {1} %s -Ou | %s | bcftools view  -Oz -o %s_{2}.vcf.gz\"" % (
        args.compression_threads, args.vcf, args.cmd, randint)
    fm.run_cmd(
        f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {cmd_to_run}",
        verbose=2)
    fm.run_cmd(
        "bcftools concat -Oz -o %s `%s | awk '{print \"%s_\"$2\".vcf.gz\"}'`" %
        (args.out, window_cmd, randint))
    fm.run_cmd("rm `%s | awk '{print \"%s_\"$2\".vcf.gz*\"}'`" %
               (window_cmd, randint))
Esempio n. 14
0
def main_genotype(args):
    conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json")))
    params = vars(args)
    params["num_genome_chunks"] = conf["num_genome_chunks"]
    window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params
    params["window_cmd"] = window_cmd
    # Check folders exist
    for l in cmd_out(window_cmd):
        row = l.strip().split()
        dirname = "%s_%s_genomics_db" % (args.prefix,row[1])
        sys.stderr.write("Looking for direcotry named %s..." % dirname)
        foldercheck(dirname)
        sys.stderr.write("OK\n")

    genotype_cmd = "gatk --java-options \"-Xmx40g\" GenotypeGVCFs -R %(ref)s -V gendb://%(prefix)s_{2}_genomics_db -O %(prefix)s.{2}.genotyped.vcf.gz" % params
    run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {genotype_cmd}",verbose=2)
    run_cmd("bcftools concat -Oz -o %(prefix)s.%(subfix_vcf)s.genotyped.vcf.gz `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz\"}'`" % params)
    run_cmd("rm `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".genotyped.vcf.gz*\"}'`" % params)
Esempio n. 15
0
    def vcf_to_matrix(self, iupacgt=True):
        self.matrix_file = self.prefix + ".mat"
        self.binary_matrix_file = self.prefix + ".mat.bin"

        if args.no_iupacgt:
            self.matrix_file = self.prefix + ".noniupac.mat"
            O = open(self.matrix_file, "w").write("chr\tpos\tref\t%s\n" %
                                                  ("\t".join(self.samples)))
            run_cmd(
                "bcftools query -f '%%CHROM\\t%%POS\\t%%REF[\\t%%TGT]\\n' %(filename)s | sed 's/\.\/./N/g; s/\([ACTG]\)\///g; s/|//g' | sed -r 's/([ACGT])\\1+/\\1/g' >> %(matrix_file)s"
                % vars(self))
        else:
            O = open(self.matrix_file, "w").write("chr\tpos\tref\t%s\n" %
                                                  ("\t".join(self.samples)))
            run_cmd(
                "bcftools query -f '%%CHROM\\t%%POS\\t%%REF[\\t%%IUPACGT]\\n' %(filename)s | tr '|' '/' | sed 's/\.\/\./N/g' >> %(matrix_file)s"
                % vars(self))

        O = open(self.binary_matrix_file,
                 "w").write("chr\tpos\tref\t%s\n" % ("\t".join(self.samples)))
        run_cmd(
            "bcftools query -f '%%CHROM\\t%%POS\\t%%REF[\\t%%GT]\\n' %(filename)s | tr '|' '/' | sed 's/\.\/\./N/g' | sed 's/0\/1/0.5/g' | sed 's/1\/1/1/g' | sed 's/0\/0/0/g' >> %(binary_matrix_file)s"
            % vars(self))
Esempio n. 16
0
def main(args):
    original_vcf = fm.vcf_class(args.vcf, threads=args.threads)
    args.prefix = original_vcf.prefix
    args.filename = original_vcf.filename
    args.indels_cmd = "" if args.keep_indels else "bcftools view -V indels | "
    args.exclude_cmd = f"bcftools view -T ^{args.exclude_bed} |" if args.exclude_bed else ""
    args.annotation_drop = "" if args.keep_genotype_info else "bcftools annotate -x ^FORMAT/GT | "

    args.window_cmd = "bedtools makewindows -g %(ref)s.fai -n 20 | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % vars(
        args)
    args.filter_cmd = (
        "%(indels_cmd)s"
        "%(exclude_cmd)s"
        "setGT.py | "
        "%(annotation_drop)s"
        "bcftools view -c 1 -a -Ou | "
        "bcftools filter -e 'GT=\\\"het\\\"' -S . | "
        "bcftools view -i 'F_PASS(GT!=\\\"mis\\\")>%(site_missing)s' | "
        "bcftools view -c 1 | "
        "bcftools +fill-tags | "
        "bcftools view -e 'AF==1 || AF==0' | "
        "bcftools norm -f %(ref)s" % vars(args))
    if args.keep_indels:
        args.final_file = "%(prefix)s.filtered.vcf.gz" % vars(args)
    else:
        args.final_file = "%(prefix)s.filtered_no_indels.vcf.gz" % vars(args)

    fm.run_cmd(
        "%(window_cmd)s | parallel -j %(threads)s --col-sep \" \" \"bcftools view  %(filename)s -r {1} | %(filter_cmd)s > %(prefix)s.{2}.tmp.txt\""
        % vars(args))
    fm.run_cmd(
        "bcftools concat -Oz -o %(final_file)s `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".tmp.txt\"}'`"
        % vars(args))
    fm.run_cmd(
        "rm `%(window_cmd)s | awk '{print \"%(prefix)s.\"$2\".tmp.txt*\"}'`" %
        vars(args))
Esempio n. 17
0
def main(args):
    samples = []
    for l in open(args.samples):
        samples = [x.rstrip() for x in open(args.samples).readlines()]

    for s in samples:
        fm.filecheck("per_sample/%s%s" % (s, args.alignment_extension))

    if fm.nofolder("%(dir)s/kraken" % vars(args)):
        fm.run_cmd("%(dir)s/kraken" % vars(args))

    args.cmd_file = fm.get_random_file()
    with open(args.cmd_file, "w") as O:
        for s in samples:
            args.sample = s

            if fm.nofile("%(dir)s/per_sample/%(sample)s.median_dp.txt" %
                         vars(args)):
                O.write(
                    "printf %s\"\\t\"$(bedtools genomecov -d -ibam %s/per_sample/%s%s | datamash median 3)\"\\n\" > %s/per_sample/%s.median_dp.txt\n"
                    % (s, args.dir, s, args.alignment_extension, args.dir, s))

            if fm.nofile("%(dir)s/kraken/%(sample)s.done" % vars(args)):
                O.write(
                    "kraken2 --db /run/user/506/standard --gzip-compressed --paired %(dir)s/fastq/%(sample)s_1.fastq.gz %(dir)s/fastq/%(sample)s_2.fastq.gz --report %(dir)s/kraken/%(sample)s.report.txt --out %(dir)s/kraken/%(sample)s.out.txt --threads 10 --memory-mapping && touch %(dir)s/kraken/%(sample)s.done\n"
                    % vars(args))

    fm.run_cmd("cat %(cmd_file)s | parallel -j %(io_heavy_threads)s" %
               vars(args),
               verbose=2)
    fm.rm_files([args.cmd_file])

    sample_metrics = []
    for s in samples:
        res = {"sample": s}
        args.sample = s
        for i, l in enumerate(
                open("%(dir)s/per_sample/%(sample)s.bqsr.bamstats" %
                     vars(args))):
            row = l.rstrip().split()
            if i in [2, 3]:
                res[row[3]] = int(row[0])
            elif i == 4:
                res[row[3]] = int(row[0])
                res["mapped_percent"] = float(row[4].replace("(", "").replace(
                    "%", ""))
            else:
                pass

        kraken_results = {}
        for l in open("%(dir)s/kraken/%(sample)s.report.txt" % vars(args)):
            row = l.strip().split()
            if row[3] not in kraken_results:
                kraken_results[row[3]] = (float(row[0]), " ".join(row[5:]))
            if float(row[0]) > kraken_results[row[3]][0]:
                kraken_results[row[3]] = (float(row[0]), " ".join(row[5:]))
        res["kraken_genus"] = "%s (%.2f)" % (kraken_results["G"][1],
                                             kraken_results["G"][0])
        res["kraken_genus1"] = "%s (%.2f)" % (kraken_results["G1"][1],
                                              kraken_results["G1"][0])
        res["kraken_species"] = "%s (%.2f)" % (kraken_results["S"][1],
                                               kraken_results["S"][0])

        tbprofiler_result = json.load(
            open("%(dir)s/tbprofiler/results/%(sample)s.results.json" %
                 vars(args)))
        res["lineage"] = tbprofiler_result["main_lin"]
        res["sub-lineage"] = tbprofiler_result["sublin"]
        res["drtype"] = tbprofiler_result["drtype"]
        tmp_drugs = defaultdict(list)
        for var in tbprofiler_result["dr_variants"]:
            for d in var["drugs"]:
                tmp_drugs[d["drug"]].append(
                    "%s_%s (%.2f)" % (var["gene"], var["change"], var["freq"]))
        for d in drugs:
            res[d] = ", ".join(tmp_drugs[d])

        sample_metrics.append(res)

    with open(args.out + ".sample_info.csv", "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(sample_metrics[0]))
        writer.writeheader()
        writer.writerows(sample_metrics)

    vcf = fm.vcf_class(args.vcf)
    if fm.nofile(args.vcf + ".stats.txt"):
        fm.run_cmd(
            "bcftools norm -m - -f %(ref)s %(vcf)s | bcftools stats -v -s - > %(vcf)s.stats.txt"
            % (vars(args)))

    vcf_stats = vcf.load_stats()

    results = {
        "number of samples": vcf_stats["number of samples"],
        "number of records": vcf_stats["number of records"],
        "number of SNPs": vcf_stats["number of SNPs"],
        "number of indels": vcf_stats["number of indels"],
    }

    snp_results = []
    if fm.nofile(args.vcf + ".csq_info.txt"):
        fm.run_cmd(
            "bcftools view -V indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' > %(vcf)s.csq_info.txt"
            % vars(args))
        fm.run_cmd(
            "bcftools view -v indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' >> %(vcf)s.csq_info.txt"
            % vars(args))

    variant_info = vcf.get_variant_data(args.ref, args.gff)
    with open(args.out + ".variant_info.csv", "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(variant_info[0]))
        writer.writeheader()
        writer.writerows(variant_info)
Esempio n. 18
0
def main(args):

    samples = []
    reader = csv.DictReader(open(args.index_file))
    if "sample" not in reader.fieldnames:
        reader = csv.DictReader(open(args.index_file, encoding='utf-8-sig'))
    for row in reader:
        if row["sample"] == "": continue
        samples.append(row["sample"])

    fm.bwa_index(args.ref)
    fm.create_seq_dict(args.ref)
    fm.faidx(args.ref)
    cmd = "demultiplex_fastq.py --R1 %(read1)s --R2 %(read2)s --index %(index_file)s" % vars(
        args)
    if args.search_flipped_index:
        cmd += " --search-flipped-index"
    run_cmd(cmd)

    for sample in samples:
        args.sample = sample
        run_cmd("fastqc %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz" %
                vars(args))
        if args.trim:
            run_cmd(
                "trimmomatic PE %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz %(sample)s_1.trimmed.fastq.gz %(sample)s_1.untrimmed.fastq.gz %(sample)s_2.trimmed.fastq.gz %(sample)s_2.untrimmed.fastq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:%(trim_qv)s MINLEN:36 2> %(sample)s.trimlog"
                % vars(args))
            run_cmd(
                "bwa mem -t 10 -R \"@RG\\tID:%(sample_prefix)s%(sample)s\\tSM:%(sample_prefix)s%(sample)s\\tPL:Illumina\" %(ref)s %(sample)s_1.trimmed.fastq.gz %(sample)s_2.trimmed.fastq.gz | samclip --ref %(ref)s --max 50 | samtools sort -o %(sample)s.bam -"
                % vars(args))
        else:
            run_cmd(
                "bwa mem -t 10 -R \"@RG\\tID:%(sample_prefix)s%(sample)s\\tSM:%(sample_prefix)s%(sample)s\\tPL:Illumina\" %(ref)s %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz | samclip --ref %(ref)s --max 50 | samtools sort -o %(sample)s.bam -"
                % vars(args))

        run_cmd("samtools index %(sample)s.bam" % vars(args))
        run_cmd("samtools flagstat %(sample)s.bam > %(sample)s.flagstat.txt" %
                vars(args))
        run_cmd(
            "mosdepth -x -b %(bed)s %(sample)s --thresholds 1,10,20,30  %(sample)s.bam"
            % vars(args))
        run_cmd(
            "bedtools coverage -a %(bed)s -b %(sample)s.bam -mean > %(sample)s_region_coverage.txt"
            % vars(args))
        run_cmd(
            "sambamba depth base %(sample)s.bam > %(sample)s.coverage.txt" %
            vars(args))
        run_cmd(
            "freebayes -f %(ref)s -t %(bed)s %(sample)s.bam --haplotype-length -1> %(sample)s.freebayes.vcf"
            % vars(args))
        run_cmd(
            "gatk HaplotypeCaller -R %(ref)s -L %(bed)s  -I %(sample)s.bam -O %(sample)s.gatk.vcf"
            % vars(args))

    if not args.per_sample_only:
        with open("vcf_list.txt", "w") as O:
            for s in samples:
                O.write("%s.freebayes.vcf")
                O.write("%s.gatk.vcf")
        for sample in samples:
            args.sample = sample
            run_cmd(
                "naive_variant_caller.py --ref %(ref)s --bam %(sample)s.bam --sample %(sample)s --min-af %(min_sample_af)s --vcf-file-list vcf_list.txt | bcftools view -Oz -o %(sample)s.vcf.gz"
                % vars(args))
            run_cmd("tabix -f %(sample)s.vcf.gz" % vars(args))
        with open("vcf_list.txt", "w") as O:
            for s in samples:
                O.write("%s.vcf.gz\n" % (s))
        run_cmd("bcftools merge -l vcf_list.txt -Oz -o combined.vcf.gz")
        run_cmd(
            r"bcftools query -f '%CHROM\t%POS[\t%DP]\n' combined.vcf.gz > tmp.txt"
        )

        run_cmd(
            "bcftools filter -i 'FMT/DP>10' -S . combined.vcf.gz | bcftools sort -Oz -o tmp.vcf.gz"
            % vars(args))
        run_cmd(
            "bcftools view -v snps tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o snps.vcf.gz"
            % vars(args))
        run_cmd("tabix snps.vcf.gz" % vars(args))
        run_cmd(
            "bcftools view -v indels tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o indels.vcf.gz"
            % vars(args))
        run_cmd("tabix indels.vcf.gz" % vars(args))
        run_cmd(
            r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.snps.txt"
        )
        run_cmd(
            r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.snps.trans.txt"
        )
        run_cmd(
            r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.indels.txt"
        )
        run_cmd(
            r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.indels.trans.txt"
        )

        bedlines = []
        amplicon_positions = []
        for l in open(args.bed):
            row = l.strip().split()
            bedlines.append(row)
            for p in range(int(row[1]), int(row[2])):
                amplicon_positions.append((row[0], p))

        def overlap_bedlines(a, bedlines):
            overlaps = []
            for b in bedlines:
                if b[0] == a[0]:
                    overlap = max(
                        0,
                        min(int(a[2]), int(b[2])) - max(int(a[1]), int(b[1])))
                    if overlap > 0:
                        overlaps.append([
                            b[0],
                            max(int(a[1]), int(b[1])),
                            min(int(a[2]), int(b[2]))
                        ])
            return overlaps

        dp = defaultdict(dict)
        for s in samples:
            for l in gzip.open(f"{s}.per-base.bed.gz"):
                row = l.decode().strip().split()
                overlaps = overlap_bedlines(row, bedlines)
                if len(overlaps) > 0:
                    for overlap in overlaps:
                        for pos in range(int(overlap[1]), int(overlap[2])):
                            dp[s][(row[0], pos)] = int(row[3])

        pos_info = {}
        for l in open(args.position_info):
            row = l.strip().split()
            pos_info[(row[0], int(row[1]))] = (row[2], row[3])

        with open("depth_info.txt", "w") as O:
            O.write("chrom\tpos\tgene\tcsq\t%s\n" % "\t".join(samples))
            for chrom, pos in amplicon_positions:
                if (chrom, pos) in pos_info:
                    d = pos_info[(chrom, pos)]
                    O.write(
                        "%s\t%s\t%s\t%s\t%s\n" %
                        (chrom, pos, d[0], d[1], "\t".join(
                            [str(dp[s].get((chrom, pos), 0))
                             for s in samples])))
Esempio n. 19
0
def main(args):
    args.region_arg = ""

    if args.variant_caller == "gatk":
        if args.bed:
            args.region_arg = "-L %s" % args.bed
        fm.run_cmd(
            "gatk HaplotypeCaller -R %(ref)s %(region_arg)s -I %(bam)s -O %(out)s.vcf.gz"
            % vars(args))
    elif args.variant_caller == "bcftools":
        if args.bed:
            args.region_arg = "-R %s" % args.bed
        fm.run_cmd(
            "bcftools mpileup -f %(ref)s %(region_arg)s %(bam)s | bcftools call -mv -Oz -o %(out)s.vcf.gz"
            % vars(args))
    elif args.variant_caller == "freebayes":
        if args.bed:
            args.region_arg = "-t %s" % args.bed
        fm.run_cmd(
            "freebayes -f %(ref)s %(region_arg)s %(bam)s | bgzip -c > %(out)s.vcf.gz"
            % vars(args))
    else:
        quit("Unknown variant caller! Exiting!")

    fm.run_cmd("tabix -f %(out)s.vcf.gz" % vars(args))

    if args.bed:
        fm.run_cmd(
            "bedtools coverage -a %(bed)s -b %(bam)s -d | awk '$NF<%(depth_cutoff)s {print $1\"\\t\"$2+$(NF-1)-2\"\\t\"$2+$(NF-1)-1}' > %(out)s.depth_mask.bed"
            % vars(args))
    else:
        fm.run_cmd(
            "bedtools genomecov -ibam %(bam)s  -d | awk '$NF<%(depth_cutoff)s {print $1\"\\t\"$2-1\"\\t\"$2}' > %(out)s.depth_mask.bed"
            % vars(args))

    for l in fm.cmd_out("wc -l %(out)s.depth_mask.bed" % vars(args)):
        num_lines = int(l.strip().split()[0])

    args.mask_arg = "-m %(out)s.depth_mask.bed -M N" % vars(
        args) if num_lines > 0 else ""

    region_names = {}
    if args.bed:
        regions_file = args.out + ".regions.txt"
        with open(regions_file, "w") as O:
            for l in open(args.bed):
                row = l.strip().split()
                r = "%s:%s-%s" % (row[0], row[1], row[2])
                O.write(r + "\n")
                if len(row) > 3:
                    region_names[r] = row[3]

        args.region_arg = "-r %s" % regions_file
        consensus_cmd = "samtools faidx %(ref)s %(region_arg)s | bcftools consensus %(out)s.vcf.gz %(mask_arg)s" % vars(
            args)
    else:
        consensus_cmd = "bcftools consensus -f %(ref)s %(out)s.vcf.gz %(mask_arg)s" % vars(
            args)

    with open(args.out + ".consensus.fa", "w") as O:
        for l in fm.cmd_out(consensus_cmd):
            if l[0] == ">":
                r = l.strip()[1:]
                O.write(">%s %s\n" % (args.out, region_names.get(r, r)))
            else:
                O.write(l + "\n")
Esempio n. 20
0
def main(args):
    samples = []
    for f in args.index_files:
        for row in csv.DictReader(open(f)):
            if row["sample"] in samples:
                sys.stderr.write(
                    f"Warning! You have a duplicate sample name: {row['sample']}\n"
                )
            samples.append(row["sample"])

    with open("vcf_files.txt", "w") as O:
        for s in samples:
            O.write(f"{s}.freebayes.vcf\n")
            O.write(f"{s}.gatk.vcf\n")
    for sample in samples:
        args.sample = sample
        run_cmd(
            "naive_variant_caller.py --ref %(ref)s --bam %(sample)s.bam --sample %(sample)s --min-af %(min_sample_af)s --vcf-file-list vcf_files.txt | bcftools view -Oz -o %(sample)s.vcf.gz"
            % vars(args))
        run_cmd("tabix -f %(sample)s.vcf.gz" % vars(args))
    with open("vcf_list.txt", "w") as O:
        for s in samples:
            O.write("%s.vcf.gz\n" % (s))
    run_cmd("bcftools merge -l vcf_list.txt -Oz -o combined.vcf.gz")

    run_cmd(
        "bcftools filter -i 'FMT/DP>10' -S . combined.vcf.gz | bcftools sort -Oz -o tmp.vcf.gz"
        % vars(args))
    run_cmd(
        "bcftools view -v snps tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o snps.vcf.gz"
        % vars(args))
    run_cmd("tabix snps.vcf.gz" % vars(args))
    run_cmd(
        "bcftools view -v indels tmp.vcf.gz | bcftools csq -p a -f %(ref)s -g %(gff)s -Oz -o indels.vcf.gz"
        % vars(args))
    run_cmd("tabix indels.vcf.gz" % vars(args))
    run_cmd(
        r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.snps.txt"
    )
    run_cmd(
        r"bcftools query snps.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.snps.trans.txt"
    )
    run_cmd(
        r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.indels.txt"
    )
    run_cmd(
        r"bcftools query indels.vcf.gz -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\t%TBCSQ\n]' > combined_genotyped_filtered_formatted.indels.trans.txt"
    )

    bedlines = []
    amplicon_positions = []
    for l in open(args.bed):
        row = l.strip().split()
        bedlines.append(row)
        for p in range(int(row[1]), int(row[2])):
            amplicon_positions.append((row[0], p))

    def overlap_bedlines(a, bedlines):
        overlaps = []
        for b in bedlines:
            if b[0] == a[0]:
                overlap = max(
                    0,
                    min(int(a[2]), int(b[2])) - max(int(a[1]), int(b[1])))
                if overlap > 0:
                    overlaps.append([
                        b[0],
                        max(int(a[1]), int(b[1])),
                        min(int(a[2]), int(b[2]))
                    ])
        return overlaps

    dp = defaultdict(dict)
    for s in samples:
        for l in gzip.open(f"{s}.per-base.bed.gz"):
            row = l.decode().strip().split()
            overlaps = overlap_bedlines(row, bedlines)
            if len(overlaps) > 0:
                for overlap in overlaps:
                    for pos in range(int(overlap[1]), int(overlap[2])):
                        dp[s][(row[0], pos)] = int(row[3])

    pos_info = {}
    for l in open(args.position_info):
        row = l.strip().split()
        pos_info[(row[0], int(row[1]))] = (row[2], row[3])

    with open("depth_info.txt", "w") as O:
        O.write("chrom\tpos\tgene\tcsq\t%s\n" % "\t".join(samples))
        for chrom, pos in amplicon_positions:
            if (chrom, pos) in pos_info:
                d = pos_info[(chrom, pos)]
                O.write("%s\t%s\t%s\t%s\t%s\n" %
                        (chrom, pos, d[0], d[1], "\t".join(
                            [str(dp[s].get((chrom, pos), 0))
                             for s in samples])))
Esempio n. 21
0
def convert_to_cram(bam_file, ref_file, threads):
    cram_file = bam_file.replace(".bam", ".cram")
    fm.run_cmd("samtools view -@ %s -C %s -o %s -T %s" %
               (threads, bam_file, cram_file, ref_file))
    fm.run_cmd("samtools index %s" % cram_file)
    fm.run_cmd("rm %s %s.bai" % (bam_file, bam_file))
Esempio n. 22
0
def main_map(args):
    args.step = get_step_num(args.prefix)

    if "trimmed" in vars(args) and args.single:
        args.reads = "%(prefix)s_trimmed.fq" % vars(args)
    elif "trimmed" in vars(args) and not args.single:
        args.reads = "%(prefix)s_1P %(prefix)s_2P" % vars(args)
    elif "trimmed" not in vars(args) and args.single:
        args.reads = "%(read1)s %(read2)s" % vars(args)
    elif "trimmed" not in vars(args) and not args.single:
        args.reads = "%(read1)s" % vars(args)
    if args.redo or args.step < 1:
        fm.run_cmd(
            "bwa mem -t %(threads)s -R \"@RG\\tID:%(prefix)s\\tSM:%(prefix)s\\tPL:Illumina\" %(ref)s %(reads)s | samtools view -@ %(threads)s -b - | samtools fixmate -@ %(threads)s -m - - | samtools sort -@ %(threads)s - | samtools markdup -@ %(threads)s - %(prefix)s.mkdup.bam -"
            % vars(args))
        if "trimmed" in vars(args) and args.single:
            fm.run_cmd("rm %(reads)s" % vars(args))
        if "trimmed" in vars(args) and not args.single:
            fm.run_cmd(
                "rm %(prefix)s_1P %(prefix)s_2P %(prefix)s_1U %(prefix)s_2U" %
                vars(args))
        fm.run_cmd("samtools index -@ %(threads)s %(prefix)s.mkdup.bam" %
                   vars(args))
        fm.run_cmd(
            "samtools flagstat -@ %(threads)s %(prefix)s.mkdup.bam > %(prefix)s.mkdup.bamstats"
            % vars(args))
    if args.bqsr_vcf and (args.redo or args.step < 2):
        for vcf in args.bqsr_vcf.split(","):
            fm.tabix_vcf(vcf)
        args.bqsr_vcf = " ".join(
            ["--known-sites %s" % s for s in args.bqsr_vcf.split(",")])
        fm.run_cmd(
            "gatk BaseRecalibrator -R %(ref)s -I %(prefix)s.mkdup.bam %(bqsr_vcf)s -O %(prefix)s.recal_data.table"
            % vars(args))
        fm.run_cmd(
            "gatk ApplyBQSR -R %(ref)s -I %(prefix)s.mkdup.bam --bqsr-recal-file %(prefix)s.recal_data.table -O %(prefix)s.bqsr.bam"
            % vars(args))
        fm.run_cmd("samtools index -@ %(threads)s %(prefix)s.bqsr.bam" %
                   vars(args))
        fm.run_cmd(
            "samtools flagstat -@ %(threads)s %(prefix)s.bqsr.bam > %(prefix)s.bqsr.bamstats"
            % vars(args))
        fm.run_cmd("rm %(prefix)s.mkdup.bam*" % vars(args))
Esempio n. 23
0
def main_import(args):
    FAILED_SAMPLES = open("%s.failed_samples.log" % args.prefix, "w")
    params = vars(args)
    params["map_file"]= f"{args.prefix}.map"

    with open(params["map_file"],"w") as O:
        # Set up list to hold sample names
        samples = []
        # Loop through sample-file and do (1) append samples to list, (2) write sample to map file and (3) check for VCF index
        for line in open(args.sample_file):
            sample = line.rstrip()
            vcf_file = f"{args.vcf_dir}/{sample}{args.vcf_extension}"
            sys.stderr.write(f"Looking for {vcf_file}")
            if os.path.isfile(vcf_file):
                sys.stderr.write("...OK\n")
            else:
                sys.stderr.write("...Not found...skipping\n")
                continue
            # filecheck(vcf_file)
            if args.ignore_missing and nofile(vcf_file):
                FAILED_SAMPLES.write("%s\tno_file\n" % sample)
                continue
            if nofile(f"{vcf_file}.validated"):
                if nofile(f"{vcf_file}.tbi"):
                    run_cmd(f"tabix {vcf_file}")
                run_cmd(f"gatk ValidateVariants -R {args.ref} -V {vcf_file} -gvcf && touch {vcf_file}.validated")
                if nofile(f"{vcf_file}.validated"):
                    FAILED_SAMPLES.write("%s\tno_validation\n" % sample)
                    continue
            samples.append(sample)
            O.write("%s\t%s\n" % (sample,vcf_file))
            if nofile(f"{vcf_file}.tbi"):
                run_cmd(f"bcftools index --tbi {vcf_file}")
    # Create .dict file (GATK fasta index) has been created for the reference
    if nofile("%s.dict" % args.ref.replace(".fasta","").replace(".fa","")):
        run_cmd("gatk CreateSequenceDictionary -R %(ref)s" % params)
    # Create .fai file (SAMtools fasta index) has been created for the reference
    if nofile("%s.fai" % args.ref.replace(".fasta","").replace(".fa","")):
        run_cmd("samtools faidx %(ref)s" % params)

    window_cmd = "bedtools makewindows -n %(num_genome_chunks)s -g %(ref)s.fai | awk '{print $1\":\"$2+1\"-\"$3\" \"$1\"_\"$2+1\"_\"$3}'" % params
    if nofile("%(prefix)s.dbconf.json" % params):
        import_cmd = "gatk GenomicsDBImport --genomicsdb-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params
        run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
        json.dump({"num_genome_chunks":args.num_genome_chunks},open("%(prefix)s.dbconf.json" % params,"w"))
    else:
        conf = json.load(open(filecheck(f"{args.prefix}.dbconf.json")))
        for l in cmd_out(window_cmd):
            row = l.strip().split()
            dirname = "%s_%s_genomics_db" % (args.prefix,row[1])
            sys.stderr.write("Looking for direcotry named %s..." % dirname)
            foldercheck(dirname)
            sys.stderr.write("OK\n")
        import_cmd = "gatk GenomicsDBImport --genomicsdb-update-workspace-path %(prefix)s_{2}_genomics_db -L {1} --sample-name-map %(map_file)s --reader-threads %(threads)s --batch-size 500" % params
        run_cmd(f"{window_cmd} | parallel --bar -j {args.threads} --col-sep \" \" {import_cmd}", verbose=2)
def main(args):

    vcf_class = fm.vcf(args.vcf)
    vcf_positions = vcf_class.get_positions()

    if not args.fasta:
        if not args.ref:
            sys.stderr.write(
                "\nERROR: Please supply a reference with --ref\n\n")
            quit()
        fm.run_cmd(
            "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" %
            vars(args))
        args.fasta = "%s.snps.fa" % vcf_class.prefix
    if fm.nofile("%s.asr.state" % args.fasta):
        fm.run_cmd(
            "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr"
            % vars(args))

    tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1)
    node_names = set([tree.name] +
                     [n.name.split("/")[0] for n in tree.get_descendants()])
    leaf_names = set(tree.get_leaf_names())
    internal_node_names = node_names - leaf_names

    states_file = "%s.asr.state" % args.fasta
    states = defaultdict(dict)
    sys.stderr.write("Loading states\n")
    for l in tqdm(open(states_file)):
        if l[0] == "#": continue
        row = l.strip().split()
        if row[0] == "Node": continue
        site = int(row[1])
        if row[0] not in internal_node_names: continue
        states[site][row[0]] = row[2]

    seqs = fm.fasta(args.fasta).fa_dict
    for site in tqdm(list(states)):
        for sample in seqs:
            states[site][sample] = seqs[sample][site - 1]

    acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"])
    convergent_sites = []
    for site in tqdm(list(states)):
        nucleotides = set([states[site][n] for n in node_names])
        if len(nucleotides) == 1: continue

        # Set up storage objects
        origins = []

        tree.add_feature("state", states[site][tree.name])
        for n in tree.traverse():
            if n == tree: continue
            node_state = states[site][n.name]
            if node_state != n.get_ancestors(
            )[0].state and node_state in acgt and n.get_ancestors(
            )[0].state in acgt:
                origins.append(n.name)
            n.add_feature("state", node_state)
        if len(origins) > 1:
            convergent_sites.append((site, vcf_positions[site - 1], origins))

    with open(args.out, "w") as O:
        for site in convergent_sites:
            O.write("%s\t%s\n" % (site[1][1], len(site[2])))
def main(args):
    params = {"threads": args.threads, "prefix": args.prefix, "ref": args.ref, "map_file": f"{args.prefix}.map", "merged_file": args.merged_file, "include": args.include_regions, "vqslod": args.vqslod, "miss": args.missing_sample_cutoff, "mix":args.cutoff_mix_GT, "gff_file": args.gff_file}
        
    if args.include_regions:
        if not os.path.isfile("%(merged_file)s.tbi" % params):
            run_cmd("bcftools index -t %(merged_file)s" % params)
        params["vcf_in"] = params["merged_file"].replace(".genotyped.vcf.gz",".in.genotyped.vcf.gz")
        run_cmd("bcftools view -R %(include)s -O z -o %(vcf_in)s %(merged_file)s" % params)
        run_cmd("bcftools index -t %(vcf_in)s" % params)
        params["merged_file"] = params["vcf_in"]
    if not os.path.isfile(args.ref.replace(".fasta",".dict")):
        run_cmd("gatk CreateSequenceDictionary -R %s" % args.ref)
    for s in args.bqsr_vcf.split(","):
        if not os.path.isfile(s + ".tbi"):
            run_cmd("bcftools index -t %s" % s)
    if not os.path.isfile("%(merged_file)s.tbi" % params):
            run_cmd("bcftools index -t %(merged_file)s" % params)
    params["bqsr_vcf_mer"] = " ".join(["--resource:pf_crosses,known=false,training=true,truth=true,prior=15.0 %s " % s for s in args.bqsr_vcf.split(",")])
    params["output"] = params["merged_file"].replace(".genotyped.vcf.gz",".recal")
    ## Calculating calibration model
    run_cmd("gatk VariantRecalibrator -R %(ref)s -V %(merged_file)s %(bqsr_vcf_mer)s -an QD -an FS -an SOR -an DP --max-gaussians 8 --mq-cap-for-logit-jitter-transform 70 -mode SNP -O %(prefix)s.snps.recal --tranches-file %(prefix)s.snps.tranches --rscript-file %(prefix)s.snps.plots.R" % params)
    run_cmd("gatk VariantRecalibrator -R %(ref)s -V %(merged_file)s %(bqsr_vcf_mer)s -an QD -an DP -an SOR -an FS --max-gaussians 4 --mq-cap-for-logit-jitter-transform 70 -mode INDEL -O %(prefix)s.indel.recal --tranches-file %(prefix)s.indel.tranches --rscript-file %(prefix)s.indel.plots.R" % params)
    ## Applying calibration model and obtaining VQSLOD
    run_cmd("gatk ApplyVQSR -R %(ref)s -V %(merged_file)s -O %(prefix)s.vqslod.snps.vcf.gz --truth-sensitivity-filter-level 99.0 --tranches-file %(prefix)s.snps.tranches --recal-file %(prefix)s.snps.recal -mode SNP" % params)
    run_cmd("gatk ApplyVQSR -R %(ref)s -V %(merged_file)s -O %(prefix)s.vqslod.indel.vcf.gz --truth-sensitivity-filter-level 99.0 --tranches-file %(prefix)s.indel.tranches --recal-file %(prefix)s.indel.recal -mode INDEL" % params)
    ## Filtering based on VQSLOD
    run_cmd("bcftools view -i 'VQSLOD>%(vqslod)s' -O z -o %(prefix)s.vqslod.filt.snps.vcf.gz  %(prefix)s.vqslod.snps.vcf.gz" % params)
    run_cmd("bcftools view -i 'VQSLOD>%(vqslod)s' -O z -o %(prefix)s.vqslod.filt.indel.vcf.gz  %(prefix)s.vqslod.indel.vcf.gz" % params)
    ## Annotating filtered files VQSLOD
    run_cmd("bcftools index -t %(prefix)s.vqslod.filt.snps.vcf.gz" % params)
    run_cmd("bcftools index -t %(prefix)s.vqslod.filt.indel.vcf.gz" % params)
    ## Add sample filtering by missing
    if params["miss"] == "0":
        run_cmd("mv %(prefix)s.vqslod.filt.snps.vcf.gz %(prefix)s.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params)
    else:
        run_cmd("plink --vcf %(prefix)s.vqslod.filt.snps.vcf.gz --mind %(miss)s --recode vcf --allow-extra-chr --out %(prefix)s_plink" % params)
        run_cmd("grep -P \"^#CHROM\" %(prefix)s_plink.vcf | awk '{ $1=\"\"; $2=\"\";$3=\"\"; $4=\"\";$5=\"\"; $6=\"\";$7=\"\"; $8=\"\";$9=\"\"; print}' | sed 's/ /\\n/g' | tail -n+10 > %(prefix)s_new" % params)
        run_cmd("bcftools view -S %(prefix)s_new --threads 20 -O z -o  %(prefix)s.miss%(miss)s.vqslod.filt.snps.vcf.gz %(prefix)s.vqslod.filt.snps.vcf.gz" % params)
    ## Add set GT
    run_cmd("bcftools view %(prefix)s.miss%(miss)s.vqslod.filt.snps.vcf.gz | setGT.py --fraction %(mix)s | bcftools view -O z -c 1 -o %(prefix)s.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params)
    ## Select only biallelic
    run_cmd("bcftools view -m2 -M2 %(prefix)s.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz --threads 20 -O z -o %(prefix)s.bi.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params)
    ## Add CSQ annotation
    run_cmd("bcftools csq -p m -f %(ref)s -g %(gff_file)s %(prefix)s.bi.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz -O z -o %(prefix)s.csq.bi.GT.miss%(miss)s.vqslod.filt.snps.vcf.gz" % params)
def main(args):
    samples = []
    reader = csv.DictReader(open(args.index_file))
    if "sample" not in reader.fieldnames:
        reader = csv.DictReader(open(args.index_file, encoding='utf-8-sig'))
    for row in reader:
        if row["sample"] == "": continue
        samples.append(row["sample"])

    fm.bwa_index(args.ref)
    fm.faidx(args.ref)

    cmd = "demultiplex_fastq.py --R1 %(read1)s --R2 %(read2)s --index %(index_file)s" % vars(
        args)
    if args.search_flipped_index:
        cmd += " --search-flipped-index"
    fm.run_cmd(cmd)

    for sample in samples:
        args.sample = sample
        fm.run_cmd(
            "bwa mem -t 10 -R \"@RG\\tID:%(sample)s\\tSM:%(sample)s\\tPL:Illumina\" %(ref)s %(sample)s_1.fastq.gz %(sample)s_2.fastq.gz | samclip --ref %(ref)s --max 50 | samtools sort -o %(sample)s.bam -"
            % vars(args))

        fm.run_cmd("samtools index %(sample)s.bam" % vars(args))

    with open("bam_list.txt", "w") as O:
        for s in samples:
            O.write("%s.bam\n" % (s))

    fm.run_cmd(
        "freebayes -f %(ref)s -L bam_list.txt --haplotype-length -1 --min-coverage 50 --min-base-quality %(min_base_qual)s  --gvcf --gvcf-dont-use-chunk true | bcftools norm -f %(ref)s | bcftools sort -Oz -o combined.genotyped.vcf.gz"
        % vars(args))
    fm.run_cmd(
        r"bcftools view -c 1 combined.genotyped.vcf.gz | bcftools query -f '[%SAMPLE\t%CHROM\t%POS\t%REF\t%ALT\t%QUAL\t%GT\t%TGT\t%DP\t%AD\n]' > combined_genotyped_filtered_formatted.snps.txt"
    )
    fm.run_cmd(
        r"bcftools query -f '%CHROM\t%POS[\t%DP]\n' combined.genotyped.vcf.gz > depth_info.txt"
    )
Esempio n. 27
0
def main(args):
    chromosomes = list(range(1, 23))
    # chromosomes = [1]
    snp_data = {}
    for f in tqdm([
            "%s/chr%s.1kg.phase3.v5a.pvar" % (args.ref_dir, x)
            for x in chromosomes
    ]):
        for l in open(f):
            if l[0] == "#": continue
            row = l.strip().split()
            snp_data[row[2]] = row

    tmp_prefix = str(uuid.uuid4())
    exclude_file = "%s.exclude.txt" % args.out
    new_bim_file = "%s.bim" % tmp_prefix
    log_file = "%s.fill_bim.log" % args.out

    EXCLUDE = open(exclude_file, "w")
    BIM = open(new_bim_file, "w")
    LOG = open(log_file, "w")

    for l in tqdm(open(args.bfile + ".bim")):
        row = l.strip().split()
        rid = row[1]
        ref_snp_data = snp_data[row[1]] if rid in snp_data else None

        if args.remove_exm and "ex" in rid:
            LOG.write("%s\tExcluded: Variant starts with exm\n" % row[1])
            BIM.write("\t".join(row) + "\n")
            EXCLUDE.write(row[1] + "\n")
            continue

        if row[4] != "0" and row[5] != "0":
            if row[4] != "I" and row[4] != "D":
                LOG.write("%s\tOK: No change\n" % row[1])
                BIM.write("\t".join(row) + "\n")
            elif (row[5] == "I" or row[5] == "D") and ref_snp_data == None:
                LOG.write("%s\tExcluded: Indel not in ref\n" % row[1])
                BIM.write("\t".join(row) + "\n")
                EXCLUDE.write(row[1] + "\n")
            elif (row[4] == "I"
                  or row[5] == "I") and ("," in ref_snp_data[4]
                                         or "," in ref_snp_data[3]):
                LOG.write("%s\tExcluded: More than one alt allele in ref\n" %
                          row[1])
                BIM.write("\t".join(row) + "\n")
                EXCLUDE.write(row[1] + "\n")
            elif row[5] == "I" and len(ref_snp_data[3]) > 1:
                row[5] = ref_snp_data[3]
                row[4] = ref_snp_data[4]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFilled indel to ref\t\n" % row[1])
            elif row[5] == "I" and len(ref_snp_data[4]) > 1:
                row[5] = ref_snp_data[4]
                row[4] = ref_snp_data[3]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFilled indel to ref\t\n" % row[1])
            elif row[4] == "I" and len(ref_snp_data[4]) > 1:
                row[5] = ref_snp_data[3]
                row[4] = ref_snp_data[4]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFilled indel to ref\t\n" % row[1])
            elif row[4] == "I" and len(ref_snp_data[3]) > 1:
                row[5] = ref_snp_data[4]
                row[4] = ref_snp_data[3]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFilled indel to ref\t\n" % row[1])

            else:
                import pdb
                pdb.set_trace()
        elif row[4] == "0" and row[5] == "0":
            LOG.write("%s\tExcluded: No ref or alt present\n" % row[1])
            BIM.write("\t".join(row) + "\n")
            EXCLUDE.write(row[1] + "\n")
        elif row[4] == "0":
            if not ref_snp_data:
                LOG.write("%s\tExcluded: SNP not present in ref\n" % row[1])
                BIM.write("\t".join(row) + "\n")
                EXCLUDE.write(row[1] + "\n")
            elif set([ref_snp_data[3], ref_snp_data[4]]) == set([
                    "A", "T"
            ]) or set([ref_snp_data[3], ref_snp_data[4]]) == set(["C", "G"]):
                LOG.write("%s\tExcluded: Ambiguous ref/alt strand\n" % row[1])
                BIM.write("\t".join(row) + "\n")
                EXCLUDE.write(row[1] + "\n")
            elif "," in ref_snp_data[4] or "," in ref_snp_data[3]:
                LOG.write("%s\tExcluded: More than one alt allele in ref\n" %
                          row[1])
                BIM.write("\t".join(row) + "\n")
                EXCLUDE.write(row[1] + "\n")
            elif row[5] == ref_snp_data[3]:
                row[4] = ref_snp_data[4]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tOK: All ref\n" % row[1])
            elif row[5] == ref_snp_data[4]:
                row[4] = ref_snp_data[3]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tOK: All alt\n" % row[1])
            elif c(row[5]) == ref_snp_data[3]:
                row[5] = ref_snp_data[3]
                row[4] = ref_snp_data[4]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFlipped to be ref\t\n" % row[1])
            elif c(row[5]) == ref_snp_data[4]:
                row[5] = ref_snp_data[4]
                row[4] = ref_snp_data[3]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFlipped to be alt\t\n" % row[1])
            elif row[5] == "I" and len(ref_snp_data[3]) > 1:
                row[5] = ref_snp_data[3]
                row[4] = ref_snp_data[4]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFilled indel to ref\t\n" % row[1])
            elif row[5] == "D" and len(ref_snp_data[4]) > 1:
                row[5] = ref_snp_data[3]
                row[4] = ref_snp_data[4]
                BIM.write("\t".join(row) + "\n")
                LOG.write("%s\tFilled indel to ref\t\n" % row[1])
            elif (row[5] != "I"
                  and row[5] != "D") and len(ref_snp_data[3]) > 1:
                LOG.write("%s\tExcluded: Ref says indel but gt is SNP\n" %
                          row[1])
                BIM.write("\t".join(row) + "\n")
                EXCLUDE.write(row[1] + "\n")
            else:
                quit(row)
        else:
            quit(row)

    EXCLUDE.close()
    BIM.close()
    LOG.close()

    fm.run_cmd("cp %s.bed %s.bed" % (args.bfile, tmp_prefix))
    fm.run_cmd("cp %s.fam %s.fam" % (args.bfile, tmp_prefix))
    fm.run_cmd("plink --bfile %s --exclude %s --make-bed --out %s" %
               (tmp_prefix, exclude_file, args.out))