Ejemplo n.º 1
0
def main(args):
    fm.filecheck(args.query)
    fm.filecheck(args.subject)

    ref_gene_seq = list(fm.fasta(args.query).fa_dict.values())[0]

    start_anchor = ref_gene_seq[:args.anchor_size]
    end_anchor = ref_gene_seq[-args.anchor_size:]

    tmp_in = fm.get_random_file()
    tmp_out = fm.get_random_file()
    with open(tmp_in, "w") as O:
        O.write(">tmp\n%s" % start_anchor)
    fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" %
               (tmp_in, args.subject, tmp_out),
               verbose=0)
    start_hits = parse_blast(tmp_out, args.anchor_size * 0.9)

    with open(tmp_in, "w") as O:
        O.write(">tmp\n%s" % end_anchor)
    fm.run_cmd("blastn -task blastn -query %s -subject %s -outfmt 15 > %s" %
               (tmp_in, args.subject, tmp_out),
               verbose=0)
    end_hits = parse_blast(tmp_out, args.anchor_size * 0.9)

    fm.rm_files([tmp_in, tmp_out])

    result_type = ""
    if args.strict_one_hit and (len(start_hits) > 1 or len(end_hits) > 1):
        result_type = "NA"
    else:
        if start_hits[0]["subject_seq"] == end_hits[0]["subject_seq"]:
            result_type = "OK"
            start_hit = start_hits[0]
            end_hit = end_hits[0]
        else:
            result_type = "Fragmented"

    with open("%s.result.txt" % args.prefix, "w") as O:
        O.write("%s\t%s\n" % (args.prefix, result_type))

    if result_type != "OK":
        quit()

    print(start_hit, end_hit)
    subject_seqs = fm.fasta(args.subject).fa_dict
    if start_hit["subject_strand"] == "Plus" and end_hit[
            "subject_strand"] == "Plus":
        hit_seq = subject_seqs[
            start_hit["subject_seq"]][start_hit["subject_start"] -
                                      1:end_hit["subject_end"]]
    elif start_hit["subject_strand"] == "Minus" and end_hit[
            "subject_strand"] == "Minus":
        hit_seq = revcom(subject_seqs[start_hit["subject_seq"]]
                         [end_hit["subject_end"] -
                          1:start_hit["subject_start"]])

    # import pdb; pdb.set_trace()
    with open("%s.extracted_seq.fa" % args.prefix, "w") as O:
        O.write(">%s\n%s\n" % (args.prefix, hit_seq))
Ejemplo n.º 2
0
def main(args):
    if args.prefix:
        individual_bams = ["%s/%s%s" % (args.dir,run,args.suffix) for run in args.prefix.split("_")]
        new_id = args.new_id if args.new_id else args.prefix
    elif args.bams:
        individual_bams = args.bams.split(",")
        new_id = args.new_id if args.new_id else "_".join([bam.split("/")[-1].replace(args.suffix,"") for bam in individual_bams])
    elif (not args.prefix and not args.bams) or (args.prefix and args.bams):
        sys.stderr.write("Need wither '--bams' or '--prefix'... Exiting!\n")
        quit()
    if len(individual_bams)==1:
        sys.stderr.write("Need more than one bam... Exiting!\n")
        quit()
    for bam in individual_bams:
        fm.filecheck(bam)
    new_bamfile = "%s/%s%s" % (args.dir,new_id,args.suffix)
    tmp_bamfile = fm.get_random_file()
    tmp_file = fm.get_random_file()
    with open(tmp_file,"w") as O:
        for l in fm.cmd_out("samtools view -H %s" % individual_bams[0]):
            row = l.strip().split("\t")
            if row[0]=="@RG":
                continue
                row[1] = "ID:%s" % new_id
                row[2] = "SM:%s" % new_id
            O.write("%s\n" % "\t".join(row))

    fm.run_cmd("samtools merge -@ %s - %s | samtools reheader -i %s - | samtools addreplacerg -@ %s - -r 'ID:%s\\tSM:%s\\tPL:Illumina' -o %s" % (
        args.threads," ".join(individual_bams), tmp_file, args.threads,new_id, new_id, new_bamfile)
    )
    fm.run_cmd("samtools index %s" % new_bamfile)
    fm.rm_files([tmp_file,tmp_bamfile])
Ejemplo n.º 3
0
 def __init__(self, filename, threads=4):
     self.samples = []
     self.filename = filename
     self.threads = threads
     self.prefix = get_vcf_prefix(filename)
     if nofile(filename + ".csi"):
         run_cmd("bcftools index  %(filename)s" % vars(self))
     self.temp_file = get_random_file()
     run_cmd("bcftools query -l %(filename)s > %(temp_file)s" % vars(self))
     for l in open(self.temp_file):
         self.samples.append(l.rstrip())
     os.remove(self.temp_file)
Ejemplo n.º 4
0
def main(args):
    vcf_obj = vcf_class(args.vcf)
    if args.snps:
        vcf_obj.vcf_to_fasta(args.ref, nofilt=args.snps_no_filt)
    else:
        args.sample_file = get_random_file()
        open(args.sample_file, "w").write("\n".join(vcf_obj.samples) + "\n")
        run_cmd(
            'cat %(sample_file)s | parallel  --bar -j %(threads)s "bcftools consensus -f %(ref)s -s {} %(vcf)s | sed \'s/^>.*/>{}/\' > {}.tmp.fasta"'
            % vars(args))
        run_cmd('cat %s > %s.fa' %
                (" ".join(["%s.tmp.fasta" % s
                           for s in vcf_obj.samples]), vcf_obj.prefix))
        run_cmd('rm %s %s' %
                (" ".join(["%s.tmp.fasta" % s
                           for s in vcf_obj.samples]), args.sample_file))
Ejemplo n.º 5
0
def main(args):
    samples = []
    for l in open(args.samples):
        samples = [x.rstrip() for x in open(args.samples).readlines()]

    for s in samples:
        fm.filecheck("per_sample/%s%s" % (s, args.alignment_extension))

    if fm.nofolder("%(dir)s/kraken" % vars(args)):
        fm.run_cmd("%(dir)s/kraken" % vars(args))

    args.cmd_file = fm.get_random_file()
    with open(args.cmd_file, "w") as O:
        for s in samples:
            args.sample = s

            if fm.nofile("%(dir)s/per_sample/%(sample)s.median_dp.txt" %
                         vars(args)):
                O.write(
                    "printf %s\"\\t\"$(bedtools genomecov -d -ibam %s/per_sample/%s%s | datamash median 3)\"\\n\" > %s/per_sample/%s.median_dp.txt\n"
                    % (s, args.dir, s, args.alignment_extension, args.dir, s))

            if fm.nofile("%(dir)s/kraken/%(sample)s.done" % vars(args)):
                O.write(
                    "kraken2 --db /run/user/506/standard --gzip-compressed --paired %(dir)s/fastq/%(sample)s_1.fastq.gz %(dir)s/fastq/%(sample)s_2.fastq.gz --report %(dir)s/kraken/%(sample)s.report.txt --out %(dir)s/kraken/%(sample)s.out.txt --threads 10 --memory-mapping && touch %(dir)s/kraken/%(sample)s.done\n"
                    % vars(args))

    fm.run_cmd("cat %(cmd_file)s | parallel -j %(io_heavy_threads)s" %
               vars(args),
               verbose=2)
    fm.rm_files([args.cmd_file])

    sample_metrics = []
    for s in samples:
        res = {"sample": s}
        args.sample = s
        for i, l in enumerate(
                open("%(dir)s/per_sample/%(sample)s.bqsr.bamstats" %
                     vars(args))):
            row = l.rstrip().split()
            if i in [2, 3]:
                res[row[3]] = int(row[0])
            elif i == 4:
                res[row[3]] = int(row[0])
                res["mapped_percent"] = float(row[4].replace("(", "").replace(
                    "%", ""))
            else:
                pass

        kraken_results = {}
        for l in open("%(dir)s/kraken/%(sample)s.report.txt" % vars(args)):
            row = l.strip().split()
            if row[3] not in kraken_results:
                kraken_results[row[3]] = (float(row[0]), " ".join(row[5:]))
            if float(row[0]) > kraken_results[row[3]][0]:
                kraken_results[row[3]] = (float(row[0]), " ".join(row[5:]))
        res["kraken_genus"] = "%s (%.2f)" % (kraken_results["G"][1],
                                             kraken_results["G"][0])
        res["kraken_genus1"] = "%s (%.2f)" % (kraken_results["G1"][1],
                                              kraken_results["G1"][0])
        res["kraken_species"] = "%s (%.2f)" % (kraken_results["S"][1],
                                               kraken_results["S"][0])

        tbprofiler_result = json.load(
            open("%(dir)s/tbprofiler/results/%(sample)s.results.json" %
                 vars(args)))
        res["lineage"] = tbprofiler_result["main_lin"]
        res["sub-lineage"] = tbprofiler_result["sublin"]
        res["drtype"] = tbprofiler_result["drtype"]
        tmp_drugs = defaultdict(list)
        for var in tbprofiler_result["dr_variants"]:
            for d in var["drugs"]:
                tmp_drugs[d["drug"]].append(
                    "%s_%s (%.2f)" % (var["gene"], var["change"], var["freq"]))
        for d in drugs:
            res[d] = ", ".join(tmp_drugs[d])

        sample_metrics.append(res)

    with open(args.out + ".sample_info.csv", "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(sample_metrics[0]))
        writer.writeheader()
        writer.writerows(sample_metrics)

    vcf = fm.vcf_class(args.vcf)
    if fm.nofile(args.vcf + ".stats.txt"):
        fm.run_cmd(
            "bcftools norm -m - -f %(ref)s %(vcf)s | bcftools stats -v -s - > %(vcf)s.stats.txt"
            % (vars(args)))

    vcf_stats = vcf.load_stats()

    results = {
        "number of samples": vcf_stats["number of samples"],
        "number of records": vcf_stats["number of records"],
        "number of SNPs": vcf_stats["number of SNPs"],
        "number of indels": vcf_stats["number of indels"],
    }

    snp_results = []
    if fm.nofile(args.vcf + ".csq_info.txt"):
        fm.run_cmd(
            "bcftools view -V indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' > %(vcf)s.csq_info.txt"
            % vars(args))
        fm.run_cmd(
            "bcftools view -v indels %(vcf)s | bcftools norm -m - -f %(ref)s | bcftools csq -f %(ref)s -g %(gff)s | correct_tb_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%AC\\t%%BCSQ\\n' >> %(vcf)s.csq_info.txt"
            % vars(args))

    variant_info = vcf.get_variant_data(args.ref, args.gff)
    with open(args.out + ".variant_info.csv", "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(variant_info[0]))
        writer.writeheader()
        writer.writerows(variant_info)