def fasta2vcf(fasta_file, outfile):
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)
    refseq = pp.fasta(conf["ref"]).fa_dict
    seqs = pp.fasta(fasta_file)
    samples = list(seqs.fa_dict.keys())

    for sample in samples:
        fname = pp.get_random_file()
        open(fname, "w").write(">%s\n%s\n" % (sample, seqs.fa_dict[sample]))
        fasta_obj = pp.fasta(fname)
        vcf_obj = pp.vcf(fasta_obj.get_ref_variants(conf["ref"], sample))
        pp.run_cmd("rm %s" % fname)

    sample_chunks = [samples[i:i + 200] for i in range(0, len(samples), 200)]
    tmp_vcfs = []
    for tmp_samps in sample_chunks:
        tmp_list = pp.get_random_file()
        tmp_vcf = pp.get_random_file()
        open(tmp_list,
             "w").write("\n".join(["%s.vcf.gz" % x for x in tmp_samps]))
        pp.run_cmd("bcftools merge -0 -l %s -Oz -o %s" % (tmp_list, tmp_vcf))
        pp.run_cmd("bcftools index %s" % tmp_vcf)
        tmp_vcfs.append(tmp_vcf)
        pp.rm_files([tmp_list])

    pp.run_cmd("bcftools merge -0  %s | bcftools view -V indels -Oz -o %s" %
               (" ".join(tmp_vcfs), outfile))

    vcf_files = ["%s.vcf.gz" % s for s in samples]
    vcf_csi_files = ["%s.vcf.gz.csi" % s for s in samples]
    pp.rm_files(vcf_files + vcf_csi_files + tmp_vcfs)
Exemple #2
0
def run_fuzznuc(seqs, pattern, pmismatch=0):
    tmpfile = pp.get_random_file()
    pp.run_cmd(
        "fuzznuc -sequence %s -pattern %s -outfile %s -complement -pmismatch %s"
        % (seqs, pattern, tmpfile, pmismatch))
    result = parse_fuzznuc_output(tmpfile)
    pp.rm_files([tmpfile])
    return result
Exemple #3
0
def vcf2consensus(bam, vcf, ref, id, consensus):
    tmp_bed = pp.get_random_file()
    pp.run_cmd(
        'bedtools genomecov -d -ibam %s | awk \'$3<10\' | awk \'{print $1"\\t"$2"\\t"$2}\' > %s'
        % (bam, tmp_bed))
    pp.run_cmd(
        "bcftools consensus -f %s -m %s -M N %s | sed 's/^>.*/>%s/' > %s" %
        (ref, tmp_bed, vcf, id, consensus))
    pp.run_cmd("rm %s" % tmp_bed)
Exemple #4
0
def main(args):
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(".targets.csq.vcf.gz", "") for x in os.listdir(args.dir)
            if x[-19:] == ".targets.csq.vcf.gz"
        ]
    sample_fastas = defaultdict(list)
    params = {
        "tmp_locations": pp.get_random_file(),
        "tmp_mappings": pp.get_random_file(),
        "ref": conf["ref"]
    }
    pp.run_cmd("awk '{print $1\":\"$2\"-\"$3\"\\t\"$5}' %s > %s" %
               (conf["bed"], params["tmp_mappings"]))
    pp.run_cmd("cut -f1 %s > %s" %
               (params["tmp_mappings"], params["tmp_locations"]))
    FILES = {}
    for l in open(params["tmp_mappings"]):
        row = l.rstrip().split()
        FILES[row[0]] = open("%s.fasta" % row[1], "w")
    for s in samples:
        params["vcf"] = "%s/%s.targets.csq.vcf.gz" % (args.dir, s)
        params["tmp_vcf"] = "%s/%s.targets.csq.tmp.vcf.gz" % (args.dir, s)
        params["sample_fa"] = "%s.targets.fa" % (s)
        pp.run_cmd(
            "bcftools filter -e 'sum(AD)=0' -S . %(vcf)s | bcftools view -a | grep -v NON_REF | bcftools view -Oz -o %(tmp_vcf)s"
            % params)
        pp.run_cmd("bcftools index %(tmp_vcf)s" % params)
        pp.run_cmd(
            "samtools faidx -r %(tmp_locations)s %(ref)s | bcftools consensus -H A %(tmp_vcf)s > %(sample_fa)s"
            % params)
        fa_dict = pp.fasta(params["sample_fa"]).fa_dict
        for locus in fa_dict:
            FILES[locus].write(">%s\n%s\n" % (s, fa_dict[locus]))
        pp.rm_files([params["tmp_vcf"]])
    pp.rm_files([params["tmp_locations"], params["tmp_mappings"]])
Exemple #5
0
def phylogeny(prefix,conf_file,sample_file=None,base_dir = ".",threads=3):
    conf = json.load(open(conf_file))

    if sample_file:
        samples = [x.rstrip() for x in open(sample_file).readlines()]
    else:
        samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"]

    samples_file = pp.get_random_file()
    OUT = open(samples_file,"w")
    OUT.write("%s\n"%"\n".join(samples))
    OUT.close()
    for s in samples:
        tprefix = s+".genome"
        gbcf_file = "%s.gbcf" % tprefix
        if pp.nofile("%s/vcf/%s.genome.gbcf" % (base_dir,s)):
            bam_file = "%s/bam/%s.bam" % (base_dir,s)
            bam_obj = pp.bam(bam_file,s,conf["ref"])
            bam_obj.gbcf(prefix=tprefix)
            pp.run_cmd("mv %s* %s/vcf" % (gbcf_file,base_dir))
    cmd = "merge_vcfs.py %s %s %s --vcf_dir %s/vcf/ --vcf_ext genome.gbcf" % (samples_file,conf["ref"],prefix,base_dir)
    print(cmd)
def main(args):
    vcf = vcf_class(args.vcf)
    # vcf.get_mean_genotype()
    if args.genes:
        vcf.get_genesum()
    geno_file = vcf.prefix + ".geno"
    genesum_file = vcf.prefix + ".genesum"
    meta = {}
    for s in vcf.samples:
        meta[s] = {}
    for row in csv.DictReader(open(args.csv)):
        for pheno in row.keys():
            if pheno == "id": continue
            if row['id'] not in meta: continue
            meta[row["id"]][pheno] = row[pheno]
    phenos = [x.rstrip() for x in open(args.phenos).readlines()]
    cmd_file = pp.get_random_file()
    X = open(cmd_file, "w")
    for pheno in phenos:
        pheno_file = "%s.pheno" % pheno
        if pheno not in row:
            pp.log("%s not in CSV file" % pheno, True)
        P = open(pheno_file, "w")
        P.write("\n".join([
            meta[s][pheno] if pheno in meta[s] else "NA" for s in vcf.samples
        ]))
        P.close()
        X.write(
            "gemma -p %s -g %s -gk 1 -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s -maf 0.00005 -miss 0.99 && gemma  -lmm 1 -p %s -g %s  -k output/%s.cXX.txt  -o %s.genesum -notsnp\n"
            % (pheno_file, geno_file, pheno, pheno_file, geno_file, pheno,
               pheno, pheno_file, genesum_file, pheno, pheno))
    X.close()

    if args.preprocess:
        pp.log("Preprocessing finished\n", True)
    else:
        pp.run_cmd("cat %s | parallel -j %s" % (cmd_file, args.threads))
Exemple #7
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix]

    # Loop through the sample result files
    samples_with_mutation = []
    variant_position_set = set()
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant:
                samples_with_mutation.append(s)
                variant_position_set.add(var["genome_pos"])

    sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation))
    # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"]
    if len(samples_with_mutation)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found"))
        quit()
    elif len(variant_position_set)>1:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos"))
        quit()


    if len(variant_position_set)==1:
        variant_position = int(list(variant_position_set)[0])

    sys.stderr.write("\nGenome position is %s\n" % variant_position)
    sys.stderr.write("\nPerforming ReadPosRankSum test\n")
    # variant_position = 3841662
    params = vars(args)
    params["ref"] = conf["ref"]
    params["pos"] = variant_position
    params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz")
    read_pos_rank_sums = []
    for s in tqdm(samples_with_mutation):
        params["sample"] = s
        pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0)
        pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0)
        pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0)
        for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false  | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0):
            row = l.strip().split()
            if row[1]==".": continue
            if int(row[0])==variant_position:
                read_pos_rank_sums.append((s,float(row[1])))

    if len(read_pos_rank_sums)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples"))
    else:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums])))
    pp.rm_files([params["tmp_vcf"]])
Exemple #8
0
def profile_vcf(filename, conf):
    params = conf.copy()
    params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz")
    params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz")
    params["filename"] = filename
    params["tmphdr"] = pp.get_random_file()
    params["tmptxt"] = pp.get_random_file()
    l = ""
    for l in pp.cmd_out(
            "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" %
            params):
        pass
    AD_found = False if l == "" else True
    if AD_found == False:
        open(params["tmphdr"], "w").write(
            "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n"
        )
        pp.run_cmd(
            "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s"
            % params)
        pp.run_cmd("bgzip %(tmptxt)s" % params)
        pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params)
        pp.run_cmd(
            "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s"
            % params)
    else:
        pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params)
    pp.run_cmd(
        "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s  -Oz -o %(tmpcsq)s -p a"
        % params)
    csq_bcf_obj = pp.bcf(params["tmpcsq"])
    csq = csq_bcf_obj.load_csq(ann_file=conf["ann"])
    results = {
        "variants": [],
        "missing_pos": [],
        "qc": {
            "pct_reads_mapped": "NA",
            "num_reads_mapped": "NA"
        }
    }
    for sample in csq:
        results["variants"] = csq[sample]
    all_bcf_obj = pp.bcf(params["tmpvcf"])
    mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][
            325505]["C"] == 50:
        mutations["Chromosome"][325505] = {"T": 25}
    if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][
            599868]["G"] == 50:
        mutations["Chromosome"][599868] = {"A": 25}
    if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][
            931123]["C"] == 50:
        mutations["Chromosome"][931123] = {"T": 25}
    if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][
            1759252]["T"] == 50:
        mutations["Chromosome"][1759252] = {"G": 25}
    json.dump(mutations, open("dump.json", "w"))
    barcode_mutations = pp.barcode(mutations, conf["barcode"])
    results["barcode"] = barcode_mutations
    results = pp.db_compare(db_file=conf["json_db"], mutations=results)
    bed_regions = pp.load_bed(conf["bed"], [4], 4)
    missing_regions = {gene: "NA" for gene in bed_regions}
    results["missing_regions"] = missing_regions
    if AD_found:
        pp.run_cmd("rm %(tmpcsq)s" % params)
    else:
        pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params)
    return results