Example #1
0
def get_ann(variants):
    uuid = str(uuid4())  #"463545ef-71fc-449b-8f4e-9c907ee6fbf5"
    with open(uuid, "w") as O:
        O.write('##fileformat=VCFv4.2\n')
        O.write(
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
        O.write('##contig=<ID=Chromosome,length=4411532>\n')
        O.write(
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttest\n')
        for var in variants.values():
            O.write(
                "Chromosome\t%(pos)s\t.\t%(ref)s\t%(alt)s\t255\t.\t.\tGT\t1\n"
                % var)
    results = {}
    keys = list(variants.keys())
    vals = list(variants.values())
    i = 0
    for l in pp.cmd_out(f"snpEff ann Mycobacterium_tuberculosis_h37rv {uuid}"):
        if l[0] == "#": continue
        row = l.strip().split()
        for ann in row[7].split(","):
            a = ann.split("|")
            if vals[i]["gene"] in [a[3], a[4]]:
                results[keys[
                    i]] = a[9] if vals[i]["type"] == "nucleotide" else a[10]
        i += 1
    os.remove(uuid)
    return results
Example #2
0
def main(args):
    ref = pp.fasta(args.ref).fa_dict
    cds = gff_load_cds(args.gff)
    final_list = []
    coding = defaultdict(list)
    generator = pp.cmd_out(
        f"bcftools view {args.vcf}") if args.vcf else sys.stdin
    for l in generator:
        row = l.strip().split()
        if l[0] == "#":
            sys.stdout.write(l.strip() + "\n")
        elif len(row[3]) > 1 or len(row[4]) > 1:
            final_list.append(row)
        else:
            gene, cpos = get_codon_pos(row[0], int(row[1]), cds)
            if gene == None:
                final_list.append(row)
            else:
                coding[(gene, cpos)].append(row)

    for rows in coding.values():
        chrom = rows[0][0]
        pos = sorted([int(r[1]) for r in rows])

        ref_nucs = {p: ref[chrom][p - 1] for p in range(pos[0], pos[-1] + 1)}
        alt_nucs = ref_nucs.copy()
        for i, p in enumerate(pos):
            alt_nucs[p] = rows[i][4]
        new_row = rows[0]
        new_row[3] = "".join(ref_nucs.values())
        new_row[4] = "".join(alt_nucs.values())
        final_list.append(new_row)

    for row in sorted(final_list, key=lambda x: int(x[1])):
        sys.stdout.write("\t".join(row) + "\n")
 def get_mean_genotype(self, outfile=None):
     self.outfile = outfile
     if self.outfile == None:
         self.outfile = self.prefix + ".geno"
     O = open(self.outfile, "w")
     for l in tqdm(
             pp.cmd_out(
                 "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%TGT]\\n' %(filename)s"
                 % vars(self))):
         row = l.rstrip().split()
         alts = row[3].split(",")
         for alt in alts:
             ref = "%s/%s" % (row[2], row[2])
             tmp = "%s/%s" % (alt, alt)
             genos = []
             for x in row[4:]:
                 if x == ref:
                     genos.append("0")
                 elif x == tmp:
                     genos.append("1")
                 else:
                     genos.append("NA")
             O.write("%s, %s, %s, %s\n" %
                     (row[0] + "_" + row[1] + "_" + alt, row[2], alt,
                      ", ".join(genos)))
     O.close()
Example #4
0
def get_variant_data(vcf_file,ref_file,gff_file,protein_file):
    nsp_data = {}
    gene_info = {}
    for row in csv.DictReader(open(protein_file)):
        row["Start"] = int(row["Start"])
        row["End"] = int(row["End"])
        gene_info[row["Gene"]] = {"function":row["Putative function"],"DOI":row["DOI"]}
        if row["Region"]!="nsp": continue
        for i in range(row["Start"],row["End"]+1):
            nsp_data[i] = row

    pp.run_cmd("samtools faidx %s" % ref_file)
    results = defaultdict(list)
    for l in pp.cmd_out("bcftools view %s | bcftools csq -f %s -g %s -p a  | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'" % (vcf_file,ref_file,gff_file)):
        # Replace " " with "N" because if the alt allele contains N then
        # translated consequence will have spaces
        row = l.strip().replace(" ","N").split()
        pos,ref,alts_str,af_str,csq_str = row
        alt_af = sum([float(x) for x in af_str.split(",")])
        csqs = csq_str.split(",")
        types = []
        changes = []
        genes = []
        pos = int(pos)
        for i in range(len(csqs)):
            if csqs[i][0]=="@":
                # results[pos].append(results[int(csqs[i][1:])][0])
                pass
            elif csqs[i]==".":
                results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":"intergenic","changes":"NA","gene":"NA","gene_function":"NA","gene_reference":"NA"})

            else:
                csq = csqs[i].split("|")
                types.append(csq[0].replace("*",""))
                if csq[1]=="orf1ab":
                    codon_pos = get_codon_num(csq[5])
                    if codon_pos in nsp_data:
                        genes.append(nsp_data[codon_pos]["Gene"])
                        codon_pos = codon_pos-nsp_data[codon_pos]["Start"]+1
                        changes.append(change_codon_number(csq[5],codon_pos))
                    else:
                        genes.append("orf1ab")
                        changes.append(csq[5])
                else:
                    changes.append(csq[5] if len(csq)>5 else "")
                    genes.append(csq[1])
                if len(set(types))==1:
                    types = list(set(types))
                results[pos].append({"pos":pos, "alts":alts_str, "alt_af":alt_af, "types":",".join(types), "changes":",".join(changes),"gene":genes[0], "gene_function":gene_info[genes[0]]["function"], "gene_reference":gene_info[genes[0]]["DOI"]})
    final_results = []
    for res in list(results.values()):
        for r in res:
            final_results.append(r)
        # if len(res)==1:
        #     final_results.append(res[0])
        # else:
        #     quit("ERROR! more than one variant for a position")
    return final_results
Example #5
0
def main(args):
    generator = pp.cmd_out(
        f"bcftools view {args.vcf}") if args.vcf else sys.stdin
    convert = dict(zip(args.source, args.target))
    for l in generator:
        if l[0] == "#":
            sys.stdout.write(l)
        else:
            row = l.strip().split()
            row[0] = convert[row[0]]
            sys.stdout.write("\t".join(row) + "\n")
Example #6
0
def create_species_db(args, extra_files=None):
    if not extra_files:
        extra_files = {}
    version = {"name": args.prefix}
    if not args.db_name:
        for l in pp.cmd_out("git log | head -4"):
            row = l.strip().split()
            if row == []: continue
            version[row[0].replace(":", "")] = " ".join(row[1:])
        version["commit"] = version["commit"][:7]
    else:
        version["Date"] = str(
            datetime.now()) if not args.db_date else args.db_date
        version["name"] = args.db_name if args.db_name else "NA"
        version["commit"] = args.db_commit if args.db_name else "NA"
        version["Author"] = args.db_author if args.db_author else "NA"

    kmer_file = args.prefix + ".kmers.txt"
    version_file = args.prefix + ".version.json"
    shutil.copyfile(args.kmers, kmer_file)
    json.dump(version, open(version_file, "w"))
    for file in extra_files.values():
        target = f"{args.prefix}.{file}"
        shutil.copyfile(file, target)
    variables_file = args.prefix + ".variables.json"
    variables = {}
    variables["files"] = {
        "kmers": kmer_file,
        "version": version_file,
        "variables": variables_file
    }

    if extra_files:
        for key, val in extra_files.items():
            variables["files"][key] = f"{args.prefix}.{val}"
    json.dump(variables, open(variables_file, "w"))

    if args.load:
        load_dir = f"{sys.base_prefix}/share/{args.software_name}"
        if not os.path.isdir(load_dir):
            os.mkdir(load_dir)

        for key, val in variables['files'].items():
            target = f"{load_dir}/{val}"
            infolog(f"Copying file: {val} ---> {target}")
            shutil.copyfile(val, target)
 def get_genesum(self, outfile=None):
     self.outfile = outfile
     if self.outfile == None:
         self.outfile = self.prefix + ".gensum"
     genesum = defaultdict(lambda: defaultdict(int))
     O = open(self.outfile, "w")
     for l in tqdm(
             pp.cmd_out(
                 "bcftools query -f '[%%SAMPLE\\t%%GT\\t%%TBCSQ\\n]' %(filename)s"
                 % vars(self))):
         row = l.split()
         #por4A    1/1    synonymous|Rv0002|gene1|protein_coding|+|109L|2378G>A    synonymous|Rv0002|gene1|protein_coding|+|109L|2378G>A
         info = row[2].split("|")
         if info[0] == "synonymous": continue
         if info[0][0] == "@": continue
         genesum[info[1]][row[0]] += 1
     for gene in genesum:
         O.write(
             "%s\tNA\tNA\t%s\n" %
             (gene, "\t".join(str(genesum[gene][s]) for s in self.samples)))
     O.close()
Example #8
0
def run_profile(uniq_id, storage_dir, fasta=None, R1=None, R2=None):
    cp.log("This is the worker. Running %s" % uniq_id)
    if fasta:
        pp.run_cmd(
            "covid-profiler.py profile --fasta %s --prefix %s --dir %s" %
            (fasta, uniq_id, storage_dir))
    elif R1 and not R2:
        pp.run_cmd("covid-profiler.py profile -1 %s --prefix %s --dir %s" %
                   (R1, uniq_id, storage_dir))
    elif R1 and R2:
        pp.run_cmd(
            "covid-profiler.py profile -1 %s -2 %s --prefix %s --dir %s" %
            (R1, R2, uniq_id, storage_dir))
    else:
        sys.stderr.write("ERROR!!! Check file inputs to profile worker!")
    pp.run_cmd("zip -j %s/%s.zip %s/%s*" %
               (storage_dir, uniq_id, storage_dir, uniq_id))
    results = json.load(open("%s/%s.results.json" % (storage_dir, uniq_id)))

    if R1:
        pp.run_cmd("bcftools view %s/%s.vcf.gz > %s/%s.vcf" %
                   (storage_dir, uniq_id, storage_dir, uniq_id))
        for l in pp.cmd_out(
                "bedtools genomecov -ibam %s/%s.bam -d | datamash mean 3" %
            (storage_dir, uniq_id)):
            cp.log(l)
            results["mean_depth"] = round(float(l.strip()), 2)
    results["num_variants"] = len(results["variants"])

    client = MongoClient()
    db = client.test_database
    db.profiler_results.find_one_and_update(
        {"_id": uniq_id}, {"$set": {
            "results": results,
            "status": "done"
        }})

    return True
 def __init__(self, filename, prefix=None, threads=1):
     self.samples = []
     self.filename = filename
     self.prefix = prefix
     self.threads = threads
     if prefix == None:
         if filename[-4:] == ".bcf":
             self.prefix = filename[:-4]
         elif filename[-5:] == ".gbcf":
             self.prefix = filename[:-5]
         elif filename[-7:] == ".vcf.gz":
             self.prefix = filename[:-7]
         elif filename[-8:] == ".gvcf.gz":
             self.prefix = filename[:-8]
         elif filename[-4:] == ".vcf":
             self.prefix = filename[:-4]
         else:
             self.prefix = filename
     else:
         self.prefix = prefix
     index_bcf(filename, self.threads)
     for l in pp.cmd_out("bcftools query -l %(filename)s" % vars(self)):
         self.samples.append(l.rstrip())
Example #10
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix]

    # Loop through the sample result files
    samples_with_mutation = []
    variant_position_set = set()
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant:
                samples_with_mutation.append(s)
                variant_position_set.add(var["genome_pos"])

    sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation))
    # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"]
    if len(samples_with_mutation)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found"))
        quit()
    elif len(variant_position_set)>1:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos"))
        quit()


    if len(variant_position_set)==1:
        variant_position = int(list(variant_position_set)[0])

    sys.stderr.write("\nGenome position is %s\n" % variant_position)
    sys.stderr.write("\nPerforming ReadPosRankSum test\n")
    # variant_position = 3841662
    params = vars(args)
    params["ref"] = conf["ref"]
    params["pos"] = variant_position
    params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz")
    read_pos_rank_sums = []
    for s in tqdm(samples_with_mutation):
        params["sample"] = s
        pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0)
        pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0)
        pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0)
        for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false  | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0):
            row = l.strip().split()
            if row[1]==".": continue
            if int(row[0])==variant_position:
                read_pos_rank_sums.append((s,float(row[1])))

    if len(read_pos_rank_sums)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples"))
    else:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums])))
    pp.rm_files([params["tmp_vcf"]])
def get_variant_data(vcf_file, ref_file, gff_file, protein_file):
    nsp_data = {}
    gene_info = {}
    for row in csv.DictReader(open(protein_file)):
        row["Start"] = int(row["Start"])
        row["End"] = int(row["End"])
        gene_info[row["Gene"]] = {
            "function": row["Putative function"],
            "DOI": row["DOI"]
        }
        if row["Region"] != "nsp": continue
        for i in range(row["Start"], row["End"] + 1):
            nsp_data[i] = row

    pp.run_cmd("samtools faidx %s" % ref_file)
    results = {}
    for l in pp.cmd_out(
            "bcftools view %s | bcftools csq -f %s -g %s  | correct_covid_csq.py | bcftools +fill-tags | bcftools query -f '%%POS\\t%%REF\\t%%ALT\\t%%AF\\t%%BCSQ\\n'"
            % (vcf_file, ref_file, gff_file)):
        pos, ref, alts_str, af_str, csq_str = l.strip().split()
        alt_af = sum([float(x) for x in af_str.split(",")])
        csqs = csq_str.split(",")
        types = []
        changes = []
        genes = []
        pos = int(pos)
        for i in range(len(csqs)):
            if csqs[i][0] == "@":
                results[pos] = results[int(csqs[i][1:])]

            elif csqs[i] == ".":
                results[pos] = {
                    "pos": pos,
                    "alts": alts_str,
                    "alt_af": alt_af,
                    "types": "intergenic",
                    "changes": "NA",
                    "gene": "NA",
                    "gene_function": "NA",
                    "gene_reference": "NA"
                }

            else:
                csq = csqs[i].split("|")
                types.append(csq[0].replace("*", ""))

                if csq[1] == "orf1ab":
                    codon_pos = get_codon_num(csq[5])
                    if codon_pos in nsp_data:
                        genes.append(nsp_data[codon_pos]["Gene"])
                        codon_pos = codon_pos - nsp_data[codon_pos]["Start"] + 1
                        changes.append(change_codon_number(csq[5], codon_pos))
                    else:
                        genes.append("orf1ab")
                        changes.append(csq[5])
                else:
                    changes.append(csq[5])
                    genes.append(csq[1])
                if len(set(types)) == 1:
                    types = list(set(types))
                results[pos] = {
                    "pos": pos,
                    "alts": alts_str,
                    "alt_af": alt_af,
                    "types": ",".join(types),
                    "changes": ",".join(changes),
                    "gene": genes[0],
                    "gene_function": gene_info[genes[0]]["function"],
                    "gene_reference": gene_info[genes[0]]["DOI"]
                }
    return results
def main_profile(args):
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    if args.fasta:
        if args.read1 or args.read2:
            sys.stderr.write(
                "Please use --fasta or --read1/2 but not both... Exiting!\n")
            quit()
        fasta_obj = pp.fasta(args.fasta)
        wg_vcf_obj = pp.vcf(
            fasta_obj.get_ref_variants(conf["ref"],
                                       prefix=args.prefix,
                                       file_prefix=files_prefix))
    else:
        if not args.read1:
            sys.stderr.write(
                "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n"
            )
            quit()
        ### Create bam file if fastq has been supplied ###
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        wg_vcf_obj = bam_obj.call_variants(conf["ref"],
                                           args.caller,
                                           remove_missing=True)
        cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"],
                         wg_vcf_obj.samples[0],
                         wg_vcf_obj.prefix + ".consensus.fasta")
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    refseq = pp.fasta(conf["ref"]).fa_dict
    refseqname = list(refseq.keys())[0]

    results = {}
    if not args.fasta:
        for l in pp.cmd_out("bedtools genomecov -ibam %s | datamash median 3" %
                            (bam_obj.bam_file)):
            results["mean_depth"] = int(l.strip())
    barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    barcode = pp.barcode(barcode_mutations, conf["barcode"])
    clade = ";".join(sorted([d["annotation"] for d in barcode]))
    sys.stdout.write("%s\t%s\n" % (args.prefix, clade))
    results["clade"] = clade

    variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"],
                                       conf["gff"], conf["proteins"])
    results["variants"] = variant_data

    json.dump(results, open("%s.results.json" % files_prefix, "w"))
Example #13
0
def profile_vcf(filename, conf):
    params = conf.copy()
    params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz")
    params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz")
    params["filename"] = filename
    params["tmphdr"] = pp.get_random_file()
    params["tmptxt"] = pp.get_random_file()
    l = ""
    for l in pp.cmd_out(
            "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" %
            params):
        pass
    AD_found = False if l == "" else True
    if AD_found == False:
        open(params["tmphdr"], "w").write(
            "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n"
        )
        pp.run_cmd(
            "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s"
            % params)
        pp.run_cmd("bgzip %(tmptxt)s" % params)
        pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params)
        pp.run_cmd(
            "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s"
            % params)
    else:
        pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params)
    pp.run_cmd(
        "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s  -Oz -o %(tmpcsq)s -p a"
        % params)
    csq_bcf_obj = pp.bcf(params["tmpcsq"])
    csq = csq_bcf_obj.load_csq(ann_file=conf["ann"])
    results = {
        "variants": [],
        "missing_pos": [],
        "qc": {
            "pct_reads_mapped": "NA",
            "num_reads_mapped": "NA"
        }
    }
    for sample in csq:
        results["variants"] = csq[sample]
    all_bcf_obj = pp.bcf(params["tmpvcf"])
    mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][
            325505]["C"] == 50:
        mutations["Chromosome"][325505] = {"T": 25}
    if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][
            599868]["G"] == 50:
        mutations["Chromosome"][599868] = {"A": 25}
    if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][
            931123]["C"] == 50:
        mutations["Chromosome"][931123] = {"T": 25}
    if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][
            1759252]["T"] == 50:
        mutations["Chromosome"][1759252] = {"G": 25}
    json.dump(mutations, open("dump.json", "w"))
    barcode_mutations = pp.barcode(mutations, conf["barcode"])
    results["barcode"] = barcode_mutations
    results = pp.db_compare(db_file=conf["json_db"], mutations=results)
    bed_regions = pp.load_bed(conf["bed"], [4], 4)
    missing_regions = {gene: "NA" for gene in bed_regions}
    results["missing_regions"] = missing_regions
    if AD_found:
        pp.run_cmd("rm %(tmpcsq)s" % params)
    else:
        pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params)
    return results