Beispiel #1
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    # Loop through the sample result files
    null_lineage_samples = []
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        if data["lineage"] == []:
            null_lineage_samples.append(s)

    print("\n".join(null_lineage_samples))
Beispiel #2
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    # Loop through the sample result files
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        variants = []
        vartypes = []
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        for var in data["dr_variants"]:
            if var["locus_tag"] == "Rv2043c":
                variants.append(var)
                vartypes.append(var["type"].replace("*", ""))
        if len(variants) > 1 and "frameshift" in var["type"]:
            print(s, variants)
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    # Loop through the sample result files
    results = defaultdict(list)
    dr_mutations = set()
    # for
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if var["gene"] == "fabG1" and var["change"] == "c.663C>A":
                print(var["freq"])
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    variants = defaultdict(list)
    # Loop through the sample result files
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            variants[(var["gene"], var["change"])].append(s)

    with open(args.out, "w") as O:
        O.write("Gene,Variant,%s\n" % ",".join(samples))
        for key in variants:
            samps = variants[key]
            O.write("%s,%s,%s\n" % (key[0], key[1], ",".join(
                ["1" if s in samps else "0" for s in samples])))
Beispiel #5
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.in_dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    # Loop through the sample result files
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.in_dir, s, args.suffix))))
        for var in data["dr_variants"]:
            if isinstance(var["drugs"], list): continue
            tmp1 = []
            for d in var["drugs"]:
                tmp2 = {}
                tmp2["drug"] = d
                for k in var["drugs"][d]:
                    tmp2[k] = var["drugs"][d][k]
                tmp1.append(tmp2)
            var["drugs"] = tmp1

        json.dump(data, open("%s/%s%s" % (args.out_dir, s, args.suffix), "w"))
Beispiel #6
0
def main(args):
    mapping = {
        "missense": "SNP",
        "non_coding": "SNP",
        "non_coding": "SNP",
        "stop_gained": "SNP",
        "start_lost": "SNP",
        "frameshift": "indel",
        "inframe_deletion": "indel",
        "inframe_insertion": "indel",
        "large_deletion": "large_deletion"
    }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    resistance = defaultdict(lambda: defaultdict(list))
    for s in tqdm(samples):
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        for var in data["dr_variants"]:
            resistance[var["drug"]][s].append(
                mapping.get(var["type"], "complex"))

    for drug in resistance:
        lines = []
        lines.append("DATASET_PIECHART")
        lines.append("SEPARATOR COMMA")
        lines.append("DATASET_LABEL,%s" % drug)
        lines.append("COLOR,#ff0000")
        lines.append("FIELD_COLORS,#ff0000,#00ff00,#0000ff,#ffffff")
        lines.append("FIELD_LABELS,snp,indel,large_deletion,no_variant")
        lines.append("MARGIN,5")
        # lines.append("MAXIMUM_SIZE,30")
        lines.append("BORDER_WIDTH,1")
        lines.append("BORDER_COLOR,#000000")
        lines.append("DATA")
        for s in samples:
            count = Counter(resistance[drug][s])
            lines.append("%s,-1,7,%s,%s" % (s, ",".join([
                str(count[d]) for d in ["SNP", "indel", "large_deletion"]
            ]), "0" if sum(count.values()) > 0 else "1"))
        with open("%s.itol.conf.txt" % drug, "w") as O:
            O.write("\n".join(lines))
Beispiel #7
0
def main(args):
    bed_file = "%s/share/tbprofiler/%s.bed" % (sys.base_prefix,args.db)
    locus_tag2drugs = tbprofiler.get_lt2drugs(bed_file)

    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix]

    FLQ_set = set(["moxifloxacin","levofloxacin","ciprofloxacin","ofloxacin"])
    SLI_set = set(["amikacin","capreomycin","kanamycin"])

    OUT = open(args.out,"w")
    writer = csv.DictWriter(OUT, fieldnames = ["sample","dr-class"])
    writer.writeheader()

    for s in tqdm(samples):
        data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix))))
        resistant_drugs = set()
        for var in data["dr_variants"]:
            for d in var["drugs"]:
                resistant_drugs.add(d["drug"])


        
        rif = "rifampicin" in resistant_drugs
        inh = "isoniazid" in resistant_drugs
        flq = len(FLQ_set.intersection(resistant_drugs)) > 0
        sli = len(SLI_set.intersection(resistant_drugs)) > 0

        if len(resistant_drugs)==0:
            drtype = "Sensitive"
        elif (rif and not inh) or (inh and not rif):
            drtype = "Pre-MDR"
        elif (rif and inh) and (not flq and not sli):
            drtype = "MDR"
        elif (rif and inh) and ( (flq and not sli) or (sli and not flq) ):
            drtype = "Pre-XDR"
        elif (rif and inh) and (flq and sli):
            drtype = "XDR"
        else:
            drtype = "Other"

        writer.writerow({"sample":s, "dr-class":drtype})

    OUT.close()
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.in_dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    if not os.path.isdir(args.out_dir):
        os.mkdir(args.out_dir)
    # Loop through the sample result files
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        new_dr_variants = defaultdict(list)
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.in_dir, s, args.suffix))))
        for var in data["dr_variants"]:
            tmp = copy.deepcopy(var)
            del tmp["drug"]
            x = {"drug": var["drug"]}
            if "confidence" in tmp:
                del tmp["confidence"]
                x["confidence"] = var["confidence"]
            if "literature" in tmp:
                del tmp["literature"]
                x["literature"] = var["literature"]

            new_dr_variants[json.dumps(tmp)].append(x)

        data["dr_variants"] = []
        for x in new_dr_variants:
            new_var = json.loads(x)
            new_var["drugs"] = []
            for d in new_dr_variants[x]:
                new_var["drugs"].append(d)
            data["dr_variants"].append(new_var)
        json.dump(data, open("%s/%s%s" % (args.out_dir, s, args.suffix), "w"))
Beispiel #9
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    # Loop through the sample result files
    variant_freqs = []
    rprs = []
    bqrs = []
    mqrs = []
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if (var["locus_tag"] == args.gene or var["gene"]
                    == args.gene) and var["change"] == args.variant:
                variant_freqs.append(var["freq"])
                try:
                    rprs.append(
                        float(var["variant_annotations"]["ReadPosRankSum"]))
                    bqrs.append(
                        float(var["variant_annotations"]["BaseQRankSum"]))
                    mqrs.append(float(var["variant_annotations"]["MQRankSum"]))
                except:
                    pass

    if len(variant_freqs) > 0:
        print("%s\t%s\t%s\t%s\t%s\t%s\t%s" %
              (args.gene, args.variant, len(variant_freqs),
               statistics.median(variant_freqs), statistics.median(rprs),
               statistics.median(bqrs), statistics.median(mqrs)))
    else:
        print("%s\t%s\tNA\tNA\tNA\tNA\tNA" % (args.gene, args.variant))
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.vcf_suffix, "") for x in os.listdir(args.vcf_dir)
            if x[-len(args.vcf_suffix):] == args.vcf_suffix
        ]

    for l in open(conf["gff"]):
        row = l.strip().split()
        if len(row) <= 2: continue
        if row[2] != "gene": continue
        if "Name=%s" % args.gene in l or "gene:%s" % args.gene in l:
            break

    start, end = int(row[3]), int(row[4])
    # Loop through the sample result files
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        if not os.path.isfile("%s/%s%s" % (args.dir, s, args.suffix)): continue
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        vars = json.dumps([
            d for d in data["dr_variants"] + data["other_variants"]
            if d["locus_tag"] == args.gene
        ])
        print(vars)
        if "deletion" not in vars and "frameshift" not in vars and "inframe" not in vars and "stop" not in vars and "start" not in vars:

            revseq = "| revseq  -sequence /dev/stdin  -outseq /dev/stdout" if row[
                6] == "-" else ""
            pp.run_cmd(
                "samtools faidx %s Chromosome:%s-%s | bcftools consensus %s/%s%s %s | sed 's/^>.*/>%s/' > %s.%s.fasta"
                % (conf["ref"], start, end, args.vcf_dir, s, args.vcf_suffix,
                   revseq, s, s, args.gene),
                verbose=1)
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [
            x.replace(args.suffix, "") for x in os.listdir(args.dir)
            if x[-len(args.suffix):] == args.suffix
        ]

    # Loop through the sample result files
    annotations = []

    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(
            open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if "variant_annotations" in var:
                for x in var["variant_annotations"]:
                    if var["variant_annotations"][x] == ".":
                        var["variant_annotations"][x] = "NA"
                var["variant_annotations"]["sample"] = s
                var["variant_annotations"]["frequency"] = var["freq"]
                var["variant_annotations"]["gene"] = var["gene"]
                var["variant_annotations"]["change"] = var["change"]

                annotations.append(var["variant_annotations"])

    with open(args.out, "w") as O:
        writer = csv.DictWriter(O, fieldnames=list(annotations[0]))
        writer.writeheader()
        writer.writerows(annotations)
Beispiel #12
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix]

    # Loop through the sample result files
    samples_with_mutation = []
    variant_position_set = set()
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant:
                samples_with_mutation.append(s)
                variant_position_set.add(var["genome_pos"])

    sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation))
    # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"]
    if len(samples_with_mutation)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found"))
        quit()
    elif len(variant_position_set)>1:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos"))
        quit()


    if len(variant_position_set)==1:
        variant_position = int(list(variant_position_set)[0])

    sys.stderr.write("\nGenome position is %s\n" % variant_position)
    sys.stderr.write("\nPerforming ReadPosRankSum test\n")
    # variant_position = 3841662
    params = vars(args)
    params["ref"] = conf["ref"]
    params["pos"] = variant_position
    params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz")
    read_pos_rank_sums = []
    for s in tqdm(samples_with_mutation):
        params["sample"] = s
        pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0)
        pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0)
        pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0)
        for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false  | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0):
            row = l.strip().split()
            if row[1]==".": continue
            if int(row[0])==variant_position:
                read_pos_rank_sums.append((s,float(row[1])))

    if len(read_pos_rank_sums)==0:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples"))
    else:
        sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums])))
    pp.rm_files([params["tmp_vcf"]])
Beispiel #13
0
def main(args):
    # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. }
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix]

    # Loop through the sample result files
    mutations = defaultdict(list)
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix))))
        for var in data["dr_variants"] + data["other_variants"]:
            if var["gene"]!="rrs" and var["gene"]!="rrl" and re.search("r\.[0-9]+",var["change"]):
                continue
            if "nucleotide_change" in var and (re.search("p.[A-Za-z]+",var["change"]) or re.search("c.[0-9]+",var["change"]) or re.search("c.\-[0-9]+",var["change"])):
                pos =  ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(1) for x in var["nucleotide_change"].split("+")])
                ref =  ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(2) for x in var["nucleotide_change"].split("+")])
                alt =  ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(3) for x in var["nucleotide_change"].split("+")])
                # if var["change"]=="p.Gly168Ser":
                    # import pdb; pdb.set_trace()
            elif var["type"]=="non_coding" and re.search("c.\-[0-9]+",var["change"]):
                re_obj = re.search("c.\-[0-9]+([ACGT]+)>([ACGT]+)",var["change"])
                pos = str(var["genome_pos"])
                ref = re_obj.group(1)
                alt = re_obj.group(2)
            elif var["type"]=="non_coding" and re.search("r.[0-9]+",var["change"]):
                re_obj = re.search("[0-9]+([ACGT]+)>([ACGT]+)", var["_internal_change"])
                pos = str(var["genome_pos"])
                ref = re_obj.group(1)
                alt = re_obj.group(2)
            elif var["type"]=="large_deletion":
                continue
            elif var["type"].replace("*","")=="synonymous":
                continue
            elif var["type"].replace("*","")=="frameshift&start_lost":
                continue
            elif var["type"].replace("*","")=="missense&inframe_altering":
                continue
            elif var["type"].replace("*","")=="stop_lost":
                continue
            elif var["type"].replace("*","")=="stop_retained":
                continue
            else:
                quit(var)
            # if var["change"]=="p.Ser450Leu":
                # import pdb; pdb.set_trace()
            mutations[(pos,ref,alt)].append(json.dumps({
                "genome_pos": pos,
                "type":var["type"].replace("*",""),
                "locus_tag":var["locus_tag"],
                "gene":var["gene"],
                "_internal_change":var["_internal_change"],
                "change":var["change"]
            }))

    for key in sorted(mutations,key=lambda x:int(x[0].split(",")[0])):
        for x in set(mutations[key]):
            var = json.loads(x)
            print("Chromosome\t%s\t%s\t%s\t%s|%s|%s|%s|%s|%s|%s" % (var["genome_pos"],key[1],key[2],var["type"],var["locus_tag"],var["gene"],"NA","NA",var["_internal_change"],var["change"]))
Beispiel #14
0
def main(args):
    conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"])
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix]

    if args.meta:
        meta = {}
        for row in csv.DictReader(open(args.meta)):
            meta[row["wgs_id"]] = row



    sample_nodes = []
    tmp_variant_nodes = []
    sample_variant_edges = []
    drugs = set()
    lineage_nodes = set()
    tmp_variant_drug_edges = []
    sample_lineage_edges = []
    spoligotype_nodes = set()
    sample_spoligotype_edges = []
    if args.spoligotypes:
        spoligotypes = {}
        for row in csv.DictReader(open(args.spoligotypes)):
            spoligotypes[row["sample"]] = row["spoligotype"]
            sample_spoligotype_edges.append({"id":row["sample"],"spoligotype":row["spoligotype"]})
            spoligotype_nodes.add(row["spoligotype"])

    # Loop through the sample result files
    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix))))
        sample_node = {
            "id":s,
            "drtype":data["drtype"],
            "lineage":data["sublin"],
            "lineageInfo": json.dumps(data["lineage"]),
            "qc": json.dumps({"pct_reads_mapped":data["qc"]["pct_reads_mapped"],"pct_reads_mapped":data["qc"]["pct_reads_mapped"],"gene_coverage":[]}),
            "pipeline": json.dumps(data["pipeline"]),
            "tbprofilerVersion": json.dumps(data["tbprofiler_version"]),
            "dbVersion": json.dumps(data["db_version"]),
        }
        lineage_nodes.add(data["sublin"])
        sample_lineage_edges.append({"sampleId":s,"lineage":data["sublin"]})
        if args.meta:
            for c in list(meta.values())[0]:
                if c=="id": continue
                d = camel_case(c)
                sample_node[d] = meta[s][c] if s in meta else "NA"

        if args.spoligotypes:
            sample_node["spoligotype"] = spoligotypes.get(s,"NA")

        sample_nodes.append(sample_node)
        for var in data["dr_variants"] + data["other_variants"]:
            # if var["type"]=="synonymous": continue
            variant_id = "%s_%s" % (var["locus_tag"],var["change"])
            sample_variant_edges.append(
                {
                    "sampleId":s,
                    "variantId":variant_id,
                    "freq":var["freq"],
                    "genome_pos": var["genome_pos"],
                    "nucleotideChange": var.get("nucleotide_change","NA"),
                    "internalChange": var.get("_internal_change","NA")
                }
            )
            tmp_variant_nodes.append(
                {
                    "id": variant_id,
                    "type": var["type"].replace("*",""),
                    "change": var["change"],
                    "gene": var["gene"],
                    "locus_tag": var["locus_tag"],
                }
            )
            if "drugs" in var:
                for d in var["drugs"]:
                    drugs.add(d["drug"])
                    tmp_variant_drug_edges.append(
                        {
                            "variantId": variant_id,
                            "drug": d["drug"]
                        }
                    )


    drug_nodes = [{"id":d} for d in drugs]
    lineage_nodes = [{"id":d} for d in lineage_nodes]
    spoligotype_nodes = [{"id":d} for d in spoligotype_nodes]
    variant_drug_edges = uniq_dict_list(tmp_variant_drug_edges)
    variant_nodes = uniq_dict_list(standardise_types(tmp_variant_nodes))

    def batch(iterable, n=1):
        l = len(iterable)
        for ndx in range(0, l, n):
            yield iterable[ndx:min(ndx + n, l)]

    with open("sample_nodes.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(sample_nodes[0]))
        writer.writeheader()
        writer.writerows(sample_nodes)

    with open("variant_nodes.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(variant_nodes[0]))
        writer.writeheader()
        writer.writerows(variant_nodes)

    for i,x in enumerate(batch(list(range(len(sample_variant_edges))),10000)):
        with open("sample_variant_edges.%s.csv" % i,"w") as O:
            writer = csv.DictWriter(O,fieldnames = list(sample_variant_edges[0]))
            writer.writeheader()
            for j in x:
                writer.writerow(sample_variant_edges[j])

    with open("drug_nodes.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(drug_nodes[0]))
        writer.writeheader()
        writer.writerows(drug_nodes)

    with open("variant_drug_edges.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(variant_drug_edges[0]))
        writer.writeheader()
        writer.writerows(variant_drug_edges)

    with open("lineage_nodes.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(lineage_nodes[0]))
        writer.writeheader()
        writer.writerows(lineage_nodes)

    with open("sample_lineage_edges.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(sample_lineage_edges[0]))
        writer.writeheader()
        writer.writerows(sample_lineage_edges)

    with open("spoligotype_nodes.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(spoligotype_nodes[0]))
        writer.writeheader()
        writer.writerows(spoligotype_nodes)

    with open("sample_spoligotype_edges.csv","w") as O:
        writer = csv.DictWriter(O,fieldnames = list(sample_spoligotype_edges[0]))
        writer.writeheader()
        writer.writerows(sample_spoligotype_edges)

    with open("Cypher_commands.txt" ,"w") as O:
        if args.index:
            O.write("CREATE INDEX FOR (n:Sample) ON (n.id);\n")
            O.write("CREATE INDEX FOR (n:SRA) ON (n.id);\n")
        O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_nodes.csv' AS csvLine\n")
        O.write("CREATE (s:Sample:SRA {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in sample_nodes[0]]))
        O.write("\n")

        if args.index:
            O.write("CREATE INDEX FOR (n:Country) ON (n.id);\n")
        O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_nodes.csv' AS csvLine\n")
        O.write("WITH csvLine WHERE NOT csvLine.countryCode IS null\n")
        O.write("MERGE (s:Sample {id:csvLine.id})\n")
        O.write("MERGE (c:Country {id:csvLine.countryCode})\n")
        O.write("CREATE (s) -[:COLLECTED_IN]-> (c);\n")
        O.write("\n")

        if args.index:
            O.write("CREATE INDEX FOR (n:Variant) ON (n.id);\n")
        O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_nodes.csv' AS csvLine\n")
        O.write("CREATE (v:Variant {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in variant_nodes[0]]))
        O.write("\n")

        if args.index:
            O.write("CREATE INDEX FOR (n:Gene) ON (n.id);\n")
        O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_nodes.csv' AS csvLine\n")
        O.write("MERGE (v:Variant {id:csvLine.id})\n")
        O.write("MERGE (g:Gene {id:csvLine.locus_tag, locusTag:csvLine.locus_tag, name:csvLine.gene})\n")
        O.write("CREATE (v) -[:IN_GENE]-> (g);\n")
        O.write("\n")

        if args.index:
            O.write("CREATE INDEX FOR (n:Drug) ON (n.id);\n")
        O.write("LOAD CSV WITH HEADERS FROM 'file:///drug_nodes.csv' AS csvLine\n")
        O.write("CREATE (d:Drug {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in drug_nodes[0]]))
        O.write("\n")

        if args.index:
            O.write("CREATE INDEX FOR (n:Lineage) ON (n.id);\n")
        O.write("LOAD CSV WITH HEADERS FROM 'file:///lineage_nodes.csv' AS csvLine\n")
        O.write("CREATE (:Lineage {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in lineage_nodes[0]]))
        O.write("\n")

        for i,x in enumerate(batch(list(range(len(sample_variant_edges))),10000)):
            O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_variant_edges.%s.csv' AS csvLine\n" % i)
            O.write("MATCH (s:Sample {id: csvLine.sampleId}),(v:Variant {id:csvLine.variantId})\n")
            O.write("CREATE (s) -[:CONTAINS {%s}]-> (v);\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in sample_variant_edges[0]]))
            O.write("\n")

        O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_drug_edges.csv' AS csvLine\n")
        O.write("MATCH (v:Variant {id: csvLine.variantId}),(d:Drug {id:csvLine.drug})\n")
        O.write("CREATE (v) -[:CONFERS_RESISTANCE]-> (d);\n")
        O.write("\n")

        O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_lineage_edges.csv' AS csvLine\n")
        O.write("MATCH (s:Sample {id: csvLine.sampleId}),(l:Lineage {id:csvLine.lineage})\n")
        O.write("CREATE (s) -[:LINEAGE]-> (l);\n")
        O.write("\n")

        if args.index:
            O.write("CREATE INDEX FOR (n:Spoligotype) ON (n.id);\n")
        O.write("LOAD CSV WITH HEADERS FROM 'file:///spoligotype_nodes.csv' AS csvLine\n")
        O.write("CREATE (:Spoligotype {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in spoligotype_nodes[0]]))
        O.write("\n")

        O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_spoligotype_edges.csv' AS csvLine\n")
        O.write("MATCH (s:Sample {id: csvLine.id}),(l:Spoligotype {id:csvLine.spoligotype})\n")
        O.write("CREATE (s) -[:SPOLIGOTYPE]-> (l);\n")
        O.write("\n")
def main(args):

    # -----
    # ARGS
    # -----

    # tbprofiler_results_location = 'tbprofiler_pakistan_results/'
    metadata_file = args.metadata_file
    id_key = args.id_key
    tbprofiler_results_location = args.tbp_results
    outfile = args.outfile
    db = args.db

    # -------------
    # READ IN DATA
    # -------------

    # Read in metadata
    meta_reader = csv.DictReader(open(metadata_file))
    meta_dict = {}
    for row in meta_reader:
        # Make the id the key, but also recapitulate the id in the key-values by including everything
        meta_dict[row[id_key]] = row

    # Read in locus-drug resistance associations
    bed_file = "%s/share/tbprofiler/%s.bed" % (sys.base_prefix, db)
    locus_tag2drugs = tbprofiler.get_lt2drugs(bed_file)

    # Get list of files in tbprofiler results directory
    tbprofiler_results_files = os.listdir(tbprofiler_results_location)

    # --------
    # WRANGLE
    # --------

    samples = list(meta_dict.keys())

    # ----------------
    # DR VARIANTS
    # ----------------

    dr_variants_dict = {}
    for json_file in tbprofiler_results_files:
        id = ''.join(json_file.split(".")[:-2])
        if id in samples:
            # Create empty list per id
            dr_variants_dict[id] = []
            json_file = tbprofiler_results_location + json_file
            tbp_result = json.load(open(json_file))
            # Loop over the other_variants dictionaries
            for variant in tbp_result['dr_variants']:
                # print("VARIANT: ", variant['sample'])
                # Exclude synonymous
                if variant['type'] != 'synonymous':
                    # Put it all together. Left join locus/gene drug resistance associations from locus_tag2drugs table
                    empty_str = ""
                    variant.setdefault("gene", empty_str)
                    variant.setdefault("genome_pos", empty_str)
                    variant.setdefault("type", empty_str)
                    variant.setdefault("change", empty_str)
                    variant.setdefault("nucleotide_change", empty_str)
                    variant.setdefault("locus_tag", empty_str)
                    locus_tag2drugs.setdefault(variant['locus_tag'], empty_str)
                    dr_variants_dict[id].append({'wgs_id': id, 'gene': variant['gene'], 'genome_pos': variant['genome_pos'], 'type': variant['type'],
                    'change': variant['change'], 'nucleotide_change': variant['nucleotide_change'], 'locus_tag': variant['locus_tag'], 'locus_tag_drugs': locus_tag2drugs[variant['locus_tag']]})
    
    # Save a tab-sep text file

    # Define headers from the first dict
    fieldnames = tuple(next(iter(dr_variants_dict.values()))[0].keys())

    with open(outfile + '.dr.txt', 'w') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        # Loop over the dictionaries, appending each dictionary as a row in the file
        for id in dr_variants_dict:
            writer.writerows(dr_variants_dict[id])


    # ----------------
    # OTHER VARIANTS
    # ----------------

    other_variants_dict = {}
    for json_file in tbprofiler_results_files:
        id = ''.join(json_file.split(".")[:-2])
        if id in samples:
            # Create empty list per id
            other_variants_dict[id] = []
            json_file = tbprofiler_results_location + json_file
            tbp_result = json.load(open(json_file))
            # Loop over the other_variants dictionaries
            for variant in tbp_result['other_variants']:
                # print("VARIANT: ", variant['sample'])
                # Exclude synonymous
                if variant['type'] != 'synonymous':
                    # Put it all together. Left join locus/gene drug resistance associations from locus_tag2drugs table
                    empty_str = ""
                    variant.setdefault("gene", empty_str)
                    variant.setdefault("genome_pos", empty_str)
                    variant.setdefault("type", empty_str)
                    variant.setdefault("change", empty_str)
                    variant.setdefault("nucleotide_change", empty_str)
                    variant.setdefault("locus_tag", empty_str)
                    locus_tag2drugs.setdefault(variant['locus_tag'], empty_str)
                    # other_variants_dict[clust].append({'wgs_id': id, 'gene': variant['gene'], 'genome_pos': variant['genome_pos'], 'type': variant['type'],
                    # 'change': variant['change'], 'nucleotide_change': variant['nucleotide_change'], 'locus_tag': variant['locus_tag'], 'locus_tag_drugs': locus_tag2drugs[variant['locus_tag']]})
                    other_variants_dict[id].append({'wgs_id': id, 'gene': variant['gene'], 'genome_pos': variant['genome_pos'], 'type': variant['type'],
                    'change': variant['change'], 'nucleotide_change': variant['nucleotide_change'], 'locus_tag': variant['locus_tag'], 'locus_tag_drugs': locus_tag2drugs[variant['locus_tag']]})
    
    # Save a tab-sep text file

    # Define headers from the first dict
    fieldnames = tuple(next(iter(other_variants_dict.values()))[0].keys())

    with open(outfile + '.other.txt', 'w') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        # Loop over the dictionaries, appending each dictionary as a row in the file
        for id in other_variants_dict:
            writer.writerows(other_variants_dict[id])
Beispiel #16
0
def main(args):
    # Get a dictionary with the database file: {'ref': '/path/to/fasta' ... etc. }
    conf = get_conf_dict(sys.base_prefix + '/share/tbprofiler/%s' % args.db)

    # Get a dictionary mapping the locus_tags to drugs: {'Rv1484': ['isoniazid','ethionamide'], ... etc. }
    locus_tag2drugs = tbprofiler.get_lt2drugs(conf['bed'])
    
    # Get a dictionary mapping the drug to genes: {'rifampicin': ['rpoB', 'rpoC'], 'clofazimine': ['mmpR5', 'pepQ'], ... etc. }
    drug2genes = tbprofiler.get_drugs2gene(conf['bed'])
    

    # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry
    if args.samples:
        samples = [x.rstrip() for x in open(args.samples).readlines()]
    else:
        samples = [x.replace(args.suffix,'') for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix]

    # Loop through the sample result files
    drugs = [
        'rifampicin','isoniazid','ethambutol','pyrazinamide','streptomycin','amikacin',
        'kanamycin','capreomycin','fluoroquinolones','ethionamide','cycloserine',
        'para-aminosalicylic_acid','clofazimine','bedaquiline','delamanid'
    ]

    # Set up a list which will contain our output file rows
    rows = []

    for s in tqdm(samples):
        # Data has the same structure as the .result.json files
        data = json.load(open(pp.filecheck(f'{args.dir}/{s}{args.suffix}')))

        # The data is organised per variant in data['dr_variants']. We need to 
        # transform this into a structure which is arranged by drug instead.
        # We do this by:
        # 1. Setting up a dictionary (drug_variants) where the values are lists
        # 2. Loop through all the variants and append the gene/change/freq
        #    to the list for each drug
        #
        # The structure will look like {'isoniazid':['katG_p.Ser315Thr_0.95','fabG1_-15T>C_1.00']}
        drug_variants = defaultdict(list)
        for var in data['dr_variants']:
            for d in var['drugs']:
                drug_variants[d['drug']].append(f'{var["gene"]}_{var["change"]}_{round(var["freq"],2)}')

        # Create a lookup dictionary containing all the genes for which we have missing coverage
        # E.g. {'rpoB':'0.2'}
        gene_coverage = {d['gene']:str(d['fraction']) for d in data['qc']['gene_coverage'] if d['fraction']>0}

        # Set up our row for the final output file with column names being the keys.
        row = {
            'sample': s,
            'main_lineage': data['main_lin'],
            'sublineage': data['sublin'],
            'drtype': data['drtype']
        }
        
        # For each drug add a column with the variants and another containing a value if there is missin coverage
        for drug in drugs:
            row[f'{drug}_variants'] = ", ".join(drug_variants[drug])
            row[f'{drug}_gene_cov'] = ", ".join([gene_coverage[gene] for gene in drug2genes[drug] if gene in gene_coverage])
        rows.append(row)


    # Write the output file
    with open(args.outfile,'w') as O:
        writer = csv.DictWriter(O,fieldnames=list(rows[0]),delimiter="\t")
        writer.writeheader()
        writer.writerows(rows)