def main(args): pp.filecheck(args.bcf) pp.filecheck(args.bed) bcf = pp.delly_bcf(args.bcf) overlaps = bcf.overlap_bed(args.bed) for overlap in overlaps: overlap["len"] = int(overlap["end"]) - int(overlap["start"]) overlap["sample_name"] = args.sample_name print("%(sample_name)s\t%(chr)s\t%(start)s\t%(end)s\t%(len)s\t%(region)s" % overlap)
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files results = defaultdict(list) dr_mutations = set() # for for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if var["gene"] == "fabG1" and var["change"] == "c.663C>A": print(var["freq"])
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files null_lineage_samples = [] for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) if data["lineage"] == []: null_lineage_samples.append(s) print("\n".join(null_lineage_samples))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] variants = defaultdict(list) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: variants[(var["gene"], var["change"])].append(s) with open(args.out, "w") as O: O.write("Gene,Variant,%s\n" % ",".join(samples)) for key in variants: samps = variants[key] O.write("%s,%s,%s\n" % (key[0], key[1], ",".join( ["1" if s in samps else "0" for s in samples])))
def get_conf_dict_with_path(library_path): files = { "gff": ".gff", "ref": ".fasta", "ann": ".ann.txt", "barcode": ".barcode.bed", "bed": ".bed", "json_db": ".dr.json", "version": ".version.json" } conf = {} for key in files: sys.stderr.write("Using %s file: %s\n" % (key, library_path + files[key])) conf[key] = pp.filecheck(library_path + files[key]) test = json.load(open(conf["json_db"]))["Rv1908c"]["315S>315T"] if "annotation" not in test and "drugs" in test: quit("""\n ################################# ERROR ####################################### The database has different format than expected. Since tb-profiler v2.4 the database is parsed using tb-profiler code. Please run the following code to get the latest version of the database or load your own: tb-profiler update_tbdb or tb-profiler load_library /path/to/custom_library ############################################################################### """) return conf
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files variants = [] vartypes = [] data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"]: if var["locus_tag"] == "Rv2043c": variants.append(var) vartypes.append(var["type"].replace("*", "")) if len(variants) > 1 and "frameshift" in var["type"]: print(s, variants)
def get_conf_dict(library_prefix): files = {"gff":".gff","ref":".fasta","ann":".ann.txt","barcode":".barcode.bed","bed":".bed","json_db":".dr.json","version":".version.json"} conf = {} for key in files: sys.stderr.write("Using %s file: %s\n" % (key,library_prefix+files[key])) conf[key] = pp.filecheck(library_prefix+files[key]) return conf
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.in_dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.in_dir, s, args.suffix)))) for var in data["dr_variants"]: if isinstance(var["drugs"], list): continue tmp1 = [] for d in var["drugs"]: tmp2 = {} tmp2["drug"] = d for k in var["drugs"][d]: tmp2[k] = var["drugs"][d][k] tmp1.append(tmp2) var["drugs"] = tmp1 json.dump(data, open("%s/%s%s" % (args.out_dir, s, args.suffix), "w"))
def get_conf_dict(library_prefix): files = {"gff":".gff","ref":".fasta","barcode":".barcode.bed","version":".version.json","proteins":".proteins.csv","msa":".msa.fa","non_coding_bed":".non_coding.bed","meta":".msa.meta.csv"} conf = {} for key in files: sys.stderr.write("Using %s file: %s\n" % (key,library_prefix+files[key])) conf[key] = pp.filecheck(library_prefix+files[key]) return conf
def get_conf_dict(library_prefix): files = {'gff':'.gff','ref':'.fasta','ann':'.ann.txt','barcode':'.barcode.bed','bed':'.bed','json_db':'.dr.json','version':'.version.json'} conf = {} for key in files: sys.stderr.write('Using %s file: %s\n' % (key,library_prefix+files[key])) conf[key] = pp.filecheck(library_prefix+files[key]) return conf
def main_lineage(args): conf_file = pp.filecheck(tbp._ROOT + "/../" + args.db + ".config.json") conf = json.load(open(conf_file)) pp.filecheck(args.bcf) bcf = pp.bcf(args.bcf) mutations = bcf.get_bed_gt(conf["barcode"], conf["ref"]) results = {} results["barcode"] = pp.barcode(mutations, conf["barcode"]) tbp.barcode2lineage(results) if args.prefix: outfile = "%s.lineage.%s" % (args.prefix, args.outfmt) O = open(outfile, "w") if args.outfmt == "json": json.dump(results["lineage"], O) elif args.outfmt == "txt": O.write(tbp.text.lineagejson2text(results["lineage"])) O.close()
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] for s in tqdm(samples): data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) for lin in data["lineage"]: if lin["lin"]==args.lineage and lin["frac"]<0.95: print(s)
def main(args): mapping = { "missense": "SNP", "non_coding": "SNP", "non_coding": "SNP", "stop_gained": "SNP", "start_lost": "SNP", "frameshift": "indel", "inframe_deletion": "indel", "inframe_insertion": "indel", "large_deletion": "large_deletion" } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] resistance = defaultdict(lambda: defaultdict(list)) for s in tqdm(samples): data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"]: resistance[var["drug"]][s].append( mapping.get(var["type"], "complex")) for drug in resistance: lines = [] lines.append("DATASET_PIECHART") lines.append("SEPARATOR COMMA") lines.append("DATASET_LABEL,%s" % drug) lines.append("COLOR,#ff0000") lines.append("FIELD_COLORS,#ff0000,#00ff00,#0000ff,#ffffff") lines.append("FIELD_LABELS,snp,indel,large_deletion,no_variant") lines.append("MARGIN,5") # lines.append("MAXIMUM_SIZE,30") lines.append("BORDER_WIDTH,1") lines.append("BORDER_COLOR,#000000") lines.append("DATA") for s in samples: count = Counter(resistance[drug][s]) lines.append("%s,-1,7,%s,%s" % (s, ",".join([ str(count[d]) for d in ["SNP", "indel", "large_deletion"] ]), "0" if sum(count.values()) > 0 else "1")) with open("%s.itol.conf.txt" % drug, "w") as O: O.write("\n".join(lines))
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] drugs = set() rv2drugs = {} drugs2rv = {} for line in open(conf["bed"]): row = line.strip().split() rv2drugs[row[3]] = row[5].split(",") for drug in row[5].split(","): drugs.add(drug) drugs2rv[drug] = row[3] drugs = sorted(list(drugs)) sys.stdout.write("sample,lineage,sublineage,drtype,%s\n" % (",".join( ["dr_mutations_%s,other_mutations_%s" % (d, d) for d in drugs]))) for s in tqdm(samples): data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) mutations = defaultdict(set) for var in data["dr_variants"] + data["other_variants"]: if var["type"] == "synonymous": continue if "drugs" in var: for d in var["drugs"]: mutations["dr_mutations_" + d["drug"]].add(var["gene"] + "_" + var["change"]) else: if var["locus_tag"] not in rv2drugs: continue tmp_drugs = rv2drugs[var["locus_tag"]] for drug in tmp_drugs: mutations["other_mutations_" + drug].add(var["gene"] + "_" + var["change"]) for k in ["dr_mutations_%s" % (d) for d in drugs ] + ["other_mutations_%s" % (d) for d in drugs]: if len(mutations[k]) == 0: mutations[k].add("WT") sys.stdout.write( "%s,%s,%s,%s,%s\n" % (s, data["main_lin"], data["sublin"], data["drtype"], ",".join([ "%s,%s" % (";".join(mutations["dr_mutations_" + d]), ";".join( mutations["other_mutations_" + d])) for d in drugs ])))
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] mutations = defaultdict(lambda: defaultdict(int)) gene_set = set(args.genes.split(",")) if args.genes else None for s in tqdm(samples): data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var1 in (data["dr_variants"] + data["other_variants"]): if args.genes and (var1["gene"] not in gene_set and var1["locus_tag"] not in gene_set): continue if not args.synonymous and var1["type"] == "synonymous": continue for var2 in (data["dr_variants"] + data["other_variants"]): if args.genes and (var2["gene"] not in gene_set and var2["locus_tag"] not in gene_set): continue if not args.synonymous and var2["type"] == "synonymous": continue mutations[var1["gene"] + "_" + var1["change"]][var2["gene"] + "_" + var2["change"]] += 1 for mut1 in tqdm(mutations): for mut2 in mutations[mut1]: total_mut1 = mutations[mut1][mut1] total_mut2 = mutations[mut2][mut2] t = [[0, 0], [0, 0]] t[0][0] = mutations[mut1][mut2] t[1][0] = total_mut1 - t[0][0] t[0][1] = total_mut2 - t[0][0] t[1][1] = len(samples) - t[0][1] - t[1][0] - t[0][0] t2 = sm.stats.Table2x2(np.asarray(t)) OR = t2.oddsratio if t[0] != [0.5, 0.5] else "NA" OR_pval = t2.oddsratio_pvalue() if t[0] != [0.5, 0.5] else "NA" pval = t2.test_nominal_association().pvalue print( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (mut1, mut2, total_mut1, total_mut2, mutations[mut1][mut2], OR, OR_pval, pval), t)
def main_collate(args): samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] lineages = {} sys.stderr.write("Loading data\n") for s in tqdm(samples): data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) lineages[s] = data["clade"] with open(args.out + ".clades.csv", "w") as O: O.write("isolate,clade\n") for s in samples: O.write("%s,%s\n" % (s, lineages[s]))
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] for s in tqdm(samples): data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) sys.stdout.write("%s\t%s\t%s\n" % (s, data["main_lin"], data["sublin"]))
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) drug2genes = defaultdict(set) gene2drugs = defaultdict(set) for l in open(conf["bed"]): row = l.strip().split() for drug in row[5].split(","): drug2genes[drug].add(row[4]) gene2drugs[row[4]].add(drug) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] drug_haplotypes = {drug: defaultdict(int) for drug in drug2genes} for s in tqdm(samples): sample_drug_haplotypes = {drug: set() for drug in drug2genes} data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"]: sample_drug_haplotypes[drug].add((var["gene"], var["change"])) for var in data["other_variants"]: if var["type"] == "synonymous": continue for drug in gene2drugs[var["gene"]]: sample_drug_haplotypes[drug].add((var["gene"], var["change"])) for drug in drug2genes: if len(sample_drug_haplotypes[drug]) > 0: drug_haplotypes[drug][tuple(sample_drug_haplotypes[drug])] += 1 for drug in drug2genes: mutations = set(chain(*drug_haplotypes[drug])) for m1 in tqdm(mutations): for m2 in mutations: table = [[0, 0], [0, 0]] for haplotype in drug_haplotypes[drug]: if m1 not in haplotype and m2 not in haplotype: table[0][0] += drug_haplotypes[drug][haplotype] if m1 in haplotype and m2 not in haplotype: table[1][0] += drug_haplotypes[drug][haplotype] if m1 not in haplotype and m2 in haplotype: table[1][0] += drug_haplotypes[drug][haplotype] if m1 in haplotype and m2 in haplotype: table[1][1] += drug_haplotypes[drug][haplotype] print("%s_%s - %s_%s" % (m1[0], m1[1], m2[0], m2[1]), table)
def main(args): bed_file = "%s/share/tbprofiler/%s.bed" % (sys.base_prefix,args.db) locus_tag2drugs = tbprofiler.get_lt2drugs(bed_file) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] FLQ_set = set(["moxifloxacin","levofloxacin","ciprofloxacin","ofloxacin"]) SLI_set = set(["amikacin","capreomycin","kanamycin"]) OUT = open(args.out,"w") writer = csv.DictWriter(OUT, fieldnames = ["sample","dr-class"]) writer.writeheader() for s in tqdm(samples): data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) resistant_drugs = set() for var in data["dr_variants"]: for d in var["drugs"]: resistant_drugs.add(d["drug"]) rif = "rifampicin" in resistant_drugs inh = "isoniazid" in resistant_drugs flq = len(FLQ_set.intersection(resistant_drugs)) > 0 sli = len(SLI_set.intersection(resistant_drugs)) > 0 if len(resistant_drugs)==0: drtype = "Sensitive" elif (rif and not inh) or (inh and not rif): drtype = "Pre-MDR" elif (rif and inh) and (not flq and not sli): drtype = "MDR" elif (rif and inh) and ( (flq and not sli) or (sli and not flq) ): drtype = "Pre-XDR" elif (rif and inh) and (flq and sli): drtype = "XDR" else: drtype = "Other" writer.writerow({"sample":s, "dr-class":drtype}) OUT.close()
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files variant_freqs = [] rprs = [] bqrs = [] mqrs = [] for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if (var["locus_tag"] == args.gene or var["gene"] == args.gene) and var["change"] == args.variant: variant_freqs.append(var["freq"]) try: rprs.append( float(var["variant_annotations"]["ReadPosRankSum"])) bqrs.append( float(var["variant_annotations"]["BaseQRankSum"])) mqrs.append(float(var["variant_annotations"]["MQRankSum"])) except: pass if len(variant_freqs) > 0: print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (args.gene, args.variant, len(variant_freqs), statistics.median(variant_freqs), statistics.median(rprs), statistics.median(bqrs), statistics.median(mqrs))) else: print("%s\t%s\tNA\tNA\tNA\tNA\tNA" % (args.gene, args.variant))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.in_dir) if x[-len(args.suffix):] == args.suffix ] if not os.path.isdir(args.out_dir): os.mkdir(args.out_dir) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files new_dr_variants = defaultdict(list) data = json.load( open(pp.filecheck("%s/%s%s" % (args.in_dir, s, args.suffix)))) for var in data["dr_variants"]: tmp = copy.deepcopy(var) del tmp["drug"] x = {"drug": var["drug"]} if "confidence" in tmp: del tmp["confidence"] x["confidence"] = var["confidence"] if "literature" in tmp: del tmp["literature"] x["literature"] = var["literature"] new_dr_variants[json.dumps(tmp)].append(x) data["dr_variants"] = [] for x in new_dr_variants: new_var = json.loads(x) new_var["drugs"] = [] for d in new_dr_variants[x]: new_var["drugs"].append(d) data["dr_variants"].append(new_var) json.dump(data, open("%s/%s%s" % (args.out_dir, s, args.suffix), "w"))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.vcf_suffix, "") for x in os.listdir(args.vcf_dir) if x[-len(args.vcf_suffix):] == args.vcf_suffix ] for l in open(conf["gff"]): row = l.strip().split() if len(row) <= 2: continue if row[2] != "gene": continue if "Name=%s" % args.gene in l or "gene:%s" % args.gene in l: break start, end = int(row[3]), int(row[4]) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files if not os.path.isfile("%s/%s%s" % (args.dir, s, args.suffix)): continue data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) vars = json.dumps([ d for d in data["dr_variants"] + data["other_variants"] if d["locus_tag"] == args.gene ]) print(vars) if "deletion" not in vars and "frameshift" not in vars and "inframe" not in vars and "stop" not in vars and "start" not in vars: revseq = "| revseq -sequence /dev/stdin -outseq /dev/stdout" if row[ 6] == "-" else "" pp.run_cmd( "samtools faidx %s Chromosome:%s-%s | bcftools consensus %s/%s%s %s | sed 's/^>.*/>%s/' > %s.%s.fasta" % (conf["ref"], start, end, args.vcf_dir, s, args.vcf_suffix, revseq, s, s, args.gene), verbose=1)
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files annotations = [] for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if "variant_annotations" in var: for x in var["variant_annotations"]: if var["variant_annotations"][x] == ".": var["variant_annotations"][x] = "NA" var["variant_annotations"]["sample"] = s var["variant_annotations"]["frequency"] = var["freq"] var["variant_annotations"]["gene"] = var["gene"] var["variant_annotations"]["change"] = var["change"] annotations.append(var["variant_annotations"]) with open(args.out, "w") as O: writer = csv.DictWriter(O, fieldnames=list(annotations[0])) writer.writeheader() writer.writerows(annotations)
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] aa2genome_pos = defaultdict(list) for s in tqdm(samples): data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) if args.dr_only: pool = data["dr_variants"] else: pool = data["dr_variants"] + data["other_variants"] for var in pool: aa2genome_pos[(var["gene"],var["change"])].append(var["genome_pos"]) with open(args.out,"w") as O: for var in aa2genome_pos: for pos in aa2genome_pos[var]: O.write("%s\t%s\t%s\n" % (var[0],var[1],pos))
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] mutations = defaultdict(list) mutation2drugs = defaultdict(set) for s in tqdm(samples): data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"]: mutations[(var["gene"], var["change"])].append(s) for d in var["drugs"]: mutation2drugs[(var["gene"], var["change"])].add(d["drug"]) results = [] for key in mutations: gene, var = key tot_sample_fraction = len(mutations[key]) result = { "gene": gene, "variant": var, "drug": ",".join(mutation2drugs[key]), "total_sample_fraction": tot_sample_fraction, } results.append(result) with open(args.out, "w") as O: writer = csv.DictWriter(O, fieldnames=list(results[0])) writer.writeheader() writer.writerows(results)
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] blacklist = set([l.strip() for l in open(args.blacklist).readlines()]) if args.blacklist else [] gene_set = set(args.gene.split(",")) sys.stdout.write("sample,%s\n" % args.gene) for s in tqdm(samples): data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) mutations = [] for var in (data["dr_variants"] + data["other_variants"]): if var["gene"] in gene_set or var["locus_tag"] in gene_set: if not args.synonymous and var["type"]=="synonymous": continue if var["change"] in blacklist: continue mutations.append("%s_%s" % (var["gene"],var["change"])) sys.stdout.write("%s,%s\n" % (s,";".join(mutations)))
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] sample_mutations = defaultdict(lambda: defaultdict(set)) for s in tqdm(samples): biological_sample_name = "" re_obj = re.search("H37Rv[\d]_mq[\d]+_bq[\d]+", s) if re_obj: biological_sample_name = "H37Rv" re_obj = re.search("(por[\d]+).*_mq[\d]+_bq[\d]+", s) if re_obj: biological_sample_name = re_obj.group(1) data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"]: if var["type"] == "missense": sample_mutations[biological_sample_name][s].add( (var["gene"], var["change"])) for s in sample_mutations: union_mutations = set( list( chain( *[sample_mutations[s][run] for run in sample_mutations[s]]))) for run in sample_mutations[s]: diff = union_mutations - set(sample_mutations[s][run]) print( "%s\t%s\t%s\t%s" % (s, run, len(union_mutations), len(sample_mutations[s][run])))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files samples_with_mutation = [] variant_position_set = set() for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant: samples_with_mutation.append(s) variant_position_set.add(var["genome_pos"]) sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation)) # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"] if len(samples_with_mutation)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found")) quit() elif len(variant_position_set)>1: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos")) quit() if len(variant_position_set)==1: variant_position = int(list(variant_position_set)[0]) sys.stderr.write("\nGenome position is %s\n" % variant_position) sys.stderr.write("\nPerforming ReadPosRankSum test\n") # variant_position = 3841662 params = vars(args) params["ref"] = conf["ref"] params["pos"] = variant_position params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz") read_pos_rank_sums = [] for s in tqdm(samples_with_mutation): params["sample"] = s pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0) pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0) pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0) for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0): row = l.strip().split() if row[1]==".": continue if int(row[0])==variant_position: read_pos_rank_sums.append((s,float(row[1]))) if len(read_pos_rank_sums)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples")) else: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums]))) pp.rm_files([params["tmp_vcf"]])
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] if args.meta: meta = {} for row in csv.DictReader(open(args.meta)): meta[row["wgs_id"]] = row sample_nodes = [] tmp_variant_nodes = [] sample_variant_edges = [] drugs = set() lineage_nodes = set() tmp_variant_drug_edges = [] sample_lineage_edges = [] spoligotype_nodes = set() sample_spoligotype_edges = [] if args.spoligotypes: spoligotypes = {} for row in csv.DictReader(open(args.spoligotypes)): spoligotypes[row["sample"]] = row["spoligotype"] sample_spoligotype_edges.append({"id":row["sample"],"spoligotype":row["spoligotype"]}) spoligotype_nodes.add(row["spoligotype"]) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) sample_node = { "id":s, "drtype":data["drtype"], "lineage":data["sublin"], "lineageInfo": json.dumps(data["lineage"]), "qc": json.dumps({"pct_reads_mapped":data["qc"]["pct_reads_mapped"],"pct_reads_mapped":data["qc"]["pct_reads_mapped"],"gene_coverage":[]}), "pipeline": json.dumps(data["pipeline"]), "tbprofilerVersion": json.dumps(data["tbprofiler_version"]), "dbVersion": json.dumps(data["db_version"]), } lineage_nodes.add(data["sublin"]) sample_lineage_edges.append({"sampleId":s,"lineage":data["sublin"]}) if args.meta: for c in list(meta.values())[0]: if c=="id": continue d = camel_case(c) sample_node[d] = meta[s][c] if s in meta else "NA" if args.spoligotypes: sample_node["spoligotype"] = spoligotypes.get(s,"NA") sample_nodes.append(sample_node) for var in data["dr_variants"] + data["other_variants"]: # if var["type"]=="synonymous": continue variant_id = "%s_%s" % (var["locus_tag"],var["change"]) sample_variant_edges.append( { "sampleId":s, "variantId":variant_id, "freq":var["freq"], "genome_pos": var["genome_pos"], "nucleotideChange": var.get("nucleotide_change","NA"), "internalChange": var.get("_internal_change","NA") } ) tmp_variant_nodes.append( { "id": variant_id, "type": var["type"].replace("*",""), "change": var["change"], "gene": var["gene"], "locus_tag": var["locus_tag"], } ) if "drugs" in var: for d in var["drugs"]: drugs.add(d["drug"]) tmp_variant_drug_edges.append( { "variantId": variant_id, "drug": d["drug"] } ) drug_nodes = [{"id":d} for d in drugs] lineage_nodes = [{"id":d} for d in lineage_nodes] spoligotype_nodes = [{"id":d} for d in spoligotype_nodes] variant_drug_edges = uniq_dict_list(tmp_variant_drug_edges) variant_nodes = uniq_dict_list(standardise_types(tmp_variant_nodes)) def batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): yield iterable[ndx:min(ndx + n, l)] with open("sample_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_nodes[0])) writer.writeheader() writer.writerows(sample_nodes) with open("variant_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(variant_nodes[0])) writer.writeheader() writer.writerows(variant_nodes) for i,x in enumerate(batch(list(range(len(sample_variant_edges))),10000)): with open("sample_variant_edges.%s.csv" % i,"w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_variant_edges[0])) writer.writeheader() for j in x: writer.writerow(sample_variant_edges[j]) with open("drug_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(drug_nodes[0])) writer.writeheader() writer.writerows(drug_nodes) with open("variant_drug_edges.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(variant_drug_edges[0])) writer.writeheader() writer.writerows(variant_drug_edges) with open("lineage_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(lineage_nodes[0])) writer.writeheader() writer.writerows(lineage_nodes) with open("sample_lineage_edges.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_lineage_edges[0])) writer.writeheader() writer.writerows(sample_lineage_edges) with open("spoligotype_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(spoligotype_nodes[0])) writer.writeheader() writer.writerows(spoligotype_nodes) with open("sample_spoligotype_edges.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_spoligotype_edges[0])) writer.writeheader() writer.writerows(sample_spoligotype_edges) with open("Cypher_commands.txt" ,"w") as O: if args.index: O.write("CREATE INDEX FOR (n:Sample) ON (n.id);\n") O.write("CREATE INDEX FOR (n:SRA) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_nodes.csv' AS csvLine\n") O.write("CREATE (s:Sample:SRA {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in sample_nodes[0]])) O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Country) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_nodes.csv' AS csvLine\n") O.write("WITH csvLine WHERE NOT csvLine.countryCode IS null\n") O.write("MERGE (s:Sample {id:csvLine.id})\n") O.write("MERGE (c:Country {id:csvLine.countryCode})\n") O.write("CREATE (s) -[:COLLECTED_IN]-> (c);\n") O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Variant) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_nodes.csv' AS csvLine\n") O.write("CREATE (v:Variant {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in variant_nodes[0]])) O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Gene) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_nodes.csv' AS csvLine\n") O.write("MERGE (v:Variant {id:csvLine.id})\n") O.write("MERGE (g:Gene {id:csvLine.locus_tag, locusTag:csvLine.locus_tag, name:csvLine.gene})\n") O.write("CREATE (v) -[:IN_GENE]-> (g);\n") O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Drug) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///drug_nodes.csv' AS csvLine\n") O.write("CREATE (d:Drug {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in drug_nodes[0]])) O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Lineage) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///lineage_nodes.csv' AS csvLine\n") O.write("CREATE (:Lineage {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in lineage_nodes[0]])) O.write("\n") for i,x in enumerate(batch(list(range(len(sample_variant_edges))),10000)): O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_variant_edges.%s.csv' AS csvLine\n" % i) O.write("MATCH (s:Sample {id: csvLine.sampleId}),(v:Variant {id:csvLine.variantId})\n") O.write("CREATE (s) -[:CONTAINS {%s}]-> (v);\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in sample_variant_edges[0]])) O.write("\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_drug_edges.csv' AS csvLine\n") O.write("MATCH (v:Variant {id: csvLine.variantId}),(d:Drug {id:csvLine.drug})\n") O.write("CREATE (v) -[:CONFERS_RESISTANCE]-> (d);\n") O.write("\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_lineage_edges.csv' AS csvLine\n") O.write("MATCH (s:Sample {id: csvLine.sampleId}),(l:Lineage {id:csvLine.lineage})\n") O.write("CREATE (s) -[:LINEAGE]-> (l);\n") O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Spoligotype) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///spoligotype_nodes.csv' AS csvLine\n") O.write("CREATE (:Spoligotype {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in spoligotype_nodes[0]])) O.write("\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_spoligotype_edges.csv' AS csvLine\n") O.write("MATCH (s:Sample {id: csvLine.id}),(l:Spoligotype {id:csvLine.spoligotype})\n") O.write("CREATE (s) -[:SPOLIGOTYPE]-> (l);\n") O.write("\n")
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files mutations = defaultdict(list) for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if var["gene"]!="rrs" and var["gene"]!="rrl" and re.search("r\.[0-9]+",var["change"]): continue if "nucleotide_change" in var and (re.search("p.[A-Za-z]+",var["change"]) or re.search("c.[0-9]+",var["change"]) or re.search("c.\-[0-9]+",var["change"])): pos = ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(1) for x in var["nucleotide_change"].split("+")]) ref = ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(2) for x in var["nucleotide_change"].split("+")]) alt = ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(3) for x in var["nucleotide_change"].split("+")]) # if var["change"]=="p.Gly168Ser": # import pdb; pdb.set_trace() elif var["type"]=="non_coding" and re.search("c.\-[0-9]+",var["change"]): re_obj = re.search("c.\-[0-9]+([ACGT]+)>([ACGT]+)",var["change"]) pos = str(var["genome_pos"]) ref = re_obj.group(1) alt = re_obj.group(2) elif var["type"]=="non_coding" and re.search("r.[0-9]+",var["change"]): re_obj = re.search("[0-9]+([ACGT]+)>([ACGT]+)", var["_internal_change"]) pos = str(var["genome_pos"]) ref = re_obj.group(1) alt = re_obj.group(2) elif var["type"]=="large_deletion": continue elif var["type"].replace("*","")=="synonymous": continue elif var["type"].replace("*","")=="frameshift&start_lost": continue elif var["type"].replace("*","")=="missense&inframe_altering": continue elif var["type"].replace("*","")=="stop_lost": continue elif var["type"].replace("*","")=="stop_retained": continue else: quit(var) # if var["change"]=="p.Ser450Leu": # import pdb; pdb.set_trace() mutations[(pos,ref,alt)].append(json.dumps({ "genome_pos": pos, "type":var["type"].replace("*",""), "locus_tag":var["locus_tag"], "gene":var["gene"], "_internal_change":var["_internal_change"], "change":var["change"] })) for key in sorted(mutations,key=lambda x:int(x[0].split(",")[0])): for x in set(mutations[key]): var = json.loads(x) print("Chromosome\t%s\t%s\t%s\t%s|%s|%s|%s|%s|%s|%s" % (var["genome_pos"],key[1],key[2],var["type"],var["locus_tag"],var["gene"],"NA","NA",var["_internal_change"],var["change"]))