def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files null_lineage_samples = [] for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) if data["lineage"] == []: null_lineage_samples.append(s) print("\n".join(null_lineage_samples))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files variants = [] vartypes = [] data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"]: if var["locus_tag"] == "Rv2043c": variants.append(var) vartypes.append(var["type"].replace("*", "")) if len(variants) > 1 and "frameshift" in var["type"]: print(s, variants)
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files results = defaultdict(list) dr_mutations = set() # for for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if var["gene"] == "fabG1" and var["change"] == "c.663C>A": print(var["freq"])
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] variants = defaultdict(list) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: variants[(var["gene"], var["change"])].append(s) with open(args.out, "w") as O: O.write("Gene,Variant,%s\n" % ",".join(samples)) for key in variants: samps = variants[key] O.write("%s,%s,%s\n" % (key[0], key[1], ",".join( ["1" if s in samps else "0" for s in samples])))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.in_dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.in_dir, s, args.suffix)))) for var in data["dr_variants"]: if isinstance(var["drugs"], list): continue tmp1 = [] for d in var["drugs"]: tmp2 = {} tmp2["drug"] = d for k in var["drugs"][d]: tmp2[k] = var["drugs"][d][k] tmp1.append(tmp2) var["drugs"] = tmp1 json.dump(data, open("%s/%s%s" % (args.out_dir, s, args.suffix), "w"))
def main(args): mapping = { "missense": "SNP", "non_coding": "SNP", "non_coding": "SNP", "stop_gained": "SNP", "start_lost": "SNP", "frameshift": "indel", "inframe_deletion": "indel", "inframe_insertion": "indel", "large_deletion": "large_deletion" } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] resistance = defaultdict(lambda: defaultdict(list)) for s in tqdm(samples): data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"]: resistance[var["drug"]][s].append( mapping.get(var["type"], "complex")) for drug in resistance: lines = [] lines.append("DATASET_PIECHART") lines.append("SEPARATOR COMMA") lines.append("DATASET_LABEL,%s" % drug) lines.append("COLOR,#ff0000") lines.append("FIELD_COLORS,#ff0000,#00ff00,#0000ff,#ffffff") lines.append("FIELD_LABELS,snp,indel,large_deletion,no_variant") lines.append("MARGIN,5") # lines.append("MAXIMUM_SIZE,30") lines.append("BORDER_WIDTH,1") lines.append("BORDER_COLOR,#000000") lines.append("DATA") for s in samples: count = Counter(resistance[drug][s]) lines.append("%s,-1,7,%s,%s" % (s, ",".join([ str(count[d]) for d in ["SNP", "indel", "large_deletion"] ]), "0" if sum(count.values()) > 0 else "1")) with open("%s.itol.conf.txt" % drug, "w") as O: O.write("\n".join(lines))
def main(args): bed_file = "%s/share/tbprofiler/%s.bed" % (sys.base_prefix,args.db) locus_tag2drugs = tbprofiler.get_lt2drugs(bed_file) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] FLQ_set = set(["moxifloxacin","levofloxacin","ciprofloxacin","ofloxacin"]) SLI_set = set(["amikacin","capreomycin","kanamycin"]) OUT = open(args.out,"w") writer = csv.DictWriter(OUT, fieldnames = ["sample","dr-class"]) writer.writeheader() for s in tqdm(samples): data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) resistant_drugs = set() for var in data["dr_variants"]: for d in var["drugs"]: resistant_drugs.add(d["drug"]) rif = "rifampicin" in resistant_drugs inh = "isoniazid" in resistant_drugs flq = len(FLQ_set.intersection(resistant_drugs)) > 0 sli = len(SLI_set.intersection(resistant_drugs)) > 0 if len(resistant_drugs)==0: drtype = "Sensitive" elif (rif and not inh) or (inh and not rif): drtype = "Pre-MDR" elif (rif and inh) and (not flq and not sli): drtype = "MDR" elif (rif and inh) and ( (flq and not sli) or (sli and not flq) ): drtype = "Pre-XDR" elif (rif and inh) and (flq and sli): drtype = "XDR" else: drtype = "Other" writer.writerow({"sample":s, "dr-class":drtype}) OUT.close()
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.in_dir) if x[-len(args.suffix):] == args.suffix ] if not os.path.isdir(args.out_dir): os.mkdir(args.out_dir) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files new_dr_variants = defaultdict(list) data = json.load( open(pp.filecheck("%s/%s%s" % (args.in_dir, s, args.suffix)))) for var in data["dr_variants"]: tmp = copy.deepcopy(var) del tmp["drug"] x = {"drug": var["drug"]} if "confidence" in tmp: del tmp["confidence"] x["confidence"] = var["confidence"] if "literature" in tmp: del tmp["literature"] x["literature"] = var["literature"] new_dr_variants[json.dumps(tmp)].append(x) data["dr_variants"] = [] for x in new_dr_variants: new_var = json.loads(x) new_var["drugs"] = [] for d in new_dr_variants[x]: new_var["drugs"].append(d) data["dr_variants"].append(new_var) json.dump(data, open("%s/%s%s" % (args.out_dir, s, args.suffix), "w"))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files variant_freqs = [] rprs = [] bqrs = [] mqrs = [] for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if (var["locus_tag"] == args.gene or var["gene"] == args.gene) and var["change"] == args.variant: variant_freqs.append(var["freq"]) try: rprs.append( float(var["variant_annotations"]["ReadPosRankSum"])) bqrs.append( float(var["variant_annotations"]["BaseQRankSum"])) mqrs.append(float(var["variant_annotations"]["MQRankSum"])) except: pass if len(variant_freqs) > 0: print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (args.gene, args.variant, len(variant_freqs), statistics.median(variant_freqs), statistics.median(rprs), statistics.median(bqrs), statistics.median(mqrs))) else: print("%s\t%s\tNA\tNA\tNA\tNA\tNA" % (args.gene, args.variant))
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.vcf_suffix, "") for x in os.listdir(args.vcf_dir) if x[-len(args.vcf_suffix):] == args.vcf_suffix ] for l in open(conf["gff"]): row = l.strip().split() if len(row) <= 2: continue if row[2] != "gene": continue if "Name=%s" % args.gene in l or "gene:%s" % args.gene in l: break start, end = int(row[3]), int(row[4]) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files if not os.path.isfile("%s/%s%s" % (args.dir, s, args.suffix)): continue data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) vars = json.dumps([ d for d in data["dr_variants"] + data["other_variants"] if d["locus_tag"] == args.gene ]) print(vars) if "deletion" not in vars and "frameshift" not in vars and "inframe" not in vars and "stop" not in vars and "start" not in vars: revseq = "| revseq -sequence /dev/stdin -outseq /dev/stdout" if row[ 6] == "-" else "" pp.run_cmd( "samtools faidx %s Chromosome:%s-%s | bcftools consensus %s/%s%s %s | sed 's/^>.*/>%s/' > %s.%s.fasta" % (conf["ref"], start, end, args.vcf_dir, s, args.vcf_suffix, revseq, s, s, args.gene), verbose=1)
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(args.suffix, "") for x in os.listdir(args.dir) if x[-len(args.suffix):] == args.suffix ] # Loop through the sample result files annotations = [] for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load( open(pp.filecheck("%s/%s%s" % (args.dir, s, args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if "variant_annotations" in var: for x in var["variant_annotations"]: if var["variant_annotations"][x] == ".": var["variant_annotations"][x] = "NA" var["variant_annotations"]["sample"] = s var["variant_annotations"]["frequency"] = var["freq"] var["variant_annotations"]["gene"] = var["gene"] var["variant_annotations"]["change"] = var["change"] annotations.append(var["variant_annotations"]) with open(args.out, "w") as O: writer = csv.DictWriter(O, fieldnames=list(annotations[0])) writer.writeheader() writer.writerows(annotations)
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.results_dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files samples_with_mutation = [] variant_position_set = set() for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.results_dir,s,args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if (var["gene"]==args.gene or var["locus_tag"]==args.gene) and var["change"]==args.variant: samples_with_mutation.append(s) variant_position_set.add(var["genome_pos"]) sys.stderr.write("\nFound %s samples with mutation\n" % len(samples_with_mutation)) # samples_with_mutation = ["ERR2515541","ERR2510504","ERR2864225","SRR7341698"] if len(samples_with_mutation)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Mutation_not_found")) quit() elif len(variant_position_set)>1: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"Multiple_genome_pos")) quit() if len(variant_position_set)==1: variant_position = int(list(variant_position_set)[0]) sys.stderr.write("\nGenome position is %s\n" % variant_position) sys.stderr.write("\nPerforming ReadPosRankSum test\n") # variant_position = 3841662 params = vars(args) params["ref"] = conf["ref"] params["pos"] = variant_position params["tmp_vcf"] = pp.get_random_file(extension=".vcf.gz") read_pos_rank_sums = [] for s in tqdm(samples_with_mutation): params["sample"] = s pp.run_cmd("tabix -f %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz" % params,verbose=0) pp.run_cmd("bcftools view %(vcf_dir)s/%(sample)s.targets.csq.vcf.gz Chromosome:%(pos)s -Oz -o %(tmp_vcf)s" % params,verbose=0) pp.run_cmd("tabix -f %(tmp_vcf)s" % params,verbose=0) for l in pp.cmd_out("gatk VariantAnnotator -R %(ref)s -I %(bam_dir)s/%(sample)s%(bam_extension)s -V %(tmp_vcf)s -O /dev/stdout -A ReadPosRankSumTest -OVI false | bcftools query -f '%%POS\\t%%ReadPosRankSum\\n'" % params,verbose=0): row = l.strip().split() if row[1]==".": continue if int(row[0])==variant_position: read_pos_rank_sums.append((s,float(row[1]))) if len(read_pos_rank_sums)==0: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,"No_values_from_samples")) else: sys.stdout.write("%s\t%s\t%s\n" % (args.gene,args.variant,statistics.median([x[1] for x in read_pos_rank_sums]))) pp.rm_files([params["tmp_vcf"]])
def main(args): # Get a dictionary with the database file: {"ref": "/path/to/fasta" ... etc. } conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) # Get a dictionary mapping the locus_tags to drugs: {"Rv1484": ["isoniazid","ethionamide"], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files mutations = defaultdict(list) for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) for var in data["dr_variants"] + data["other_variants"]: if var["gene"]!="rrs" and var["gene"]!="rrl" and re.search("r\.[0-9]+",var["change"]): continue if "nucleotide_change" in var and (re.search("p.[A-Za-z]+",var["change"]) or re.search("c.[0-9]+",var["change"]) or re.search("c.\-[0-9]+",var["change"])): pos = ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(1) for x in var["nucleotide_change"].split("+")]) ref = ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(2) for x in var["nucleotide_change"].split("+")]) alt = ",".join([re.search("([0-9]+)([ACGT]+)>([ACGT]+)",x).group(3) for x in var["nucleotide_change"].split("+")]) # if var["change"]=="p.Gly168Ser": # import pdb; pdb.set_trace() elif var["type"]=="non_coding" and re.search("c.\-[0-9]+",var["change"]): re_obj = re.search("c.\-[0-9]+([ACGT]+)>([ACGT]+)",var["change"]) pos = str(var["genome_pos"]) ref = re_obj.group(1) alt = re_obj.group(2) elif var["type"]=="non_coding" and re.search("r.[0-9]+",var["change"]): re_obj = re.search("[0-9]+([ACGT]+)>([ACGT]+)", var["_internal_change"]) pos = str(var["genome_pos"]) ref = re_obj.group(1) alt = re_obj.group(2) elif var["type"]=="large_deletion": continue elif var["type"].replace("*","")=="synonymous": continue elif var["type"].replace("*","")=="frameshift&start_lost": continue elif var["type"].replace("*","")=="missense&inframe_altering": continue elif var["type"].replace("*","")=="stop_lost": continue elif var["type"].replace("*","")=="stop_retained": continue else: quit(var) # if var["change"]=="p.Ser450Leu": # import pdb; pdb.set_trace() mutations[(pos,ref,alt)].append(json.dumps({ "genome_pos": pos, "type":var["type"].replace("*",""), "locus_tag":var["locus_tag"], "gene":var["gene"], "_internal_change":var["_internal_change"], "change":var["change"] })) for key in sorted(mutations,key=lambda x:int(x[0].split(",")[0])): for x in set(mutations[key]): var = json.loads(x) print("Chromosome\t%s\t%s\t%s\t%s|%s|%s|%s|%s|%s|%s" % (var["genome_pos"],key[1],key[2],var["type"],var["locus_tag"],var["gene"],"NA","NA",var["_internal_change"],var["change"]))
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) locus_tag2drugs = tbprofiler.get_lt2drugs(conf["bed"]) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,"") for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] if args.meta: meta = {} for row in csv.DictReader(open(args.meta)): meta[row["wgs_id"]] = row sample_nodes = [] tmp_variant_nodes = [] sample_variant_edges = [] drugs = set() lineage_nodes = set() tmp_variant_drug_edges = [] sample_lineage_edges = [] spoligotype_nodes = set() sample_spoligotype_edges = [] if args.spoligotypes: spoligotypes = {} for row in csv.DictReader(open(args.spoligotypes)): spoligotypes[row["sample"]] = row["spoligotype"] sample_spoligotype_edges.append({"id":row["sample"],"spoligotype":row["spoligotype"]}) spoligotype_nodes.add(row["spoligotype"]) # Loop through the sample result files for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck("%s/%s%s" % (args.dir,s,args.suffix)))) sample_node = { "id":s, "drtype":data["drtype"], "lineage":data["sublin"], "lineageInfo": json.dumps(data["lineage"]), "qc": json.dumps({"pct_reads_mapped":data["qc"]["pct_reads_mapped"],"pct_reads_mapped":data["qc"]["pct_reads_mapped"],"gene_coverage":[]}), "pipeline": json.dumps(data["pipeline"]), "tbprofilerVersion": json.dumps(data["tbprofiler_version"]), "dbVersion": json.dumps(data["db_version"]), } lineage_nodes.add(data["sublin"]) sample_lineage_edges.append({"sampleId":s,"lineage":data["sublin"]}) if args.meta: for c in list(meta.values())[0]: if c=="id": continue d = camel_case(c) sample_node[d] = meta[s][c] if s in meta else "NA" if args.spoligotypes: sample_node["spoligotype"] = spoligotypes.get(s,"NA") sample_nodes.append(sample_node) for var in data["dr_variants"] + data["other_variants"]: # if var["type"]=="synonymous": continue variant_id = "%s_%s" % (var["locus_tag"],var["change"]) sample_variant_edges.append( { "sampleId":s, "variantId":variant_id, "freq":var["freq"], "genome_pos": var["genome_pos"], "nucleotideChange": var.get("nucleotide_change","NA"), "internalChange": var.get("_internal_change","NA") } ) tmp_variant_nodes.append( { "id": variant_id, "type": var["type"].replace("*",""), "change": var["change"], "gene": var["gene"], "locus_tag": var["locus_tag"], } ) if "drugs" in var: for d in var["drugs"]: drugs.add(d["drug"]) tmp_variant_drug_edges.append( { "variantId": variant_id, "drug": d["drug"] } ) drug_nodes = [{"id":d} for d in drugs] lineage_nodes = [{"id":d} for d in lineage_nodes] spoligotype_nodes = [{"id":d} for d in spoligotype_nodes] variant_drug_edges = uniq_dict_list(tmp_variant_drug_edges) variant_nodes = uniq_dict_list(standardise_types(tmp_variant_nodes)) def batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): yield iterable[ndx:min(ndx + n, l)] with open("sample_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_nodes[0])) writer.writeheader() writer.writerows(sample_nodes) with open("variant_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(variant_nodes[0])) writer.writeheader() writer.writerows(variant_nodes) for i,x in enumerate(batch(list(range(len(sample_variant_edges))),10000)): with open("sample_variant_edges.%s.csv" % i,"w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_variant_edges[0])) writer.writeheader() for j in x: writer.writerow(sample_variant_edges[j]) with open("drug_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(drug_nodes[0])) writer.writeheader() writer.writerows(drug_nodes) with open("variant_drug_edges.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(variant_drug_edges[0])) writer.writeheader() writer.writerows(variant_drug_edges) with open("lineage_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(lineage_nodes[0])) writer.writeheader() writer.writerows(lineage_nodes) with open("sample_lineage_edges.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_lineage_edges[0])) writer.writeheader() writer.writerows(sample_lineage_edges) with open("spoligotype_nodes.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(spoligotype_nodes[0])) writer.writeheader() writer.writerows(spoligotype_nodes) with open("sample_spoligotype_edges.csv","w") as O: writer = csv.DictWriter(O,fieldnames = list(sample_spoligotype_edges[0])) writer.writeheader() writer.writerows(sample_spoligotype_edges) with open("Cypher_commands.txt" ,"w") as O: if args.index: O.write("CREATE INDEX FOR (n:Sample) ON (n.id);\n") O.write("CREATE INDEX FOR (n:SRA) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_nodes.csv' AS csvLine\n") O.write("CREATE (s:Sample:SRA {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in sample_nodes[0]])) O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Country) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_nodes.csv' AS csvLine\n") O.write("WITH csvLine WHERE NOT csvLine.countryCode IS null\n") O.write("MERGE (s:Sample {id:csvLine.id})\n") O.write("MERGE (c:Country {id:csvLine.countryCode})\n") O.write("CREATE (s) -[:COLLECTED_IN]-> (c);\n") O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Variant) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_nodes.csv' AS csvLine\n") O.write("CREATE (v:Variant {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in variant_nodes[0]])) O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Gene) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_nodes.csv' AS csvLine\n") O.write("MERGE (v:Variant {id:csvLine.id})\n") O.write("MERGE (g:Gene {id:csvLine.locus_tag, locusTag:csvLine.locus_tag, name:csvLine.gene})\n") O.write("CREATE (v) -[:IN_GENE]-> (g);\n") O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Drug) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///drug_nodes.csv' AS csvLine\n") O.write("CREATE (d:Drug {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in drug_nodes[0]])) O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Lineage) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///lineage_nodes.csv' AS csvLine\n") O.write("CREATE (:Lineage {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in lineage_nodes[0]])) O.write("\n") for i,x in enumerate(batch(list(range(len(sample_variant_edges))),10000)): O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_variant_edges.%s.csv' AS csvLine\n" % i) O.write("MATCH (s:Sample {id: csvLine.sampleId}),(v:Variant {id:csvLine.variantId})\n") O.write("CREATE (s) -[:CONTAINS {%s}]-> (v);\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in sample_variant_edges[0]])) O.write("\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///variant_drug_edges.csv' AS csvLine\n") O.write("MATCH (v:Variant {id: csvLine.variantId}),(d:Drug {id:csvLine.drug})\n") O.write("CREATE (v) -[:CONFERS_RESISTANCE]-> (d);\n") O.write("\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_lineage_edges.csv' AS csvLine\n") O.write("MATCH (s:Sample {id: csvLine.sampleId}),(l:Lineage {id:csvLine.lineage})\n") O.write("CREATE (s) -[:LINEAGE]-> (l);\n") O.write("\n") if args.index: O.write("CREATE INDEX FOR (n:Spoligotype) ON (n.id);\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///spoligotype_nodes.csv' AS csvLine\n") O.write("CREATE (:Spoligotype {%s});\n" % ", ".join(["%s: csvLine.%s" % (d,d) for d in spoligotype_nodes[0]])) O.write("\n") O.write("LOAD CSV WITH HEADERS FROM 'file:///sample_spoligotype_edges.csv' AS csvLine\n") O.write("MATCH (s:Sample {id: csvLine.id}),(l:Spoligotype {id:csvLine.spoligotype})\n") O.write("CREATE (s) -[:SPOLIGOTYPE]-> (l);\n") O.write("\n")
def main(args): # ----- # ARGS # ----- # tbprofiler_results_location = 'tbprofiler_pakistan_results/' metadata_file = args.metadata_file id_key = args.id_key tbprofiler_results_location = args.tbp_results outfile = args.outfile db = args.db # ------------- # READ IN DATA # ------------- # Read in metadata meta_reader = csv.DictReader(open(metadata_file)) meta_dict = {} for row in meta_reader: # Make the id the key, but also recapitulate the id in the key-values by including everything meta_dict[row[id_key]] = row # Read in locus-drug resistance associations bed_file = "%s/share/tbprofiler/%s.bed" % (sys.base_prefix, db) locus_tag2drugs = tbprofiler.get_lt2drugs(bed_file) # Get list of files in tbprofiler results directory tbprofiler_results_files = os.listdir(tbprofiler_results_location) # -------- # WRANGLE # -------- samples = list(meta_dict.keys()) # ---------------- # DR VARIANTS # ---------------- dr_variants_dict = {} for json_file in tbprofiler_results_files: id = ''.join(json_file.split(".")[:-2]) if id in samples: # Create empty list per id dr_variants_dict[id] = [] json_file = tbprofiler_results_location + json_file tbp_result = json.load(open(json_file)) # Loop over the other_variants dictionaries for variant in tbp_result['dr_variants']: # print("VARIANT: ", variant['sample']) # Exclude synonymous if variant['type'] != 'synonymous': # Put it all together. Left join locus/gene drug resistance associations from locus_tag2drugs table empty_str = "" variant.setdefault("gene", empty_str) variant.setdefault("genome_pos", empty_str) variant.setdefault("type", empty_str) variant.setdefault("change", empty_str) variant.setdefault("nucleotide_change", empty_str) variant.setdefault("locus_tag", empty_str) locus_tag2drugs.setdefault(variant['locus_tag'], empty_str) dr_variants_dict[id].append({'wgs_id': id, 'gene': variant['gene'], 'genome_pos': variant['genome_pos'], 'type': variant['type'], 'change': variant['change'], 'nucleotide_change': variant['nucleotide_change'], 'locus_tag': variant['locus_tag'], 'locus_tag_drugs': locus_tag2drugs[variant['locus_tag']]}) # Save a tab-sep text file # Define headers from the first dict fieldnames = tuple(next(iter(dr_variants_dict.values()))[0].keys()) with open(outfile + '.dr.txt', 'w') as f: writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') writer.writeheader() # Loop over the dictionaries, appending each dictionary as a row in the file for id in dr_variants_dict: writer.writerows(dr_variants_dict[id]) # ---------------- # OTHER VARIANTS # ---------------- other_variants_dict = {} for json_file in tbprofiler_results_files: id = ''.join(json_file.split(".")[:-2]) if id in samples: # Create empty list per id other_variants_dict[id] = [] json_file = tbprofiler_results_location + json_file tbp_result = json.load(open(json_file)) # Loop over the other_variants dictionaries for variant in tbp_result['other_variants']: # print("VARIANT: ", variant['sample']) # Exclude synonymous if variant['type'] != 'synonymous': # Put it all together. Left join locus/gene drug resistance associations from locus_tag2drugs table empty_str = "" variant.setdefault("gene", empty_str) variant.setdefault("genome_pos", empty_str) variant.setdefault("type", empty_str) variant.setdefault("change", empty_str) variant.setdefault("nucleotide_change", empty_str) variant.setdefault("locus_tag", empty_str) locus_tag2drugs.setdefault(variant['locus_tag'], empty_str) # other_variants_dict[clust].append({'wgs_id': id, 'gene': variant['gene'], 'genome_pos': variant['genome_pos'], 'type': variant['type'], # 'change': variant['change'], 'nucleotide_change': variant['nucleotide_change'], 'locus_tag': variant['locus_tag'], 'locus_tag_drugs': locus_tag2drugs[variant['locus_tag']]}) other_variants_dict[id].append({'wgs_id': id, 'gene': variant['gene'], 'genome_pos': variant['genome_pos'], 'type': variant['type'], 'change': variant['change'], 'nucleotide_change': variant['nucleotide_change'], 'locus_tag': variant['locus_tag'], 'locus_tag_drugs': locus_tag2drugs[variant['locus_tag']]}) # Save a tab-sep text file # Define headers from the first dict fieldnames = tuple(next(iter(other_variants_dict.values()))[0].keys()) with open(outfile + '.other.txt', 'w') as f: writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') writer.writeheader() # Loop over the dictionaries, appending each dictionary as a row in the file for id in other_variants_dict: writer.writerows(other_variants_dict[id])
def main(args): # Get a dictionary with the database file: {'ref': '/path/to/fasta' ... etc. } conf = get_conf_dict(sys.base_prefix + '/share/tbprofiler/%s' % args.db) # Get a dictionary mapping the locus_tags to drugs: {'Rv1484': ['isoniazid','ethionamide'], ... etc. } locus_tag2drugs = tbprofiler.get_lt2drugs(conf['bed']) # Get a dictionary mapping the drug to genes: {'rifampicin': ['rpoB', 'rpoC'], 'clofazimine': ['mmpR5', 'pepQ'], ... etc. } drug2genes = tbprofiler.get_drugs2gene(conf['bed']) # If a list of samples is supplied through the args object, store it in a list else get the list from looking in the results direcotry if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(args.suffix,'') for x in os.listdir(args.dir) if x[-len(args.suffix):]==args.suffix] # Loop through the sample result files drugs = [ 'rifampicin','isoniazid','ethambutol','pyrazinamide','streptomycin','amikacin', 'kanamycin','capreomycin','fluoroquinolones','ethionamide','cycloserine', 'para-aminosalicylic_acid','clofazimine','bedaquiline','delamanid' ] # Set up a list which will contain our output file rows rows = [] for s in tqdm(samples): # Data has the same structure as the .result.json files data = json.load(open(pp.filecheck(f'{args.dir}/{s}{args.suffix}'))) # The data is organised per variant in data['dr_variants']. We need to # transform this into a structure which is arranged by drug instead. # We do this by: # 1. Setting up a dictionary (drug_variants) where the values are lists # 2. Loop through all the variants and append the gene/change/freq # to the list for each drug # # The structure will look like {'isoniazid':['katG_p.Ser315Thr_0.95','fabG1_-15T>C_1.00']} drug_variants = defaultdict(list) for var in data['dr_variants']: for d in var['drugs']: drug_variants[d['drug']].append(f'{var["gene"]}_{var["change"]}_{round(var["freq"],2)}') # Create a lookup dictionary containing all the genes for which we have missing coverage # E.g. {'rpoB':'0.2'} gene_coverage = {d['gene']:str(d['fraction']) for d in data['qc']['gene_coverage'] if d['fraction']>0} # Set up our row for the final output file with column names being the keys. row = { 'sample': s, 'main_lineage': data['main_lin'], 'sublineage': data['sublin'], 'drtype': data['drtype'] } # For each drug add a column with the variants and another containing a value if there is missin coverage for drug in drugs: row[f'{drug}_variants'] = ", ".join(drug_variants[drug]) row[f'{drug}_gene_cov'] = ", ".join([gene_coverage[gene] for gene in drug2genes[drug] if gene in gene_coverage]) rows.append(row) # Write the output file with open(args.outfile,'w') as O: writer = csv.DictWriter(O,fieldnames=list(rows[0]),delimiter="\t") writer.writeheader() writer.writerows(rows)