def fasta2vcf(fasta_file, outfile): conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) refseq = pp.fasta(conf["ref"]).fa_dict seqs = pp.fasta(fasta_file) samples = list(seqs.fa_dict.keys()) for sample in samples: fname = pp.get_random_file() open(fname, "w").write(">%s\n%s\n" % (sample, seqs.fa_dict[sample])) fasta_obj = pp.fasta(fname) vcf_obj = pp.vcf(fasta_obj.get_ref_variants(conf["ref"], sample)) pp.run_cmd("rm %s" % fname) sample_chunks = [samples[i:i + 200] for i in range(0, len(samples), 200)] tmp_vcfs = [] for tmp_samps in sample_chunks: tmp_list = pp.get_random_file() tmp_vcf = pp.get_random_file() open(tmp_list, "w").write("\n".join(["%s.vcf.gz" % x for x in tmp_samps])) pp.run_cmd("bcftools merge -0 -l %s -Oz -o %s" % (tmp_list, tmp_vcf)) pp.run_cmd("bcftools index %s" % tmp_vcf) tmp_vcfs.append(tmp_vcf) pp.rm_files([tmp_list]) pp.run_cmd("bcftools merge -0 %s | bcftools view -V indels -Oz -o %s" % (" ".join(tmp_vcfs), outfile)) vcf_files = ["%s.vcf.gz" % s for s in samples] vcf_csi_files = ["%s.vcf.gz.csi" % s for s in samples] pp.rm_files(vcf_files + vcf_csi_files + tmp_vcfs)
def main(args): refseq = list(pp.fasta(args.ref).fa_dict.values())[0] for l in sys.stdin: if l[0] == "#": if l.strip() == "##contig=<ID=1,length=29903>": l = l.replace("1", "NC_045512.2") sys.stdout.write(l) continue row = l.strip().split() if row[0] == "1": row[0] = args.seqname ipos = int(row[1]) - 1 possible_ref_allele = row[3] true_ref_allele = refseq[ipos] alts = row[4].split(",") alleles = [possible_ref_allele] + alts if possible_ref_allele != true_ref_allele: ref_allele_index = alts.index(true_ref_allele) row[3] = true_ref_allele alts[ref_allele_index] = possible_ref_allele row[4] = ",".join(alts) genos = "".join(row[9:]).replace( str(ref_allele_index + 1), "R").replace("0", str(ref_allele_index + 1)).replace("R", "0") row[9:] = list(genos) sys.stdout.write("\t".join(row) + "\n") else: sys.stdout.write("\t".join(row) + "\n")
def main(args): ref = pp.fasta(args.ref).fa_dict cds = gff_load_cds(args.gff) final_list = [] coding = defaultdict(list) generator = pp.cmd_out( f"bcftools view {args.vcf}") if args.vcf else sys.stdin for l in generator: row = l.strip().split() if l[0] == "#": sys.stdout.write(l.strip() + "\n") elif len(row[3]) > 1 or len(row[4]) > 1: final_list.append(row) else: gene, cpos = get_codon_pos(row[0], int(row[1]), cds) if gene == None: final_list.append(row) else: coding[(gene, cpos)].append(row) for rows in coding.values(): chrom = rows[0][0] pos = sorted([int(r[1]) for r in rows]) ref_nucs = {p: ref[chrom][p - 1] for p in range(pos[0], pos[-1] + 1)} alt_nucs = ref_nucs.copy() for i, p in enumerate(pos): alt_nucs[p] = rows[i][4] new_row = rows[0] new_row[3] = "".join(ref_nucs.values()) new_row[4] = "".join(alt_nucs.values()) final_list.append(new_row) for row in sorted(final_list, key=lambda x: int(x[1])): sys.stdout.write("\t".join(row) + "\n")
def main(args): seqs = pp.fasta(args.fasta).fa_dict for seq in seqs: seqs[seq] = list(seqs[seq]) for l in open(args.bed): row = l.strip().split() for i in tqdm(range(int(row[1])-1,int(row[2]))): for seq in seqs: seqs[seq][i] = "N" with open(args.out,"w") as O: for seq in seqs: O.write(">%s\n%s\n" % (seq,"".join(seqs[seq])))
def main(args): acgtn = set(["A","C","G","T","N"]) sys.stderr.write("Loading sequences\n") seqs = pp.fasta(args.fasta).fa_dict sys.stderr.write("Masking sequences\n") for seq in tqdm(seqs): nucs = Counter(list(seqs[seq])) for nuc in nucs: if nuc not in acgtn: seqs[seq] = seqs[seq].replace(nuc,"N") sys.stderr.write("Writing sequences\n") with open(args.out,"w") as O: for seq in seqs: O.write(">%s\n%s\n" % (seq,"".join(seqs[seq])))
def main(args): seqs = pp.fasta(args.msa).fa_dict ref_pos = 0 print("ref_pos\talignment_pos\tA\tC\tG\tT\tN\tgap") for i in range(len(list(seqs.values())[0])): alignment_pos = i+1 if seqs[refseq][i]!="-": ref_pos+=1 allele_count = Counter([seqs[s][i].upper() for s in seqs]) num_N = allele_count["N"] if "N" in allele_count else 0 num_gap = allele_count["-"] if "-" in allele_count else 0 num_A = allele_count["A"] if "A" in allele_count else 0 num_C = allele_count["C"] if "C" in allele_count else 0 num_G = allele_count["G"] if "G" in allele_count else 0 num_T = allele_count["T"] if "T" in allele_count else 0 print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ref_pos,alignment_pos,num_A,num_C,num_G,num_T,num_N,num_gap))
def primer_evaluation(args): conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) msa_obj = pp.fasta(conf["msa"]).fa_dict forward_results = cp.run_fuzznuc(conf["msa"], args.primerF, pmismatch=args.mismatch) reverse_results = cp.run_fuzznuc(conf["msa"], args.primerR, pmismatch=args.mismatch) probe_results = cp.run_fuzznuc(conf["msa"], args.probe, pmismatch=args.mismatch) rows = [] for s in tqdm(msa_obj): amplicon = cp.find_amplicon(forward_results[s], reverse_results[s], probe_results[s]) rows.append(amplicon) with open(args.out, "w") as O: writer = csv.DictWriter(O, fieldnames=list(rows[0].keys())) writer.writeheader() writer.writerows(rows)
def main(args): conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [ x.replace(".targets.csq.vcf.gz", "") for x in os.listdir(args.dir) if x[-19:] == ".targets.csq.vcf.gz" ] sample_fastas = defaultdict(list) params = { "tmp_locations": pp.get_random_file(), "tmp_mappings": pp.get_random_file(), "ref": conf["ref"] } pp.run_cmd("awk '{print $1\":\"$2\"-\"$3\"\\t\"$5}' %s > %s" % (conf["bed"], params["tmp_mappings"])) pp.run_cmd("cut -f1 %s > %s" % (params["tmp_mappings"], params["tmp_locations"])) FILES = {} for l in open(params["tmp_mappings"]): row = l.rstrip().split() FILES[row[0]] = open("%s.fasta" % row[1], "w") for s in samples: params["vcf"] = "%s/%s.targets.csq.vcf.gz" % (args.dir, s) params["tmp_vcf"] = "%s/%s.targets.csq.tmp.vcf.gz" % (args.dir, s) params["sample_fa"] = "%s.targets.fa" % (s) pp.run_cmd( "bcftools filter -e 'sum(AD)=0' -S . %(vcf)s | bcftools view -a | grep -v NON_REF | bcftools view -Oz -o %(tmp_vcf)s" % params) pp.run_cmd("bcftools index %(tmp_vcf)s" % params) pp.run_cmd( "samtools faidx -r %(tmp_locations)s %(ref)s | bcftools consensus -H A %(tmp_vcf)s > %(sample_fa)s" % params) fa_dict = pp.fasta(params["sample_fa"]).fa_dict for locus in fa_dict: FILES[locus].write(">%s\n%s\n" % (s, fa_dict[locus])) pp.rm_files([params["tmp_vcf"]]) pp.rm_files([params["tmp_locations"], params["tmp_mappings"]])
def main(args): genes = load_gff(args.gff) refseq = pp.fasta(args.ref).fa_dict mutations = {} converted_mutations = {} for row in csv.DictReader(open(args.csv)): gene = [ g for g in genes if g.name == row["Gene"] or g.locus_tag == row["Gene"] ][0] mut = None r = re.search("r.([0-9]+)([acgt]+)>([acgt]+)", row["Mutation"]) if r: converted_mutations[( row["Gene"], row["Mutation"] )] = f"n.{r.group(1)}{r.group(2).upper()}>{r.group(3).upper()}" r = re.search("p\..+", row["Mutation"]) if r: converted_mutations[(row["Gene"], row["Mutation"])] = row["Mutation"] r = re.search("c.-[0-9]+[ACGT]>[ACGT]", row["Mutation"]) if r: converted_mutations[(row["Gene"], row["Mutation"])] = row["Mutation"] r = re.search("c.[0-9]+dup[ACGT]+", row["Mutation"]) if r: converted_mutations[(row["Gene"], row["Mutation"])] = row["Mutation"] r = re.search("c.[0-9]+_[0-9]+dup[ACGT]+", row["Mutation"]) if r: converted_mutations[(row["Gene"], row["Mutation"])] = row["Mutation"] r = re.search("c.([0-9]+)del", row["Mutation"]) if r: # "ethA" "c.341del" del_start = int(r.group(1)) del_end = int(r.group(1)) if gene.strand == "+": # rpoB "c.1282_1290del" genome_start = gene.start + del_start - 2 genome_end = gene.start + del_end else: # "ethA" "c.1057_1059del" genome_start = gene.start - del_end genome_end = gene.start - del_start + 2 ref = refseq["Chromosome"][genome_start - 1:genome_end - 1] alt = ref[0] mutations[(row["Gene"], row["Mutation"])] = { "pos": genome_start, "ref": ref, "alt": alt, "gene": row["Gene"], "type": "nucleotide" } r = re.search("c.([0-9]+)_([0-9]+)del", row["Mutation"]) if r: del_start = int(r.group(1)) del_end = int(r.group(2)) if gene.strand == "+": # rpoB "c.1282_1290del" genome_start = gene.start + del_start - 2 genome_end = gene.start + del_end else: # "ethA" "c.1057_1059del" genome_start = gene.start - del_end genome_end = gene.start - del_start + 2 ref = refseq["Chromosome"][genome_start - 1:genome_end - 1] alt = ref[0] mutations[(row["Gene"], row["Mutation"])] = { "pos": genome_start, "ref": ref, "alt": alt, "gene": row["Gene"], "type": "nucleotide" } r = re.search("c.-([0-9]+)del", row["Mutation"]) if r: del_start = int(r.group(1)) del_end = int(r.group(1)) if gene.strand == "+": # "embA" "c.-29_-28del" genome_start = gene.start + del_start - 1 genome_end = gene.start + del_end + 1 else: # "alr" "c.-283_-280delCAAT" genome_start = gene.start - del_end - 1 genome_end = gene.start - del_start + 1 ref = refseq["Chromosome"][genome_start - 1:genome_end - 1] alt = ref[0] mutations[(row["Gene"], row["Mutation"])] = { "pos": genome_start, "ref": ref, "alt": alt, "gene": row["Gene"], "type": "nucleotide" } r = re.search("c.(-[0-9]+)_(-[0-9]+)del", row["Mutation"]) if r: del_start = int(r.group(1)) del_end = int(r.group(2)) if gene.strand == "+": # "embA" "c.-29_-28del" genome_start = gene.start + del_start - 1 genome_end = gene.start + del_end + 1 else: # "alr" "c.-283_-280delCAAT" genome_start = gene.start - del_end - 1 genome_end = gene.start - del_start + 1 ref = refseq["Chromosome"][genome_start - 1:genome_end - 1] alt = ref[0] mutations[(row["Gene"], row["Mutation"])] = { "pos": genome_start, "ref": ref, "alt": alt, "gene": row["Gene"], "type": "nucleotide" } r = re.search("c.([0-9]+)_([0-9]+)ins([ACGT]+)", row["Mutation"]) if r: ins_start = int(r.group(1)) ins_end = int(r.group(2)) ins_seq = r.group(3) if gene.strand == "+": # "rpoB" "c.1296_1297insTTC" genome_start = gene.start + ins_start - 1 genome_end = gene.start + ins_end - 1 else: # "pncA" "c.521_522insT" ins_seq = pp.revcom(ins_seq) genome_start = gene.start - ins_start genome_end = gene.start - ins_end + 2 ref = refseq["Chromosome"][genome_start - 1:genome_end - 1] alt = ref + ins_seq mutations[(row["Gene"], row["Mutation"])] = { "pos": genome_start, "ref": ref, "alt": alt, "gene": row["Gene"], "type": "nucleotide" } if row["Mutation"] == "frameshift": converted_mutations[(row["Gene"], row["Mutation"])] = row["Mutation"] if row["Mutation"] == "large_deletion": converted_mutations[(row["Gene"], row["Mutation"])] = row["Mutation"] if row["Mutation"][:19] == "any_missense_codon_": converted_mutations[(row["Gene"], row["Mutation"])] = row["Mutation"] print("Converting %s mutations" % len(mutations)) mutation_conversion = get_ann(mutations) for key in mutation_conversion: converted_mutations[key] = mutation_conversion[key] with open(args.out + ".csv", "w") as O: with open(args.out + ".log", "w") as L: writer = csv.DictWriter(O, fieldnames=list(row)) writer.writeheader() for row in csv.DictReader(open(args.csv)): key = (row["Gene"], row["Mutation"]) if row["Mutation"] != converted_mutations[key]: L.write( f'Recoded {row["Gene"]} {row["Mutation"]} as {converted_mutations[key]}\n' ) row["Mutation"] = converted_mutations[key] writer.writerow(row)
def main(args): vcf_class = pp.vcf(args.vcf) vcf_positions = vcf_class.get_positions() if not args.fasta: if not args.ref: sys.stderr.write( "\nERROR: Please supply a reference with --ref\n\n") quit() pp.run_cmd( "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" % vars(args)) args.fasta = "%s.snps.fa" % vcf_class.prefix if pp.nofile("%s.asr.state" % args.fasta): pp.run_cmd( "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr" % vars(args)) tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1) node_names = set([tree.name] + [n.name.split("/")[0] for n in tree.get_descendants()]) leaf_names = set(tree.get_leaf_names()) internal_node_names = node_names - leaf_names states_file = "%s.asr.state" % args.fasta states = defaultdict(dict) sys.stderr.write("Loading states\n") for l in tqdm(open(states_file)): if l[0] == "#": continue row = l.strip().split() if row[0] == "Node": continue site = int(row[1]) if row[0] not in internal_node_names: continue states[site][row[0]] = row[2] seqs = pp.fasta(args.fasta).fa_dict for site in tqdm(list(states)): for sample in seqs: states[site][sample] = seqs[sample][site - 1] acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"]) convergent_sites = [] for site in tqdm(list(states)): nucleotides = set([states[site][n] for n in node_names]) if len(nucleotides) == 1: continue # Set up storage objects origins = [] tree.add_feature("state", states[site][tree.name]) for n in tree.traverse(): if n == tree: continue node_state = states[site][n.name] if node_state != n.get_ancestors( )[0].state and node_state in acgt and n.get_ancestors( )[0].state in acgt: origins.append(n.name) n.add_feature("state", node_state) if len(origins) > 1: convergent_sites.append((site, vcf_positions[site - 1], origins)) with open(args.out, "w") as O: for site in convergent_sites: O.write("%s\t%s\n" % (site[1][1], len(site[2])))
def main_preprocess(args): conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) refseq = pp.fasta(conf["ref"]).fa_dict refseqname = list(refseq.keys())[0] if args.seqs: seqs = pp.fasta(args.seqs) samples = list(seqs.fa_dict.keys()) fasta2vcf(args.seqs, "merged.vcf.gz") # # # pp.run_cmd("vcf2fasta.py --vcf merged.vcf.gz --ref %s" % conf["ref"]) # pp.run_cmd("iqtree -s merged.fa -m GTR+F+R2 -bb 1000 -nt AUTO -asr -czb -redo") variant_data = get_variant_data("merged.vcf.gz", conf["ref"], conf["gff"], conf["proteins"]) #### variant_data = {29366: {'alts': 'T', 'types': 'missense', 'changes': '365P>365S', 'gene': 'N', 'gene_function': 'Nucleocapsid protein', 'gene_reference': '10.1186/s40779-020-00240-0'}} #### mutations = [{'position': 29868, 'mutation_type': 'convergent', 'origins': 14, 'branches': 'Node927'}] for i in range(len(mutations)): for key in variant_data[mutations[i]["position"]]: mutations[i][key] = variant_data[mutations[i]["position"]][key] with open(args.out + ".csv", "w") as O: tmp = list(mutations[0].keys()) fieldnames = tmp[:4] + tmp[-6:] + tmp[4:-6] writer = csv.DictWriter(O, fieldnames=fieldnames) writer.writeheader() writer.writerows(mutations) quit() seqs = pp.fasta("merged.fa").fa_dict tree = ete3.Tree("merged.fa.treefile", format=1) node_names = set([tree.name] + [n.name.split("/")[0] for n in tree.get_descendants()]) leaf_names = set(tree.get_leaf_names()) internal_node_names = node_names - leaf_names for n in tree.traverse(): if n.name.split("/")[0] in node_names: if "Node" in n.name: tmp = n.name.split("/") n.name = tmp[0] if len(tmp) > 1: n.support = tmp[1] states = defaultdict(dict) sites = set() sys.stderr.write("Loading states\n") for l in tqdm(open("merged.fa.state")): if l[0] == "#": continue row = l.strip().split() if row[0] == "Node": continue states[int(row[1])][row[0]] = row[2] sites.add(int(row[1])) sys.stderr.write("Loading alignment sites\n") for site in tqdm(sites): for sample in seqs: states[site][sample] = seqs[sample][site - 1] barcoding_sites = [] convergent_sites = [] mutations = [] for site in tqdm(sites): if site not in variant_data: continue ####### Not sure if this is right yet nucleotides = set([states[site][n] for n in node_names]) if len(nucleotides) == 1: continue # Set up storage objjects origins = [] internal_node_change = False tree.add_feature("state", states[site][tree.name]) for n in tree.traverse(): if n == tree: continue node_state = states[site][n.name] if node_state != n.get_ancestors()[0].state: origins.append(n.name) if n.name in internal_node_names: internal_node_change = True n.add_feature("state", node_state) type = "unique" if internal_node_change and len(origins) == 1: type = "barcoding" barcoding_sites.append(site) if len(origins) > 1: type = "convergent" convergent_sites.append(site) tmp_data = { "position": site, "mutation_type": type, "origins": len(origins), "branches": ",".join(origins), "gene": variant_data[site]["gene"], "gene_function": variant_data[site]["gene_function"], "gene_reference": variant_data[site]["gene_reference"], "alts": variant_data[site]["alts"], "functional_types": variant_data[site]["types"], "changes": variant_data[site]["changes"] } for sample in leaf_names: tmp_data[sample] = states[site][sample] mutations.append(tmp_data) print("Barcoding sites: ", barcoding_sites) print("Convergent sites: ", convergent_sites) # Reroot tree at S/L types outgroup_leaf_names = [s for s in leaf_names if seqs[s][8782 - 1] == "T"] tree.set_outgroup(tree.get_common_ancestor(outgroup_leaf_names)) tree.write(format=1, outfile=args.out + ".tree") with open(args.out + ".barcode.bed", "w") as O: for pos in barcoding_sites: for allele in set(list(states[pos].values())): tmp_samps = [x for x in leaf_names if states[pos][x] == allele] O.write("%s\t%s\t%s\t%s\t%s\n" % (refseqname, pos - 1, pos, allele, tree.get_common_ancestor(tmp_samps).name)) with open(args.out + ".mutation_summary.csv", "w") as O: writer = csv.DictWriter( O, fieldnames=[ "position", "mutation_type", "origins", "branches", "gene", "gene_function", "gene_reference", "alts", "functional_types", "changes" ] + list(leaf_names)) writer.writeheader() for row in mutations: writer.writerow(row)
def main_profile(args): if pp.nofolder(args.dir): os.mkdir(args.dir) conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db) ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix if args.fasta: if args.read1 or args.read2: sys.stderr.write( "Please use --fasta or --read1/2 but not both... Exiting!\n") quit() fasta_obj = pp.fasta(args.fasta) wg_vcf_obj = pp.vcf( fasta_obj.get_ref_variants(conf["ref"], prefix=args.prefix, file_prefix=files_prefix)) else: if not args.read1: sys.stderr.write( "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n" ) quit() ### Create bam file if fastq has been supplied ### if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) wg_vcf_obj = bam_obj.call_variants(conf["ref"], args.caller, remove_missing=True) cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"], wg_vcf_obj.samples[0], wg_vcf_obj.prefix + ".consensus.fasta") if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) refseq = pp.fasta(conf["ref"]).fa_dict refseqname = list(refseq.keys())[0] results = {} barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) barcode = pp.barcode(barcode_mutations, conf["barcode"]) clade = ";".join(sorted([d["annotation"] for d in barcode])) sys.stdout.write("%s\t%s\n" % (args.prefix, clade)) results["clade"] = clade variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"], conf["gff"], conf["proteins"]) results["variants"] = variant_data json.dump(results, open("%s.results.json" % files_prefix, "w"))
def main(args): if args.bed: pp.split_bed(args.bed, args.size, reformat=args.reformat) else: fasta = pp.fasta(args.fasta) fasta.splitchr(args.size, reformat=args.reformat)
def find_ancestral_mutations(msa_file, tree_file, states_file, variant_sites=None): seqs = pp.fasta(msa_file).fa_dict tree = ete3.Tree(tree_file, format=1) node_names = set([tree.name] + [n.name.split("/")[0] for n in tree.get_descendants()]) leaf_names = set(tree.get_leaf_names()) internal_node_names = node_names - leaf_names for n in tree.traverse(): if n.name.split("/")[0] in node_names: if "Node" in n.name: tmp = n.name.split("/") n.name = tmp[0] if len(tmp) > 1: n.support = tmp[1] tree.write(format=1, outfile=tree_file + ".reformatted.tree") states = defaultdict(dict) sites = set() sys.stderr.write("Loading states\n") for l in tqdm(open(states_file)): if l[0] == "#": continue row = l.strip().split() if row[0] == "Node": continue site = int(row[1]) if variant_sites and site not in variant_sites: continue if row[0] not in internal_node_names: continue states[site][row[0]] = row[2] sites.add(site) sys.stderr.write("Loading alignment sites\n") for site in tqdm(sites): for sample in seqs: states[site][sample] = seqs[sample][site - 1] barcoding_sites = [] convergent_sites = [] mutations = [] for site in tqdm(sites): nucleotides = set([states[site][n] for n in node_names]) if len(nucleotides) == 1: continue # Set up storage objjects origins = [] internal_node_change = False tree.add_feature("state", states[site][tree.name]) for n in tree.traverse(): if n == tree: continue node_state = states[site][n.name] if node_state != n.get_ancestors( )[0].state and node_state != "N" and n.get_ancestors( )[0].state != "N": origins.append(n.name) if n.name in internal_node_names: internal_node_change = True n.add_feature("state", node_state) type = "unique" if internal_node_change and len(origins) == 1: type = "barcoding" barcoding_sites.append(site) if len(origins) > 1: type = "convergent" convergent_sites.append(site) tmp_data = { "position": site, "mutation_type": type, "origins": len(origins), "branches": ",".join(origins), } for sample in leaf_names: tmp_data[sample] = states[site][sample] mutations.append(tmp_data) return mutations