def make_alns_dict(self): """Makes dendropy aln out of dict self.comb_seq for all genes. """ physcraper.debug("make_alns_dict") firstelement = True count = 0 for gene in self.comb_seq.keys(): if count == 0: len1 = len(self.comb_seq[gene].keys()) len2 = len1 count = 1 else: len2 = len(self.comb_seq[gene].keys()) assert len1 == len2 for gene in self.comb_seq.keys(): if firstelement: aln1 = DnaCharacterMatrix.from_dict(self.comb_seq[gene]) firstelement = False self.aln_all[count] = aln1 aln1.write(path="{}/aln_0.fas".format(self.workdir), schema="fasta") else: aln = DnaCharacterMatrix.from_dict( self.comb_seq[gene], taxon_namespace=aln1.taxon_namespace) self.aln_all[count] = aln aln.write(path="{}/aln_{}.fas".format(self.workdir, count), schema="fasta") count += 1
def create_sub_files( alignment_file, dates_file, subtree_file, subtree_dates_file, subfasta_file, new_dates_file, ): dates_dic = read_dates(dates_file) # clean up comments and add dates to end of taxon names with open(subtree_file, "r") as fp: content = fp.read().replace("None", "") content = re.sub("NODE_\d+", "", content) for taxon, date in dates_dic.items(): content = content.replace(taxon, taxon + "_" + date) with open(subtree_dates_file, "w") as fp: fp.write(content) # add dates to end of sequence names sub_aln_dic = {} dna = DnaCharacterMatrix.get(path=alignment_file, schema="fasta") for taxon, date in dates_dic.items(): t = dna.taxon_namespace.get_taxon(label=taxon) new_taxon_name = taxon + "_" + date sub_aln_dic[new_taxon_name] = str(dna[t]) sub_dna = DnaCharacterMatrix.from_dict(sub_aln_dic) sub_dna.write(path=subfasta_file, schema="fasta") with open(new_dates_file, "w") as fp: fp.write(str(len(dates_dic))) for taxon, date in dates_dic.items(): fp.write("\n" + taxon + "_" + date + "\t" + date)
query_seq = DnaCharacterMatrix.get(path="ascomycota.fasta",schema="fasta") def seq_dict_build(seq, label, seq_dict): new_seq = seq.symbols_as_string().replace("-","") for tax in seq_dict.keys(): inc_seq = seq_dict[tax].symbols_as_string().replace("-","") if len(inc_seq) > len(new_seq): if inc_seq.find(new_seq) != -1: sys.stdout.write("seq {} is subsequence of {}, not added\n".format(label, tax)) return else: if new_seq.find(inc_seq) != -1: del d[tax] d[label] = seq sys.stdout.write("seq {} is supersequence of {}, {} added and {} removed\n".format(label, tax, label, tax)) return print (".") d[label] = seq return for taxon, seq in query_seq.items(): if len(seq.values()) > 800: seq_dict_build(seq, taxon.label, d) else: sys.stdout.write("*") cull = DnaCharacterMatrix.from_dict(d) cull.write(path="query_cull.fas", schema="fasta")
taxon.label = "ncbi_id_{}".format(gi_ncbi_map[gi].strip()) except: taxon.label = "gi_{}".format(gi) sys.stderr.write("no taxon id found for gi_{}".format(gi)) stops.append(len(seq.values())) stops.sort() stop = stops[int(len(stops)/2)] d = {} for taxon, seq in orig_seq.items(): d[str(taxon.label)] = seq.values()[:stop] dna_orig = DnaCharacterMatrix.from_dict(d) dna_taxa = [i for i in dna_orig.taxon_namespace] tre_orig = Tree.get(path = "{}_random_resolve.tre".format("ascomycota"), schema = "newick",taxon_namespace=dna_orig.taxon_namespace) "" treed_taxa = [i.taxon for i in tre_orig.leaf_nodes()] tre_orig.prune_taxa(set(treed_taxa) - set(dna_taxa)) for taxon in set(dna_taxa) - set(treed_taxa): del d[taxon.label] #####NEXT STEPS!!! #make a function that doe sthis dumb shit in orig as well
taxon.label = ncbi_to_ott[int(gi_ncbi_map[gi])] except: taxon.label = "ncbi_id_{}".format(gi_ncbi_map[gi].strip()) except: taxon.label = "gi_{}".format(gi) sys.stderr.write("no taxon id found for gi_{}".format(gi)) stops.append(len(seq.values())) stops.sort() stop = stops[int(len(stops) / 2)] d = {} for taxon, seq in orig_seq.items(): d[str(taxon.label)] = seq.values()[:stop] dna_orig = DnaCharacterMatrix.from_dict(d) dna_taxa = [i for i in dna_orig.taxon_namespace] tre_orig = Tree.get(path="{}_random_resolve.tre".format("ascomycota"), schema="newick", taxon_namespace=dna_orig.taxon_namespace) "" treed_taxa = [i.taxon for i in tre_orig.leaf_nodes()] tre_orig.prune_taxa(set(treed_taxa) - set(dna_taxa)) for taxon in set(dna_taxa) - set(treed_taxa): del d[taxon.label] #####NEXT STEPS!!!
import sys from dendropy import DnaCharacterMatrix infi = sys.argv[1] outstub = sys.argv[2] start = int(sys.argv[3]) stop = int(sys.argv[4]) orig = DnaCharacterMatrix.get(path=infi, schema="nexus") d = {} for taxon, seq in orig.items(): d[taxon.label] = seq.values()[start:stop] dna = DnaCharacterMatrix.from_dict(d) dna.write(path="{}.fas".format(outstub), schema="fasta")
"""prunes to 1 seq per spp, and fills in missing data for missing spp, in preparation for concanteneation, return dict to be made in char matrix""" aln_dict = {} tmp_dict = {} for taxon, seq in physcraper_obj.aln.items(): aln_dict[taxon.label] = seq seqlen = len(seq) #should all be same bc aligned for spp_name in spp_dict.keys(): try: otu = random.choice(spp_dict[spp_name]) tmp_dict[spp_name] = aln_dict[otu] except KeyError: tmp_dict[spp_name] = "-" * seqlen return tmp_dict aln1 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu1, gene1)) aln2 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu2, gene2), taxon_namespace = aln1.taxon_namespace) concat = DnaCharacterMatrix.concatenate([aln1,aln2]) concat.write(path="concat.fas", schema="fasta") #Open the two pyscraper objects #Merge the alignements on OTT_ID? #How to force/missing data ...