Example #1
0
 def make_alns_dict(self):
     """Makes dendropy aln out of dict self.comb_seq for all genes.
     """
     physcraper.debug("make_alns_dict")
     firstelement = True
     count = 0
     for gene in self.comb_seq.keys():
         if count == 0:
             len1 = len(self.comb_seq[gene].keys())
             len2 = len1
             count = 1
         else:
             len2 = len(self.comb_seq[gene].keys())
         assert len1 == len2
     for gene in self.comb_seq.keys():
         if firstelement:
             aln1 = DnaCharacterMatrix.from_dict(self.comb_seq[gene])
             firstelement = False
             self.aln_all[count] = aln1
             aln1.write(path="{}/aln_0.fas".format(self.workdir),
                        schema="fasta")
         else:
             aln = DnaCharacterMatrix.from_dict(
                 self.comb_seq[gene], taxon_namespace=aln1.taxon_namespace)
             self.aln_all[count] = aln
             aln.write(path="{}/aln_{}.fas".format(self.workdir, count),
                       schema="fasta")
         count += 1
def create_sub_files(
    alignment_file,
    dates_file,
    subtree_file,
    subtree_dates_file,
    subfasta_file,
    new_dates_file,
):
    dates_dic = read_dates(dates_file)

    # clean up comments and add dates to end of taxon names
    with open(subtree_file, "r") as fp:
        content = fp.read().replace("None", "")
        content = re.sub("NODE_\d+", "", content)
        for taxon, date in dates_dic.items():
            content = content.replace(taxon, taxon + "_" + date)

    with open(subtree_dates_file, "w") as fp:
        fp.write(content)

    # add dates to end of sequence names
    sub_aln_dic = {}
    dna = DnaCharacterMatrix.get(path=alignment_file, schema="fasta")

    for taxon, date in dates_dic.items():
        t = dna.taxon_namespace.get_taxon(label=taxon)
        new_taxon_name = taxon + "_" + date
        sub_aln_dic[new_taxon_name] = str(dna[t])
    sub_dna = DnaCharacterMatrix.from_dict(sub_aln_dic)
    sub_dna.write(path=subfasta_file, schema="fasta")

    with open(new_dates_file, "w") as fp:
        fp.write(str(len(dates_dic)))
        for taxon, date in dates_dic.items():
            fp.write("\n" + taxon + "_" + date + "\t" + date)
Example #3
0
query_seq = DnaCharacterMatrix.get(path="ascomycota.fasta",schema="fasta")

def seq_dict_build(seq, label, seq_dict):
    new_seq = seq.symbols_as_string().replace("-","")
    for tax in seq_dict.keys():
        inc_seq = seq_dict[tax].symbols_as_string().replace("-","")
        if len(inc_seq) > len(new_seq):
            if inc_seq.find(new_seq) != -1:
                sys.stdout.write("seq {} is subsequence of {}, not added\n".format(label, tax))
                return
        else:
            if new_seq.find(inc_seq) != -1:
                del d[tax]
                d[label] = seq
                sys.stdout.write("seq {} is supersequence of {}, {} added and {} removed\n".format(label, tax, label, tax))
                return
    print (".")
    d[label] = seq
    return


for taxon, seq in query_seq.items():
    if len(seq.values()) > 800:
        seq_dict_build(seq, taxon.label, d)
    else:
        sys.stdout.write("*")


cull = DnaCharacterMatrix.from_dict(d)
cull.write(path="query_cull.fas", schema="fasta")
Example #4
0
                taxon.label = "ncbi_id_{}".format(gi_ncbi_map[gi].strip())
        except:
            taxon.label = "gi_{}".format(gi)
            sys.stderr.write("no taxon id found for gi_{}".format(gi))
    stops.append(len(seq.values()))


stops.sort()
stop = stops[int(len(stops)/2)]

d = {}
for taxon, seq in orig_seq.items():
        d[str(taxon.label)] = seq.values()[:stop]
    

dna_orig = DnaCharacterMatrix.from_dict(d)

dna_taxa = [i for i in dna_orig.taxon_namespace]

tre_orig = Tree.get(path = "{}_random_resolve.tre".format("ascomycota"), schema = "newick",taxon_namespace=dna_orig.taxon_namespace)
""
treed_taxa = [i.taxon for i in tre_orig.leaf_nodes()]

tre_orig.prune_taxa(set(treed_taxa) - set(dna_taxa))

for taxon in set(dna_taxa) - set(treed_taxa):
	del d[taxon.label]

#####NEXT STEPS!!!

#make a function that doe sthis dumb shit in orig as well
Example #5
0
                taxon.label = ncbi_to_ott[int(gi_ncbi_map[gi])]
            except:
                taxon.label = "ncbi_id_{}".format(gi_ncbi_map[gi].strip())
        except:
            taxon.label = "gi_{}".format(gi)
            sys.stderr.write("no taxon id found for gi_{}".format(gi))
    stops.append(len(seq.values()))

stops.sort()
stop = stops[int(len(stops) / 2)]

d = {}
for taxon, seq in orig_seq.items():
    d[str(taxon.label)] = seq.values()[:stop]

dna_orig = DnaCharacterMatrix.from_dict(d)

dna_taxa = [i for i in dna_orig.taxon_namespace]

tre_orig = Tree.get(path="{}_random_resolve.tre".format("ascomycota"),
                    schema="newick",
                    taxon_namespace=dna_orig.taxon_namespace)
""
treed_taxa = [i.taxon for i in tre_orig.leaf_nodes()]

tre_orig.prune_taxa(set(treed_taxa) - set(dna_taxa))

for taxon in set(dna_taxa) - set(treed_taxa):
    del d[taxon.label]

#####NEXT STEPS!!!
Example #6
0
import sys
from dendropy import DnaCharacterMatrix
infi = sys.argv[1]
outstub = sys.argv[2]
start = int(sys.argv[3])
stop = int(sys.argv[4])

orig = DnaCharacterMatrix.get(path=infi, schema="nexus")

d = {}
for taxon, seq in orig.items():
    d[taxon.label] = seq.values()[start:stop]

dna = DnaCharacterMatrix.from_dict(d)

dna.write(path="{}.fas".format(outstub), schema="fasta")
Example #7
0
    """prunes to 1 seq per spp, and fills in missing data for missing spp,
    in preparation for concanteneation, return dict to be made in char matrix"""
    aln_dict = {}
    tmp_dict = {}
    for taxon, seq in physcraper_obj.aln.items():
        aln_dict[taxon.label] = seq
    seqlen = len(seq) #should all be same bc aligned
    for spp_name in spp_dict.keys():
        try:
            otu = random.choice(spp_dict[spp_name])
            tmp_dict[spp_name] = aln_dict[otu]
        except KeyError:
            tmp_dict[spp_name] = "-" * seqlen
    return tmp_dict

aln1 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu1, gene1))
aln2 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu2, gene2), taxon_namespace = aln1.taxon_namespace)

concat = DnaCharacterMatrix.concatenate([aln1,aln2])
concat.write(path="concat.fas",
            schema="fasta")





#Open the two pyscraper objects
#Merge the alignements on OTT_ID?
#How to force/missing data ...