def write_seqs(tree, in_seqdir, out_seqdir, index, node_info):
    """
    Write sequences of proteins from tree to a file.
    These are the sequences involved in the duplication node and one from
    the nearest outgroup.

    ARGUMENTS:
    tree - the tree with tips that need sequences written
    seqdir - the location of the output file
    index - a unique identifier to tag the filename with.
    """
    og, node_lab = node_info.split()
    print(og, node_lab)
    print(tree)
    tips = []
    for node in tree.descendants:
        if node.name == node_lab:
            tips.extend(node.get_leaf_names())
        else:
            tips.append(node.get_leaf_names()[0])
            outgroup = node.get_leaf_names()[0]
    in_seqs = mod.read_fasta(in_seqdir + "/" + og + ".fa")
    newseqs = {}
    for tip in tips:
        print(og + " " + tip)
        newseqs[str(tip)] = in_seqs[str(tip)]
    out = open(out_seqdir + "/seqs_" + str(index) + ".fa", "w")
    for key, value in newseqs.items():
        out.write(">" + key + "\n" + value + "\n")
    out.close()
    out = open("outgroups", "a")
    out.write(outgroup + "\n")
    out.close()
def main():
    """
    Remove the shortest paralog from a terminal duplicaiton.
    
    Take a tree and seqeunce file from user arguments.
    While there are terminal duplicaitons (2 seqeunces from the same species
    who are eachothers nearest relative):
        Remove the sequence of the tuple with the shortest sequence.
    """
    treefile, seqfile = get_args()
    seqs = mod.read_fasta(seqfile)

    #modify seq names to match the treefile
    seq_names = list(seqs.keys())
    for seq in seq_names:
        repl = re.sub("\[|\]", "_", seq)
        seqs[repl] = seqs.pop(seq)


    tree = ete3.Tree(treefile, format = 1)
    #for node in nodes, if both descendants are a tip from the same species, delete the right one and start again
    done = 0
    while done == 0 and len(tree) > 1:
        print(tree)
        old_tree = tree.copy()
        for node in tree.traverse():
            descendants = list(node.iter_descendants())
            if len(node) == 1 and not node.is_leaf():
                node.delete()
                break
            if len(node) == 2:
                if descendants[0].name[:4] == descendants[1].name[:4]:

                    names = []
                    lengths = {}
                    for descendant in descendants:
                        names.append(descendant.name)
                        lengths[descendant.name] = len(seqs[descendant.name[5:]])
                    
                    to_delete = min(names, key=lengths.__getitem__)
                    to_delete = tree.search_nodes(name = to_delete)[0]
                    to_delete.detach()
                    break
                    done = 1
        if len(list(old_tree.traverse())) == len(list(tree.traverse())):
            done = 1

    #Write tree
    if len(tree) > 1:
        tree.write(outfile = treefile + ".paralogs_colapsed")
        out = open(seqfile + ".paralogs_colapsed", "w")
        for leaf in tree:
            pattern = re.compile(leaf.name[5:])
            key = list(filter(pattern.match, seqs.keys()))[0]
            out.write(">" + leaf.name[5:] + "\n" + seqs[key])
        out.close()
Ejemplo n.º 3
0
def main():
    """
    load a species tree that looks like what I need to emulate
    For each gene family:
        load the tree seqs
        modify the tip labels so that they are just the species names
        check if topologies match.
        if not, check if they match with the deletion of one or two outgroups
        if either of these are true, add them to some sort of list
        write the seqs file but with gene names modified to be just the
        species or species with a number for the duplcited genes.
            This bit might actuall be a bit of pain because i need to ensure
            the 1s are all on the same side, which is the opposite of the 2s
    """
    species_tree, gene_trees, seq_files, outdir, extension = get_args()
    gene_trees = glob.glob(gene_trees + "/*" + extension)

    duplication_tree = ete3.Tree(species_tree)
    pure_sp_tree = remove_extensions(duplication_tree.copy())

    outgroup = ["Nstr", "Isca", "Ssca"]
    ingroup = get_ingroup(pure_sp_tree, outgroup)
    ingroup_names = []
    for leaf in ingroup:
        ingroup_names.append(leaf.name)
    ingroup = ingroup_names

    for treefile in gene_trees:
        tree = ete3.Tree(treefile)
        pure_gene_tree = remove_extensions(tree.copy())

        #check the number of taxa is to specification
        if not correct_n_taxa(pure_gene_tree, ingroup, outgroup):
            continue

        #add unique names, but only if ingroup is monophyletic
        if pure_gene_tree.check_monophyly(values=ingroup_names,
                                          target_attr="name"):
            add_unique_ids(pure_gene_tree, ingroup_names)
        else:  #here the ingroup is not monophyletic so skip
            continue

        #Continue of rf distance is 0. Given our previous taxa occupency
        #checks this well confirm our trees are correct
        if pure_gene_tree.robinson_foulds(duplication_tree)[0] == 0:
            #find sequence file
            base_name = treefile.split("/")[1].split("_")[0]
            seqs = mod.read_fasta(seq_files + "/" + base_name + ".fa")

            new_seqs = modified_sequence_dict(tree, pure_gene_tree, seqs,
                                              outgroup)

            out = open(outdir + "/" + base_name + ".fa", "w")
            for key, value in new_seqs.items():
                out.write(">" + key + "\n" + value + "\n")
            out.close()
Ejemplo n.º 4
0
def main():
    """Do the things."""
    infile, outfile, n_sites = get_args()
    seqs = mod.read_fasta(infile)
    flag = 1
    out = open(outfile, "w")
    for key, value in seqs.items():
        if flag:
            indicies = sorted(random.sample(range(len(value)), int(n_sites)))
            flag = 0
        out.write(">" + key + "\n" + "".join([value[i] for i in indicies]) + "\n")
    out.close()
Ejemplo n.º 5
0
def main():
    """
    Perform the following.
    
    1. Get user input of infile and outfile
    2. Read in the sequences from the transcriptome
    3. For each sequence, take the first start codon in any frame and read
    through until a stop codon is reached. Write this to the outfile. If a
    stop codon is not reached, just remove the sequence from the outfile.
    """
    infile, outfile = get_args()
    seqs = mod.read_fasta(infile)

    coding_seqs = remove_guff(seqs)

    out = open(outfile, "w")
    for key, value in coding_seqs.items():
        out.write(">" + key + "\n" + value + "\n")
    out.close()
Ejemplo n.º 6
0
def write_seqs(tree, in_seqdir, out_seqdir, index, node_info):
    """
    Write sequences of proteins from tree to a file.

    ARGUMENTS:
    tree - the tree with tips that need sequences written
    seqdir - the location of the output file
    index - a unique identifier to tag the filename with.
    """
    og = node_info.split()[0]
    print(tree)
    tips = tree.get_leaf_names()
    print(tips)
    in_seqs = mod.read_fasta(in_seqdir + "/" + og + ".fa")
    newseqs = {}
    for tip in tips:
        print(og + " " + tip)
        newseqs[str(tip)] = in_seqs[str(tip)]
    out = open(out_seqdir + "/seqs_" + str(index) + ".fa", "w")
    for key, value in newseqs.items():
        out.write(">" + key + "\n" + value + "\n")
    out.close()