예제 #1
0
def build_phylogeny_trees():
    path = "out/homologous_gene_sequences/"
    output_path = "out/aligned_homologous_gene_sequences/"

    for homologous_gene_sequence in os.listdir(path):
        input = path + homologous_gene_sequence
        output = output_path + homologous_gene_sequence
    
        clustal_omega = ClustalOmegaCommandline(infile=input, outfile=output, verbose=True, auto=True)
        os.system(str(clustal_omega))

        multi_seq_align = AlignIO.read(output, 'fasta')

        # Distance Matrix
        calculator = DistanceCalculator('identity')
        dist_mat = calculator.get_distance(multi_seq_align)

        tree_constructor = DistanceTreeConstructor()
        phylo_tree = tree_constructor.upgma(dist_mat)

        Phylo.draw(phylo_tree)

        print('\nPhylogenetic Tree\n', homologous_gene_sequence)
        Phylo.draw_ascii(phylo_tree)
        Phylo.write([phylo_tree], 'out/phylogenetic_trees/{}_tree.nex'.format(homologous_gene_sequence), 'nexus')
예제 #2
0
 def printGeneTree(self):
     """
     Print gene trees with matplotlib and in the terminal for the four largest target ORFs of coronaviruses.
     Takes a .phy file containing multiple alligned sequences, generates a matrix based on sequence composition 
     and compares each sequence (genome) to one another. sequences with grater scores (similarity) are ranked closer
     together on the phylogenetic trees.
     input: A .phy file that contains coronavirus gene sequences to draw phylogenetic tree
     output: A visual representation of a gene tree on terminal and matplotlib
     """
     align = AlignIO.read(
         self.newPhylip,
         'phylip')  # Reads created .phy file containing the SeqRecord
     #print (align) # prints concatenated allignments
     calculator = DistanceCalculator('identity')
     dm = calculator.get_distance(align)  # Calculate the distance matrix
     print(
         '\n======================================== DISTANCE MATRIX =======================================\n'
     )
     print(dm, "\n\n")  # Print the distance Matrix
     constructor = DistanceTreeConstructor(
     )  # Construct the phylogenetic tree using UPGMA algorithm
     tree = constructor.upgma(dm)
     print(
         '\n========================================= GENE TREE ===========================================\n'
     )
     Phylo.draw(
         tree
     )  # Draw the phylogenetic tree (must install matplotlib to use this formatting)
     Phylo.draw_ascii(tree)  # Print the phylogenetic tree in terminal
def main_new(fastafile, bkp):
    distance_name = ["ab", "ac", "bc"]
    temp = SeqIO.to_dict(SeqIO.parse(fastafile, "fasta"))
    seq_name = [*temp]
    aln = AlignIO.read(open(fastafile), 'fasta')
    calculator = DistanceCalculator('blosum62')
    segment_1 = calculator.get_distance(aln[:, :bkp])
    segment_2 = calculator.get_distance(aln[:, bkp:])
    distance = [
        segment_1[seq_name[1]][0], segment_1[seq_name[2]][0],
        segment_1[seq_name[2]][1], segment_2[seq_name[1]][0],
        segment_2[seq_name[2]][0], segment_2[seq_name[2]][1]
    ]
    #distance=[segment_1[seq_name[1]][0],segment_1[seq_name[2]][0],segment_1[seq_name[2]][1],segment_2[seq_name[1]][0],segment_2[seq_name[2]][0],segment_2[seq_name[2]][1]];
    compare_distance = [
        abs(distance[0] - distance[3]),
        abs(distance[1] - distance[4]),
        abs(distance[2] - distance[5])
    ]  ##in order of ab,ac,bc
    temp2 = distance_name[compare_distance.index(min(compare_distance))]
    string = "abc"
    string = string.replace(temp2[0], "")
    string = string.replace(temp2[1], "")
    rec = seq_name["abc".index(string)]
    return rec
예제 #4
0
def plot_alignment_heatmap(alignments, trans_dict=None, title="Percent difference"):
    # calculate distance - https://biopython.org/wiki/Phylo
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignments)

    if trans_dict is None:
        # create a translation dictionary for human understandable labels
        trans_dict = dict(
            (alignment.id, " ".join(alignment.description.split()[1:3])) for alignment in alignments
        )

    # create dataframe from distance matrix for easier plotting
    df = pd.DataFrame(
        dm.matrix,
        index=[trans_dict[name] for name in dm.names],
        columns=[trans_dict[name] for name in dm.names]
    )
    plt.figure()
    sns.heatmap(
        df * 100,
        fmt='3.2f',
        annot=True,
        linewidths=0.5,
        cmap=sns.light_palette("navy"),
        cbar=False,
        square=True
    )
    plt.title(title)
    plt.tight_layout()
    return plt.gcf()
예제 #5
0
def plot_phylo_tree(align: MultipleSeqAlignment, accession_numbers: dict):
    """
    Plots a phylogenetic tree
    :param align: MultipleSeqAlignment with the alignment result to be plotted
    :param accession_numbers: dict of accession numbers and their translation to human-understandable names
    :return: figure-handle of the plotted phylogenetic tree
    """
    # calculate distance - https://biopython.org/wiki/Phylo
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(align)

    # construct a tree
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)

    # remove the names for the non-terminals for better visual appeal
    for non_terminal in tree.get_nonterminals():
        non_terminal.name = ''

    # change accession numbers into human more understandable names
    for terminal in tree.get_terminals():
        terminal.name = accession_numbers[re.match("(^\S*)(?=\.)",
                                                   terminal.name)[0]]

    print(Phylo.draw_ascii(tree))

    # plot the tree
    fig, ax = plt.subplots(1, 1)
    # draw the resulting tree
    Phylo.draw(tree, show_confidence=False, axes=ax, do_show=False)
    ax.set_xlim(right=0.8)
    return fig
예제 #6
0
def fastaToNJTree(fastaFile, outputFile):
    aln = AlignIO.read(fastaFile, 'fasta')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)
    Phylo.write(tree, outputFile, 'newick')
def calculate_weight_vector(aln_obj,
                            algorithm='pairwise',
                            calc_mx='identity',
                            repeat=1000,
                            nucl=False):
    alg_types = ['voronoi', 'pairwise']
    if algorithm not in alg_types:
        raise ValueError("Invalid algorithm type. Expected one of: %s" %
                         alg_types)
    i = 0
    if algorithm == 'voronoi':
        calculator = DistanceCalculator(calc_mx)
        convergence_vr = [0] * len(aln_obj)
        while i < repeat:
            test_seq = generate_sequence_sampled_from_alignment(aln_obj)
            wei_vr = list()
            for seq_obj in aln_obj:
                wei_vr.append(calculator._pairwise(seq_obj.seq, test_seq))
            closest_seq = min(wei_vr)
            closest_sequences = [
                i for i, j in enumerate(wei_vr) if j == closest_seq
            ]
            for pos in closest_sequences:
                convergence_vr[pos] += 1 / len(closest_sequences)
            i += 1
        return [i / sum(convergence_vr) for i in convergence_vr]
    if algorithm == 'pairwise':
        tree = tree_construct(aln_obj, nucl=nucl, calc_mx=calc_mx)
        distance_sums = list()
        for seq_obj in aln_obj:
            curr_seq_dist = 0
            for seq_obj2 in aln_obj:
                curr_seq_dist += tree.distance(seq_obj.id, seq_obj2.id)
            distance_sums.append(curr_seq_dist)
        return [i / sum(distance_sums) for i in distance_sums]
예제 #8
0
def dna(file_path, file_format, algorithm):
    # Read the sequences and align
    aln = AlignIO.read(file_path, file_format)

    # Print the alignment
    print(aln)

    # Calculate the distance matrix
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)

    # Print the distance Matrix
    print('\nDistance Matrix\n===================')
    print(calculator)

    # Construct the phylogenetic tree using choosen algorithm
    constructor = DistanceTreeConstructor()
    if algorithm.lower() == 'upgma':
        tree = constructor.upgma(dm)
    elif algorithm.lower() == 'nj':
        tree = constructor.nj(dm)
    else:
        click.echo('Invalid algorithm!')

    # Draw the phylogenetic tree
    Phylo.draw(tree)

    # Print the phylogenetic tree in the terminal
    print('\nPhylogenetic Tree\n===================')
    Phylo.draw_ascii(tree)
예제 #9
0
def consensus(msa):
    alignment = MultipleSeqAlignment(msa)
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(alignment)
    print tree
예제 #10
0
def tree(from_cluster,to_cluster, grupa):

    consensus_trees = []

    for i in [x for x in range(from_cluster,to_cluster)]:

        msa = AlignIO.read('msa\msa_rodzina_' + str(i)+ '_s.fasta', 'fasta')
        print i
        calculator = DistanceCalculator('identity')

        try:
            dm = calculator.get_distance(msa)
            constructor = DistanceTreeConstructor(calculator, 'nj')
            trees = bootstrap_trees(msa, 50, constructor)

            trees_list = list(trees)
            not_included = set([])

            for j in range(len(trees_list)):
                target_tree = trees_list[j]
                support_tree = get_support(target_tree, trees_list)

            for node in support_tree.get_nonterminals():
                if node.confidence < 50:
                    not_included.add(j)

            trees = [trees_list[k] for k in range(len(trees_list)) if k not in not_included]

            if len(trees) > 0:
                consensus_trees.append(majority_consensus(trees))

        except:
            ValueError

    Phylo.write(consensus_trees,"drzewa_wynikowe_" + str(grupa),"newick")
예제 #11
0
def nj_tree_constructor(x):
    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator("identity")
    dm = calculator.get_distance(x)
    njtree = constructor.nj(dm)
    print(njtree)
    Phylo.draw_ascii(njtree)
예제 #12
0
def construct_id_dm(seq_df,
                    seq_fpath,
                    align_outpath="tmp/iddm_align.fasta",
                    ordered=False,
                    aligned=False,
                    kalign_silent=True):
    """Constructs an np.ndarray corresponding to the identity distance matrix of records in seq_df

    :param seq_df: DataFrame of OrthoDB/ NCBI sequence records; should only contain records for which identity
    distance matrix will be computed
    :param seq_fpath:  Path of fasta file containing at least all of the records in seq_df. Can contain more records
    than are in seq_df - a temporary file containing only the records in seq_df.index will be generated (filtered_fpath)
    :param align_outpath: Optional filepath. If provided, the resulting alignment will be stored there. Otherwise,
    written to a temporary file (tmp/iddm_align.fasta)
    :param ordered: boolean. True: distance matrix rows will be ordered by the order of records in seq_df.index;
    False: distance matrix rows will be ordered by the order of records in seq_fpath
    :return: id_dm: np.ndarray of identity distance matrix calculated by AlignIO
    :return: align_srs: pandas Series object containing aligned sequences
    """
    from Bio.Phylo.TreeConstruction import DistanceCalculator
    from Bio import AlignIO
    # Filter records in seq_fpath to new fasta only containing records in seq_df.index
    # filtered_outpath = "tmp/iddm.fasta"
    filtered_fpath = "tmp/alias_matches.fasta"
    filter_fasta_infile(seq_df.index,
                        seq_fpath,
                        outfile_path=filtered_fpath,
                        ordered=ordered)
    if not aligned:
        # KAlign sequences in filtered_outpath, write to align_outpath
        with open(filtered_fpath,
                  'r') as filtered_f, open(align_outpath,
                                           'wt',
                                           encoding='utf-8') as align_f:
            args = ['kalign']
            if kalign_silent:
                subprocess.run(args=args,
                               stdin=filtered_f,
                               stdout=align_f,
                               stderr=subprocess.PIPE,
                               text=True)
            else:
                subprocess.run(args=args,
                               stdin=filtered_f,
                               stdout=align_f,
                               text=True)
    else:
        align_outpath = filtered_fpath
    align_srs = fasta_to_srs(align_outpath)
    with open(align_outpath) as aligned_f:
        aln = AlignIO.read(aligned_f, 'fasta')
    calculator = DistanceCalculator('identity')
    id_dm_obj = calculator.get_distance(aln)
    # Convert AlignIO object to np.ndarray
    for i, r in enumerate(id_dm_obj):
        if i == 0:
            id_dm = np.array(r)
        else:
            id_dm = np.vstack((id_dm, r))
    return id_dm, align_srs
def buildTree(FASTAFile):
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.nj(myMatrix)
    upgmaTree.root_at_midpoint()
    Phylo.draw(upgmaTree)
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format

    # RLR tree required for maxParsimony function
    tree = NewicktoRLR(tree)
    return tree
예제 #14
0
def tree_reconstruction(phy_file, method, model, phyformat):
    '''Construct tree with given method and model'''

    aln = AlignIO.read(phy_file, 'phylip-' + phyformat)

    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator(model)
    dm = calculator.get_distance(aln)

    if method == 'upgma':
        tree = constructor.upgma(dm)
    elif method == 'nj':
        tree = constructor.nj(dm)

    tree.ladderize()

    for c in tree.find_clades():
        if 'Inner' in c.name:
            c.name = ''

    Phylo.write(tree, args.output + '/tree.nwk', 'newick')

    plt.rcParams['font.style'] = 'italic'
    plt.rc('font', size=8)
    plt.rc('axes', titlesize=14)
    plt.rc('xtick', labelsize=10)
    plt.rc('ytick', labelsize=10)
    plt.rc('figure', titlesize=18)

    draw(tree, do_show=False)
    plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
예제 #15
0
def get_tree(aln, kind='nj'):
    from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return dm, tree
예제 #16
0
def consensus(msa):
    alignment = MultipleSeqAlignment(msa)
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(alignment)
    print tree
def build_trees(filename, tree_name):
    # Compute alignment with ClustalW algorithm
    clustalw_cline = ClustalwCommandline("clustalw",
                                         infile="{}.fa".format(filename))
    clustalw_cline()
    alignment = AlignIO.read("{}.aln".format(filename), format="clustal")

    # Create distance matrix
    calculator = DistanceCalculator('blosum62')
    dist_matrix = calculator.get_distance(alignment)

    # Build phylogenetic trees using upgma and nj methods
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dist_matrix)
    nj_tree = constructor.nj(dist_matrix)

    # Draw the trees
    label_func = lambda clade: "" if clade.name.startswith("Inner") else clade

    Phylo.draw(upgma_tree, label_func=label_func, do_show=False)
    plt.title("{} × upgma".format(tree_name))
    plt.show()

    Phylo.draw(nj_tree, label_func=label_func, do_show=False)
    plt.title("{} × nj".format(tree_name))
    plt.show()
예제 #18
0
def upgma_tree_constructor(x):
    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator("identity")
    dm = calculator.get_distance(x)
    upgmatree = constructor.upgma(dm)
    print(upgmatree)
    Phylo.draw_ascii(upgmatree)
예제 #19
0
def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"

    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    print("Number of characters in alignment:", len(alignment[0]))

    ####################
    # Neighbor joining #
    ####################
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)

    constructor = DistanceTreeConstructor()
    start = time.time()
    tree = constructor.nj(dm)
    end = time.time()
    print("Neighbor joining ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)

    #########
    # UPGMA #
    #########

    start = time.time()
    tree = constructor.upgma(dm)
    end = time.time()
    print("UPGMA ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)
예제 #20
0
def main():
	
	calculator = DistanceCalculator()

	# Exercise 1
	print("Exercise 1:")
	genomeAfrican = getGenome("genomes/africanAligned.fasta")
	genomeIndian = getGenome("genomes/indianAligned.fasta")
	genomeMammoth = getGenome("genomes/mammothAligned.fasta")

	distAM = calculator._pairwise(genomeAfrican, genomeMammoth)
	distIM = calculator._pairwise(genomeIndian, genomeMammoth)

	print("Distance between African and Mammoth is {}.".format(distAM))
	print("Distance between Indian and Mammoth is {}.".format(distIM))

	# Exercise 3
	print("\nExercise 3:")
	genomeWhale = getGenome("genomes/whaleAligned.fasta")
	genomeCow = getGenome("genomes/cowAligned.fasta")
	genomeHippo = getGenome("genomes/hippoAligned.fasta")

	distWC = calculator._pairwise(genomeWhale, genomeCow)
	distWH = calculator._pairwise(genomeWhale, genomeHippo)
	
	print("Distance between Whale and Cow is {}.".format(distWC))
	print("Distance between Whale and Hippo is {}.".format(distWH))
예제 #21
0
def main():
    alignment = AlignIO.read(open("protein.fasta"), "fasta")
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    constructor = DistanceTreeConstructor(calculator, 'upgma')
    tree = constructor.build_tree(alignment)
    tree.ladderize()
    Phylo.draw(tree)
예제 #22
0
파일: dash.py 프로젝트: YashasviMantha/NLP
def get_tree():
    #biopython-extract the unrooted  tree
    aln = AlignIO.read('agc.aln', 'clustal')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return tree
예제 #23
0
def build_tree(aln, kind='nj'):
    """Build a tree with bio.phylo module"""

    from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    return dm, tree
예제 #24
0
 def test_nonmatching_seqs(self):
     aln = AlignIO.read(StringIO(">Alpha\nA-A--\n>Gamma\n-Y-Y-"), "fasta")
     # With a proper scoring matrix -- no matches
     dmat = DistanceCalculator('blosum62').get_distance(aln)
     self.assertEqual(dmat['Alpha', 'Alpha'], 0.)
     self.assertEqual(dmat['Alpha', 'Gamma'], 1.)
     # Comparing characters only -- 4 misses, 1 match
     dmat = DistanceCalculator().get_distance(aln)
     self.assertEqual(dmat['Alpha', 'Alpha'], 0.)
     self.assertAlmostEqual(dmat['Alpha', 'Gamma'], 4. / 5.)
def createTree(file):
    aln = AlignIO.read(file, 'phylip')
    # Calculate the distance matrix
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)

    # Construct the phylogenetic tree using UPGMA algorithm
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)
    Phylo.write(tree, 'new.xml', 'phyloxml')
예제 #26
0
def D_seq_matrix(fasta_file):
    aln = AlignIO.read(fasta_file, 'fasta')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree_seq = constructor.upgma(dm)
    #print tree_dmc
    Phylo.write(tree_seq,'ph_seq.nre','newick')
    print dm.names 
    return dm
예제 #27
0
    def __init__(self, input_filename, searcher):
        # Read the aligned sequences and align
        self.aligned_file = AlignIO.read(input_filename, format='clustal')
        self.searcher = searcher

        # Calculate the distance matrix
        calculator = DistanceCalculator('identity')
        self.distance_matrix = calculator.get_distance(self.aligned_file)

        matplotlib.rc('font', size=6)
예제 #28
0
def cluster_by_cdr3(results_table, output_dir):
    df1 = pd.read_csv(results_table)
    res_df = pd.DataFrame()
    fasta_file = output_dir + "/cdr3.fasta"
    with open(fasta_file, 'w+') as fas:
        for i, row in df1.iterrows():
            if type(row["CDR3 first"]) is not str:
                continue
            fas.write(">" + "_".join(row["Patient"].split(" ")) + ":" +
                      row["well_id"] + ":" + row["V first"] + "\n")
            fas.write(row["CDR3 first"] + "\n")

    alignment_file = align_func.clustalw_align(fasta_file, sys.stdout)

    aln = AlignIO.read(alignment_file, 'clustal')

    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    with open(output_dir + '/dm.pkl', 'wb') as f:
        pickle.dump(dm, f, protocol=0)

    l = list(combinations(range(len(dm.names)), 2))
    distmat = np.repeat(np.inf, len(l))

    for index in range(len(l)):
        distmat[index] = dm.matrix[l[index][1]][l[index][0]]

    with open(output_dir + '/distmat.pkl', 'wb') as f:
        pickle.dump(distmat, f, protocol=0)

    Z = linkage(distmat, method='average')

    max_d = 0.05
    clusters = fcluster(Z, max_d, criterion='distance')

    patient_col = [x.split("_W")[0] for x in dm.names]
    well_col = ['W' + x.split("_W")[1].split("_")[0] for x in dm.names]
    df2 = pd.DataFrame(data={
        "cluster": clusters,
        "Patient": patient_col,
        "well_id": well_col
    })

    table = pd.merge(df2, df1, on=["Patient", "well_id"], how="inner")

    table = table[[
        'cluster', 'Patient', 'Amp Batch', 'well_id', 'cell_name', '#reads',
        '#umi distribution', "V first", "V first counts", "V second",
        "V second counts", "D first", "D first counts", "D second",
        "D second counts", "J first", "J first counts", "J second",
        "J second counts", "CDR3 first", "CDR3 first counts", "CDR3 second",
        "CDR3 second counts"
    ]]

    table.to_csv(output_dir + '/full_results.csv')
예제 #29
0
def MSAOBJ(Align):
    calculator = DistanceCalculator('identity')

    MSAlst = []
    for indx  in Align.index:
        ind = [a == indx for a in Align.index]
        seq = Seq(list(Align[ind].iloc[0])[0])
        MSAlst.append(SeqRecord(seq,id=indx))
        align = MultipleSeqAlignment(MSAlst)
        dm = calculator.get_distance(align)
        return(align,dm)
예제 #30
0
def build_tree_NJ(msa, distanceMatrix=None):
    if not distanceMatrix:
        distCalculator = DistanceCalculator("identity")
        distanceMatrix = distCalculator.get_distance(msa)
    # Construct the tree with the distance Matrix
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(distanceMatrix)
    # Make the tree rooted
    #tree.root_at_midpoint()
    #return newick format
    return "[&R] " + tree.format("newick").strip()
예제 #31
0
def phyloxml_from_msa(msa, phyloxml):
    from Bio import AlignIO
    from Bio.Phylo.TreeConstruction import DistanceCalculator
    from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
    from Bio import Phylo
    ms_alignment = AlignIO.read(msa, "fasta")
    calculator = DistanceCalculator("ident")
    dist_matrix = calculator.get_distance(ms_alignment)
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dist_matrix)
    Phylo.write(tree, phyloxml, "phyloxml")
예제 #32
0
def generar_arbol(file, indice):
    with open(file, "r") as aln:
        alineamiento = AlignIO.read(aln, "clustal")

    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alineamiento)

    constructor = DistanceTreeConstructor(calculator)
    nj = constructor.nj(dm)  # Neighbor Joining
    Phylo.draw(nj)
    path = './static/assets/arbol_filogenetico' + indice + '.png'
    pylab.savefig(path, format='png')
예제 #33
0
    def calculate_distance_matrix(self, type, file):
        in_file = file
        # print(type(in_file))
        if type == 'DNA':
            matrix_type = 'blastn'
        else:
            matrix_type = 'blosum62'

        calculator = DistanceCalculator(matrix_type)
        alignment = AlignIO.read(in_file, "fasta")
        dm = calculator.get_distance(alignment)
        return dm
예제 #34
0
def blosumnj(filename):
    aln = AlignIO.read(open(filename), 'fasta')
    print(aln)

    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(aln)
    print(dm)

    from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)
    print(tree)
예제 #35
0
def distance(inFile, model='identity'):
    """
    Given an alingment file (in fasta format), this function return a distance matrix.
    Module required:
    - AlignIO (from Bio)
    - DistanceCalculator (from Bio.Phylo.TreeConstruction)
    Usage: <inFile> <model (default = 'identity')>
    """
    aln = AlignIO.read(inFile, 'fasta') # read the alignment
    calculator = DistanceCalculator(model) # prepare the mode to calculate the distance
    dm = calculator.get_distance(aln) # calculate the distance of the alignment
    return dm
예제 #36
0
 def __get_dm(self):
     from Bio.Phylo.TreeConstruction import DistanceCalculator
     import numpy as np
     calculator = DistanceCalculator('identity')
     dm = calculator.get_distance(self.aln)
     dm_array = np.zeros(shape=(self.ns, self.ns))
     for row in range(0, self.ns):
         for cln in range(0, self.ns):
             if cln > row:
                 dm_array[row, cln] = dm[cln][row]
             else:
                 dm_array[row, cln] = dm[row][cln]
     return dm_array
예제 #37
0
def get_dist_matrix (file):
    aln = AlignIO.read(open('tmp/'+file), 'clustal')
    calculator = DistanceCalculator('blosum62')
    dist_matrix = calculator.get_distance(aln)
    i=0
    j=0
    da_list = list()
    for row in dist_matrix:
        print ('New Row!')
        j=0
        for column in row:
            if i<j:          # with this, you take out the 0's so n = (N²-N)/2
                print (dist_matrix[i,j])
                da_list.append(dist_matrix[i,j])
            j+=1
        i+=1
    return (da_list)
예제 #38
0
    def test_nj(self):
        tree = self.constructor.nj(self.dm)
        self.assertTrue(isinstance(tree, BaseTree.Tree))
        # tree_file = StringIO()
        # Phylo.write(tree, tree_file, 'newick')
        ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(tree, ref_tree))
        # ref_tree.close()

        # create a matrix of length 2
        calculator = DistanceCalculator('blosum62')
        self.min_dm = calculator.get_distance(self.aln)
        for i in range(len(self.min_dm) - 2):
            del self.min_dm[len(self.min_dm) - 1]

        min_tree = self.constructor.nj(self.min_dm)
        self.assertTrue(isinstance(min_tree, BaseTree.Tree))

        ref_min_tree = Phylo.read('./TreeConstruction/nj_min.tre', 'newick')
        self.assertTrue(Consensus._equal_topology(min_tree, ref_min_tree))
def distances_to_seq(alignment, sequence, distance_model="identity"):
    """A tool for computing not the complete sequence-sequence distance matrix,
    but only the distances to certain sequences.

    Beware: relies on a protected member of DistanceCalculator.

    :param alignment: A MultipleSeqAlignment object.

    :param sequence: A SeqRecord object. Must be of the same length as the
        records in the alignment.

    :param distance_model: One of either 'identity', 'blastn', or 'trans'.
        Defines the distance of a nucleotide pair. See
        Bio.Phylo.TreeConstruction.DistanceCalculator documentation.

    :returns: A list of distances between the given sequence and all sequences
        in the MSA, in the order in which the sequences are in the MSA.
    """
    dcalc = DistanceCalculator(distance_model)
    output = [dcalc._pairwise(sequence, msa_seq) for msa_seq in alignment]
    return output
예제 #40
0
    def test_distance_calculator(self):
        aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip')

        calculator = DistanceCalculator('identity')
        dm = calculator.get_distance(aln)
        self.assertEqual(dm['Alpha', 'Beta'], 1 - (10 * 1.0 / 13))

        calculator = DistanceCalculator('blastn')
        dm = calculator.get_distance(aln)
        self.assertEqual(dm['Alpha', 'Beta'], 1 - (38 * 1.0 / 65))

        calculator = DistanceCalculator('trans')
        dm = calculator.get_distance(aln)
        self.assertEqual(dm['Alpha', 'Beta'], 1 - (49 * 1.0 / 78))

        calculator = DistanceCalculator('blosum62')
        dm = calculator.get_distance(aln)
        self.assertEqual(dm['Alpha', 'Beta'], 1 - (53 * 1.0 / 84))
예제 #41
0
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import AlignIO
from Bio.Phylo.Consensus import *
from Bio import Phylo

clusters = 508
consensus_trees = []
#drzewa konsensusowe dla wszystkich klastrow

for i in [x for x in range(100,clusters) if x != 354]:
    msa = AlignIO.read('msa_klaster' + str(i) + '_s.fasta', 'fasta')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(msa)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    trees = bootstrap_trees(msa, 50, constructor)

    trees_list = list(trees)
    not_included = set([])

    for j in range(len(trees_list)):
        target_tree = trees_list[j]
        support_tree = get_support(target_tree, trees_list)

        for node in support_tree.get_nonterminals():
            if node.confidence < 50:
                not_included.add(j)

    trees = [trees_list[k] for k in range(len(trees_list)) if k not in not_included]

    if len(trees) > 0:
        consensus_trees.append(majority_consensus(trees))
def makeDistanceTree():
    aln = AlignIO.read('Tests/TreeConstruction/msa.phy', 'phylip')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(aln)
예제 #43
0
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio import AlignIO

# the alignmnet is pretty much the elementary structure
aln = AlignIO.read('./msa.phy', 'phylip')
# print aln
# SingleLetterAlphabet() alignment with 5 rows and 13 columns
# AACGTGGCCACAT Alpha
# AAGGTCGCCACAC Beta
# GAGATTTCCGCCT Delta
# GAGATCTCCGCCC Epsilon
# CAGTTCGCCACAA Gamma

# Several thigns can be done witht he alignment: get a distance matrix from it:
dstcalc = DistanceCalculator('identity')
dm = dstcalc.get_distance(aln)
# DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon'], matrix=[[0], [0.23076923076923073, 0], [0.3846153846153846, 0.23076923076923073, 0], [0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0], [0.6153846153846154, 0.3846153846153846, 0.46153846153846156, 0.15384615384615385, 0]])
print "What's the get_distance(aln) from DistanceCalculator('identity') object?"
print type(dm)
print dm
# Alpha   0
# Beta    0.230769230769  0
# Gamma   0.384615384615  0.230769230769  0
# Delta   0.538461538462  0.538461538462  0.538461538462  0
# Epsilon 0.615384615385  0.384615384615  0.461538461538  0.153846153846  0

# build a tree from it.
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

construc0 = DistanceTreeConstructor(dstcalc, 'nj')
# 	i = 0
# 	while i < len(sequences):
# 		if sequences[i] in temp_dict:
# 			i += 14
# 		else:
# 			temp_dict[sequences[i]] = sequences[i + 1 : i + 13]
# 			new_file.write(str(i) + "\n")
# 			for item in temp_dict[sequences[i]]:
# 				new_file.write(item)
# 			i += 14


fasta_files = file_handlers.find_files(file_paths, "fasta")
for path in fasta_files:
    file_name = file_handlers.get_file_name(path)
    print file_name
    name_list = file_name.split(".")
    # derep_out_file = ''.join(name_list[0] + '_uniques.fasta')
    dm_out_file = "".join(name_list[0] + "_dm.txt")
    # cmd = ['usearch -derep_fulllength ' + path + ' -fastaout ' + derep_out_file]
    # subprocess.call(cmd, shell=True)

    new_file = open("/Users/andrea/repositories/AMPHORA2/muscle_alignments/" + dm_out_file, "w")
    aln = AlignIO.read(path, "fasta")
    calculator = DistanceCalculator(
        "identity"
    )  # identity is the name of the model(scoring matrix) to calculate the distance. The identity model is the default one and can be used both for DNA and protein sequence.
    dm = calculator.get_distance(aln)
    new_file.write(dm)
    new_file.close()
def NNIheuristic(FASTAFile, sampleSize, threshold, outputDir):
    """"Find the maximum parsimony score for that tree"""
    random.seed(0)
    outputFile = FASTAFile.replace(".align", ".out")
    if "/" in outputFile:
        outputFile = outputFile[outputFile.rfind("/"):]
    output = open(outputDir + "/" + outputFile, 'w')
    output.write("*****************RUN STARTS HERE!*****************")
    #start time
    startTime = time.clock()
    output.write("\n" + "Filename: " + FASTAFile + "\n")
    output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n")
    output.write("Sample Size: " + str(sampleSize) + "\nThreshold: " + str(threshold) + "\n\n")
    # Import fasta alignment file
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    output.write("matrix constructed here")
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.upgma(myMatrix)
    
    output.write("constructed upgma tree")
        
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format
    output.write("created the original tree into newick format")

    # RLR tree required for maxParsimony function
    tree = NewicktoRLR(tree)
    score = maxParsimony(tree, tipMapping)
    graph = nx.Graph()
    makeGraph(graph, tree)
    output.write("made a graph")
    leaves = getLeaves(tree)
    currentFeasible = isFeasible(graph,leaves)
    
    output.write("tested isFeasible")
    
    # Perform NNI heuristic
    counter = 0
    loopCounter = 0
    while True:
        output.write("in the while loop")
        loopCounter += 1
        output.write("Loop Iteration: " + str(loopCounter) + "\n")
        output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n")
        output.write("Current Tree\nFeasibility: " + str(currentFeasible) + "\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n")
        NNIs = allNNIs(tree)
        if len(NNIs)-1 < sampleSize:
            sampleSize = len(NNIs)-1
        toScore = random.sample(NNIs, sampleSize)
        
        # add feasibility test
        output.write("starting feasibility test")
        feasible = []
        infeasible = []
        for tree in toScore:
            graph = nx.Graph()
            makeGraph(graph, tree)
            leaves = getLeaves(tree)
            if isFeasible(graph, leaves): #if this tree is possible
                feasible.append(tree)
            else:
                infeasible.append(tree) #if this tree is not possible
        output.write("Number of Feasible Neighbor Trees: " + str(len(feasible)) + "\n")
        output.write("Number of Infeasible Neighbor Trees: " + str(len(infeasible)) + "\n")
        if len(feasible) != 0: #if feasible trees were found
            if isFeasible(graph, leaves): #if this NNI is possible
                feasible.append(tree) 
            else:
                infeasible.append(tree) #if this NNI is not possible
        if len(feasible) != 0: #if feasible NNIs were found
            scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), feasible)
            sortedList = sorted(scoredList)
            counter = 0
            if not currentFeasible or sortedList[0][0] < score:
                score = sortedList[0][0]
                tree = sortedList[0][1]
                currentFeasible = True
                output.write("Found a New Feasible Tree!\n\n")
            else:
                output.write("Best Possible Feasible Tree Found\n" + str(tree) + "\n" + "Score: " + str(score) + "\n\n")
                break
        else: #if no possible trees we're found
            if currentFeasible: #checks if the original tree was feasible
                output.write("No Feasible Neighbors, Best Possible Feasible Tree\n" + str(tree) + "\n\n")
                break
            counter += 1
            output.write("Threshold counter: " + str(counter) + "\n\n")
            if counter >= threshold:
                output.write("Threshold Met: No Feasible Tree Found\n")
                stopTime = (time.clock() - startTime)
                output.write("Program Stop: " + str(stopTime) + " seconds\n\n")
                return
            output.write("Searching Infeasible Space\n")
            scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), infeasible)
            sortedList = sorted(scoredList)
            choseNeighbor = False    
            for neighbor in sortedList: #if the original tree was infeasible and no feasible neighbors were found, take the next best infeasible tree and run again
                if neighbor[0] > score:
                    score = neighbor[0]
                    tree = neighbor[1]
                    choseNeighbor = True
                    break
            if not choseNeighbor: 
                score = sortedList[-1][0]
                tree = sortedList[-1][1]
            currentFeasible = False
            output.write("Next Best Infeasible Tree\n\n")
    endTime = (time.clock() - startTime)
    output.write("Program End: " + str(endTime) + " seconds\n\n")
                
    #outputTree = RLRtoNewick(tree)
    #print "Final score", score
    return
예제 #46
0
파일: demo_tree.py 프로젝트: chapter09/cDNA
## pad sequences so that they all have the same length
#for record in records:
#    if len(record.seq) != maxlen:
#        sequence = str(record.seq).ljust(maxlen, '.')
#        record.seq = Seq.Seq(sequence)
#assert all(len(record.seq) == maxlen for record in records)

## write to temporary file and do alignment
#output_file = '{}_padded.fasta'.format(os.path.splitext(input_file)[0])
#with open(output_file, 'w') as f:
#    SeqIO.write(records, f, 'fasta')
#alignment = AlignIO.read(output_file, "fasta")

#cline = ClustalwCommandline("clustalw2", infile=input_file)
#print(cline)
#print type(cline)

muscle_cline = MuscleCommandline(input=input_file)
stdout, stderr = muscle_cline()
alignment = AlignIO.read(StringIO(stdout), "fasta")
print(alignment)

#alignment = AlignIO.read('../data/ls_orchid.fasta', 'fasta')
#print alignment
calculator = DistanceCalculator('ident')
dm = calculator.get_distance(alignment)
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
Phylo.write(tree, 'phyloxml.xml', 'phyloxml')
예제 #47
0
 def setUp(self):
     self.aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip')
     calculator = DistanceCalculator('blosum62')
     self.dm = calculator.get_distance(self.aln)
     self.constructor = DistanceTreeConstructor(calculator)
예제 #48
0
# Bruno Azenha Goncalves
# ICMC - USP
# Python program to build phylogenetic tree

from Bio import AlignIO
from Bio import Phylo
import numpy as np
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

# Reads the alignment files
alignApe = AlignIO.read('genomes/human_primates_aligned.fasta', 'fasta')
alignHIV = AlignIO.read('genomes/alignedHIV.fasta', 'fasta')

# Creates the distance matrix
calculator = DistanceCalculator('ident')
dm_ape = calculator.get_distance(alignApe)
dm_hiv = calculator.get_distance(alignHIV)


# Jukes Cantor corrections
dm_ape_corrected = dm_ape
for d in dm_ape_corrected.matrix:
	d[:] = [-3/4*np.log(1-4/3*x) for x in d]

dm_hiv_corrected = dm_hiv
for d in dm_hiv_corrected.matrix:
	d[:] = [-3/4*np.log(1-4/3*x) for x in d]


# Constructs the tree using the upgma algorithm
def noFeasibleTest(FASTAFile, sampleSize, outputDir):
    """"takes a FASTAFile, constructs a UPGMA Tree from the file data, converts this tree to RLR format,
    tries to find the tree with the lowest parsimony score (ignores feasibility check)"""
    random.seed(0)
    outputFile = FASTAFile.replace(".align", ".out")
    if "/" in outputFile:
        outputFile = outputFile[outputFile.rfind("/"):]
    output = open(outputDir + "/" + outputFile, 'w')
    output.write("*****************RUN STARTS HERE!*****************")
    #start time
    startTime = time.clock()
    output.write("\n" + "Filename: " + FASTAFile + "\n")
    output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n")
    output.write("Sample Size: " + str(sampleSize) + "\n\n")
    # Import fasta alignment file
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.upgma(myMatrix)
        
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format

    # RLR tree required for maxParsimony function
    tree = NNI.NewicktoRLR(tree)
    score = NNI.maxParsimony(tree, tipMapping)
        
    # Perform NNI heuristic
    loopCounter = 0
    while True:
        loopCounter += 1
        output.write("Loop Iteration: " + str(loopCounter) + "\n")
        output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n")
        output.write("Current Tree\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n")
        NNIs = NNI.allNNIs(tree)
        if len(NNIs)-1 < sampleSize:
            sampleSize = len(NNIs)-1
        toScore = random.sample(NNIs, sampleSize)
        
        scoredList = map(lambda x: (NNI.maxParsimony(x, tipMapping), x), toScore)
        sortedlist = sorted(scoredList)
        if sortedlist[0][0] < score:
            score = sortedlist[0][0]
            tree = sortedlist[0][1]
            output.write("Found A More Parsimonious Tree!\n\n")
            
        else:
            break
            output.write("No Neighbors With Better Scores Found\n\n")
    output.write("Final Tree:\n" + str(tree) + "\nScore: " + str(score) + "\n\n")
    endTime = (time.clock() - startTime)
    output.write("Program End: " + str(endTime) + " seconds\n\n")
    return