def test_pairwise_identity(sequences, mode): """ Test correct calculation of `get_pairwise_sequence_identity()` via pairwise calls of `get_sequence_identity()`. """ sequences = sequences msa, _, _, _ = align.align_multiple( sequences, matrix=align.SubstitutionMatrix.std_protein_matrix() ) ref_identity_matrix = np.zeros((len(sequences), len(sequences))) for i in range(len(sequences)): for j in range(len(sequences)): ref_identity_matrix[i,j] = align.get_sequence_identity( msa[:, [i,j]], mode=mode ) test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode) # Identity of two equal sequences should be 1, if only the length of # the sequence is counted if mode == "shortest": assert (np.diag(test_identity_matrix) == 1).all() # Identity must be between 0 and 1 assert ((test_identity_matrix <= 1) & (test_identity_matrix >= 0)).all() # Identity matrix is symmetric assert (test_identity_matrix == test_identity_matrix.T).all() # Pairwise identity must be equal in the two functions assert (test_identity_matrix == ref_identity_matrix).all()
ids.append(ncbi_id) # Download sequences a file-like object and read the sequences from it fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(ids, file_name=None, db_name="protein", ret_type="fasta")) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] # Create multiple sequence alignment with Clustal Omega alignment = clustalo.ClustalOmegaApp.align(sequences) # The distance measure required for the tree calculation is the # percentage of non-identical amino acids in the respective two # sequences distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") # Create tree via neighbor joining tree = phylo.neighbor_joining(distances) # Convert to NetworkX graph #For the graph visualization, the edge directions are unnecessary graph = tree.as_graph().to_undirected() fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.gca() ax.axis("off") # Calculate position of nodes in the plot pos = nx.kamada_kawai_layout(graph) # Assign the gene names to the nodes that represent a reference index node_labels = {i: name for i, name in enumerate(genes)} nx.draw_networkx( graph,