def get_random_tree(filename, tree_string, L, kappa): # strains = read_in_strains(filename) # # L = genome_length(strains) # min_m = get_min_m(strains, L) # scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string) phylogeny = pyvolve.read_tree(tree = tree_string) # pyvolve.print_tree(phylogeny) freqs = [0.25,0.25,0.25,0.25] nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs}) ancestor = generate_ancestor(L) print(ancestor) my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor) my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny) my_evolver() # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # strains = my_evolver.get_sequences(anc = True) # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # print('pi: ' + str(pi)) # print('theta: ' + str(theta)) return {'pi': pi, 'theta': theta}
def get_random_tree(L, species, scaled_tree_string, kappa, iteration): # strains = read_in_strains(filename) # L = genome_length(strains) # min_m = get_min_m(strains, L) # max_m = get_max_m(strains, L, tree_string) # pis = [] # thetas = [] # scaled_trees = [] # for x in range(min_m,max_m+1): # scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment) # scaled_trees.append(scaled_tree_string) # for tree in scaled_trees: phylogeny = pyvolve.read_tree(tree=scaled_tree_string) print('read in the tree') pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': kappa, 'state_freqs': freqs }) ancestor = generate_ancestor(L) print('generated an ancestor') # # print(ancestor) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) my_evolver(ratefile=None, infofile=None, seqfile="simulated_alignment_" + str(species[:-1]) + "_universal_" + str(iteration + 1) + ".fasta") # # my_evolver() print('evolved the sequences') # # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # # strains = my_evolver.get_sequences(anc = True) # # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # pis.append(pi) # thetas.append(theta) # # print('pi: ' + str(pi)) # # print('theta: ' + str(theta)) # return {'pi': pis, 'theta': thetas} return pi, theta
# with open(('ID_Matrix_' + species_name + '.csv'), 'w', newline = '') as f: # writer = csv.writer(f) # writer.writerow([species_name]) # header = [''] # header.extend(strain_names) # writer.writerow(header) # for row in range(shape[0]): # write_row = [strain_names[row]] # write_row.extend(identity_matrix[row]) # writer.writerow(write_row) # writer.writerow([]) print(name) L = genome_length(strains) n = species_size(strains) pi = pi_value(strains) theta = theta_value(strains) else: L = 'N/A' n = 'N/A' # if (os.path.join(full_path, 'concat_universal.fa'): # concat_universal_file = open(os.path.join(full_path, 'concat_universal.fa'), 'r') # concat_universal_file = list(concat_universal_file) # print(full_path) # concat_core_file = open(os.path.join(full_path, 'concat_core.fa'), 'r') # open('concat_core.fa', 'r') # concat_universal_file = open(os.path.join(full_path, 'concat_universal.fa'), 'r') full_path = os.path.join(folder_path, 'kappa.txt') if os.path.exists(full_path): kappa_file = open(full_path, 'r')
def apply_model_along_phylogeny(species_path, kappa, tree_string): # ancestor = '' # print('Reading in the strains.\n\n') strains = read_in_strains( species_path ) # dictionary with the genomes of all the strains; key = strain name, value = genome internal_nodes = get_internal_nodes(anc_path) all_nodes = strains + internal_nodes strain_names = list(strains.keys()) # list of all the extant strain names all_node_names = list(all_nodes.keys()) # print(strain_names) n = species_size(strains) # number of extant strains total_pairs = ( n * (n - 1)) / 2 # the total number of strain pairs that will be compared L = genome_length(strains) # number of base pairs in the genomd theta = theta_value( strains) # proportion of the genome that is polymorphic mu = theta / (2 * n ) # mutation rate in mutations per base pair per generation min_m = get_min_m( strains, L ) # minimum number of mutations that could account for all the polymorphisms in the species # print('Scaling the branch lengths of the tree.\n\n') # scaled_tree_string = tree_string scaled_tree_string = scale_newick_format_tree( strains, L, min_m, tree_string, 0) # the tree_string scaled by min_m SHARED = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides shared between two strains; the (i,j) entry is the number of nucleotides that are the same between strain i and strain j CONVERGENT = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to convergent mutation between two strains; the (i,j) entry is the number of convergent mutations between strain i and strain j ANCESTRAL = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to direct inheritence from the ancestor; the (i,j) entry is the number of nucleotides that were inherited by both strain i and strain j RECOMBINANT = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to a recombination event; the (i,j) entry is the number of nucleotides that were recombined between strain i and strain j updated_tree_info = name_nodes(tree_string, strain_names) tree_string = updated_tree_info[ 'tree_string'] # version of the tree_string where every node is labeled new_nodes = updated_tree_info[ 'new_nodes'] # the new node names that were added all_nodes = all_node_names + new_nodes # list of all the node names in the pyhlogenetic tree # print(all_nodes) # parents = find_parents(strain_names, tree_string) # a dictionary of the sequence of parents of each strain; key = strain name, value = list of the parents in order of increasing distance from the strain parents = find_parents(all_node_names, tree_string) distances = get_branch_lengths( all_nodes, tree_string ) # a dictionary of the distances of each strain to its closest ancestor; key = strain name, value = distance to its closest ancestor # print(parents) # print(distances) # parents = parents_and_distances['parents'] # distances = parents_and_distances['distances'] count = 1 # a counter for the current strain pair number that is being processed for s1 in range(n): # allows each strain to be strain 1 strain1 = strain_names[s1] genome1 = strains[strain1] SHARED[s1, s1] = L CONVERGENT[ s1, s1] = 0 # there can be no convergent mutations between a strain and itself ANCESTRAL[s1, s1] = L RECOMBINANT[s1, s1] = 0 for s2 in range(s1 + 1, n): # allows each strain after strain 1 to be strain 2 strain2 = strain_names[s2] genome2 = strains[strain2] MRCA = find_MRCA( strain1, strain2, parents ) # the Most Recent Common Ancestor between the two strains s, a = 0, 0 # initializes the shared and ancestral values for the pair of strains to 0 for site in range(L): # goes through every site along the genome if genome1[site] == genome2[ site]: # counts up the number of shared sites s += 1 if genome1[site] == ancestor[ site]: # counts up the number of shared sites that were inherited from the ancestor a += 1 # s1_tree_location = scaled_tree_string.find(strain_names[s1]) # s2_tree_location = scaled_tree_string.find(strain_names[s2]) # start_length_1 = scaled_tree_string.find(':', s1_tree_location) + 1 # x1 = scaled_tree_string.find(',', start_length_1) # y1 = scaled_tree_string.find(')', start_length_1) # if x1 == -1: # end_length_1 = y1 # elif y1 == -1: # end_length_1 = x1 # else: # end_length_1 = min(x1,y1) # start_length_2 = scaled_tree_string.find(':', s2_tree_location) + 1 # x2 = scaled_tree_string.find(',', start_length_2) # y2 = scaled_tree_string.find(')', start_length_2) # if x2 == -1: # end_length_2 = y2 # elif y2 == -1: # end_length_2 = x2 # else: # end_length_2 = min(x2,y2) # length_1 = float(scaled_tree_string[start_length_1:end_length_1]) # length_2 = float(scaled_tree_string[start_length_2:end_length_2]) # MRCA = find_MRCA(strain1, strain2, parents) # the Most Recent Common Ancestor between the two strains pair_distances = get_distances_to_MRCA( strain1, strain2, MRCA, tree_string, strain_names, parents, distances ) # gets the total lengths of the branches back to the MRCA of strain 1 and strain 2 distance_1 = pair_distances[ 'distance_1'] # the distance from strain 1 to the MRCA distance_2 = pair_distances[ 'distance_2'] # the distance from strain 2 to the MRCA m_1 = int(distance_1 * L + 1) # the number of mutations that occurred on strain 1 m_2 = int(distance_1 * L + 1) # the number of mutations that occurred on strain 2 generations_1 = int( m_1 / mu ) # the number of generations over which these mutations occurred on strain 1 generations_2 = int( m_2 / mu ) # the number of generations over which these mutations occurred on strain 2 c = expected_c_given_ms( L, m_1, m_2, mu, generations_1, generations_2, kappa, 0.5 ) # the expected number of convergent mutations between strain 1 and strain 2 # fills in the appropriate values to the S,C,A,R matrices for the current strain pair SHARED[s1, s2] = s SHARED[s2, s1] = s CONVERGENT[s1, s2] = c CONVERGENT[s2, s1] = c ANCESTRAL[s1, s2] = a ANCESTRAL[s2, s1] = a RECOMBINANT[s1, s2] = s - a - c RECOMBINANT[s2, s1] = s - a - c count += 1 # print('\n\nCompleted strain pairing ' + str(count) + ' out of ' + str(n**2) + '\n\n') # return {'strain_names': strain_names, 'Convergent': CONVERGENT} return { 'strain_names': strain_names, 'Shared': SHARED, 'Convergent': CONVERGENT, 'Ancestral': ANCESTRAL, 'Recombinant': RECOMBINANT }
# ancestral_tree_string = list(tree_file)[0] # internal_nodes = get_internal_nodes(os.path.join(raxml_path, ancestral_alignment)) # internal_nodes,ancestral_tree_string = rename_ancestors(internal_nodes, strain_names, ancestral_tree_string) # all_nodes = {} # for key in strains.keys(): # all_nodes[key] = strains[key] # for key in internal_nodes.keys(): # all_nodes[key] = internal_nodes[key] # all_node_names = list(all_nodes.keys()) total_pairs = int( (n * (n - 1)) / 2) # the total number of strain pairs that will be compared pi = pi_value(strains) theta = theta_value( strains) # proportion of the genome that is polymorphic # complete_tree_string = merge_trees(rooted_tree_string, ancestral_tree_string) kappa_file = open(kappa_file, 'r') kappa = float(list(kappa_file)[0]) min_m = get_min_m( strains, L ) # minimum number of mutations that could account for all the polymorphisms in the species max_m = get_max_m(strains, L, rooted_tree_string) accurate_tree, trials = scale_branch_lengths(L, rooted_tree_string, min_m, max_m, pi, theta, kappa, 1) with open(output_file + '.csv', 'w', newline='') as f: writer = csv.writer(f)
# runs the functions to get pi, theta, GC% average, and GC% standard deviation for each species and write them into a .csv file # time complexity: O(n^4), where n is the length of the strains path = 'C:/Users/Owner/Documents/UNCG REU/Project/concatenates/other' # path where the .fa files are located with open(('species_params2.csv'), 'w', newline = '') as f: writer = csv.writer(f) writer.writerow(['Species', 'Genome Length', 'pi', 'theta', 'GC%']) # column headers for filename in glob.glob(os.path.join(path, '*.fa')): # finds the values for each species species = read_in_strains(filename) name = filename[len(path)+1:len(filename)-3] # filename.strip('C:/Users/Owner/Documents/UNCG REU/Project/Recombination-Rates/concatenates').strip('/concat_') # strips off everything but the actual species name # name = filename.strip('C:/Users/Owner/Documents/UNCG REU/Project/Recombination-Rates/concatenates').strip('/concat_') # strips off everything but the actual species name print(name) size = species_size(species) length = genome_length(species) pi = str(pi_value(species)) theta = str(theta_value(species)) # GC_comp = list(nucleotide_composition(species)) GC_average = nucleotide_composition(species) # GC_average = GC_comp[0] # GC_stdev = GC_comp[1] writer.writerow([name, length, size, pi, theta, GC_average]) # print(filename) # print('pi = ' + str(pi)) # print('theta = ' + str(theta)) # print('GC% = ' + str(GC_comp)) # print('\n') print(name)
def get_SCAR_matrices(species_alignment, ancestral_alignment, kappa_file, mu, species): reduced_species_alignment = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + species + 'concat_universal.fa.reduced' raxml_path = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + species # uberculosis' tree_file = 'RAxML_bestTree.tree' rooted_tree_file = 'RAxML_rootedTree.root' # ancestral_alignment = 'RAxML_marginalAncestralStates.anc' ancestral_tree_file = 'RAxML_nodeLabelledRootedTree.anc' # get_tree_string(species_alignment, raxml_path) reduced = os.path.exists(reduced_species_alignment) # get_tree_root(tree_file, raxml_path) # get_ancestors(rooted_tree_file, species_alignment, raxml_path, reduced) if not reduced: strains = read_in_strains(species_alignment) else: strains = read_in_reduced_strains( reduced_species_alignment ) # dictionary with the genomes of all the strains; key = strain name, value = genome # for strain in strains.keys(): # print(strain) # print(strains[strain][:10]) L = genome_length(strains) # number of base pairs in the genome n = species_size(strains) # number of extant strains strain_names = list(strains.keys()) # list of all the extant strain names SHARED = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides shared between two strains; the (i,j) entry is the number of nucleotides that are the same between strain i and strain j CONVERGENT = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to convergent mutation between two strains; the (i,j) entry is the number of convergent mutations between strain i and strain j ANCESTRAL = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to direct inheritence from the ancestor; the (i,j) entry is the number of nucleotides that were inherited by both strain i and strain j RECOMBINANT = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to a recombination event; the (i,j) entry is the number of nucleotides that were recombined between strain i and strain j RATES = np.empty([n, n], dtype=np.float, order='C') tree_file = open((os.path.join(raxml_path, rooted_tree_file)), 'r') rooted_tree_string = list(tree_file)[0] tree_file = open((os.path.join(raxml_path, ancestral_tree_file)), 'r') ancestral_tree_string = list(tree_file)[0] # strains = read_in_strains(species_alignment) # dictionary with the genomes of all the strains; key = strain name, value = genome internal_nodes = get_internal_nodes( os.path.join(raxml_path, ancestral_alignment)) # print(internal_nodes.keys()) # all_nodes = internal_nodes internal_nodes, ancestral_tree_string = rename_ancestors( internal_nodes, strain_names, ancestral_tree_string) all_nodes = {} for key in strains.keys(): all_nodes[key] = strains[key] for key in internal_nodes.keys(): all_nodes[key] = internal_nodes[key] # strain_names = list(strains.keys()) # list of all the extant strain names all_node_names = list(all_nodes.keys()) # print(strain_names) print(all_node_names) # n = species_size(strains) # number of extant strains total_pairs = int( (n * (n - 1)) / 2) # the total number of strain pairs that will be compared # L = genome_length(strains) # number of base pairs in the genome pi = pi_value(strains) theta = theta_value( strains) # proportion of the genome that is polymorphic # print(theta) # print(n) # mu = (theta)/(2*n) # mutation rate in mutations per base pair per generation # print(mu) # tree_file = open((os.path.join(raxml_path, rooted_tree_file)), 'r') # rooted_tree_string = list(tree_file)[0] # tree_file = open((os.path.join(raxml_path, ancestral_tree_file)), 'r') # ancestral_tree_string = list(tree_file)[0] # print(rooted_tree_string) # print(ancestral_tree_string) complete_tree_string = merge_trees(rooted_tree_string, ancestral_tree_string) print(complete_tree_string) kappa_file = open(kappa_file, 'r') kappa = float(list(kappa_file)[0]) # min_m = get_min_m(strains, L) # minimum number of mutations that could account for all the polymorphisms in the species # max_m = get_max_m(strains, L, complete_tree_string) #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ############################################################################### ##### CHANGE THIS!!!!!!!!!!!!!!!!!!!!! ######################################## ############################################################################### # scaled_tree_string = complete_tree_string scaled_tree_string = scale_newick_format_tree(complete_tree_string) # scaled_tree_string = scale_branch_lengths(L, complete_tree_string, min_m, max_m, pi, theta, kappa, 1) # scale_newick_format_tree(strains, L, min_m, tree_string, 0) # the tree_string scaled by min_m # L, tree_string, min_m, max_m, real_pi, real_theta, kappa phylogeny = pyvolve.read_tree(tree=scaled_tree_string) # pyvolve.print_tree(phylogeny) g = open('scaled_tree.txt', 'w') g.write(scaled_tree_string) g.close() # updated_tree_info = name_nodes(tree_string, strain_names) # scaled_tree_string = updated_tree_info['tree_string'] # version of the tree_string where every node is labeled # new_nodes = updated_tree_info['new_nodes'] # the new node names that were added # all_nodes = all_node_names + new_nodes # list of all the node names in the pyhlogenetic tree # print(all_nodes) # parents = find_parents(strain_names, tree_string) # a dictionary of the sequence of parents of each strain; key = strain name, value = list of the parents in order of increasing distance from the strain parents = find_parents(all_node_names, scaled_tree_string) # print('found parents') # print(parents) distances = get_branch_lengths( all_node_names, scaled_tree_string ) # a dictionary of the distances of each strain to its closest ancestor; key = strain name, value = distance to its closest ancestor # print('found distances') # print(distances) count = 1 # a counter for the current strain pair number that is being processed total = 0 for s1 in range(n): # allows each strain to be strain 1 strain1 = strain_names[s1] genome1 = strains[strain1] SHARED[s1, s1] = L CONVERGENT[ s1, s1] = 0 # there can be no convergent mutations between a strain and itself ANCESTRAL[s1, s1] = L RECOMBINANT[s1, s1] = 0 for s2 in range(s1 + 1, n): # allows each strain after strain 1 to be strain 2 strain2 = strain_names[s2] genome2 = strains[strain2] MRCA = find_MRCA( strain1, strain2, parents ) # the Most Recent Common Ancestor between the two strains MRCA_genome = all_nodes[MRCA] s, a = get_s_a(genome1, genome2, MRCA_genome, L) # print('got s and a') c, pair_distances = get_c(strain1, strain2, MRCA, parents, scaled_tree_string, distances, L, mu, kappa) # print('got c') r = s - c - a # print(pair_distances['distance_1']) # print(pair_distances['distance_2']) # print(L) # fills in the appropriate values to the S,C,A,R matrices for the current strain pair SHARED[s1, s2] = s SHARED[s2, s1] = s CONVERGENT[s1, s2] = c CONVERGENT[s2, s1] = c ANCESTRAL[s1, s2] = a ANCESTRAL[s2, s1] = a RECOMBINANT[s1, s2] = r RECOMBINANT[s2, s1] = r RATES[s1, s2] = r / int(pair_distances['distance_1'] * L + 1) RATES[s2, s1] = r / int(pair_distances['distance_2'] * L + 1) total += r / int(pair_distances['distance_1'] * L + 1) total += r / int(pair_distances['distance_2'] * L + 1) # print('\n\nCompleted strain pairing ' + str(count) + ' out of ' + str(total_pairs) + '\n\n') count += 1 average_rate = total / total_pairs print('The average recombination rate is ' + str(average_rate)) return { 'strain_names': strain_names, 'Shared': SHARED, 'Convergent': CONVERGENT, 'Ancestral': ANCESTRAL, 'Recombinant': RECOMBINANT, 'Rates': RATES, 'average': average_rate }