def get_random_tree(filename, tree_string, L, kappa): # strains = read_in_strains(filename) # # L = genome_length(strains) # min_m = get_min_m(strains, L) # scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string) phylogeny = pyvolve.read_tree(tree = tree_string) # pyvolve.print_tree(phylogeny) freqs = [0.25,0.25,0.25,0.25] nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs}) ancestor = generate_ancestor(L) print(ancestor) my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor) my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny) my_evolver() # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # strains = my_evolver.get_sequences(anc = True) # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # print('pi: ' + str(pi)) # print('theta: ' + str(theta)) return {'pi': pi, 'theta': theta}
def get_random_tree(L, species, scaled_tree_string, kappa, iteration): # strains = read_in_strains(filename) # L = genome_length(strains) # min_m = get_min_m(strains, L) # max_m = get_max_m(strains, L, tree_string) # pis = [] # thetas = [] # scaled_trees = [] # for x in range(min_m,max_m+1): # scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment) # scaled_trees.append(scaled_tree_string) # for tree in scaled_trees: phylogeny = pyvolve.read_tree(tree=scaled_tree_string) print('read in the tree') pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': kappa, 'state_freqs': freqs }) ancestor = generate_ancestor(L) print('generated an ancestor') # # print(ancestor) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) my_evolver(ratefile=None, infofile=None, seqfile="simulated_alignment_" + str(species[:-1]) + "_universal_" + str(iteration + 1) + ".fasta") # # my_evolver() print('evolved the sequences') # # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # # strains = my_evolver.get_sequences(anc = True) # # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # pis.append(pi) # thetas.append(theta) # # print('pi: ' + str(pi)) # # print('theta: ' + str(theta)) # return {'pi': pis, 'theta': thetas} return pi, theta
# shape = identity_matrix.shape # with open(('ID_Matrix_' + species_name + '.csv'), 'w', newline = '') as f: # writer = csv.writer(f) # writer.writerow([species_name]) # header = [''] # header.extend(strain_names) # writer.writerow(header) # for row in range(shape[0]): # write_row = [strain_names[row]] # write_row.extend(identity_matrix[row]) # writer.writerow(write_row) # writer.writerow([]) print(name) L = genome_length(strains) n = species_size(strains) pi = pi_value(strains) theta = theta_value(strains) else: L = 'N/A' n = 'N/A' # if (os.path.join(full_path, 'concat_universal.fa'): # concat_universal_file = open(os.path.join(full_path, 'concat_universal.fa'), 'r') # concat_universal_file = list(concat_universal_file) # print(full_path) # concat_core_file = open(os.path.join(full_path, 'concat_core.fa'), 'r') # open('concat_core.fa', 'r') # concat_universal_file = open(os.path.join(full_path, 'concat_universal.fa'), 'r') full_path = os.path.join(folder_path, 'kappa.txt') if os.path.exists(full_path):
from process_genomes import nucleotide_composition # runs the functions to get pi, theta, GC% average, and GC% standard deviation for each species and write them into a .csv file # time complexity: O(n^4), where n is the length of the strains path = 'C:/Users/Owner/Documents/UNCG REU/Project/concatenates/other' # path where the .fa files are located with open(('species_params2.csv'), 'w', newline = '') as f: writer = csv.writer(f) writer.writerow(['Species', 'Genome Length', 'pi', 'theta', 'GC%']) # column headers for filename in glob.glob(os.path.join(path, '*.fa')): # finds the values for each species species = read_in_strains(filename) name = filename[len(path)+1:len(filename)-3] # filename.strip('C:/Users/Owner/Documents/UNCG REU/Project/Recombination-Rates/concatenates').strip('/concat_') # strips off everything but the actual species name # name = filename.strip('C:/Users/Owner/Documents/UNCG REU/Project/Recombination-Rates/concatenates').strip('/concat_') # strips off everything but the actual species name print(name) size = species_size(species) length = genome_length(species) pi = str(pi_value(species)) theta = str(theta_value(species)) # GC_comp = list(nucleotide_composition(species)) GC_average = nucleotide_composition(species) # GC_average = GC_comp[0] # GC_stdev = GC_comp[1] writer.writerow([name, length, size, pi, theta, GC_average]) # print(filename) # print('pi = ' + str(pi)) # print('theta = ' + str(theta)) # print('GC% = ' + str(GC_comp)) # print('\n') print(name)
for s in range(len(species)): # pis = iterations*[None] # list that will be populated with the simulated pi values; index = iteration - 1 # thetas = iterations*[None] # list that will be populated with the simulated theta values; index = iteration - 1 tree_file = open((path + species[s] + 'Universal Tree/RAxML_bestTree.tree'), 'r') tree_string = list(tree_file)[0] print(tree_string) print('got tree string') kappa_file = open((path + species[s] + 'kappa.txt'), 'r') kappa = float(list(kappa_file)[0]) print('got kappa') strains = read_in_strains(path+species[s]+concat) L = genome_length(strains) print('read in strains') real_pi = pi_value(strains) real_theta = theta_value(strains) min_m = get_min_m(strains, L) print('min_m = ' + str(min_m)) max_m = get_max_m(strains, L, tree_string) print('max_m = ' + str(max_m)) print('got min and max m') tree_string = scale_branch_lengths(L, tree_string, min_m, max_m, real_pi, real_theta, kappa, 1) print('got the appropriately scaled tree') pis = iterations*[None] thetas = iterations*[None] for i in range(iterations): print(i) # tree_file = open((path + species[s] + 'Universal Tree/RAxML_bestTree.tree'), 'r')
def get_SCAR_matrices(species_alignment, ancestral_alignment, kappa_file, mu, species): reduced_species_alignment = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + species + 'concat_universal.fa.reduced' raxml_path = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + species # uberculosis' tree_file = 'RAxML_bestTree.tree' rooted_tree_file = 'RAxML_rootedTree.root' # ancestral_alignment = 'RAxML_marginalAncestralStates.anc' ancestral_tree_file = 'RAxML_nodeLabelledRootedTree.anc' # get_tree_string(species_alignment, raxml_path) reduced = os.path.exists(reduced_species_alignment) # get_tree_root(tree_file, raxml_path) # get_ancestors(rooted_tree_file, species_alignment, raxml_path, reduced) if not reduced: strains = read_in_strains(species_alignment) else: strains = read_in_reduced_strains( reduced_species_alignment ) # dictionary with the genomes of all the strains; key = strain name, value = genome # for strain in strains.keys(): # print(strain) # print(strains[strain][:10]) L = genome_length(strains) # number of base pairs in the genome n = species_size(strains) # number of extant strains strain_names = list(strains.keys()) # list of all the extant strain names SHARED = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides shared between two strains; the (i,j) entry is the number of nucleotides that are the same between strain i and strain j CONVERGENT = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to convergent mutation between two strains; the (i,j) entry is the number of convergent mutations between strain i and strain j ANCESTRAL = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to direct inheritence from the ancestor; the (i,j) entry is the number of nucleotides that were inherited by both strain i and strain j RECOMBINANT = np.empty( [n, n], dtype=np.float, order='C' ) # a matrix of the number of nucleotides that match due to a recombination event; the (i,j) entry is the number of nucleotides that were recombined between strain i and strain j RATES = np.empty([n, n], dtype=np.float, order='C') tree_file = open((os.path.join(raxml_path, rooted_tree_file)), 'r') rooted_tree_string = list(tree_file)[0] tree_file = open((os.path.join(raxml_path, ancestral_tree_file)), 'r') ancestral_tree_string = list(tree_file)[0] # strains = read_in_strains(species_alignment) # dictionary with the genomes of all the strains; key = strain name, value = genome internal_nodes = get_internal_nodes( os.path.join(raxml_path, ancestral_alignment)) # print(internal_nodes.keys()) # all_nodes = internal_nodes internal_nodes, ancestral_tree_string = rename_ancestors( internal_nodes, strain_names, ancestral_tree_string) all_nodes = {} for key in strains.keys(): all_nodes[key] = strains[key] for key in internal_nodes.keys(): all_nodes[key] = internal_nodes[key] # strain_names = list(strains.keys()) # list of all the extant strain names all_node_names = list(all_nodes.keys()) # print(strain_names) print(all_node_names) # n = species_size(strains) # number of extant strains total_pairs = int( (n * (n - 1)) / 2) # the total number of strain pairs that will be compared # L = genome_length(strains) # number of base pairs in the genome pi = pi_value(strains) theta = theta_value( strains) # proportion of the genome that is polymorphic # print(theta) # print(n) # mu = (theta)/(2*n) # mutation rate in mutations per base pair per generation # print(mu) # tree_file = open((os.path.join(raxml_path, rooted_tree_file)), 'r') # rooted_tree_string = list(tree_file)[0] # tree_file = open((os.path.join(raxml_path, ancestral_tree_file)), 'r') # ancestral_tree_string = list(tree_file)[0] # print(rooted_tree_string) # print(ancestral_tree_string) complete_tree_string = merge_trees(rooted_tree_string, ancestral_tree_string) print(complete_tree_string) kappa_file = open(kappa_file, 'r') kappa = float(list(kappa_file)[0]) # min_m = get_min_m(strains, L) # minimum number of mutations that could account for all the polymorphisms in the species # max_m = get_max_m(strains, L, complete_tree_string) #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ############################################################################### ##### CHANGE THIS!!!!!!!!!!!!!!!!!!!!! ######################################## ############################################################################### # scaled_tree_string = complete_tree_string scaled_tree_string = scale_newick_format_tree(complete_tree_string) # scaled_tree_string = scale_branch_lengths(L, complete_tree_string, min_m, max_m, pi, theta, kappa, 1) # scale_newick_format_tree(strains, L, min_m, tree_string, 0) # the tree_string scaled by min_m # L, tree_string, min_m, max_m, real_pi, real_theta, kappa phylogeny = pyvolve.read_tree(tree=scaled_tree_string) # pyvolve.print_tree(phylogeny) g = open('scaled_tree.txt', 'w') g.write(scaled_tree_string) g.close() # updated_tree_info = name_nodes(tree_string, strain_names) # scaled_tree_string = updated_tree_info['tree_string'] # version of the tree_string where every node is labeled # new_nodes = updated_tree_info['new_nodes'] # the new node names that were added # all_nodes = all_node_names + new_nodes # list of all the node names in the pyhlogenetic tree # print(all_nodes) # parents = find_parents(strain_names, tree_string) # a dictionary of the sequence of parents of each strain; key = strain name, value = list of the parents in order of increasing distance from the strain parents = find_parents(all_node_names, scaled_tree_string) # print('found parents') # print(parents) distances = get_branch_lengths( all_node_names, scaled_tree_string ) # a dictionary of the distances of each strain to its closest ancestor; key = strain name, value = distance to its closest ancestor # print('found distances') # print(distances) count = 1 # a counter for the current strain pair number that is being processed total = 0 for s1 in range(n): # allows each strain to be strain 1 strain1 = strain_names[s1] genome1 = strains[strain1] SHARED[s1, s1] = L CONVERGENT[ s1, s1] = 0 # there can be no convergent mutations between a strain and itself ANCESTRAL[s1, s1] = L RECOMBINANT[s1, s1] = 0 for s2 in range(s1 + 1, n): # allows each strain after strain 1 to be strain 2 strain2 = strain_names[s2] genome2 = strains[strain2] MRCA = find_MRCA( strain1, strain2, parents ) # the Most Recent Common Ancestor between the two strains MRCA_genome = all_nodes[MRCA] s, a = get_s_a(genome1, genome2, MRCA_genome, L) # print('got s and a') c, pair_distances = get_c(strain1, strain2, MRCA, parents, scaled_tree_string, distances, L, mu, kappa) # print('got c') r = s - c - a # print(pair_distances['distance_1']) # print(pair_distances['distance_2']) # print(L) # fills in the appropriate values to the S,C,A,R matrices for the current strain pair SHARED[s1, s2] = s SHARED[s2, s1] = s CONVERGENT[s1, s2] = c CONVERGENT[s2, s1] = c ANCESTRAL[s1, s2] = a ANCESTRAL[s2, s1] = a RECOMBINANT[s1, s2] = r RECOMBINANT[s2, s1] = r RATES[s1, s2] = r / int(pair_distances['distance_1'] * L + 1) RATES[s2, s1] = r / int(pair_distances['distance_2'] * L + 1) total += r / int(pair_distances['distance_1'] * L + 1) total += r / int(pair_distances['distance_2'] * L + 1) # print('\n\nCompleted strain pairing ' + str(count) + ' out of ' + str(total_pairs) + '\n\n') count += 1 average_rate = total / total_pairs print('The average recombination rate is ' + str(average_rate)) return { 'strain_names': strain_names, 'Shared': SHARED, 'Convergent': CONVERGENT, 'Ancestral': ANCESTRAL, 'Recombinant': RECOMBINANT, 'Rates': RATES, 'average': average_rate }