Example #1
0
        print(name)
        # identity_matrix = id_matrix(strains)
        # shape = identity_matrix.shape
        # with open(('ID_Matrix_' + species_name + '.csv'), 'w', newline = '') as f:
        # 	writer = csv.writer(f)
        # 	writer.writerow([species_name])
        # 	header = ['']
        # 	header.extend(strain_names)
        # 	writer.writerow(header)
        # 	for row in range(shape[0]):
        # 		write_row = [strain_names[row]]
        # 		write_row.extend(identity_matrix[row])
        # 		writer.writerow(write_row)
        # 	writer.writerow([])
        print(name)
        L = genome_length(strains)
        n = species_size(strains)
        pi = pi_value(strains)
        theta = theta_value(strains)

    else:
        L = 'N/A'
        n = 'N/A'

    # if (os.path.join(full_path, 'concat_universal.fa'):
    # 	concat_universal_file = open(os.path.join(full_path, 'concat_universal.fa'), 'r')
    # 	concat_universal_file = list(concat_universal_file)

    # print(full_path)
    # concat_core_file = open(os.path.join(full_path, 'concat_core.fa'), 'r') # open('concat_core.fa', 'r')
    # concat_universal_file = open(os.path.join(full_path, 'concat_universal.fa'), 'r')
Example #2
0
def apply_model_along_phylogeny(species_path, kappa, tree_string):
    # ancestor = ''
    # print('Reading in the strains.\n\n')
    strains = read_in_strains(
        species_path
    )  # dictionary with the genomes of all the strains; key = strain name, value = genome
    internal_nodes = get_internal_nodes(anc_path)
    all_nodes = strains + internal_nodes
    strain_names = list(strains.keys())  # list of all the extant strain names
    all_node_names = list(all_nodes.keys())
    # print(strain_names)
    n = species_size(strains)  # number of extant strains
    total_pairs = (
        n *
        (n - 1)) / 2  # the total number of strain pairs that will be compared
    L = genome_length(strains)  # number of base pairs in the genomd
    theta = theta_value(
        strains)  # proportion of the genome that is polymorphic
    mu = theta / (2 * n
                  )  # mutation rate in mutations per base pair per generation
    min_m = get_min_m(
        strains, L
    )  # minimum number of mutations that could account for all the polymorphisms in the species
    # print('Scaling the branch lengths of the tree.\n\n')
    # scaled_tree_string = tree_string
    scaled_tree_string = scale_newick_format_tree(
        strains, L, min_m, tree_string, 0)  # the tree_string scaled by min_m

    SHARED = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides shared between two strains; the (i,j) entry is the number of nucleotides that are the same between strain i and strain j
    CONVERGENT = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides that match due to convergent mutation between two strains; the (i,j) entry is the number of convergent mutations between strain i and strain j
    ANCESTRAL = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides that match due to direct inheritence from the ancestor; the (i,j) entry is the number of nucleotides that were inherited by both strain i and strain j
    RECOMBINANT = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides that match due to a recombination event; the (i,j) entry is the number of nucleotides that were recombined between strain i and strain j

    updated_tree_info = name_nodes(tree_string, strain_names)
    tree_string = updated_tree_info[
        'tree_string']  # version of the tree_string where every node is labeled
    new_nodes = updated_tree_info[
        'new_nodes']  # the new node names that were added
    all_nodes = all_node_names + new_nodes  # list of all the node names in the pyhlogenetic tree
    # print(all_nodes)

    # parents = find_parents(strain_names, tree_string) # a dictionary of the sequence of parents of each strain; key = strain name, value = list of the parents in order of increasing distance from the strain
    parents = find_parents(all_node_names, tree_string)
    distances = get_branch_lengths(
        all_nodes, tree_string
    )  # a dictionary of the distances of each strain to its closest ancestor; key = strain name, value = distance to its closest ancestor
    # print(parents)
    # print(distances)

    # parents = parents_and_distances['parents']
    # distances = parents_and_distances['distances']

    count = 1  # a counter for the current strain pair number that is being processed
    for s1 in range(n):  # allows each strain to be strain 1
        strain1 = strain_names[s1]
        genome1 = strains[strain1]
        SHARED[s1, s1] = L
        CONVERGENT[
            s1,
            s1] = 0  # there can be no convergent mutations between a strain and itself
        ANCESTRAL[s1, s1] = L
        RECOMBINANT[s1, s1] = 0
        for s2 in range(s1 + 1,
                        n):  # allows each strain after strain 1 to be strain 2
            strain2 = strain_names[s2]
            genome2 = strains[strain2]

            MRCA = find_MRCA(
                strain1, strain2, parents
            )  # the Most Recent Common Ancestor between the two strains

            s, a = 0, 0  # initializes the shared and ancestral values for the pair of strains to 0
            for site in range(L):  # goes through every site along the genome
                if genome1[site] == genome2[
                        site]:  # counts up the number of shared sites
                    s += 1
                    if genome1[site] == ancestor[
                            site]:  # counts up the number of shared sites that were inherited from the ancestor
                        a += 1

            # s1_tree_location = scaled_tree_string.find(strain_names[s1])

            # s2_tree_location = scaled_tree_string.find(strain_names[s2])

            # start_length_1 = scaled_tree_string.find(':', s1_tree_location) + 1
            # x1 = scaled_tree_string.find(',', start_length_1)
            # y1 = scaled_tree_string.find(')', start_length_1)
            # if x1 == -1:
            # 	end_length_1 = y1
            # elif y1 == -1:
            # 	end_length_1 = x1
            # else:
            # 	end_length_1 = min(x1,y1)

            # start_length_2 = scaled_tree_string.find(':', s2_tree_location) + 1
            # x2 = scaled_tree_string.find(',', start_length_2)
            # y2 = scaled_tree_string.find(')', start_length_2)
            # if x2 == -1:
            # 	end_length_2 = y2
            # elif y2 == -1:
            # 	end_length_2 = x2
            # else:
            # 	end_length_2 = min(x2,y2)

            # length_1 = float(scaled_tree_string[start_length_1:end_length_1])
            # length_2 = float(scaled_tree_string[start_length_2:end_length_2])
            # MRCA = find_MRCA(strain1, strain2, parents) # the Most Recent Common Ancestor between the two strains
            pair_distances = get_distances_to_MRCA(
                strain1, strain2, MRCA, tree_string, strain_names, parents,
                distances
            )  # gets the total lengths of the branches back to the MRCA of strain 1 and strain 2
            distance_1 = pair_distances[
                'distance_1']  # the distance from strain 1 to the MRCA
            distance_2 = pair_distances[
                'distance_2']  # the distance from strain 2 to the MRCA
            m_1 = int(distance_1 * L +
                      1)  # the number of mutations that occurred on strain 1
            m_2 = int(distance_1 * L +
                      1)  # the number of mutations that occurred on strain 2
            generations_1 = int(
                m_1 / mu
            )  # the number of generations over which these mutations occurred on strain 1
            generations_2 = int(
                m_2 / mu
            )  # the number of generations over which these mutations occurred on strain 2

            c = expected_c_given_ms(
                L, m_1, m_2, mu, generations_1, generations_2, kappa, 0.5
            )  # the expected number of convergent mutations between strain 1 and strain 2

            # fills in the appropriate values to the S,C,A,R matrices for the current strain pair
            SHARED[s1, s2] = s
            SHARED[s2, s1] = s
            CONVERGENT[s1, s2] = c
            CONVERGENT[s2, s1] = c
            ANCESTRAL[s1, s2] = a
            ANCESTRAL[s2, s1] = a
            RECOMBINANT[s1, s2] = s - a - c
            RECOMBINANT[s2, s1] = s - a - c

            count += 1
            # print('\n\nCompleted strain pairing ' + str(count) + ' out of ' + str(n**2) + '\n\n')

    # return {'strain_names': strain_names, 'Convergent': CONVERGENT}
    return {
        'strain_names': strain_names,
        'Shared': SHARED,
        'Convergent': CONVERGENT,
        'Ancestral': ANCESTRAL,
        'Recombinant': RECOMBINANT
    }
    kappa_file = '/mnt/c/Users/Owner/Documents/UNCG/Project/BIGG_DATA/Useful_Data/Concatenates,Trees,Homoplasies/' + sp + '/kappa.txt'
    output_file = 'scaling_trials_' + sp + '_universal'
    reduced_species_alignment = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + sp + 'concat_universal.fa.reduced'
    raxml_path = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + sp
    tree_file = 'RAxML_bestTree.tree'
    rooted_tree_file = 'RAxML_rootedTree.root'
    ancestral_tree_file = 'RAxML_nodeLabelledRootedTree.anc'
    reduced = os.path.exists(reduced_species_alignment)
    if not reduced:
        strains = read_in_strains(species_alignment)
    else:
        strains = read_in_reduced_strains(
            reduced_species_alignment
        )  # dictionary with the genomes of all the strains; key = strain name, value = genome

    L = genome_length(strains)  # number of base pairs in the genome
    n = species_size(strains)  # number of extant strains
    strain_names = list(strains.keys())  # list of all the extant strain names

    tree_file = open((os.path.join(raxml_path, rooted_tree_file)), 'r')
    rooted_tree_string = list(tree_file)[0]
    # tree_file = open((os.path.join(raxml_path, ancestral_tree_file)), 'r')
    # ancestral_tree_string = list(tree_file)[0]
    # internal_nodes = get_internal_nodes(os.path.join(raxml_path, ancestral_alignment))
    # internal_nodes,ancestral_tree_string = rename_ancestors(internal_nodes, strain_names, ancestral_tree_string)
    # all_nodes = {}
    # for key in strains.keys():
    # all_nodes[key] = strains[key]
    # for key in internal_nodes.keys():
    # all_nodes[key] = internal_nodes[key]
from process_genomes import theta_value
from process_genomes import nucleotide_composition

# runs the functions to get pi, theta, GC% average, and GC% standard deviation for each species and write them into a .csv file
# time complexity: O(n^4), where n is the length of the strains
path = 'C:/Users/Owner/Documents/UNCG REU/Project/concatenates/other' # path where the .fa files are located 
with open(('species_params2.csv'), 'w', newline = '') as f: 
	writer = csv.writer(f)
	writer.writerow(['Species', 'Genome Length', 'pi', 'theta', 'GC%']) # column headers
	for filename in glob.glob(os.path.join(path, '*.fa')): # finds the values for each species
		species = read_in_strains(filename)
		name = filename[len(path)+1:len(filename)-3] # filename.strip('C:/Users/Owner/Documents/UNCG REU/Project/Recombination-Rates/concatenates').strip('/concat_') # strips off everything but the actual species name
		# name = filename.strip('C:/Users/Owner/Documents/UNCG REU/Project/Recombination-Rates/concatenates').strip('/concat_') # strips off everything but the actual species name
		print(name)
		size = species_size(species)
		length = genome_length(species)
		pi = str(pi_value(species))
		theta = str(theta_value(species))
		# GC_comp = list(nucleotide_composition(species))
		GC_average = nucleotide_composition(species)
		# GC_average = GC_comp[0]
		# GC_stdev = GC_comp[1]


		writer.writerow([name, length, size, pi, theta, GC_average])

		# print(filename)
		# print('pi = ' + str(pi))
		# print('theta = ' + str(theta))
		# print('GC% = ' + str(GC_comp))
		# print('\n')
Example #5
0
def get_SCAR_matrices(species_alignment, ancestral_alignment, kappa_file, mu,
                      species):
    reduced_species_alignment = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + species + 'concat_universal.fa.reduced'
    raxml_path = '/mnt/c/Users/Owner/Documents/UNCG/Project/standard-RAxML/done_species/' + species  # uberculosis'
    tree_file = 'RAxML_bestTree.tree'
    rooted_tree_file = 'RAxML_rootedTree.root'
    # ancestral_alignment = 'RAxML_marginalAncestralStates.anc'
    ancestral_tree_file = 'RAxML_nodeLabelledRootedTree.anc'

    # get_tree_string(species_alignment, raxml_path)
    reduced = os.path.exists(reduced_species_alignment)
    # get_tree_root(tree_file, raxml_path)
    # get_ancestors(rooted_tree_file, species_alignment, raxml_path, reduced)

    if not reduced:
        strains = read_in_strains(species_alignment)
    else:
        strains = read_in_reduced_strains(
            reduced_species_alignment
        )  # dictionary with the genomes of all the strains; key = strain name, value = genome
    # for strain in strains.keys():
    # 	print(strain)
    # 	print(strains[strain][:10])

    L = genome_length(strains)  # number of base pairs in the genome
    n = species_size(strains)  # number of extant strains
    strain_names = list(strains.keys())  # list of all the extant strain names

    SHARED = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides shared between two strains; the (i,j) entry is the number of nucleotides that are the same between strain i and strain j
    CONVERGENT = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides that match due to convergent mutation between two strains; the (i,j) entry is the number of convergent mutations between strain i and strain j
    ANCESTRAL = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides that match due to direct inheritence from the ancestor; the (i,j) entry is the number of nucleotides that were inherited by both strain i and strain j
    RECOMBINANT = np.empty(
        [n, n], dtype=np.float, order='C'
    )  # a matrix of the number of nucleotides that match due to a recombination event; the (i,j) entry is the number of nucleotides that were recombined between strain i and strain j
    RATES = np.empty([n, n], dtype=np.float, order='C')

    tree_file = open((os.path.join(raxml_path, rooted_tree_file)), 'r')
    rooted_tree_string = list(tree_file)[0]
    tree_file = open((os.path.join(raxml_path, ancestral_tree_file)), 'r')
    ancestral_tree_string = list(tree_file)[0]
    # strains = read_in_strains(species_alignment) # dictionary with the genomes of all the strains; key = strain name, value = genome
    internal_nodes = get_internal_nodes(
        os.path.join(raxml_path, ancestral_alignment))
    # print(internal_nodes.keys())
    # all_nodes = internal_nodes

    internal_nodes, ancestral_tree_string = rename_ancestors(
        internal_nodes, strain_names, ancestral_tree_string)

    all_nodes = {}
    for key in strains.keys():
        all_nodes[key] = strains[key]
    for key in internal_nodes.keys():
        all_nodes[key] = internal_nodes[key]

    # strain_names = list(strains.keys()) # list of all the extant strain names
    all_node_names = list(all_nodes.keys())
    # print(strain_names)
    print(all_node_names)

    # n = species_size(strains) # number of extant strains
    total_pairs = int(
        (n * (n - 1)) /
        2)  # the total number of strain pairs that will be compared
    # L = genome_length(strains) # number of base pairs in the genome
    pi = pi_value(strains)
    theta = theta_value(
        strains)  # proportion of the genome that is polymorphic
    # print(theta)
    # print(n)
    # mu = (theta)/(2*n) # mutation rate in mutations per base pair per generation
    # print(mu)

    # tree_file = open((os.path.join(raxml_path, rooted_tree_file)), 'r')
    # rooted_tree_string = list(tree_file)[0]
    # tree_file = open((os.path.join(raxml_path, ancestral_tree_file)), 'r')
    # ancestral_tree_string = list(tree_file)[0]

    # print(rooted_tree_string)
    # print(ancestral_tree_string)
    complete_tree_string = merge_trees(rooted_tree_string,
                                       ancestral_tree_string)
    print(complete_tree_string)

    kappa_file = open(kappa_file, 'r')
    kappa = float(list(kappa_file)[0])
    # min_m = get_min_m(strains, L) # minimum number of mutations that could account for all the polymorphisms in the species
    # max_m = get_max_m(strains, L, complete_tree_string)
    #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ###############################################################################
    ##### CHANGE THIS!!!!!!!!!!!!!!!!!!!!! ########################################
    ###############################################################################
    # scaled_tree_string = complete_tree_string
    scaled_tree_string = scale_newick_format_tree(complete_tree_string)

    # scaled_tree_string = scale_branch_lengths(L, complete_tree_string, min_m, max_m, pi, theta, kappa, 1) # scale_newick_format_tree(strains, L, min_m, tree_string, 0) # the tree_string scaled by min_m
    # L, tree_string, min_m, max_m, real_pi, real_theta, kappa
    phylogeny = pyvolve.read_tree(tree=scaled_tree_string)
    # pyvolve.print_tree(phylogeny)

    g = open('scaled_tree.txt', 'w')
    g.write(scaled_tree_string)
    g.close()

    # updated_tree_info = name_nodes(tree_string, strain_names)
    # scaled_tree_string = updated_tree_info['tree_string'] # version of the tree_string where every node is labeled
    # new_nodes = updated_tree_info['new_nodes'] # the new node names that were added
    # all_nodes = all_node_names + new_nodes # list of all the node names in the pyhlogenetic tree
    # print(all_nodes)

    # parents = find_parents(strain_names, tree_string) # a dictionary of the sequence of parents of each strain; key = strain name, value = list of the parents in order of increasing distance from the strain
    parents = find_parents(all_node_names, scaled_tree_string)
    # print('found parents')
    # print(parents)
    distances = get_branch_lengths(
        all_node_names, scaled_tree_string
    )  # a dictionary of the distances of each strain to its closest ancestor; key = strain name, value = distance to its closest ancestor
    # print('found distances')
    # print(distances)

    count = 1  # a counter for the current strain pair number that is being processed
    total = 0
    for s1 in range(n):  # allows each strain to be strain 1
        strain1 = strain_names[s1]
        genome1 = strains[strain1]
        SHARED[s1, s1] = L
        CONVERGENT[
            s1,
            s1] = 0  # there can be no convergent mutations between a strain and itself
        ANCESTRAL[s1, s1] = L
        RECOMBINANT[s1, s1] = 0
        for s2 in range(s1 + 1,
                        n):  # allows each strain after strain 1 to be strain 2
            strain2 = strain_names[s2]
            genome2 = strains[strain2]

            MRCA = find_MRCA(
                strain1, strain2, parents
            )  # the Most Recent Common Ancestor between the two strains
            MRCA_genome = all_nodes[MRCA]

            s, a = get_s_a(genome1, genome2, MRCA_genome, L)
            # print('got s and a')

            c, pair_distances = get_c(strain1, strain2, MRCA, parents,
                                      scaled_tree_string, distances, L, mu,
                                      kappa)
            # print('got c')

            r = s - c - a
            # print(pair_distances['distance_1'])
            # print(pair_distances['distance_2'])
            # print(L)

            # fills in the appropriate values to the S,C,A,R matrices for the current strain pair
            SHARED[s1, s2] = s
            SHARED[s2, s1] = s
            CONVERGENT[s1, s2] = c
            CONVERGENT[s2, s1] = c
            ANCESTRAL[s1, s2] = a
            ANCESTRAL[s2, s1] = a
            RECOMBINANT[s1, s2] = r
            RECOMBINANT[s2, s1] = r
            RATES[s1, s2] = r / int(pair_distances['distance_1'] * L + 1)
            RATES[s2, s1] = r / int(pair_distances['distance_2'] * L + 1)
            total += r / int(pair_distances['distance_1'] * L + 1)
            total += r / int(pair_distances['distance_2'] * L + 1)

            # print('\n\nCompleted strain pairing ' + str(count) + ' out of ' + str(total_pairs) + '\n\n')
            count += 1

    average_rate = total / total_pairs
    print('The average recombination rate is ' + str(average_rate))

    return {
        'strain_names': strain_names,
        'Shared': SHARED,
        'Convergent': CONVERGENT,
        'Ancestral': ANCESTRAL,
        'Recombinant': RECOMBINANT,
        'Rates': RATES,
        'average': average_rate
    }
        # name = filename.strip(path).strip('concat_') # strips off everything but the actual species name
        # print(name)
        # writer.writerow([name])
        # id_matrix = np.matrix([[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]])
        # identity_matrix = id_matrix(species)
        # shape = identity_matrix.shape
        header = ['']
        header.extend(strains)
        writer.writerow(header)
        for row in range(shape[0]):
            write_row = [strains[row]]
            write_row.extend(identity_matrix[row])
            writer.writerow(write_row)
        writer.writerow([])
    print(name)
    print(genome_length(species))
    print(species_size(species))

    # writer.writerow(species.keys())
    # for x in range(number):
    # 	ids = (number+1)*[None]
    # 	ids[0] = species.keys()[x]
    # 	for y in range(number):
    # 		ids[y+1] = id_matrix[x,y]
    # 	writer.writerow(ids)

# writer = csv.writer(f)
# matrices = id_matrix_sim(n, l, m, g, k, p)
# id_matrix = matrices['id_matrix']
# c_matrix = matrices['c_matrix']
# shape = id_matrix.shape # (rows, columns)