def test_mean(Ps, Tint, lint, nodes_in_order ): #take a list of vectors, push up, take mean, then push down Ps_pushed = [] for P in Ps: P_pushed = L2U.push_up(P, Tint, lint, nodes_in_order) Ps_pushed.append(P_pushed) mean = L2U.mean_of_vectors(Ps_pushed) mean_inverse = L2U.inverse_push_up(mean, Tint, lint, nodes_in_order) return np.any(mean_inverse < 0)
def test_W2(): tree_str = '((B:0.4,C:0.6)A:1);' # there is an internal node (temp0) here. (T1, l1, nodes1) = L2U.parse_tree(tree_str) nodes_samples = { 'C': { 'sample1': 0, 'sample2': 0.5, 'sample3': 0.17 }, 'B': { 'sample1': 0, 'sample2': 0.33, 'sample3': 0.5 }, 'A': { 'sample1': 1, 'sample2': 0.17, 'sample3': 0.33 }, 'temp0': { 'sample1': 0, 'sample2': 0, 'sample3': 0 } } # temp0 is the root node (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1) W2 = L2U.build_W2(T1, l1, nodes1) push_up_1 = L2U.push_up(nodes_weighted['sample1'], T1, l1, nodes1) push_up_2 = L2U.push_up(nodes_weighted['sample2'], T1, l1, nodes1) push_up_3 = L2U.push_up(nodes_weighted['sample3'], T1, l1, nodes1) w2_sample1 = W2.dot(np.array(nodes_weighted['sample1'])) w2_sample2 = W2.dot(np.array(nodes_weighted['sample2'])) w2_sample3 = W2.dot(np.array(nodes_weighted['sample3'])) assert all(w2_sample1 - push_up_1 < 10**-12) assert all(w2_sample2 - push_up_2 < 10**-12) assert all(w2_sample3 - push_up_3 < 10**-12) inv_W2 = L2U.inverse_W2(csc_matrix(W2)) inv_push_up_1 = inv_W2.dot(push_up_1) inv_push_up_2 = inv_W2.dot(push_up_2) inv_push_up_3 = inv_W2.dot(push_up_3) assert all(inv_push_up_1 - nodes_weighted['sample1'] < 10**-12) assert all(inv_push_up_2 - nodes_weighted['sample2'] < 10**-12) assert all(inv_push_up_3 - nodes_weighted['sample3'] < 10**-12) #test with real data P = env_prob_dict['232.M9Okey217'] Q = env_prob_dict['232.M3Indl217'] W2 = L2U.build_W2(Tint, lint, nodes_in_order) push_up_1 = L2U.push_up(P, Tint, lint, nodes_in_order) push_up_2 = L2U.push_up(Q, Tint, lint, nodes_in_order) assert all(W2.dot(np.array(P)) - push_up_1 < 10**-12) assert all(W2.dot(np.array(Q)) - push_up_2 < 10**-12)
def Total_Pairwise(biom_file, tree_file, output_file=None, debug=0, max_cores=int(mp.cpu_count() / 4)): global T1 global l1 global nodes_in_order global nodes_weighted global PCoA_Samples if max_cores > mp.cpu_count() or max_cores <= 1: cores = mp.cpu_count() - 1 else: cores = max_cores nodes_samples = BW.extract_biom(biom_file) T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file) (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes_in_order) PCoA_Samples = BW.extract_samples(biom_file) if debug == 1: print(f"Running Debugging Multiprocess on {cores} Cores...") # Testing subset of samples... PCoA_Samples = PCoA_Samples[:64] local_vars = list(locals().items()) for var, obj in local_vars: print(f"{var.ljust(17)}: {sys.getsizeof(obj)}") # Multi Core Method row = [(i, j) for j in range(len(PCoA_Samples)) for i in range(len(PCoA_Samples))] with mp.Pool(processes=cores) as pool: result = pool.map(unifrac_work_wrapper, row) result_matrix = [] for i in range(len(PCoA_Samples)): dist_list = [] for j in range(len(PCoA_Samples)): dist_list.append(result[i * len(PCoA_Samples) + j][0]) if debug == 1: print(result[i * len(PCoA_Samples) + j][1]) result_matrix.append(dist_list) if output_file is not None: CSV.write(output_file, dist_list) return result_matrix
def test_parse_tree(): tree_str = '((B:0.1,C:0.2)A:0.3);' (Tint1, lint1, nodes_in_order1) = L2U.parse_tree(tree_str) assert Tint1 == {0: 2, 1: 2, 2: 3} assert lint1 == {(1, 2): 0.1, (2, 3): 0.3, (0, 2): 0.2} assert nodes_in_order1 == ['C', 'B', 'A', 'temp0'] # temp0 is the root node
def test_inverse(): #simple tests P1 = np.array([0.1, 0.2, 0, 0.3, 0, 0.3, 0.1]) T1 = {0: 4, 1: 4, 2: 5, 3: 5, 4: 6, 5: 6} l1 = { (0, 4): 0.1, (1, 4): 0.1, (2, 5): 0.2, (3, 5): 0, (4, 6): 0.2, (5, 6): 0.2 } # 0 edge_length not involving the root nodes1 = ['A', 'B', 'C', 'D', 'temp0', 'temp1', 'temp2'] P_pushed1 = L2U.push_up(P1, T1, l1, nodes1) x = np.sqrt(L2U.epsilon) * 0.3 answer1 = np.array( [0.0316227766, 0.0632455532, 0, x, 0.134164079, 0.268328157, 1]) assert all(np.abs(P_pushed1 - answer1) < 0.00000001) #test push_up assert P_pushed1[3] > 10**-18 #P_pushed[3] (edge length 0) is non-zero P_inversed1 = L2U.inverse_push_up(P_pushed1, T1, l1, nodes1) assert np.sum(abs(P1 - P_inversed1)) < 10**-12 #test inverse_push_up l2 = { (0, 4): 0.1, (1, 4): 0.1, (2, 5): 0.2, (3, 5): 0, (4, 6): 0.2, (5, 6): 0 } # more than one edge with 0 edge length, involving the root. y = np.sqrt(L2U.epsilon) * 0.6 P_pushed2 = L2U.push_up(P1, T1, l2, nodes1) answer2 = np.array([0.0316227766, 0.0632455532, 0, x, 0.134164079, y, 1]) assert all(np.abs(P_pushed2 - answer2) < 0.00000001) #test with real data Q = env_prob_dict['232.M2Lsft217'] Q_pushed = L2U.push_up(Q, Tint, lint, nodes_in_order) Q_inversed = L2U.inverse_push_up(Q_pushed, Tint, lint, nodes_in_order) assert np.sum(abs(Q - Q_inversed)) < 10**-12
def generate_diffab(biom_file, tree_file, metadata_file, tax_file, verbose, threads, intermediate_store, preprocessed_use, unifrac_code, output_file): if verbose: print('\tExtracting metadata...') (Tint, lint, nodes_in_order) = L2U.parse_tree_file(tree_file) metadata = meta.extract_metadata(metadata_file) sample_groups = [] groups_temp = list(metadata.values()) groups = [] for i in range(len(groups_temp)): if groups_temp[i]['body_site'] not in groups: groups.append(groups_temp[i]['body_site']) group_str = ','.join(groups) if verbose: print('\tSuccessfully extracted metadata') if preprocessed_use and path.exists('intermediate/L1_preprocessed_intermediate.txt') and path.exists('intermediate/L2_preprocessed_intermediate.txt'): L1_preprocessed = CSV.read('intermediate/L1_preprocessed_intermediate.txt') L2_preprocessed = CSV.read('intermediate/L2_preprocessed_intermediate.txt') if verbose: print('\tSuccessfully retrieved intermediate file for L1 and L2 Preprocessing') else: if verbose and preprocessed_use: print('\tWarning: Intermediate selected but not available. Starting preprocessing... This may take a while...') elif verbose: print('\tWarning: Biom preprocessing starting... This may take a while...') if intermediate_store: if path.exists('intermediate/L1_preprocessed_intermediate.txt'): os.remove('intermediate/L1_preprocessed_intermediate.txt') if path.exists('intermediate/L2_preprocessed_intermediate.txt'): os.remove('intermediate/L2_preprocessed_intermediate.txt') L1_preprocessed, L2_preprocessed = prep.generate_preprocessed(biom_file, tree_file, 1, 'intermediate/L1_preprocessed_intermediate.txt', 'intermediate/L2_preprocessed_intermediate.txt') else: L1_preprocessed, L2_preprocessed = prep.generate_preprocessed(biom_file, tree_file, unifrac_code, 'tmp_L1_preprocessed_intermediate.txt', 'tmp_L2_preprocessed_intermediate.txt') if verbose: print('\tCompleted biom preprocessing matrix computation') if unifrac_code == 1 or unifrac_code == 2: if preprocessed_use or intermediate_store: L1_region_names, L1_tax_arr, L1_group_averages, L1_inverse_pushed, L1_neg_arr, L1_distance_matrix, L1_node_type_group_abundances = avg.compute_L1_averages('intermediate/L1_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True) else: L1_region_names, L1_tax_arr, L1_group_averages, L1_inverse_pushed, L1_neg_arr, L1_distance_matrix, L1_node_type_group_abundances = avg.compute_L1_averages('tmp_L1_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True) diff.generate_diffab(L1_region_names, L1_inverse_pushed, Tint, lint, nodes_in_order, L1_tax_arr, 'L1' + output_file, 0.000005, 10, True, 1) if unifrac_code == 0 or unifrac_code == 1: if preprocessed_use or intermediate_store: L2_region_names, L2_tax_arr, L2_group_averages, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances = avg.compute_L2_averages('intermediate/L2_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True) else: L2_region_names, L2_tax_arr, L2_group_averages, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances = avg.compute_L2_averages('tmp_L2_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True) diff.generate_diffab(L2_region_names, L2_inverse_pushed, Tint, lint, nodes_in_order, L2_tax_arr, 'L2' + output_file, 0.000005, 10, True, 2) if path.exists('tmp_L1_preprocessed_intermediate.txt'): os.remove('tmp_L1_preprocessed_intermediate.txt') if path.exists('tmp_L2_preprocessed_intermediate.txt'): os.remove('tmp_L2_preprocessed_intermediate.txt')
def Group_Pairwise(biom_file, tree_file, metadata_file, group_num, output_file=None, debug=0, max_cores=int(mp.cpu_count() / 4)): global T1 global l1 global nodes_in_order global nodes_weighted global PCoA_Samples if max_cores > mp.cpu_count() or max_cores <= 1: cores = mp.cpu_count() - 1 else: cores = max_cores nodes_samples = BW.extract_biom(biom_file) T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file) (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes_in_order) PCoA_Samples = BW.extract_samples(biom_file) group_num -= 1 metadata = meta.extract_metadata(metadata_file) sample_groups = [] groups_temp = list(metadata.values()) groups = [] for i in range(len(groups_temp)): if groups_temp[i]['body_site'] not in groups: groups.append(groups_temp[i]['body_site']) print(groups) sample_sites = [[] for i in range(len(groups))] # Separate the groups for i in range(len(PCoA_Samples)): for j in range(len(groups)): if metadata[PCoA_Samples[i]]['body_site'] == groups[j]: sample_sites[j].append(PCoA_Samples[i]) print(sample_sites) if debug == 1: print(f"Running Debugging Multiprocess on {cores} Cores...") # Testing subset of samples... sample_sites[group_num] = sample_sites[group_num][:64] local_vars = list(locals().items()) for var, obj in local_vars: print(f"{var.ljust(17)}: {sys.getsizeof(obj)}") # Multi Core Method row = [(i, j) for j in range(len(sample_sites[group_num])) for i in range(len(sample_sites[group_num]))] with mp.Pool(processes=cores) as pool: result = pool.map(unifrac_work_wrapper, row) result_matrix = [] for i in range(len(sample_sites[group_num])): dist_list = [] for j in range(len(sample_sites[group_num])): dist_list.append(result[i * len(sample_sites[group_num]) + j][0]) if debug == 1: print(result[i * len(sample_sites[group_num]) + j][1]) result_matrix.append(dist_list) if output_file is not None: CSV.write(output_file[:-4] + "-" + groups[group_num] + '.csv', dist_list) return result_matrix
def unifrac_worker(samp1num, samp2num): L2UniFrac = L2U.L2Unifrac_weighted_plain( T1, l1, nodes_in_order, nodes_weighted[PCoA_Samples[samp1num]], nodes_weighted[PCoA_Samples[samp2num]]) formatted_L2 = "{:.16f}".format(L2UniFrac) return L2UniFrac, f"\tInner loop: {str(samp2num).zfill(4)} | L2-UniFrac: {formatted_L2} | Sample 1: {PCoA_Samples[samp1num]} | Sample 2: {PCoA_Samples[samp2num]}"
def compute_L1_L2_averages(L1_file, L2_file, biom_file, tree_file, metadata_file, tax_file, output_file=None): if output_file is not None and path.exists(output_file): os.remove(output_file) # Note: these are the same for L1/L2, so they will be computed only once. (USE T1 FOR ANCESTORS FOR TEMP NODES) #nodes_samples = BW.extract_biom(biom_file) T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file) # Subsample Biom file (2 samples). Then trace the mass to find where it no longer sums to 1. PCoA_Samples = BW.extract_samples(biom_file) metadata = meta.extract_metadata(metadata_file) # Extract region names and map samples to regions region_names = [] region_map = {} for i in range(len(PCoA_Samples)): if metadata[PCoA_Samples[i]]['body_site'] not in region_names: region_map[metadata[PCoA_Samples[i]]['body_site']] = [] region_names.append(metadata[PCoA_Samples[i]]['body_site']) region_map[metadata[PCoA_Samples[i]]['body_site']].append(i) PCoA_Samples[i] = region_names.index(metadata[PCoA_Samples[i]]['body_site']) # Read sparse matrices if not isinstance(L1_file, list): sparse_matrix_L1 = CSV.read_sparse(L1_file) else: sparse_matrix_L1 = L1_file if not isinstance(L2_file, list): sparse_matrix_L2 = CSV.read_sparse(L2_file) else: sparse_matrix_L2 = L2_file group_averages_L1 = {} group_averages_L2 = {} # Store region names for later if output_file is not None: CSV.write(output_file, region_names) # Write taxas for cell taxonomies = tax.extract_tax(tax_file) tax_arr = [] for i in range(len(nodes_in_order)): if nodes_in_order[i][0] != 't': tax_arr.append(taxonomies[int(nodes_in_order[i])]) else: loop = True if i in T1: temp_node = T1[i] else: tax_arr.append('internal') loop = False while loop: if nodes_in_order[temp_node][0] != 't': tax_arr.append(taxonomies[int(nodes_in_order[temp_node])]) break else: if temp_node in T1: temp_node = T1[temp_node] else: tax_arr.append('internal') break if output_file is not None: CSV.write(output_file, tax_arr) # Take L1 average of each L1_pushed_arr = [] for i in range(len(region_names)): group_arr = [] if not isinstance(L1_file, list): for j in range(len(region_map[region_names[i]])): group_arr.append(np.array(sparse_matrix_L1[region_map[region_names[i]][j]].todense())[0]) else: for j in range(len(region_map[region_names[i]])): group_arr.append(np.array(sparse_matrix_L1[region_map[region_names[i]][j]])[0]) average = L1U.median_of_vectors(group_arr) group_averages_L1[region_names[i]] = average L1_pushed_arr.append(average) # Store L1 averages print("L1 Group Averages:") if output_file is not None: CSV.write(output_file, ["L1 Group Averages:"]) for name in region_names: padded_name = "{:<15}".format(name+":") print(f"{padded_name} {group_averages_L1[name]}") if output_file is not None: CSV.write(output_file, group_averages_L1[name]) # Take L2 average of each L2_pushed_arr = [] for i in range(len(region_names)): group_arr = [] if not isinstance(L2_file, list): for j in range(len(region_map[region_names[i]])): group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]].todense())[0]) else: for j in range(len(region_map[region_names[i]])): group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]])[0]) average = L2U.mean_of_vectors(group_arr) group_averages_L2[region_names[i]] = average L2_pushed_arr.append(average) # Store L2 averages print("\nL2 Group Averages:") if output_file is not None: CSV.write(output_file, ["L2 Group Averages:"]) for name in region_names: padded_name = "{:<15}".format(name+":") print(f"{padded_name} {group_averages_L2[name]}") if output_file is not None: CSV.write(output_file, group_averages_L2[name]) # Push L1 down and store print("\nL1 Inverse Push Up:") if output_file is not None: CSV.write(output_file, ["L1 Inverse Pushed Up:"]) L1_neg_arr = [] L1_inverse_pushed = {} for name in region_names: neg_count = 0 median_inverse = L1U.inverse_push_up(group_averages_L1[name], T1, l1, nodes_in_order) L1_inverse_pushed[name] = median_inverse for i in range(len(median_inverse)): if median_inverse[i] < negatives_filtering_threshold: neg_count += 1 L1_neg_arr.append(neg_count) padded_name = "{:<15}".format(name+":") print(f"{padded_name} {median_inverse}") if output_file is not None: CSV.write(output_file, median_inverse) # Push L2 down and store print("\nL2 Inverse Push Up:") if output_file is not None: CSV.write(output_file, ["L2 Inverse Pushed Up:"]) L2_neg_arr = [] L2_inverse_pushed = {} for name in region_names: neg_count = 0 mean_inverse = L2U.inverse_push_up(group_averages_L2[name], T1, l1, nodes_in_order) L2_inverse_pushed[name] = mean_inverse for i in range(len(mean_inverse)): if mean_inverse[i] < negatives_filtering_threshold: neg_count += 1 L2_neg_arr.append(neg_count) padded_name = "{:<15}".format(name+":") print(f"{padded_name} {mean_inverse}") if output_file is not None: CSV.write(output_file, mean_inverse) # Write negative counts print("L1 and L2 Negatives by Group:") print(L1_neg_arr) print(L2_neg_arr) if output_file is not None: CSV.write(output_file, ["L1 and L2 Negatives by Group:"]) CSV.write(output_file, L1_neg_arr) CSV.write(output_file, L2_neg_arr) L1_distance_matrix = compute_pairwise_pushed_L1(L1_pushed_arr) L2_distance_matrix = compute_pairwise_pushed_L2(L2_pushed_arr) print("L1 Distance Matrix:") if output_file is not None: CSV.write(output_file, ["L1 Distance Matrix:"]) for i in range(len(L1_pushed_arr)): print(L1_distance_matrix[i]) if output_file is not None: CSV.write(output_file, L1_distance_matrix[i]) print("L2 Distance Matrix:") if output_file is not None: CSV.write(output_file, ["L2 Distance Matrix:"]) for i in range(len(L2_pushed_arr)): print(L2_distance_matrix[i]) if output_file is not None: CSV.write(output_file, L2_distance_matrix[i]) print("L1 Abundances by Node Type:") if output_file is not None: CSV.write(output_file, ["L1 Abundances by Node Type:"]) L1_node_type_group_abundances = [] for name in region_names: region_abundance_vector = L1_inverse_pushed[name] k = p = c = o = f = g = s = temp = 0 for i in range(len(region_abundance_vector)): node_tax = tax_arr[i].split(';') if len(node_tax) > 1: if node_tax[-2][0] == 'k': k += region_abundance_vector[i] elif node_tax[-2][0] == 'p': p += region_abundance_vector[i] elif node_tax[-2][0] == 'c': c += region_abundance_vector[i] elif node_tax[-2][0] == 'o': o += region_abundance_vector[i] elif node_tax[-2][0] == 'f': f += region_abundance_vector[i] elif node_tax[-2][0] == 'g': g += region_abundance_vector[i] elif node_tax[-2][0] == 's': s += region_abundance_vector[i] else: print("Error") else: temp += region_abundance_vector[i] print([k, p, c, o, f, g, s, temp]) if output_file is not None: CSV.write(output_file, [k, p, c, o, f, g, s, temp]) L1_node_type_group_abundances.append([k, p, c, o, f, g, s, temp]) print("L2 Abundances by Node Type:") if output_file is not None: CSV.write(output_file, ["L2 Abundances by Node Type:"]) L2_node_type_group_abundances = [] for name in region_names: region_abundance_vector = L2_inverse_pushed[name] k = p = c = o = f = g = s = temp = 0 for i in range(len(region_abundance_vector)): node_tax = tax_arr[i].split(';') if len(node_tax) > 1: if node_tax[-2][0] == 'k': k += region_abundance_vector[i] elif node_tax[-2][0] == 'p': p += region_abundance_vector[i] elif node_tax[-2][0] == 'c': c += region_abundance_vector[i] elif node_tax[-2][0] == 'o': o += region_abundance_vector[i] elif node_tax[-2][0] == 'f': f += region_abundance_vector[i] elif node_tax[-2][0] == 'g': g += region_abundance_vector[i] elif node_tax[-2][0] == 's': s += region_abundance_vector[i] else: print("Error") else: temp += region_abundance_vector[i] print([k, p, c, o, f, g, s, temp]) if output_file is not None: CSV.write(output_file, [k, p, c, o, f, g, s, temp]) L2_node_type_group_abundances.append([k, p, c, o, f, g, s, temp]) return region_names, tax_arr, group_averages_L1, group_averages_L2, L1_inverse_pushed, L2_inverse_pushed, L1_neg_arr, L2_neg_arr, L1_distance_matrix, L2_distance_matrix, L1_node_type_group_abundances, L2_node_type_group_abundances
def L2_pushup_worker(sample_num): L2_Pushed = L2U.push_up(nodes_weighted[PCoA_Samples[sample_num]], T1, l1, nodes_in_order) return L2_Pushed
def test_push_up(): tree_str = '((B:0.1,C:0.2)A:0.3);' # there is an internal node (temp0) here. (T1, l1, nodes1) = L2U.parse_tree(tree_str) nodes_samples = { 'C': { 'sample1': 1, 'sample2': 0 }, 'B': { 'sample1': 1, 'sample2': 1 }, 'A': { 'sample1': 0, 'sample2': 0 }, 'temp0': { 'sample1': 0, 'sample2': 1 } } # temp0 is the root node (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1) unifrac2 = np.linalg.norm( L2U.push_up(nodes_weighted['sample1'], T1, l1, nodes1) - L2U.push_up(nodes_weighted['sample2'], T1, l1, nodes1)) L2_UniFrac = L2U.L2Unifrac_weighted_plain( T1, l1, nodes1, nodes_weighted['sample1'], nodes_weighted['sample2']) #calculated using L2Unifrac print(unifrac2, L2_UniFrac) assert np.abs(unifrac2 - L2_UniFrac) < 10**-12 tree_str = '((B:0.1,C:0.2)A:0.3);' # there is an internal node (temp0) here. (T1, l1, nodes1) = L2U.parse_tree(tree_str) nodes_samples = { 'C': { 'sample1': 1, 'sample2': 0 }, 'B': { 'sample1': 1, 'sample2': 1 }, 'A': { 'sample1': 1, 'sample2': 0 }, 'temp0': { 'sample1': 0, 'sample2': 1 } } # temp0 is the root node (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1) unifrac2 = np.linalg.norm( L2U.push_up(nodes_weighted['sample1'], T1, l1, nodes1) - L2U.push_up(nodes_weighted['sample2'], T1, l1, nodes1)) L2_UniFrac = L2U.L2Unifrac_weighted_plain( T1, l1, nodes1, nodes_weighted['sample1'], nodes_weighted['sample2']) #calculated using L2Unifrac print(unifrac2, L2_UniFrac) assert np.abs(unifrac2 - L2_UniFrac) < 10**-12 #test with real data P = env_prob_dict['232.M9Okey217'] Q = env_prob_dict['232.M3Indl217'] unifrac2 = np.linalg.norm( L2U.push_up(P, Tint, lint, nodes_in_order) - L2U.push_up(Q, Tint, lint, nodes_in_order)) L2_UniFrac2 = L2U.L2Unifrac_weighted_plain(Tint, lint, nodes_in_order, P, Q) #calculated using L2Unifrac print(unifrac2, L2_UniFrac2) assert np.abs(unifrac2 - L2_UniFrac2) < 10**-12
def test_summation(): tree_str1 = '((B:0.1,C:0.2)A:0.3);' # there is an internal node (temp0) here. (T1_1, l1_1, nodes1) = L2U.parse_tree(tree_str1) nodes_samples1 = { 'C': { 'sample1': 1, 'sample2': 0 }, 'B': { 'sample1': 1, 'sample2': 1 }, 'A': { 'sample1': 0, 'sample2': 0 }, 'temp0': { 'sample1': 0, 'sample2': 1 } } # temp0 is the root node (nodes_weighted1, samples_temp1) = L2U.parse_envs(nodes_samples1, nodes1) push_up_1 = L2U.push_up(nodes_weighted1['sample1'], T1_1, l1_1, nodes1) push_up_2 = L2U.push_up(nodes_weighted1['sample2'], T1_1, l1_1, nodes1) push_up_avg = L2U.mean_of_vectors([push_up_1, push_up_2]) push_down_avg = L2U.inverse_push_up(push_up_avg, T1_1, l1_1, nodes1) assert (1 - sum(push_down_avg) < 10**-12) tree_str2 = '((B:0.1,C:0.2)A:0.3);' # there is an internal node (temp0) here. (T1_2, l1_2, nodes2) = L2U.parse_tree(tree_str2) nodes_samples2 = { 'C': { 'sample1': 1, 'sample2': 0 }, 'B': { 'sample1': 1, 'sample2': 1 }, 'A': { 'sample1': 1, 'sample2': 0 }, 'temp0': { 'sample1': 0, 'sample2': 1 } } # temp0 is the root node (nodes_weighted2, samples_temp2) = L2U.parse_envs(nodes_samples2, nodes2) push_up_1 = L2U.push_up(nodes_weighted2['sample1'], T1_2, l1_2, nodes2) push_up_2 = L2U.push_up(nodes_weighted2['sample2'], T1_2, l1_2, nodes2) push_up_avg = L2U.mean_of_vectors([push_up_1, push_up_2]) push_down_avg = L2U.inverse_push_up(push_up_avg, T1_2, l1_2, nodes2) assert (1 - sum(push_down_avg) < 10**-12) #test with real data P1 = env_prob_dict['232.M9Okey217'] P2 = env_prob_dict['232.M3Indl217'] P3 = env_prob_dict['232.L3Space217'] P4 = env_prob_dict['232.M9Vkey217'] P5 = env_prob_dict['232.M2Jkey217'] P6 = env_prob_dict['232.M2Mkey217'] P7 = env_prob_dict['232.M3Rinl217'] P8 = env_prob_dict['232.M3Midl217'] push_up_1 = L2U.push_up(P1, Tint, lint, nodes_in_order) push_up_2 = L2U.push_up(P2, Tint, lint, nodes_in_order) push_up_3 = L2U.push_up(P3, Tint, lint, nodes_in_order) push_up_4 = L2U.push_up(P4, Tint, lint, nodes_in_order) push_up_5 = L2U.push_up(P5, Tint, lint, nodes_in_order) push_up_6 = L2U.push_up(P6, Tint, lint, nodes_in_order) push_up_7 = L2U.push_up(P7, Tint, lint, nodes_in_order) push_up_8 = L2U.push_up(P8, Tint, lint, nodes_in_order) push_up_avg = L2U.mean_of_vectors([ push_up_1, push_up_2, push_up_3, push_up_4, push_up_5, push_up_6, push_up_7, push_up_8 ]) push_down_avg = L2U.inverse_push_up(push_up_avg, Tint, lint, nodes_in_order) assert (1 - sum(push_down_avg) < 10**-12)
def test_weighted(): tree_str = '((B:0.1,C:0.2)A:0.3);' # there is an internal node (temp0) here. (T1, l1, nodes1) = L2U.parse_tree(tree_str) nodes_samples = { 'C': { 'sample1': 1, 'sample2': 0 }, 'B': { 'sample1': 1, 'sample2': 1 }, 'A': { 'sample1': 0, 'sample2': 0 }, 'temp0': { 'sample1': 0, 'sample2': 1 } } # temp0 is the root node (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1) unifrac2 = L2U.L2Unifrac_weighted_plain(T1, l1, nodes1, nodes_weighted['sample1'], nodes_weighted['sample2']) L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted( T1, l1, nodes1, nodes_weighted['sample1'], nodes_weighted['sample2']) #calculated using L2Unifrac print(unifrac2, L2_UniFrac) assert np.abs(unifrac2 - L2_UniFrac) < 10**-12 tree_str = '((B:0.1,C:0.2)A:0.3);' # there is an internal node (temp0) here. (T1, l1, nodes1) = L2U.parse_tree(tree_str) nodes_samples = { 'C': { 'sample1': 1, 'sample2': 0 }, 'B': { 'sample1': 1, 'sample2': 1 }, 'A': { 'sample1': 1, 'sample2': 0 }, 'temp0': { 'sample1': 0, 'sample2': 1 } } # temp0 is the root node (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1) unifrac2 = L2U.L2Unifrac_weighted_plain(T1, l1, nodes1, nodes_weighted['sample1'], nodes_weighted['sample2']) L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted( T1, l1, nodes1, nodes_weighted['sample1'], nodes_weighted['sample2']) #calculated using L2Unifrac print(unifrac2, L2_UniFrac) assert np.abs(unifrac2 - L2_UniFrac) < 10**-12 P = env_prob_dict['232.M9Okey217'] Q = env_prob_dict['232.M3Indl217'] unifrac2 = L2U.L2Unifrac_weighted_plain(Tint, lint, nodes_in_order, P, Q) L2_UniFrac2, DifferentialAbundance = L2U.L2Unifrac_weighted( Tint, lint, nodes_in_order, P, Q) #calculated using L2Unifrac print(unifrac2, L2_UniFrac2) assert np.abs(unifrac2 - L2_UniFrac2) < 10**-12
def generate_preprocessed(biom_file, tree_file, unifrac_code, output_file_L1=None, output_file_L2=None, max_cores=int(mp.cpu_count() / 4)): global T1 global l1 global nodes_in_order global nodes_weighted global PCoA_Samples if max_cores > mp.cpu_count() or max_cores <= 1: cores = mp.cpu_count() - 1 else: cores = max_cores # Note: these are the same for L1/L2, so they will be computed only once. nodes_samples = BW.extract_biom(biom_file) T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file) (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes_in_order) PCoA_Samples = BW.extract_samples(biom_file) # Multi Core Method L1_preprocessed = [] L2_preprocessed = [] values = range(len(PCoA_Samples)) dim1 = len(PCoA_Samples) dim2 = len(L1_pushup_worker(0)) if output_file_L1 is not None and unifrac_code == 1 or unifrac_code == 2: CSV.write(output_file_L1, [dim1, dim2]) L1_preprocessed.append([dim1, dim2]) if output_file_L2 is not None and unifrac_code == 0 or unifrac_code == 1: CSV.write(output_file_L2, [dim1, dim2]) L2_preprocessed.append([dim1, dim2]) if unifrac_code == 1 or unifrac_code == 2: with mp.Pool(processes=cores) as pool: result = pool.map(L1_pushup_worker, values) #chunksize? for i in range(len(result)): for j in range(len(result[i])): if result[i][j] != 0: if output_file_L1 is not None: CSV.write(output_file_L1, [i, j, result[i][j]]) L1_preprocessed.append([i, j, result[i][j]]) if unifrac_code == 0 or unifrac_code == 1: with mp.Pool(processes=cores) as pool: result = pool.map(L2_pushup_worker, values) for i in range(len(result)): for j in range(len(result[i])): if result[i][j] != 0: if output_file_L2 is not None: CSV.write(output_file_L2, [i, j, result[i][j]]) L2_preprocessed.append([i, j, result[i][j]]) if unifrac_code == 0: return [], L2_preprocessed if unifrac_code == 2: return L1_preprocessed, [] if unifrac_code == 1: return L1_preprocessed, L2_preprocessed
def compute_L2_averages(L2_file, biom_file, tree_file, metadata_file, tax_file, output_file=None, most_shared=False): if output_file is not None and path.exists(output_file): os.remove(output_file) # Note: these are the same for L1/L2, so they will be computed only once. (USE T1 FOR ANCESTORS FOR TEMP NODES) #nodes_samples = BW.extract_biom(biom_file) T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file) # Subsample Biom file (2 samples). Then trace the mass to find where it no longer sums to 1. PCoA_Samples = BW.extract_samples(biom_file) metadata = meta.extract_metadata(metadata_file) # Extract region names and map samples to regions region_names = [] region_map = {} for i in range(len(PCoA_Samples)): if metadata[PCoA_Samples[i]]['body_site'] not in region_names: region_map[metadata[PCoA_Samples[i]]['body_site']] = [] region_names.append(metadata[PCoA_Samples[i]]['body_site']) region_map[metadata[PCoA_Samples[i]]['body_site']].append(i) PCoA_Samples[i] = region_names.index(metadata[PCoA_Samples[i]]['body_site']) # Read sparse matrix if not isinstance(L2_file, list): sparse_matrix_L2 = CSV.read_sparse(L2_file) else: sparse_matrix_L2 = L2_file group_averages_L2 = {} # Store region names for later if output_file is not None: CSV.write(output_file, region_names) # Write taxas for cell taxonomies = tax.extract_tax(tax_file) tax_arr = [] if most_shared: leaf_nodes = [] for i in range(len(nodes_in_order)): if nodes_in_order[i][0] != 't': leaf_nodes.append(i) descendent_dict = {i:[] for i in range(len(nodes_in_order))} for i in range(len(leaf_nodes)): temp_node = leaf_nodes[i] while True: if temp_node in T1: if temp_node != leaf_nodes[i]: descendent_dict[temp_node].append(leaf_nodes[i]) temp_node = T1[temp_node] else: if temp_node != leaf_nodes[i]: descendent_dict[temp_node].append(leaf_nodes[i]) break for i in range(len(nodes_in_order)): descendent_taxonomy = [] for j in range(len(descendent_dict[i])): descendent_taxonomy.append(taxonomies[int(nodes_in_order[descendent_dict[i][j]])]) if len(descendent_taxonomy) == 0: tax_arr.append(taxonomies[int(nodes_in_order[i])]) else: k_dict = {} p_dict = {} c_dict = {} o_dict = {} f_dict = {} g_dict = {} s_dict = {} for j in range(len(descendent_taxonomy)): tmp_taxa = descendent_taxonomy[j].split(';')[:-1] for k in range(len(tmp_taxa)): if tmp_taxa[k][0] == 'k' and tmp_taxa[k] not in k_dict: k_dict[tmp_taxa[k]] = 1 elif tmp_taxa[k][0] == 'k': k_dict[tmp_taxa[k]] += 1 if tmp_taxa[k][0] == 'p' and tmp_taxa[k] not in p_dict: p_dict[tmp_taxa[k]] = 1 elif tmp_taxa[k][0] == 'p': p_dict[tmp_taxa[k]] += 1 if tmp_taxa[k][0] == 'c' and tmp_taxa[k] not in c_dict: c_dict[tmp_taxa[k]] = 1 elif tmp_taxa[k][0] == 'c': c_dict[tmp_taxa[k]] += 1 if tmp_taxa[k][0] == 'o' and tmp_taxa[k] not in o_dict: o_dict[tmp_taxa[k]] = 1 elif tmp_taxa[k][0] == 'o': o_dict[tmp_taxa[k]] += 1 if tmp_taxa[k][0] == 'f' and tmp_taxa[k] not in f_dict: f_dict[tmp_taxa[k]] = 1 elif tmp_taxa[k][0] == 'f': f_dict[tmp_taxa[k]] += 1 if tmp_taxa[k][0] == 'g' and tmp_taxa[k] not in g_dict: g_dict[tmp_taxa[k]] = 1 elif tmp_taxa[k][0] == 'g': g_dict[tmp_taxa[k]] += 1 if tmp_taxa[k][0] == 's' and tmp_taxa[k] not in s_dict: s_dict[tmp_taxa[k]] = 1 elif tmp_taxa[k][0] == 's': s_dict[tmp_taxa[k]] += 1 shared_taxonomy = '' for key, value in k_dict.items(): if value == len(descendent_taxonomy): shared_taxonomy = shared_taxonomy+key break for key, value in p_dict.items(): if value == len(descendent_taxonomy): shared_taxonomy = shared_taxonomy+';'+key break for key, value in c_dict.items(): if value == len(descendent_taxonomy): shared_taxonomy = shared_taxonomy+';'+key break for key, value in o_dict.items(): if value == len(descendent_taxonomy): shared_taxonomy = shared_taxonomy+';'+key break for key, value in f_dict.items(): if value == len(descendent_taxonomy): shared_taxonomy = shared_taxonomy+';'+key break for key, value in g_dict.items(): if value == len(descendent_taxonomy): shared_taxonomy = shared_taxonomy+';'+key break for key, value in s_dict.items(): if value == len(descendent_taxonomy): shared_taxonomy = shared_taxonomy+';'+key break shared_taxonomy = shared_taxonomy+';' if shared_taxonomy == ';': shared_taxonomy = 'Root;' # Root node will include taxonomy from Archaea and Bacteria, thus sharing nothing. tax_arr.append(shared_taxonomy) else: for i in range(len(nodes_in_order)): if nodes_in_order[i][0] != 't': tax_arr.append(taxonomies[int(nodes_in_order[i])]) else: loop = True if i in T1: temp_node = T1[i] else: tax_arr.append('internal') loop = False while loop: if nodes_in_order[temp_node][0] != 't': tax_arr.append(taxonomies[int(nodes_in_order[temp_node])]) break else: if temp_node in T1: temp_node = T1[temp_node] else: tax_arr.append('internal') break if output_file is not None: CSV.write(output_file, tax_arr) # Take L2 average of each L2_pushed_arr = [] for i in range(len(region_names)): group_arr = [] if not isinstance(L2_file, list): for j in range(len(region_map[region_names[i]])): group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]].todense())[0]) else: for j in range(len(region_map[region_names[i]])): group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]])[0]) average = L2U.mean_of_vectors(group_arr) group_averages_L2[region_names[i]] = average L2_pushed_arr.append(average) # Store L2 averages print("\nL2 Group Averages:") if output_file is not None: CSV.write(output_file, ["L2 Group Averages:"]) for name in region_names: padded_name = "{:<15}".format(name+":") print(f"{padded_name} {group_averages_L2[name]}") if output_file is not None: CSV.write(output_file, group_averages_L2[name]) # Push L2 down and store print("\nL2 Inverse Push Up:") if output_file is not None: CSV.write(output_file, ["L2 Inverse Pushed Up:"]) L2_neg_arr = [] L2_inverse_pushed = {} for name in region_names: neg_count = 0 mean_inverse = L2U.inverse_push_up(group_averages_L2[name], T1, l1, nodes_in_order) L2_inverse_pushed[name] = mean_inverse for i in range(len(mean_inverse)): if mean_inverse[i] < negatives_filtering_threshold: neg_count += 1 L2_neg_arr.append(neg_count) padded_name = "{:<15}".format(name+":") print(f"{padded_name} {mean_inverse}") if output_file is not None: CSV.write(output_file, mean_inverse) # Write negative counts print("L2 Negatives by Group:") print(L2_neg_arr) if output_file is not None: CSV.write(output_file, ["L2 Negatives by Group:"]) CSV.write(output_file, L2_neg_arr) L2_distance_matrix = compute_pairwise_pushed_L2(L2_pushed_arr) print("L2 Distance Matrix:") if output_file is not None: CSV.write(output_file, ["L2 Distance Matrix:"]) for i in range(len(L2_pushed_arr)): print(L2_distance_matrix[i]) if output_file is not None: CSV.write(output_file, L2_distance_matrix[i]) print("L2 Abundances by Node Type:") if output_file is not None: CSV.write(output_file, ["L2 Abundances by Node Type:"]) L2_node_type_group_abundances = [] for name in region_names: region_abundance_vector = L2_inverse_pushed[name] k = p = c = o = f = g = s = temp = 0 for i in range(len(region_abundance_vector)): node_tax = tax_arr[i].split(';') if len(node_tax) > 1: if node_tax[-2][0] == 'k': k += region_abundance_vector[i] elif node_tax[-2][0] == 'p': p += region_abundance_vector[i] elif node_tax[-2][0] == 'c': c += region_abundance_vector[i] elif node_tax[-2][0] == 'o': o += region_abundance_vector[i] elif node_tax[-2][0] == 'f': f += region_abundance_vector[i] elif node_tax[-2][0] == 'g': g += region_abundance_vector[i] elif node_tax[-2][0] == 's': s += region_abundance_vector[i] else: print("Error") else: temp += region_abundance_vector[i] print([k, p, c, o, f, g, s, temp]) if output_file is not None: CSV.write(output_file, [k, p, c, o, f, g, s, temp]) L2_node_type_group_abundances.append([k, p, c, o, f, g, s, temp]) return region_names, tax_arr, group_averages_L2, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances
import sys sys.path.append('../L2Unifrac') sys.path.append('../L2Unifrac/src') sys.path.append('../src') import L2Unifrac as L2U import numpy as np from scipy.sparse import csc_matrix try: (Tint, lint, nodes_in_order ) = L2U.parse_tree_file('../data/old_UniFrac/97_otus_unannotated.tree') env_dict = L2U.create_env('../data/old_UniFrac/289_seqs_otus.txt') except FileNotFoundError: (Tint, lint, nodes_in_order ) = L2U.parse_tree_file('../data/old_UniFrac/97_otus_unannotated.tree') env_dict = L2U.create_env('../data/old_UniFrac/289_seqs_otus.txt') (env_prob_dict, samples) = L2U.parse_envs(env_dict, nodes_in_order) #test parse_tree def test_parse_tree(): tree_str = '((B:0.1,C:0.2)A:0.3);' (Tint1, lint1, nodes_in_order1) = L2U.parse_tree(tree_str) assert Tint1 == {0: 2, 1: 2, 2: 3} assert lint1 == {(1, 2): 0.1, (2, 3): 0.3, (0, 2): 0.2} assert nodes_in_order1 == ['C', 'B', 'A', 'temp0'] # temp0 is the root node #test push_up and inverse_push_up def test_inverse():
def generate_diffab(regions, region_averages, Tint, lint, nodes_in_order, taxonomy_in_order, output, thresh, maxDisp, includeTemp, L, include_tmp_diffab=False): if L == 1: for i in range(len(region_averages)): for j in range(len(region_averages)): if i < j: L1_UniFrac, DifferentialAbundance = L1U.EMDUnifrac_weighted( Tint, lint, nodes_in_order, region_averages[regions[i]], region_averages[regions[j]], include_tmp_diffab=include_tmp_diffab) tempDiff = {} for (child, parent), diff in DifferentialAbundance.items(): tax = taxonomy_in_order[child] if tax not in tempDiff: tempDiff[tax] = 0 tempDiff[tax] += diff newDifferentialAbundance = {} for (child, parent), diff in DifferentialAbundance.items(): for tax, diff_sum in tempDiff.items(): if taxonomy_in_order[ child] == tax and diff_sum != 0: newDifferentialAbundance[(child, parent)] = diff_sum tempDiff[tax] = 0 fig = L2U.plot_diffab(nodes_in_order, taxonomy_in_order, newDifferentialAbundance, regions[i], regions[j], plot_zeros=False, thresh=thresh, show=False, maxDisp=maxDisp, includeTemp=includeTemp) plt.savefig('images/{0}_diffab_{1}_{2}.png'.format( output, regions[i], regions[j])) else: for i in range(len(region_averages)): for j in range(len(region_averages)): if i < j: L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted( Tint, lint, nodes_in_order, region_averages[regions[i]], region_averages[regions[j]], include_tmp_diffab=include_tmp_diffab) tempDiff = {} for (child, parent), diff in DifferentialAbundance.items(): tax = taxonomy_in_order[child] if tax not in tempDiff: tempDiff[tax] = 0 tempDiff[tax] += diff newDifferentialAbundance = {} for (child, parent), diff in DifferentialAbundance.items(): for tax, diff_sum in tempDiff.items(): if taxonomy_in_order[ child] == tax and diff_sum != 0: newDifferentialAbundance[(child, parent)] = diff_sum tempDiff[tax] = 0 fig = L2U.plot_diffab(nodes_in_order, taxonomy_in_order, newDifferentialAbundance, regions[i], regions[j], plot_zeros=False, thresh=thresh, show=False, maxDisp=maxDisp, includeTemp=includeTemp) plt.savefig('images/{0}_diffab_{1}_{2}.png'.format( output, regions[i], regions[j]))
computations. """ import sys sys.path.append('../L2Unifrac') sys.path.append('../L2Unifrac/src') sys.path.append('../src') sys.path.append('../scripts') import L2Unifrac as L2U import averages as avg import TaxWrapper as tax import numpy as np # Compute diffab (Tint, lint, nodes_in_order ) = L2U.parse_tree_file('../data/trees/gg_13_5_otus_99_annotated.tree') L2_region_names, L2_tax_arr, L2_group_averages, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances = avg.compute_L2_averages( '../scripts/L2-Push-Out.csv', '../data/47422_otu_table.biom', '../data/trees/gg_13_5_otus_99_annotated.tree', '../data/metadata/P_1928_65684500_raw_meta.txt', '../data/taxonomies/gg_13_8_99.gg.tax', '../scripts/Group-Averages-2.csv', True) L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted( Tint, lint, nodes_in_order, L2_inverse_pushed[L2_region_names[0]], L2_inverse_pushed[L2_region_names[1]]) # Separate dictionaries group_1_diffab = {key[0]: 0 for key, value in DifferentialAbundance.items()} group_2_diffab = {key[0]: 0 for key, value in DifferentialAbundance.items()} for key, value in DifferentialAbundance.items(): if DifferentialAbundance[key] > 0: