Exemple #1
0
def test_mean(Ps, Tint, lint, nodes_in_order
              ):  #take a list of vectors, push up, take mean, then push down
    Ps_pushed = []
    for P in Ps:
        P_pushed = L2U.push_up(P, Tint, lint, nodes_in_order)
        Ps_pushed.append(P_pushed)
    mean = L2U.mean_of_vectors(Ps_pushed)
    mean_inverse = L2U.inverse_push_up(mean, Tint, lint, nodes_in_order)
    return np.any(mean_inverse < 0)
def test_W2():
    tree_str = '((B:0.4,C:0.6)A:1);'  # there is an internal node (temp0) here.
    (T1, l1, nodes1) = L2U.parse_tree(tree_str)
    nodes_samples = {
        'C': {
            'sample1': 0,
            'sample2': 0.5,
            'sample3': 0.17
        },
        'B': {
            'sample1': 0,
            'sample2': 0.33,
            'sample3': 0.5
        },
        'A': {
            'sample1': 1,
            'sample2': 0.17,
            'sample3': 0.33
        },
        'temp0': {
            'sample1': 0,
            'sample2': 0,
            'sample3': 0
        }
    }  # temp0 is the root node
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1)
    W2 = L2U.build_W2(T1, l1, nodes1)
    push_up_1 = L2U.push_up(nodes_weighted['sample1'], T1, l1, nodes1)
    push_up_2 = L2U.push_up(nodes_weighted['sample2'], T1, l1, nodes1)
    push_up_3 = L2U.push_up(nodes_weighted['sample3'], T1, l1, nodes1)
    w2_sample1 = W2.dot(np.array(nodes_weighted['sample1']))
    w2_sample2 = W2.dot(np.array(nodes_weighted['sample2']))
    w2_sample3 = W2.dot(np.array(nodes_weighted['sample3']))
    assert all(w2_sample1 - push_up_1 < 10**-12)
    assert all(w2_sample2 - push_up_2 < 10**-12)
    assert all(w2_sample3 - push_up_3 < 10**-12)

    inv_W2 = L2U.inverse_W2(csc_matrix(W2))
    inv_push_up_1 = inv_W2.dot(push_up_1)
    inv_push_up_2 = inv_W2.dot(push_up_2)
    inv_push_up_3 = inv_W2.dot(push_up_3)
    assert all(inv_push_up_1 - nodes_weighted['sample1'] < 10**-12)
    assert all(inv_push_up_2 - nodes_weighted['sample2'] < 10**-12)
    assert all(inv_push_up_3 - nodes_weighted['sample3'] < 10**-12)

    #test with real data
    P = env_prob_dict['232.M9Okey217']
    Q = env_prob_dict['232.M3Indl217']
    W2 = L2U.build_W2(Tint, lint, nodes_in_order)
    push_up_1 = L2U.push_up(P, Tint, lint, nodes_in_order)
    push_up_2 = L2U.push_up(Q, Tint, lint, nodes_in_order)
    assert all(W2.dot(np.array(P)) - push_up_1 < 10**-12)
    assert all(W2.dot(np.array(Q)) - push_up_2 < 10**-12)
Exemple #3
0
def Total_Pairwise(biom_file,
                   tree_file,
                   output_file=None,
                   debug=0,
                   max_cores=int(mp.cpu_count() / 4)):
    global T1
    global l1
    global nodes_in_order
    global nodes_weighted
    global PCoA_Samples

    if max_cores > mp.cpu_count() or max_cores <= 1:
        cores = mp.cpu_count() - 1
    else:
        cores = max_cores

    nodes_samples = BW.extract_biom(biom_file)
    T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file)
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples,
                                                    nodes_in_order)

    PCoA_Samples = BW.extract_samples(biom_file)

    if debug == 1:
        print(f"Running Debugging Multiprocess on {cores} Cores...")

        # Testing subset of samples...
        PCoA_Samples = PCoA_Samples[:64]

        local_vars = list(locals().items())
        for var, obj in local_vars:
            print(f"{var.ljust(17)}: {sys.getsizeof(obj)}")

    # Multi Core Method
    row = [(i, j) for j in range(len(PCoA_Samples))
           for i in range(len(PCoA_Samples))]

    with mp.Pool(processes=cores) as pool:
        result = pool.map(unifrac_work_wrapper, row)

    result_matrix = []
    for i in range(len(PCoA_Samples)):
        dist_list = []
        for j in range(len(PCoA_Samples)):
            dist_list.append(result[i * len(PCoA_Samples) + j][0])
            if debug == 1:
                print(result[i * len(PCoA_Samples) + j][1])
        result_matrix.append(dist_list)
        if output_file is not None:
            CSV.write(output_file, dist_list)
    return result_matrix
def test_parse_tree():
    tree_str = '((B:0.1,C:0.2)A:0.3);'
    (Tint1, lint1, nodes_in_order1) = L2U.parse_tree(tree_str)
    assert Tint1 == {0: 2, 1: 2, 2: 3}
    assert lint1 == {(1, 2): 0.1, (2, 3): 0.3, (0, 2): 0.2}
    assert nodes_in_order1 == ['C', 'B', 'A',
                               'temp0']  # temp0 is the root node
def test_inverse():
    #simple tests
    P1 = np.array([0.1, 0.2, 0, 0.3, 0, 0.3, 0.1])
    T1 = {0: 4, 1: 4, 2: 5, 3: 5, 4: 6, 5: 6}
    l1 = {
        (0, 4): 0.1,
        (1, 4): 0.1,
        (2, 5): 0.2,
        (3, 5): 0,
        (4, 6): 0.2,
        (5, 6): 0.2
    }  # 0 edge_length not involving the root
    nodes1 = ['A', 'B', 'C', 'D', 'temp0', 'temp1', 'temp2']
    P_pushed1 = L2U.push_up(P1, T1, l1, nodes1)
    x = np.sqrt(L2U.epsilon) * 0.3
    answer1 = np.array(
        [0.0316227766, 0.0632455532, 0, x, 0.134164079, 0.268328157, 1])
    assert all(np.abs(P_pushed1 - answer1) < 0.00000001)  #test push_up
    assert P_pushed1[3] > 10**-18  #P_pushed[3] (edge length 0) is non-zero
    P_inversed1 = L2U.inverse_push_up(P_pushed1, T1, l1, nodes1)
    assert np.sum(abs(P1 - P_inversed1)) < 10**-12  #test inverse_push_up
    l2 = {
        (0, 4): 0.1,
        (1, 4): 0.1,
        (2, 5): 0.2,
        (3, 5): 0,
        (4, 6): 0.2,
        (5, 6): 0
    }  # more than one edge with 0 edge length, involving the root.
    y = np.sqrt(L2U.epsilon) * 0.6
    P_pushed2 = L2U.push_up(P1, T1, l2, nodes1)
    answer2 = np.array([0.0316227766, 0.0632455532, 0, x, 0.134164079, y, 1])
    assert all(np.abs(P_pushed2 - answer2) < 0.00000001)
    #test with real data
    Q = env_prob_dict['232.M2Lsft217']
    Q_pushed = L2U.push_up(Q, Tint, lint, nodes_in_order)
    Q_inversed = L2U.inverse_push_up(Q_pushed, Tint, lint, nodes_in_order)
    assert np.sum(abs(Q - Q_inversed)) < 10**-12
Exemple #6
0
def generate_diffab(biom_file, tree_file, metadata_file, tax_file, verbose, threads, intermediate_store, preprocessed_use, unifrac_code, output_file):
	if verbose:
		print('\tExtracting metadata...')
	(Tint, lint, nodes_in_order) = L2U.parse_tree_file(tree_file)
	metadata = meta.extract_metadata(metadata_file)
	sample_groups = []
	groups_temp = list(metadata.values())
	groups = []
	for i in range(len(groups_temp)):
		if groups_temp[i]['body_site'] not in groups:
			groups.append(groups_temp[i]['body_site'])
	group_str = ','.join(groups)
	if verbose:
		print('\tSuccessfully extracted metadata')
	if preprocessed_use and path.exists('intermediate/L1_preprocessed_intermediate.txt') and path.exists('intermediate/L2_preprocessed_intermediate.txt'):
		L1_preprocessed = CSV.read('intermediate/L1_preprocessed_intermediate.txt')
		L2_preprocessed = CSV.read('intermediate/L2_preprocessed_intermediate.txt')
		if verbose:
			print('\tSuccessfully retrieved intermediate file for L1 and L2 Preprocessing')
	else:
		if verbose and preprocessed_use:
			print('\tWarning: Intermediate selected but not available. Starting preprocessing... This may take a while...')
		elif verbose:
			print('\tWarning: Biom preprocessing starting... This may take a while...')
		if intermediate_store:
			if path.exists('intermediate/L1_preprocessed_intermediate.txt'):
				os.remove('intermediate/L1_preprocessed_intermediate.txt')
			if path.exists('intermediate/L2_preprocessed_intermediate.txt'):
				os.remove('intermediate/L2_preprocessed_intermediate.txt')
			L1_preprocessed, L2_preprocessed = prep.generate_preprocessed(biom_file, tree_file, 1, 'intermediate/L1_preprocessed_intermediate.txt', 'intermediate/L2_preprocessed_intermediate.txt')
		else:
			L1_preprocessed, L2_preprocessed = prep.generate_preprocessed(biom_file, tree_file, unifrac_code, 'tmp_L1_preprocessed_intermediate.txt', 'tmp_L2_preprocessed_intermediate.txt')
		if verbose:
			print('\tCompleted biom preprocessing matrix computation')
	if unifrac_code == 1 or unifrac_code == 2:
		if preprocessed_use or intermediate_store:
			L1_region_names, L1_tax_arr, L1_group_averages, L1_inverse_pushed, L1_neg_arr, L1_distance_matrix, L1_node_type_group_abundances = avg.compute_L1_averages('intermediate/L1_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True)
		else:
			L1_region_names, L1_tax_arr, L1_group_averages, L1_inverse_pushed, L1_neg_arr, L1_distance_matrix, L1_node_type_group_abundances = avg.compute_L1_averages('tmp_L1_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True)
		diff.generate_diffab(L1_region_names, L1_inverse_pushed, Tint, lint, nodes_in_order, L1_tax_arr, 'L1' + output_file, 0.000005, 10, True, 1)
	if unifrac_code == 0 or unifrac_code == 1:
		if preprocessed_use or intermediate_store:
			L2_region_names, L2_tax_arr, L2_group_averages, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances = avg.compute_L2_averages('intermediate/L2_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True)
		else:
			L2_region_names, L2_tax_arr, L2_group_averages, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances = avg.compute_L2_averages('tmp_L2_preprocessed_intermediate.txt', biom_file, tree_file, metadata_file, tax_file, 'reports/' + str(output_file) + '_avg_report.csv', most_shared=True)
		diff.generate_diffab(L2_region_names, L2_inverse_pushed, Tint, lint, nodes_in_order, L2_tax_arr, 'L2' + output_file, 0.000005, 10, True, 2)
	if path.exists('tmp_L1_preprocessed_intermediate.txt'):
		os.remove('tmp_L1_preprocessed_intermediate.txt')
	if path.exists('tmp_L2_preprocessed_intermediate.txt'):
		os.remove('tmp_L2_preprocessed_intermediate.txt')
Exemple #7
0
def Group_Pairwise(biom_file,
                   tree_file,
                   metadata_file,
                   group_num,
                   output_file=None,
                   debug=0,
                   max_cores=int(mp.cpu_count() / 4)):
    global T1
    global l1
    global nodes_in_order
    global nodes_weighted
    global PCoA_Samples

    if max_cores > mp.cpu_count() or max_cores <= 1:
        cores = mp.cpu_count() - 1
    else:
        cores = max_cores

    nodes_samples = BW.extract_biom(biom_file)
    T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file)
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples,
                                                    nodes_in_order)

    PCoA_Samples = BW.extract_samples(biom_file)

    group_num -= 1
    metadata = meta.extract_metadata(metadata_file)
    sample_groups = []
    groups_temp = list(metadata.values())
    groups = []
    for i in range(len(groups_temp)):
        if groups_temp[i]['body_site'] not in groups:
            groups.append(groups_temp[i]['body_site'])
    print(groups)

    sample_sites = [[] for i in range(len(groups))]

    # Separate the groups
    for i in range(len(PCoA_Samples)):
        for j in range(len(groups)):
            if metadata[PCoA_Samples[i]]['body_site'] == groups[j]:
                sample_sites[j].append(PCoA_Samples[i])
    print(sample_sites)

    if debug == 1:
        print(f"Running Debugging Multiprocess on {cores} Cores...")

        # Testing subset of samples...
        sample_sites[group_num] = sample_sites[group_num][:64]

        local_vars = list(locals().items())
        for var, obj in local_vars:
            print(f"{var.ljust(17)}: {sys.getsizeof(obj)}")

    # Multi Core Method
    row = [(i, j) for j in range(len(sample_sites[group_num]))
           for i in range(len(sample_sites[group_num]))]

    with mp.Pool(processes=cores) as pool:
        result = pool.map(unifrac_work_wrapper, row)

    result_matrix = []
    for i in range(len(sample_sites[group_num])):
        dist_list = []
        for j in range(len(sample_sites[group_num])):
            dist_list.append(result[i * len(sample_sites[group_num]) + j][0])
            if debug == 1:
                print(result[i * len(sample_sites[group_num]) + j][1])
        result_matrix.append(dist_list)
        if output_file is not None:
            CSV.write(output_file[:-4] + "-" + groups[group_num] + '.csv',
                      dist_list)
    return result_matrix
Exemple #8
0
def unifrac_worker(samp1num, samp2num):
    L2UniFrac = L2U.L2Unifrac_weighted_plain(
        T1, l1, nodes_in_order, nodes_weighted[PCoA_Samples[samp1num]],
        nodes_weighted[PCoA_Samples[samp2num]])
    formatted_L2 = "{:.16f}".format(L2UniFrac)
    return L2UniFrac, f"\tInner loop: {str(samp2num).zfill(4)} | L2-UniFrac: {formatted_L2} | Sample 1: {PCoA_Samples[samp1num]} | Sample 2: {PCoA_Samples[samp2num]}"
Exemple #9
0
def compute_L1_L2_averages(L1_file, L2_file, biom_file, tree_file, metadata_file, tax_file, output_file=None):

	if output_file is not None and path.exists(output_file):
		os.remove(output_file)

	# Note: these are the same for L1/L2, so they will be computed only once. (USE T1 FOR ANCESTORS FOR TEMP NODES)
	#nodes_samples = BW.extract_biom(biom_file)
	T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file)

	# Subsample Biom file (2 samples). Then trace the mass to find where it no longer sums to 1.
	PCoA_Samples = BW.extract_samples(biom_file)
	metadata = meta.extract_metadata(metadata_file)

	# Extract region names and map samples to regions
	region_names = []
	region_map = {}
	for i in range(len(PCoA_Samples)):
		if metadata[PCoA_Samples[i]]['body_site'] not in region_names:
			region_map[metadata[PCoA_Samples[i]]['body_site']] = []
			region_names.append(metadata[PCoA_Samples[i]]['body_site'])
		region_map[metadata[PCoA_Samples[i]]['body_site']].append(i)
		PCoA_Samples[i] = region_names.index(metadata[PCoA_Samples[i]]['body_site'])

	# Read sparse matrices
	if not isinstance(L1_file, list):
		sparse_matrix_L1 = CSV.read_sparse(L1_file)
	else:
		sparse_matrix_L1 = L1_file
	if not isinstance(L2_file, list):
		sparse_matrix_L2 = CSV.read_sparse(L2_file)
	else:
		sparse_matrix_L2 = L2_file

	group_averages_L1 = {}
	group_averages_L2 = {}

	# Store region names for later
	if output_file is not None:
		CSV.write(output_file, region_names)

	# Write taxas for cell
	taxonomies = tax.extract_tax(tax_file)
	tax_arr = []
	for i in range(len(nodes_in_order)):
		if nodes_in_order[i][0] != 't':
			tax_arr.append(taxonomies[int(nodes_in_order[i])])
		else:
			loop = True
			if i in T1:
				temp_node = T1[i]
			else:
				tax_arr.append('internal')
				loop = False
			while loop:
				if nodes_in_order[temp_node][0] != 't':
					tax_arr.append(taxonomies[int(nodes_in_order[temp_node])])
					break
				else:
					if temp_node in T1:
						temp_node = T1[temp_node]
					else:
						tax_arr.append('internal')
						break
	
	if output_file is not None:
		CSV.write(output_file, tax_arr)

	# Take L1 average of each
	L1_pushed_arr = []
	for i in range(len(region_names)):
		group_arr = []
		if not isinstance(L1_file, list):
			for j in range(len(region_map[region_names[i]])):
				group_arr.append(np.array(sparse_matrix_L1[region_map[region_names[i]][j]].todense())[0])
		else:
			for j in range(len(region_map[region_names[i]])):
				group_arr.append(np.array(sparse_matrix_L1[region_map[region_names[i]][j]])[0])
		average = L1U.median_of_vectors(group_arr)
		group_averages_L1[region_names[i]] = average
		L1_pushed_arr.append(average)

	# Store L1 averages
	print("L1 Group Averages:")
	if output_file is not None:
		CSV.write(output_file, ["L1 Group Averages:"])
	for name in region_names:
		padded_name = "{:<15}".format(name+":")
		print(f"{padded_name} {group_averages_L1[name]}")
		if output_file is not None:
			CSV.write(output_file, group_averages_L1[name])

	# Take L2 average of each
	L2_pushed_arr = []
	for i in range(len(region_names)):
		group_arr = []
		if not isinstance(L2_file, list):
			for j in range(len(region_map[region_names[i]])):
				group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]].todense())[0])
		else:
			for j in range(len(region_map[region_names[i]])):
				group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]])[0])
		average = L2U.mean_of_vectors(group_arr)
		group_averages_L2[region_names[i]] = average
		L2_pushed_arr.append(average)

	# Store L2 averages
	print("\nL2 Group Averages:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Group Averages:"])
	for name in region_names:
		padded_name = "{:<15}".format(name+":")
		print(f"{padded_name} {group_averages_L2[name]}")
		if output_file is not None:
			CSV.write(output_file, group_averages_L2[name])

	# Push L1 down and store
	print("\nL1 Inverse Push Up:")
	if output_file is not None:
		CSV.write(output_file, ["L1 Inverse Pushed Up:"])
	L1_neg_arr = []
	L1_inverse_pushed = {}
	for name in region_names:
		neg_count = 0
		median_inverse = L1U.inverse_push_up(group_averages_L1[name], T1, l1, nodes_in_order)
		L1_inverse_pushed[name] = median_inverse
		for i in range(len(median_inverse)):
			if median_inverse[i] < negatives_filtering_threshold:
				neg_count += 1
		L1_neg_arr.append(neg_count)
		padded_name = "{:<15}".format(name+":")
		print(f"{padded_name} {median_inverse}")
		if output_file is not None:
			CSV.write(output_file, median_inverse)

	# Push L2 down and store
	print("\nL2 Inverse Push Up:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Inverse Pushed Up:"])
	L2_neg_arr = []
	L2_inverse_pushed = {}
	for name in region_names:
		neg_count = 0
		mean_inverse = L2U.inverse_push_up(group_averages_L2[name], T1, l1, nodes_in_order)
		L2_inverse_pushed[name] = mean_inverse
		for i in range(len(mean_inverse)):
			if mean_inverse[i] < negatives_filtering_threshold:
				neg_count += 1
		L2_neg_arr.append(neg_count)
		padded_name = "{:<15}".format(name+":")
		print(f"{padded_name} {mean_inverse}")
		if output_file is not None:
			CSV.write(output_file, mean_inverse)

	# Write negative counts
	print("L1 and L2 Negatives by Group:")
	print(L1_neg_arr)
	print(L2_neg_arr)
	if output_file is not None:
		CSV.write(output_file, ["L1 and L2 Negatives by Group:"])
		CSV.write(output_file, L1_neg_arr)
		CSV.write(output_file, L2_neg_arr)

	L1_distance_matrix = compute_pairwise_pushed_L1(L1_pushed_arr)
	L2_distance_matrix = compute_pairwise_pushed_L2(L2_pushed_arr)

	print("L1 Distance Matrix:")
	if output_file is not None:
		CSV.write(output_file, ["L1 Distance Matrix:"])
	for i in range(len(L1_pushed_arr)):
		print(L1_distance_matrix[i])
		if output_file is not None:
			CSV.write(output_file, L1_distance_matrix[i])

	print("L2 Distance Matrix:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Distance Matrix:"])
	for i in range(len(L2_pushed_arr)):
		print(L2_distance_matrix[i])
		if output_file is not None:
			CSV.write(output_file, L2_distance_matrix[i])		

	print("L1 Abundances by Node Type:")
	if output_file is not None:
		CSV.write(output_file, ["L1 Abundances by Node Type:"])
	L1_node_type_group_abundances = []
	for name in region_names:
		region_abundance_vector = L1_inverse_pushed[name]
		k = p = c = o = f = g = s = temp = 0
		for i in range(len(region_abundance_vector)):
			node_tax = tax_arr[i].split(';')
			if len(node_tax) > 1:
				if node_tax[-2][0] == 'k':
					k += region_abundance_vector[i]
				elif node_tax[-2][0] == 'p':
					p += region_abundance_vector[i]
				elif node_tax[-2][0] == 'c':
					c += region_abundance_vector[i]
				elif node_tax[-2][0] == 'o':
					o += region_abundance_vector[i]
				elif node_tax[-2][0] == 'f':
					f += region_abundance_vector[i]
				elif node_tax[-2][0] == 'g':
					g += region_abundance_vector[i]
				elif node_tax[-2][0] == 's':
					s += region_abundance_vector[i]
				else:
					print("Error")
			else:
				temp += region_abundance_vector[i]
		print([k, p, c, o, f, g, s, temp])
		if output_file is not None:
			CSV.write(output_file, [k, p, c, o, f, g, s, temp])
		L1_node_type_group_abundances.append([k, p, c, o, f, g, s, temp])

	print("L2 Abundances by Node Type:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Abundances by Node Type:"])
	L2_node_type_group_abundances = []
	for name in region_names:
		region_abundance_vector = L2_inverse_pushed[name]
		k = p = c = o = f = g = s = temp = 0
		for i in range(len(region_abundance_vector)):
			node_tax = tax_arr[i].split(';')
			if len(node_tax) > 1:
				if node_tax[-2][0] == 'k':
					k += region_abundance_vector[i]
				elif node_tax[-2][0] == 'p':
					p += region_abundance_vector[i]
				elif node_tax[-2][0] == 'c':
					c += region_abundance_vector[i]
				elif node_tax[-2][0] == 'o':
					o += region_abundance_vector[i]
				elif node_tax[-2][0] == 'f':
					f += region_abundance_vector[i]
				elif node_tax[-2][0] == 'g':
					g += region_abundance_vector[i]
				elif node_tax[-2][0] == 's':
					s += region_abundance_vector[i]
				else:
					print("Error")
			else:
				temp += region_abundance_vector[i]
		print([k, p, c, o, f, g, s, temp])
		if output_file is not None:
			CSV.write(output_file, [k, p, c, o, f, g, s, temp])
		L2_node_type_group_abundances.append([k, p, c, o, f, g, s, temp])

	return region_names, tax_arr, group_averages_L1, group_averages_L2, L1_inverse_pushed, L2_inverse_pushed, L1_neg_arr, L2_neg_arr, L1_distance_matrix, L2_distance_matrix, L1_node_type_group_abundances, L2_node_type_group_abundances
Exemple #10
0
def L2_pushup_worker(sample_num):
    L2_Pushed = L2U.push_up(nodes_weighted[PCoA_Samples[sample_num]], T1, l1,
                            nodes_in_order)
    return L2_Pushed
def test_push_up():
    tree_str = '((B:0.1,C:0.2)A:0.3);'  # there is an internal node (temp0) here.
    (T1, l1, nodes1) = L2U.parse_tree(tree_str)
    nodes_samples = {
        'C': {
            'sample1': 1,
            'sample2': 0
        },
        'B': {
            'sample1': 1,
            'sample2': 1
        },
        'A': {
            'sample1': 0,
            'sample2': 0
        },
        'temp0': {
            'sample1': 0,
            'sample2': 1
        }
    }  # temp0 is the root node
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1)
    unifrac2 = np.linalg.norm(
        L2U.push_up(nodes_weighted['sample1'], T1, l1, nodes1) -
        L2U.push_up(nodes_weighted['sample2'], T1, l1, nodes1))
    L2_UniFrac = L2U.L2Unifrac_weighted_plain(
        T1, l1, nodes1, nodes_weighted['sample1'],
        nodes_weighted['sample2'])  #calculated using L2Unifrac
    print(unifrac2, L2_UniFrac)
    assert np.abs(unifrac2 - L2_UniFrac) < 10**-12

    tree_str = '((B:0.1,C:0.2)A:0.3);'  # there is an internal node (temp0) here.
    (T1, l1, nodes1) = L2U.parse_tree(tree_str)
    nodes_samples = {
        'C': {
            'sample1': 1,
            'sample2': 0
        },
        'B': {
            'sample1': 1,
            'sample2': 1
        },
        'A': {
            'sample1': 1,
            'sample2': 0
        },
        'temp0': {
            'sample1': 0,
            'sample2': 1
        }
    }  # temp0 is the root node
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1)
    unifrac2 = np.linalg.norm(
        L2U.push_up(nodes_weighted['sample1'], T1, l1, nodes1) -
        L2U.push_up(nodes_weighted['sample2'], T1, l1, nodes1))
    L2_UniFrac = L2U.L2Unifrac_weighted_plain(
        T1, l1, nodes1, nodes_weighted['sample1'],
        nodes_weighted['sample2'])  #calculated using L2Unifrac
    print(unifrac2, L2_UniFrac)
    assert np.abs(unifrac2 - L2_UniFrac) < 10**-12
    #test with real data
    P = env_prob_dict['232.M9Okey217']
    Q = env_prob_dict['232.M3Indl217']
    unifrac2 = np.linalg.norm(
        L2U.push_up(P, Tint, lint, nodes_in_order) -
        L2U.push_up(Q, Tint, lint, nodes_in_order))
    L2_UniFrac2 = L2U.L2Unifrac_weighted_plain(Tint, lint, nodes_in_order, P,
                                               Q)  #calculated using L2Unifrac
    print(unifrac2, L2_UniFrac2)
    assert np.abs(unifrac2 - L2_UniFrac2) < 10**-12
def test_summation():
    tree_str1 = '((B:0.1,C:0.2)A:0.3);'  # there is an internal node (temp0) here.
    (T1_1, l1_1, nodes1) = L2U.parse_tree(tree_str1)
    nodes_samples1 = {
        'C': {
            'sample1': 1,
            'sample2': 0
        },
        'B': {
            'sample1': 1,
            'sample2': 1
        },
        'A': {
            'sample1': 0,
            'sample2': 0
        },
        'temp0': {
            'sample1': 0,
            'sample2': 1
        }
    }  # temp0 is the root node
    (nodes_weighted1, samples_temp1) = L2U.parse_envs(nodes_samples1, nodes1)
    push_up_1 = L2U.push_up(nodes_weighted1['sample1'], T1_1, l1_1, nodes1)
    push_up_2 = L2U.push_up(nodes_weighted1['sample2'], T1_1, l1_1, nodes1)
    push_up_avg = L2U.mean_of_vectors([push_up_1, push_up_2])
    push_down_avg = L2U.inverse_push_up(push_up_avg, T1_1, l1_1, nodes1)
    assert (1 - sum(push_down_avg) < 10**-12)

    tree_str2 = '((B:0.1,C:0.2)A:0.3);'  # there is an internal node (temp0) here.
    (T1_2, l1_2, nodes2) = L2U.parse_tree(tree_str2)
    nodes_samples2 = {
        'C': {
            'sample1': 1,
            'sample2': 0
        },
        'B': {
            'sample1': 1,
            'sample2': 1
        },
        'A': {
            'sample1': 1,
            'sample2': 0
        },
        'temp0': {
            'sample1': 0,
            'sample2': 1
        }
    }  # temp0 is the root node
    (nodes_weighted2, samples_temp2) = L2U.parse_envs(nodes_samples2, nodes2)
    push_up_1 = L2U.push_up(nodes_weighted2['sample1'], T1_2, l1_2, nodes2)
    push_up_2 = L2U.push_up(nodes_weighted2['sample2'], T1_2, l1_2, nodes2)
    push_up_avg = L2U.mean_of_vectors([push_up_1, push_up_2])
    push_down_avg = L2U.inverse_push_up(push_up_avg, T1_2, l1_2, nodes2)
    assert (1 - sum(push_down_avg) < 10**-12)

    #test with real data
    P1 = env_prob_dict['232.M9Okey217']
    P2 = env_prob_dict['232.M3Indl217']
    P3 = env_prob_dict['232.L3Space217']
    P4 = env_prob_dict['232.M9Vkey217']
    P5 = env_prob_dict['232.M2Jkey217']
    P6 = env_prob_dict['232.M2Mkey217']
    P7 = env_prob_dict['232.M3Rinl217']
    P8 = env_prob_dict['232.M3Midl217']

    push_up_1 = L2U.push_up(P1, Tint, lint, nodes_in_order)
    push_up_2 = L2U.push_up(P2, Tint, lint, nodes_in_order)
    push_up_3 = L2U.push_up(P3, Tint, lint, nodes_in_order)
    push_up_4 = L2U.push_up(P4, Tint, lint, nodes_in_order)
    push_up_5 = L2U.push_up(P5, Tint, lint, nodes_in_order)
    push_up_6 = L2U.push_up(P6, Tint, lint, nodes_in_order)
    push_up_7 = L2U.push_up(P7, Tint, lint, nodes_in_order)
    push_up_8 = L2U.push_up(P8, Tint, lint, nodes_in_order)
    push_up_avg = L2U.mean_of_vectors([
        push_up_1, push_up_2, push_up_3, push_up_4, push_up_5, push_up_6,
        push_up_7, push_up_8
    ])
    push_down_avg = L2U.inverse_push_up(push_up_avg, Tint, lint,
                                        nodes_in_order)
    assert (1 - sum(push_down_avg) < 10**-12)
def test_weighted():
    tree_str = '((B:0.1,C:0.2)A:0.3);'  # there is an internal node (temp0) here.
    (T1, l1, nodes1) = L2U.parse_tree(tree_str)
    nodes_samples = {
        'C': {
            'sample1': 1,
            'sample2': 0
        },
        'B': {
            'sample1': 1,
            'sample2': 1
        },
        'A': {
            'sample1': 0,
            'sample2': 0
        },
        'temp0': {
            'sample1': 0,
            'sample2': 1
        }
    }  # temp0 is the root node
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1)
    unifrac2 = L2U.L2Unifrac_weighted_plain(T1, l1, nodes1,
                                            nodes_weighted['sample1'],
                                            nodes_weighted['sample2'])
    L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted(
        T1, l1, nodes1, nodes_weighted['sample1'],
        nodes_weighted['sample2'])  #calculated using L2Unifrac
    print(unifrac2, L2_UniFrac)
    assert np.abs(unifrac2 - L2_UniFrac) < 10**-12

    tree_str = '((B:0.1,C:0.2)A:0.3);'  # there is an internal node (temp0) here.
    (T1, l1, nodes1) = L2U.parse_tree(tree_str)
    nodes_samples = {
        'C': {
            'sample1': 1,
            'sample2': 0
        },
        'B': {
            'sample1': 1,
            'sample2': 1
        },
        'A': {
            'sample1': 1,
            'sample2': 0
        },
        'temp0': {
            'sample1': 0,
            'sample2': 1
        }
    }  # temp0 is the root node
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples, nodes1)
    unifrac2 = L2U.L2Unifrac_weighted_plain(T1, l1, nodes1,
                                            nodes_weighted['sample1'],
                                            nodes_weighted['sample2'])
    L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted(
        T1, l1, nodes1, nodes_weighted['sample1'],
        nodes_weighted['sample2'])  #calculated using L2Unifrac
    print(unifrac2, L2_UniFrac)
    assert np.abs(unifrac2 - L2_UniFrac) < 10**-12

    P = env_prob_dict['232.M9Okey217']
    Q = env_prob_dict['232.M3Indl217']
    unifrac2 = L2U.L2Unifrac_weighted_plain(Tint, lint, nodes_in_order, P, Q)
    L2_UniFrac2, DifferentialAbundance = L2U.L2Unifrac_weighted(
        Tint, lint, nodes_in_order, P, Q)  #calculated using L2Unifrac
    print(unifrac2, L2_UniFrac2)
    assert np.abs(unifrac2 - L2_UniFrac2) < 10**-12
Exemple #14
0
def generate_preprocessed(biom_file,
                          tree_file,
                          unifrac_code,
                          output_file_L1=None,
                          output_file_L2=None,
                          max_cores=int(mp.cpu_count() / 4)):
    global T1
    global l1
    global nodes_in_order
    global nodes_weighted
    global PCoA_Samples

    if max_cores > mp.cpu_count() or max_cores <= 1:
        cores = mp.cpu_count() - 1
    else:
        cores = max_cores

    # Note: these are the same for L1/L2, so they will be computed only once.
    nodes_samples = BW.extract_biom(biom_file)
    T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file)
    (nodes_weighted, samples_temp) = L2U.parse_envs(nodes_samples,
                                                    nodes_in_order)

    PCoA_Samples = BW.extract_samples(biom_file)

    # Multi Core Method
    L1_preprocessed = []
    L2_preprocessed = []

    values = range(len(PCoA_Samples))

    dim1 = len(PCoA_Samples)
    dim2 = len(L1_pushup_worker(0))

    if output_file_L1 is not None and unifrac_code == 1 or unifrac_code == 2:
        CSV.write(output_file_L1, [dim1, dim2])
    L1_preprocessed.append([dim1, dim2])
    if output_file_L2 is not None and unifrac_code == 0 or unifrac_code == 1:
        CSV.write(output_file_L2, [dim1, dim2])
    L2_preprocessed.append([dim1, dim2])

    if unifrac_code == 1 or unifrac_code == 2:

        with mp.Pool(processes=cores) as pool:
            result = pool.map(L1_pushup_worker, values)  #chunksize?

        for i in range(len(result)):
            for j in range(len(result[i])):
                if result[i][j] != 0:
                    if output_file_L1 is not None:
                        CSV.write(output_file_L1, [i, j, result[i][j]])
                    L1_preprocessed.append([i, j, result[i][j]])

    if unifrac_code == 0 or unifrac_code == 1:

        with mp.Pool(processes=cores) as pool:
            result = pool.map(L2_pushup_worker, values)

        for i in range(len(result)):
            for j in range(len(result[i])):
                if result[i][j] != 0:
                    if output_file_L2 is not None:
                        CSV.write(output_file_L2, [i, j, result[i][j]])
                    L2_preprocessed.append([i, j, result[i][j]])

    if unifrac_code == 0:
        return [], L2_preprocessed
    if unifrac_code == 2:
        return L1_preprocessed, []
    if unifrac_code == 1:
        return L1_preprocessed, L2_preprocessed
Exemple #15
0
def compute_L2_averages(L2_file, biom_file, tree_file, metadata_file, tax_file, output_file=None, most_shared=False):

	if output_file is not None and path.exists(output_file):
		os.remove(output_file)

	# Note: these are the same for L1/L2, so they will be computed only once. (USE T1 FOR ANCESTORS FOR TEMP NODES)
	#nodes_samples = BW.extract_biom(biom_file)
	T1, l1, nodes_in_order = L2U.parse_tree_file(tree_file)
	# Subsample Biom file (2 samples). Then trace the mass to find where it no longer sums to 1.
	PCoA_Samples = BW.extract_samples(biom_file)
	metadata = meta.extract_metadata(metadata_file)
	# Extract region names and map samples to regions
	region_names = []
	region_map = {}
	for i in range(len(PCoA_Samples)):
		if metadata[PCoA_Samples[i]]['body_site'] not in region_names:
			region_map[metadata[PCoA_Samples[i]]['body_site']] = []
			region_names.append(metadata[PCoA_Samples[i]]['body_site'])
		region_map[metadata[PCoA_Samples[i]]['body_site']].append(i)
		PCoA_Samples[i] = region_names.index(metadata[PCoA_Samples[i]]['body_site'])
	# Read sparse matrix
	if not isinstance(L2_file, list):
		sparse_matrix_L2 = CSV.read_sparse(L2_file)
	else:
		sparse_matrix_L2 = L2_file

	group_averages_L2 = {}
	# Store region names for later
	if output_file is not None:
		CSV.write(output_file, region_names)
	# Write taxas for cell
	taxonomies = tax.extract_tax(tax_file)
	tax_arr = []

	if most_shared:
		leaf_nodes = []
		for i in range(len(nodes_in_order)):
			if nodes_in_order[i][0] != 't':
				leaf_nodes.append(i)
		descendent_dict = {i:[] for i in range(len(nodes_in_order))}
		for i in range(len(leaf_nodes)):
			temp_node = leaf_nodes[i]
			while True:
				if temp_node in T1:
					if temp_node != leaf_nodes[i]:
						descendent_dict[temp_node].append(leaf_nodes[i])
					temp_node = T1[temp_node]
				else:
					if temp_node != leaf_nodes[i]:
						descendent_dict[temp_node].append(leaf_nodes[i])
					break
		for i in range(len(nodes_in_order)):
			descendent_taxonomy = []
			for j in range(len(descendent_dict[i])):
				descendent_taxonomy.append(taxonomies[int(nodes_in_order[descendent_dict[i][j]])])
			if len(descendent_taxonomy) == 0:
				tax_arr.append(taxonomies[int(nodes_in_order[i])])
			else:
				k_dict = {}
				p_dict = {}
				c_dict = {}
				o_dict = {}
				f_dict = {}
				g_dict = {}
				s_dict = {}
				for j in range(len(descendent_taxonomy)):
					tmp_taxa = descendent_taxonomy[j].split(';')[:-1]
					for k in range(len(tmp_taxa)):
						if tmp_taxa[k][0] == 'k' and tmp_taxa[k] not in k_dict:
							k_dict[tmp_taxa[k]] = 1
						elif tmp_taxa[k][0] == 'k':
							k_dict[tmp_taxa[k]] += 1
						if tmp_taxa[k][0] == 'p' and tmp_taxa[k] not in p_dict:
							p_dict[tmp_taxa[k]] = 1
						elif tmp_taxa[k][0] == 'p':
							p_dict[tmp_taxa[k]] += 1
						if tmp_taxa[k][0] == 'c' and tmp_taxa[k] not in c_dict:
							c_dict[tmp_taxa[k]] = 1
						elif tmp_taxa[k][0] == 'c':
							c_dict[tmp_taxa[k]] += 1
						if tmp_taxa[k][0] == 'o' and tmp_taxa[k] not in o_dict:
							o_dict[tmp_taxa[k]] = 1
						elif tmp_taxa[k][0] == 'o':
							o_dict[tmp_taxa[k]] += 1
						if tmp_taxa[k][0] == 'f' and tmp_taxa[k] not in f_dict:
							f_dict[tmp_taxa[k]] = 1
						elif tmp_taxa[k][0] == 'f':
							f_dict[tmp_taxa[k]] += 1
						if tmp_taxa[k][0] == 'g' and tmp_taxa[k] not in g_dict:
							g_dict[tmp_taxa[k]] = 1
						elif tmp_taxa[k][0] == 'g':
							g_dict[tmp_taxa[k]] += 1
						if tmp_taxa[k][0] == 's' and tmp_taxa[k] not in s_dict:
							s_dict[tmp_taxa[k]] = 1
						elif tmp_taxa[k][0] == 's':
							s_dict[tmp_taxa[k]] += 1
				shared_taxonomy = ''
				for key, value in k_dict.items():
					if value == len(descendent_taxonomy):
						shared_taxonomy = shared_taxonomy+key
						break
				for key, value in p_dict.items():
					if value == len(descendent_taxonomy):
						shared_taxonomy = shared_taxonomy+';'+key
						break
				for key, value in c_dict.items():
					if value == len(descendent_taxonomy):
						shared_taxonomy = shared_taxonomy+';'+key
						break
				for key, value in o_dict.items():
					if value == len(descendent_taxonomy):
						shared_taxonomy = shared_taxonomy+';'+key
						break
				for key, value in f_dict.items():
					if value == len(descendent_taxonomy):
						shared_taxonomy = shared_taxonomy+';'+key
						break
				for key, value in g_dict.items():
					if value == len(descendent_taxonomy):
						shared_taxonomy = shared_taxonomy+';'+key
						break
				for key, value in s_dict.items():
					if value == len(descendent_taxonomy):
						shared_taxonomy = shared_taxonomy+';'+key
						break
				shared_taxonomy = shared_taxonomy+';'
				if shared_taxonomy == ';':
					shared_taxonomy = 'Root;' # Root node will include taxonomy from Archaea and Bacteria, thus sharing nothing.
				tax_arr.append(shared_taxonomy)
	else:
		for i in range(len(nodes_in_order)):
			if nodes_in_order[i][0] != 't':
				tax_arr.append(taxonomies[int(nodes_in_order[i])])
			else:
				loop = True
				if i in T1:
					temp_node = T1[i]
				else:
					tax_arr.append('internal')
					loop = False
				while loop:
					if nodes_in_order[temp_node][0] != 't':
						tax_arr.append(taxonomies[int(nodes_in_order[temp_node])])
						break
					else:
						if temp_node in T1:
							temp_node = T1[temp_node]
						else:
							tax_arr.append('internal')
							break

	if output_file is not None:
		CSV.write(output_file, tax_arr)

	# Take L2 average of each
	L2_pushed_arr = []
	for i in range(len(region_names)):
		group_arr = []
		if not isinstance(L2_file, list):
			for j in range(len(region_map[region_names[i]])):
				group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]].todense())[0])
		else:
			for j in range(len(region_map[region_names[i]])):
				group_arr.append(np.array(sparse_matrix_L2[region_map[region_names[i]][j]])[0])
		average = L2U.mean_of_vectors(group_arr)
		group_averages_L2[region_names[i]] = average
		L2_pushed_arr.append(average)

	# Store L2 averages
	print("\nL2 Group Averages:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Group Averages:"])
	for name in region_names:
		padded_name = "{:<15}".format(name+":")
		print(f"{padded_name} {group_averages_L2[name]}")
		if output_file is not None:
			CSV.write(output_file, group_averages_L2[name])

	# Push L2 down and store
	print("\nL2 Inverse Push Up:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Inverse Pushed Up:"])
	L2_neg_arr = []
	L2_inverse_pushed = {}
	for name in region_names:
		neg_count = 0
		mean_inverse = L2U.inverse_push_up(group_averages_L2[name], T1, l1, nodes_in_order)
		L2_inverse_pushed[name] = mean_inverse
		for i in range(len(mean_inverse)):
			if mean_inverse[i] < negatives_filtering_threshold:
				neg_count += 1
		L2_neg_arr.append(neg_count)
		padded_name = "{:<15}".format(name+":")
		print(f"{padded_name} {mean_inverse}")
		if output_file is not None:
			CSV.write(output_file, mean_inverse)

	# Write negative counts
	print("L2 Negatives by Group:")
	print(L2_neg_arr)
	if output_file is not None:
		CSV.write(output_file, ["L2 Negatives by Group:"])
		CSV.write(output_file, L2_neg_arr)

	L2_distance_matrix = compute_pairwise_pushed_L2(L2_pushed_arr)

	print("L2 Distance Matrix:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Distance Matrix:"])
	for i in range(len(L2_pushed_arr)):
		print(L2_distance_matrix[i])
		if output_file is not None:
			CSV.write(output_file, L2_distance_matrix[i])		

	print("L2 Abundances by Node Type:")
	if output_file is not None:
		CSV.write(output_file, ["L2 Abundances by Node Type:"])
	L2_node_type_group_abundances = []
	for name in region_names:
		region_abundance_vector = L2_inverse_pushed[name]
		k = p = c = o = f = g = s = temp = 0
		for i in range(len(region_abundance_vector)):
			node_tax = tax_arr[i].split(';')
			if len(node_tax) > 1:
				if node_tax[-2][0] == 'k':
					k += region_abundance_vector[i]
				elif node_tax[-2][0] == 'p':
					p += region_abundance_vector[i]
				elif node_tax[-2][0] == 'c':
					c += region_abundance_vector[i]
				elif node_tax[-2][0] == 'o':
					o += region_abundance_vector[i]
				elif node_tax[-2][0] == 'f':
					f += region_abundance_vector[i]
				elif node_tax[-2][0] == 'g':
					g += region_abundance_vector[i]
				elif node_tax[-2][0] == 's':
					s += region_abundance_vector[i]
				else:
					print("Error")
			else:
				temp += region_abundance_vector[i]
		print([k, p, c, o, f, g, s, temp])
		if output_file is not None:
			CSV.write(output_file, [k, p, c, o, f, g, s, temp])
		L2_node_type_group_abundances.append([k, p, c, o, f, g, s, temp])

	return region_names, tax_arr, group_averages_L2, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances
import sys
sys.path.append('../L2Unifrac')
sys.path.append('../L2Unifrac/src')
sys.path.append('../src')
import L2Unifrac as L2U
import numpy as np
from scipy.sparse import csc_matrix

try:
    (Tint, lint, nodes_in_order
     ) = L2U.parse_tree_file('../data/old_UniFrac/97_otus_unannotated.tree')
    env_dict = L2U.create_env('../data/old_UniFrac/289_seqs_otus.txt')
except FileNotFoundError:
    (Tint, lint, nodes_in_order
     ) = L2U.parse_tree_file('../data/old_UniFrac/97_otus_unannotated.tree')
    env_dict = L2U.create_env('../data/old_UniFrac/289_seqs_otus.txt')
(env_prob_dict, samples) = L2U.parse_envs(env_dict, nodes_in_order)


#test parse_tree
def test_parse_tree():
    tree_str = '((B:0.1,C:0.2)A:0.3);'
    (Tint1, lint1, nodes_in_order1) = L2U.parse_tree(tree_str)
    assert Tint1 == {0: 2, 1: 2, 2: 3}
    assert lint1 == {(1, 2): 0.1, (2, 3): 0.3, (0, 2): 0.2}
    assert nodes_in_order1 == ['C', 'B', 'A',
                               'temp0']  # temp0 is the root node


#test push_up and inverse_push_up
def test_inverse():
Exemple #17
0
def generate_diffab(regions,
                    region_averages,
                    Tint,
                    lint,
                    nodes_in_order,
                    taxonomy_in_order,
                    output,
                    thresh,
                    maxDisp,
                    includeTemp,
                    L,
                    include_tmp_diffab=False):
    if L == 1:
        for i in range(len(region_averages)):
            for j in range(len(region_averages)):
                if i < j:
                    L1_UniFrac, DifferentialAbundance = L1U.EMDUnifrac_weighted(
                        Tint,
                        lint,
                        nodes_in_order,
                        region_averages[regions[i]],
                        region_averages[regions[j]],
                        include_tmp_diffab=include_tmp_diffab)
                    tempDiff = {}
                    for (child, parent), diff in DifferentialAbundance.items():
                        tax = taxonomy_in_order[child]
                        if tax not in tempDiff:
                            tempDiff[tax] = 0
                        tempDiff[tax] += diff
                    newDifferentialAbundance = {}
                    for (child, parent), diff in DifferentialAbundance.items():
                        for tax, diff_sum in tempDiff.items():
                            if taxonomy_in_order[
                                    child] == tax and diff_sum != 0:
                                newDifferentialAbundance[(child,
                                                          parent)] = diff_sum
                                tempDiff[tax] = 0
                    fig = L2U.plot_diffab(nodes_in_order,
                                          taxonomy_in_order,
                                          newDifferentialAbundance,
                                          regions[i],
                                          regions[j],
                                          plot_zeros=False,
                                          thresh=thresh,
                                          show=False,
                                          maxDisp=maxDisp,
                                          includeTemp=includeTemp)
                    plt.savefig('images/{0}_diffab_{1}_{2}.png'.format(
                        output, regions[i], regions[j]))
    else:
        for i in range(len(region_averages)):
            for j in range(len(region_averages)):
                if i < j:
                    L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted(
                        Tint,
                        lint,
                        nodes_in_order,
                        region_averages[regions[i]],
                        region_averages[regions[j]],
                        include_tmp_diffab=include_tmp_diffab)
                    tempDiff = {}
                    for (child, parent), diff in DifferentialAbundance.items():
                        tax = taxonomy_in_order[child]
                        if tax not in tempDiff:
                            tempDiff[tax] = 0
                        tempDiff[tax] += diff
                    newDifferentialAbundance = {}
                    for (child, parent), diff in DifferentialAbundance.items():
                        for tax, diff_sum in tempDiff.items():
                            if taxonomy_in_order[
                                    child] == tax and diff_sum != 0:
                                newDifferentialAbundance[(child,
                                                          parent)] = diff_sum
                                tempDiff[tax] = 0
                    fig = L2U.plot_diffab(nodes_in_order,
                                          taxonomy_in_order,
                                          newDifferentialAbundance,
                                          regions[i],
                                          regions[j],
                                          plot_zeros=False,
                                          thresh=thresh,
                                          show=False,
                                          maxDisp=maxDisp,
                                          includeTemp=includeTemp)
                    plt.savefig('images/{0}_diffab_{1}_{2}.png'.format(
                        output, regions[i], regions[j]))
Exemple #18
0
computations.
"""

import sys
sys.path.append('../L2Unifrac')
sys.path.append('../L2Unifrac/src')
sys.path.append('../src')
sys.path.append('../scripts')
import L2Unifrac as L2U
import averages as avg
import TaxWrapper as tax
import numpy as np

# Compute diffab
(Tint, lint, nodes_in_order
 ) = L2U.parse_tree_file('../data/trees/gg_13_5_otus_99_annotated.tree')
L2_region_names, L2_tax_arr, L2_group_averages, L2_inverse_pushed, L2_neg_arr, L2_distance_matrix, L2_node_type_group_abundances = avg.compute_L2_averages(
    '../scripts/L2-Push-Out.csv', '../data/47422_otu_table.biom',
    '../data/trees/gg_13_5_otus_99_annotated.tree',
    '../data/metadata/P_1928_65684500_raw_meta.txt',
    '../data/taxonomies/gg_13_8_99.gg.tax', '../scripts/Group-Averages-2.csv',
    True)
L2_UniFrac, DifferentialAbundance = L2U.L2Unifrac_weighted(
    Tint, lint, nodes_in_order, L2_inverse_pushed[L2_region_names[0]],
    L2_inverse_pushed[L2_region_names[1]])

# Separate dictionaries
group_1_diffab = {key[0]: 0 for key, value in DifferentialAbundance.items()}
group_2_diffab = {key[0]: 0 for key, value in DifferentialAbundance.items()}
for key, value in DifferentialAbundance.items():
    if DifferentialAbundance[key] > 0: