def join_encoded_layers_and_pam_stats(encoded_layers, pam_stats): """Concatenate encoded layers and pam statistics. Note: The PAM statistics, very likely, represent a subset of cells that are in the shapegrid and therefore must be "inflated" to match the same sites. """ row_headers = encoded_layers.get_row_headers() new_stats_mtx = Matrix(np.zeros( (len(row_headers), len(pam_stats.get_column_headers()))), headers={ '0': row_headers, '1': pam_stats.get_column_headers() }) # Set values in new stats matrix # Note: This is somewhat fragile. It requires that encoded_layers row site # ids be a superset of pam_stats row site ids. Consider either forcing # the data to match or something more robust for a more official version all_site_ids = np.array( [int(site_id) for site_id, _, _ in encoded_layers.get_row_headers()]) ps_site_ids = np.array( [int(site_id) for site_id, _, _ in pam_stats.get_row_headers()]) data_idxs = np.take(all_site_ids, ps_site_ids) for i in range(len(data_idxs)): new_stats_mtx[data_idxs[i]] = pam_stats[i] # Concatenate and return joined_mtx = Matrix.concatenate([encoded_layers, new_stats_mtx], axis=1) return joined_mtx
def test_with_signed_value_comparison(self): """Tests that getting p-values does what is expected.""" obs_matrix = Matrix(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])) rand_1 = Matrix(np.array([[3, 2, 1], [6, 3, -12], [8, 3, -10]])) rand_2 = Matrix(np.array([[9, 23, 1], [4, 2, 9], [-32, -3, 9]])) p_vals = perm_testing.get_p_values( obs_matrix, [rand_1, rand_2], compare_func=perm_testing.compare_signed_values) assert np.all( p_vals[:, :, 0] == np.array([[1, 0.5, 0], [0.5, 0, 0.5], [0.5, 0, 0]]))
def test_valid(self): """Tests that correcting p-values does what is expected. """ uncorrected = Matrix( np.array([[0.05, 0.1, 0.02], [0.01, 0.05, 0.06], [0.1, 0.01, 0.20]])) corrected = perm_testing.correct_p_values(uncorrected)
def get_p_values(observed_matrix, test_matrices, compare_func=compare_absolute_values): """Gets p-values by comparing the observed and random data. Args: observed_matrix (:obj: `Matrix`): A Matrix object with observed values test_matrices (:obj: `list`): A list of Matrix objects with values obtained through permutations compare_func (:obj: `function`): A function that, when given two values, returns True if the second meets the condition Returns: numpy.ndarray: An array of p-values. Todo: * Take optional clip values * Take optional number of permutations """ p_val_headers = deepcopy(observed_matrix.headers) ndim = observed_matrix.ndim p_val_headers[str(ndim)] = ['P-Values'] # Create the P-values matrix. The shape should be the same as the observed # data with one extra dimension if the last dimension has size > 1 if observed_matrix.shape[-1] == 1: # pragma: nocover p_vals_shape = observed_matrix.shape else: # pragma: nocover p_vals_shape = list(observed_matrix.shape) + [1] p_values = Matrix(np.zeros(p_vals_shape), headers=observed_matrix.headers) num_permutations = 0 for rand in test_matrices: # If the random matrices are a stack with more dimensions or more # layers, compare each layer to observed if rand.ndim > ndim or (rand.shape[-1] > observed_matrix.shape[-1]): # pragma: nocover # Determine shape of test matrix if rand.ndim > ndim: test_shape = list(rand.shape)[:-1] else: test_shape = observed_matrix.shape # Loop through each for i in range(rand.shape[-1]): p_values += compare_func( observed_matrix, # Slice off one test layer rand[..., i].reshape(test_shape)) num_permutations += 1 elif rand.ndim < len(p_vals_shape): # pragma: nocover p_values += compare_func(observed_matrix, rand).reshape(p_vals_shape) num_permutations += 1 else: # pragma: nocover p_values += compare_func(observed_matrix, rand) num_permutations += 1 # Divide by number of permutations and clip just in case p_values = np.clip(np.nan_to_num(p_values / num_permutations), 0.0, 1.0) return p_values
def get_matrix(csv_fn): squids = [] row_headers = [] #data = None data = [] with open(csv_fn) as in_file: header = True for line in in_file: if header: header = False squids = line.strip().split(',')[3:] #data = np.zeros(()) else: parts = line.strip().split(',') row_headers.append(tuple([float(i) for i in parts[0:3]])) data.append([int(i) for i in parts[3:]]) #print(len(row_headers)) #print(len(squids)) #259200 #print(squids) mtx = Matrix(np.array(data, dtype=np.int0), headers={ '0': row_headers, '1': squids }) return mtx
def get_character_matrix_from_sequences_list(sequences, var_headers=None): """Converts a list of sequences into a character matrix. Args: sequences (:obj:`list` of :obj:`Sequence`): A list of Sequence objects to be converted. var_headers (:obj:`list` of headers, optional): If provided, uses these as variable headers for the columns in the matrix. Returns: Matrix: A matrix of sequence data. """ if var_headers is not None: col_headers = var_headers else: col_headers = [ 'Column {}'.format(i) for i in range(len(sequences[0].cont_values)) ] data = np.zeros((len(sequences), len(col_headers)), dtype=float) row_headers = [] i = 0 for seq in sequences: row_headers.append(seq.name) data[i] = np.array(seq.cont_values) i += 1 return Matrix(data, headers={'0': row_headers, '1': col_headers})
def pdnew(pam, tree): """Creates a lookup dictionary for PD of sites in a matrix. Args: pam (:obj:`Matrix`): A Lifemapper Matrix object with presence absence values. tree (:obj:`TreeWrapper`): A TreeWrapper object for a wrapped Dendropy phylogenetic tree. Returns: Matrix object w/ PD values & spp. rch for each community in sample. Col1 = PD; Col2 = SR; Rows = Indvidual samples from pam. """ # Get number of samples in community matrix data. nsamp = len(pam.get_row_headers()) # Array to hold each sample's PD & RCH. col1 = PD; col2 = RCH. PD_array = np.zeros((nsamp, 2), dtype=float) # This loop will calculate the PD value for each sample in 'pam'. for sample in range(nsamp): # Pull out the data for current sample. my_samp = pam[sample] # print my_samp # print pam.get_row_headers(),"\n" # Pull out which spp are present & which are absent from the sample. # yields lists of strings --> spp names. sp_pres = list(it.compress(pam.get_column_headers(), my_samp)) # Dendropy does not retain underscores in labels sp_pres = [i.replace('_', ' ') for i in sp_pres] # print "The following spp are present in the sample:\n", sp_pres,"\n" # print tree # Get a tree of the spp present in the sample. tree_pres = tree.extract_tree_with_taxa_labels(sp_pres) # print tree_pres,"\n**********\n" # Get sum of edge lengths for each sub tree. PD_pres = tree_pres.length() # Get spp rch of sample. rch_samp = len(sp_pres) # Update PD_array. PD_array[sample] = [PD_pres, rch_samp] # convert PD_array to Matrix object & match headers to pam data. PD_mat = Matrix( PD_array, headers={'0': pam.get_row_headers(), '1': ['PD', 'SR']} ) # return values. return PD_mat
def test_valid(self): """Test the function with valid inputs.""" # Create a tree tree = TreeWrapper.get(data='(A,(B,((C,D),(E,F))));', schema='newick') mtx = Matrix(np.random.random((6, 2, 1)), headers={ '0': ['A', 'B', 'C', 'D', 'E', 'F'], '1': ['label', 'other_val'] }) # This should not fail annotators.annotate_tree_with_label(tree, mtx, label_column=0)
def load_pams(pam_dir): """Loads PAMs from CSV files in the specified directory Args: pam_dir (str): A directory containing PAM CSV files """ pams = [] for fn in glob.glob(os.path.join(pam_dir, '*.csv')): with open(fn) as in_f: pams.append( Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=3)) return pams
def test_valid(self, valid_phylo_beta_diversity_package): """Test the method with valid data. Args: valid_phylo_beta_diversity_package (tuple): A tuple of information that together forms a valid phylogenetic beta diversity package. Note: * Test values were determined from example at https://rdrr.io/rforge/betapart/man/phylo.beta.pair.html """ (pam_fn, tree_fn, _, _, _, test_beta_sim_fn, test_beta_sne_fn, test_beta_sor_fn, _, _, _, test_phylo_beta_sim_fn, test_phylo_beta_sne_fn, test_phylo_beta_sor_fn) = valid_phylo_beta_diversity_package with open(pam_fn) as in_f: pam = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) tree = TreeWrapper.from_filename(tree_fn) with open(test_beta_sim_fn) as in_f: test_beta_sim = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_sne_fn) as in_f: test_beta_sne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_sor_fn) as in_f: test_beta_sor = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_sim_fn) as in_f: test_phylo_beta_sim = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_sne_fn) as in_f: test_phylo_beta_sne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_sor_fn) as in_f: test_phylo_beta_sor = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) (beta_sim, phylo_beta_sim, beta_sne, phylo_beta_sne, beta_sor, phylo_beta_sor) = pbd.calculate_phylo_beta_diversity_sorensen( pam, tree) # Check matrix outputs to see if they are within tolerance assert np.allclose(beta_sim, test_beta_sim) assert np.allclose(phylo_beta_sim, test_phylo_beta_sim) assert np.allclose(beta_sne, test_beta_sne) assert np.allclose(phylo_beta_sne, test_phylo_beta_sne) assert np.allclose(beta_sor, test_beta_sor) assert np.allclose(phylo_beta_sor, test_phylo_beta_sor)
def main(): pam_fn = 'C:/Users/cj/Desktop/ryan_v3/pam.lmm' tree_fn = 'C:/Users/cj/Desktop/ryan_v3/squid_tree.nex' out_fn = 'C:/Users/cj/Desktop/ryan_v3/tree_mtx.lmm' with open(pam_fn, 'rb') as in_file: pam = Matrix.load_flo(in_file) tree = TreeWrapper.get(path=tree_fn, schema='nexus') tree_mtx = calculate_tree_site_statistics(pam, tree) with open(out_fn, 'wb') as out_file: tree_mtx.save(out_file) print(tree_mtx.max(axis=1)) print(tree_mtx.max(axis=0))
def test_valid(self, tmpdir): """Test the function with valid inputs. Args: tmpdir (:obj:`py.path.local`): A temporary directory test fixture generated by pytest. """ # Create a tree tree = TreeWrapper.get(data='(A,(B,((C,D),(E,F))));', schema='newick') mtx = Matrix( np.random.random((6, 3, 2)), headers={'0': ['A', 'B', 'C', 'D', 'E', 'F'], '1': ['label', 'other_val', 'one_more_val']}) # This should not fail output_directory = os.path.join(tmpdir.dirname, 'plots') create_distribution_plots(tree, mtx, output_directory)
def test_valid(self, valid_phylo_beta_diversity_package): """Test the method with valid data Note: * Test values were determined from example at https://rdrr.io/rforge/betapart/man/phylo.beta.pair.html """ (pam_fn, tree_fn, test_beta_jac_fn, test_beta_jne_fn, test_beta_jtu_fn, _, _, _, test_phylo_beta_jac_fn, test_phylo_beta_jne_fn, test_phylo_beta_jtu_fn, _, _, _) = valid_phylo_beta_diversity_package with open(pam_fn) as in_f: pam = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) tree = TreeWrapper.from_filename(tree_fn) with open(test_beta_jac_fn) as in_f: test_beta_jac = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_jne_fn) as in_f: test_beta_jne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_jtu_fn) as in_f: test_beta_jtu = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_jac_fn) as in_f: test_phylo_beta_jac = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_jne_fn) as in_f: test_phylo_beta_jne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_jtu_fn) as in_f: test_phylo_beta_jtu = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) (beta_jtu, phylo_beta_jtu, beta_jne, phylo_beta_jne, beta_jac, phylo_beta_jac) = pbd.calculate_phylo_beta_diversity_jaccard( pam, tree) # Check matrix outputs to see if they are within tolerance assert np.allclose(beta_jtu, test_beta_jtu) assert np.allclose(phylo_beta_jtu, test_phylo_beta_jtu) assert np.allclose(beta_jne, test_beta_jne) assert np.allclose(phylo_beta_jne, test_phylo_beta_jne) assert np.allclose(beta_jac, test_beta_jac) assert np.allclose(phylo_beta_jac, test_phylo_beta_jac)
def get_report_data(accepted_taxa_filename, base_dir): num_accepted_species = 0 species_report = {} # Generate report with open(accepted_taxa_filename) as taxa_file: for line in taxa_file: num_accepted_species += 1 parts = line.split(', ') species_name = parts[1].strip().strip('"') sp_key = int(parts[2]) genus_name = species_name.split(' ')[0] genus_dir = os.path.join(base_dir, genus_name) kew_filename = os.path.join( genus_dir, '{}_powo.json'.format(species_name)) k_val = -1 if os.path.exists(kew_filename): k_val += 1 if os.stat(kew_filename).st_size > 5: k_val += 1 idigbio_filename = os.path.join( genus_dir, '{}_idigbio.csv'.format(species_name)) i_val = -1 if os.path.exists(idigbio_filename): i_val += 1 if os.stat(idigbio_filename).st_size > 5: i_val += 1 gbif_filename = os.path.join( genus_dir, '{}_gbif.csv'.format(species_name)) g_val = -1 if os.path.exists(gbif_filename): g_val += 1 if os.stat(gbif_filename).st_size > 5: g_val += 1 if sum([k_val, i_val, g_val]) < 3: species_report[species_name] = [k_val, i_val, g_val] # Create a matrix for output species_names = [] report_data = [] for k in sorted(species_report.keys()): species_names.append(k) report_data.append(species_report[k]) species_report_matrix = Matrix( np.array(report_data), headers={ '0': species_names, '1': ['POWO', 'iDigBio', 'GBIF']}) return num_accepted_species, species_report_matrix
def correct_p_values(p_values_matrix, false_discovery_rate=0.05): """Perform P-value correction. Args: p_values_matrix (:obj: `Matrix`): A Matrix of p-values to correct false_discovery_rate (:obj: `float`): An acceptable false discovery rate (alpha) value to declare a cell significant Returns: Matrix: A matrix object of significant values. Todo: * Enable other correction types * Consider how metadata may be added * Consider producing a matrix of the maximum FDR value that would mark each cell as significant """ # Reshape data into one-dimensional array p_flat = p_values_matrix.flatten() num_vals = p_flat.size # 1. Order p-values # 2. Assign rank # 3. Create critical values # 4. Find the largest p-value such that P(i) < critical value # 5. All P(j) such that j <= i are significant rank = 1 comp_p = 0.0 for p in sorted(p_flat.tolist()): crit_val = false_discovery_rate * (float(rank) / num_vals) # Check if the p value is less than the critical value if p < crit_val: # If this p is smaller, all p values smaller than this one are # "significant", even those that were greater than their # respective critical value comp_p = p rank += 1 headers = deepcopy(p_values_matrix.headers) headers[str(p_values_matrix.ndim)] = ['BH Corrected'] sig_values = (p_values_matrix <= comp_p).astype(int) return Matrix(sig_values, headers=headers)
def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('shapegrid_filename', type=str, help='File location of the shapegrid shapefile') parser.add_argument('pam_filename', type=str, help='File location of the PAM matrix for statistics') parser.add_argument('tree_filename', type=str, help='File location of the tree to use for statistics') parser.add_argument('tree_schema', choices=['newick', 'nexus'], help='The tree schema') parser.add_argument('out_geojson_filename', type=str, help='File location to write the output GeoJSON') parser.add_argument('--layer', nargs=2, action='append', help='File location of a layer followed by a label') args = parser.parse_args() # Load data pam = Matrix.load(args.pam_filename) tree = TreeWrapper.get(path=args.tree_filename, schema=args.tree_schema) # Encode layers encoded_layers = encode_environment_layers(args.shapegrid_filename, args.layer) # Calculate PAM statistics stats_mtx = calculate_tree_site_statistics(pam, tree) # Join encoded layers and PAM statistics mtx = join_encoded_layers_and_pam_stats(encoded_layers, stats_mtx) # Generate GeoJSON geojson_data = create_geojson(args.shapegrid_filename, mtx) # Write GeoJSON with open(args.out_geojson_filename, 'w') as out_file: json.dump(geojson_data, out_file)
def calc_sig_phylo_sor(pam, tree): # pragma: no cover """Calculates phylogenetic beta diversity for the sorensen index family. Args: pam (:obj:`Matrix`): A Lifemapper Matrix object with presence absence values. tree (:obj:`TreeWrapper`): A TreeWrapper object for a wrapped Dendropy phylogenetic tree. Returns: Phylogenetic beta diversity matrics (species by species) * beta_sim: ADD DESCRIPTION * phylo_beta_sim: ADD DESCRIPTION * beta_sne: ADD DESCRIPTION * phylo_beta_sne: ADD DESCRIPTION * beta_sor: ADD DESCRIPTION * phylo_beta_sor: ADD DESCRIPTION Todo: * Fill in method documentation * Fill in method * Fill in tests / documentation """ # Get a lookup dictionary for the matrix index of each species in the PAM # in case they are not in the same order as the taxa in the tree species_lookup = get_species_index_lookup(pam) # Build a header dictionary, all of the returned matricies will have the # same headers, site rows by site columns. # Note: This will differ from the R method because each site will be # present in both the rows and the columns. mtx_headers = { '0': pam.get_row_headers(), # Row headers '1': pam.get_row_headers() # Column headers } num_sites = pam.data.shape[0] # Get the number of sites in the PAM # Note: For ease of development, use these numpy arrays for the # computations. They will be wrapped into a Matrix object when they are # returned from the function. phylo_beta_sim_data = np.zeros((num_sites, num_sites), dtype=np.float) phylo_beta_sne_data = np.zeros((num_sites, num_sites), dtype=np.float) phylo_beta_sor_data = np.zeros((num_sites, num_sites), dtype=np.float) # TODO: Compute phylo beta diversity for sorensen index family core_calc = core_PD_calc(pam, tree) # This loop will populate arrays with beta diversity metrics. for my_row in range(core_calc.data.shape[0]): my_dat = core_calc.data[my_row, 0:4] my_dim = core_calc.get_row_headers()[my_row] phylo_beta_sim_data[my_dim[0], my_dim[1]] = (my_dat[0] / (my_dat[0] + my_dat[3])) phylo_beta_sim_data[my_dim[1], my_dim[0]] = phylo_beta_sim_data[my_dim[0], my_dim[1]] phylo_beta_sor_data[my_dim[0], my_dim[1]] = (my_dat[2] / ((2 * my_dat[3]) + my_dat[2])) phylo_beta_sor_data[my_dim[1], my_dim[0]] = phylo_beta_sor_data[my_dim[0], my_dim[1]] phylo_beta_sne_data[my_dim[0], my_dim[1]] = ( (my_dat[1] - my_dat[0]) / ((2 * my_dat[3]) + my_dat[2])) * (my_dat[3] / (my_dat[0] + my_dat[3])) phylo_beta_sne_data[my_dim[1], my_dim[0]] = phylo_beta_sne_data[my_dim[0], my_dim[1]] # Just to match formatting across scripts. for i in range(num_sites): phylo_beta_sim_data[i, i] = 1. phylo_beta_sne_data[i, i] = 1. phylo_beta_sor_data[i, i] = 1. return (Matrix(phylo_beta_sim_data, headers=mtx_headers), Matrix(phylo_beta_sne_data, headers=mtx_headers), Matrix(phylo_beta_sor_data, headers=mtx_headers))
def calc_phylo_jac_distr(pam, tree, nrand=5): # pragma: no cover """Calculates distribution of jaccard metrics based on randomization of phylogenetic relationships. Args: pam (:obj:`Matrix`): A Lifemapper Matrix object with presence absence values (site rows by species columns). tree (:obj:`TreeWrapper`): A TreeWrapper object for a wrapped Dendropy phylogenetic tree. Returns: Mean & SD of distribution of Jaccard-based metrics from randomizations. Note: * It looks like the scipy.spatial.distance.jaccard method may be useful here. Todo: * Fill in method documentation * Fill in method * Fill in tests and method documentation in sphinx """ # Get a lookup dictionary for the matrix index of each species in the PAM # in case they are not in the same order as the taxa in the tree species_lookup = get_species_index_lookup(pam) # Build a header dictionary, all of the returned matricies will have the # same headers, site rows by site columns. # Note: This will differ from the R method because each site will be # present in both the rows and the columns. mtx_headers = { '0': pam.get_row_headers(), # Row headers '1': pam.get_row_headers() # Column headers } num_sites = pam.data.shape[0] # Get the number of sites in the PAM # print pam.data, "\n" # print pam.get_column_headers(),"\n" # print pam.get_row_headers(),"\n" # These matrices will serve as running average placeholders. pjtu_av = np.zeros((num_sites, num_sites), dtype=np.float) pjne_av = np.zeros((num_sites, num_sites), dtype=np.float) pjac_av = np.zeros((num_sites, num_sites), dtype=np.float) # TODO: Randomize tree, calc. metrics, save running average. # for trial in range(len(nrand)): # Randomize tip labels of tree. # Get core metrics related to phylogeny. core_calc = core_PD_calc(pam, tree) # Matrix object. # This loop will populate arrays with all beta diversity metrics. for my_row in range(core_calc.data.shape[0]): # Pull out the phylogentic core numeric values. my_dat = core_calc.data[my_row, 0:4] # Get index values for placing into output arrays. my_dim = core_calc.get_row_headers()[my_row] # Populate arrays. phylo_beta_jtu_data[my_dim[0], my_dim[1]] = (2 * my_dat[0]) / ( (2 * my_dat[0]) + my_dat[3]) phylo_beta_jtu_data[my_dim[1], my_dim[0]] = phylo_beta_jtu_data[my_dim[0], my_dim[1]] phylo_beta_jac_data[my_dim[0], my_dim[1]] = (my_dat[2] / (my_dat[3] + my_dat[2])) phylo_beta_jac_data[my_dim[1], my_dim[0]] = phylo_beta_jac_data[my_dim[0], my_dim[1]] phylo_beta_jne_data[my_dim[0], my_dim[1]] = ( (my_dat[1] - my_dat[0]) / (my_dat[3] + my_dat[2])) * (my_dat[3] / ((2 * my_dat[0]) + my_dat[3])) phylo_beta_jne_data[my_dim[1], my_dim[0]] = phylo_beta_jne_data[my_dim[0], my_dim[1]] # Ensure diagonals are 1 just to match Biotaphy test file expectations. for i in range(num_sites): phylo_beta_jtu_data[i, i] = 1. phylo_beta_jac_data[i, i] = 1. phylo_beta_jne_data[i, i] = 1. return (Matrix(phylo_beta_jtu_data, headers=mtx_headers), Matrix(phylo_beta_jne_data, headers=mtx_headers), Matrix(phylo_beta_jac_data, headers=mtx_headers))
def calculate_phylo_beta_diversity_jaccard(pam, tree): """Calculates phylogenetic beta diversity for the jaccard index family. Args: pam (:obj:`Matrix`): A Lifemapper Matrix object with presence absence values (site rows by species columns). tree (:obj:`TreeWrapper`): A TreeWrapper object for a wrapped Dendropy phylogenetic tree. Returns: Phylogenetic beta diversity matrics (species by species) * beta_jtu: ADD DESCRIPTION * phylo_beta_jtu: ADD DESCRIPTION * beta_jne: ADD DESCRIPTION * phylo_beta_jne: ADD DESCRIPTION * beta_jac: ADD DESCRIPTION * phylo_beta_jac: ADD DESCRIPTION Note: * It looks like the scipy.spatial.distance.jaccard method may be useful here. Todo: * Fill in method documentation * Fill in method """ # Get a lookup dictionary for the matrix index of each species in the PAM # in case they are not in the same order as the taxa in the tree species_lookup = get_species_index_lookup(pam) # Build a header dictionary, all of the returned matricies will have the # same headers, site rows by site columns. # Note: This will differ from the R method because each site will be # present in both the rows and the columns. mtx_headers = { '0': pam.get_row_headers(), # Row headers '1': pam.get_row_headers() # Column headers } num_sites = pam.shape[0] # Get the number of sites in the PAM # print pam.data, "\n" # print pam.get_column_headers(),"\n" # print pam.get_row_headers(),"\n" # Note: For ease of development, use these numpy arrays for the # computations. They will be wrapped into a Matrix object when they are # returned from the function. beta_jtu_data = np.zeros((num_sites, num_sites), dtype=np.float) phylo_beta_jtu_data = np.zeros((num_sites, num_sites), dtype=np.float) beta_jne_data = np.zeros((num_sites, num_sites), dtype=np.float) phylo_beta_jne_data = np.zeros((num_sites, num_sites), dtype=np.float) beta_jac_data = np.zeros((num_sites, num_sites), dtype=np.float) phylo_beta_jac_data = np.zeros((num_sites, num_sites), dtype=np.float) # TODO: Compute phylo beta diversity for jaccard index family # Get core metrics related to phylogeny. core_calc = core_PD_calc(pam, tree) # Matrix object. # This loop will populate arrays with all beta diversity metrics. for my_row in range(core_calc.shape[0]): # Pull out the phylogentic core numeric values. my_dat = core_calc[my_row, 0:4] # Get index values for placing into output arrays. my_dim = core_calc.get_row_headers()[my_row] # Populate arrays. phylo_beta_jtu_data[my_dim[0], my_dim[1]] = (2 * my_dat[0]) / ( (2 * my_dat[0]) + my_dat[3]) phylo_beta_jtu_data[my_dim[1], my_dim[0]] = phylo_beta_jtu_data[my_dim[0], my_dim[1]] phylo_beta_jac_data[my_dim[0], my_dim[1]] = (my_dat[2] / (my_dat[3] + my_dat[2])) phylo_beta_jac_data[my_dim[1], my_dim[0]] = phylo_beta_jac_data[my_dim[0], my_dim[1]] phylo_beta_jne_data[my_dim[0], my_dim[1]] = ( (my_dat[1] - my_dat[0]) / (my_dat[3] + my_dat[2])) * (my_dat[3] / ((2 * my_dat[0]) + my_dat[3])) phylo_beta_jne_data[my_dim[1], my_dim[0]] = phylo_beta_jne_data[my_dim[0], my_dim[1]] # Get core metrics for simple beta diversity (no phylo component). ''' Arrays: 0==shared; 1==not shared; 2==sum not shared; 3==max not shared; 4==min not shared. ''' core_beta = core_Beta_calc(pam, tree) # Populate arrays. beta_jtu_data = (2 * core_beta[4]) / ((2 * core_beta[4]) + core_beta[0]) beta_jne_data = ((core_beta[3] - core_beta[4]) / (core_beta[0] + core_beta[2])) * (core_beta[0] / ( (2 * core_beta[4]) + core_beta[0])) beta_jac_data = core_beta[2] / (core_beta[0] + core_beta[2]) # Ensure diagonals are 1 just to match Biotaphy test file expectations. for i in range(num_sites): phylo_beta_jtu_data[i, i] = 1. phylo_beta_jac_data[i, i] = 1. phylo_beta_jne_data[i, i] = 1. beta_jtu_data[i, i] = 1. beta_jac_data[i, i] = 1. beta_jne_data[i, i] = 1. return (Matrix(beta_jtu_data, headers=mtx_headers), Matrix(phylo_beta_jtu_data, headers=mtx_headers), Matrix(beta_jne_data, headers=mtx_headers), Matrix(phylo_beta_jne_data, headers=mtx_headers), Matrix(beta_jac_data, headers=mtx_headers), Matrix(phylo_beta_jac_data, headers=mtx_headers))
def core_PD_calc(pam, tree): """Creates array of core metrics to asses components of beta diversity. Args: pam (:obj:'Matrix'): A Lifemapper Matrix object with presence absence values. tree (:obj:'TreeWrapper'): A TreeWrapper object for a wrapped Dendropy phylogenetic tree. Returns: Matrix object. Cols=core metrics; Rows=Pairwise comparisons. Details: In general, the metrics returned represent different contributions to PD arising from how communities are combined together. Metrics: min_not_shared: smallest dist. from ind. samples to their combo. max_not_shared: largest dist. from ind. samples to their combo. sum_not_shared: total addition to PD from both comm. shared: combined contribution to PD that comm. make jointly. """ # PD for each community of the community matrix. pd = pdnew(pam, tree) # List all possible pairwise community combinations. combin = list(it.combinations(range(len(pam.get_row_headers())), 2)) # Array to store PD of pairwise site combinations. # Rows = all pairwise community comparisons. Cols = spp. com_tot_pair = np.zeros((len(combin), len(pam.get_column_headers())), dtype=np.float) # This loop will populate the pairwise PD_array. # 1 == spp present in at least 1 sample; 0 == spp absent from both samples. for pair in range(len(combin)): # Assign each site's data to new variable for convenience. site0 = pam[combin[pair][0]] site1 = pam[combin[pair][1]] # Is each spp present in at least 1 sample. for idx in range(len(site0)): if site0[idx] or site1[idx]: com_tot_pair[pair, idx] = 1 else: com_tot_pair[pair, idx] = 0 # Convert pairwise PD_array into Matrix object for pdnew() function. com_tot_pair = Matrix(com_tot_pair, headers={ '0': combin, '1': pam.get_column_headers() }) # Array holding the PD of each pairwise community combination. pd_tot_pair = pdnew(com_tot_pair, tree) # The following will calculate the sum of each pair of sample's PD. # I.e. treats each sample separately. sum_pd_pair = [] for pair in range(len(combin)): tmp = pd[combin[pair][0], 0] + pd[combin[pair][1], 0] sum_pd_pair.append(tmp) # PD of all communities combined. com_tot_multi = np.sum(pam, axis=0) # Convert to presence/ absence (i.e. 1,0). com_tot_multi = [1 if i > 0 else 0 for i in com_tot_multi] # Calculate the PD. sp_pres = list(it.compress(pam.get_column_headers(), com_tot_multi)) # Dendropy labels don't retain '_' sp_pres = [i.replace('_', ' ') for i in sp_pres] tree_pres = tree.extract_tree_with_taxa_labels(sp_pres) pd_tot_multi = tree_pres.length() # Contribution of PD that is not shared beteen two sites: # pull out just PD values. pd_sites = pd[0:len(pd.get_row_headers()), 0] # create list of all pairwise combinations. pd_combos = list(it.combinations(pd_sites, 2)) # Array to hold metrics assessing PD contributions to beta diversity. not_shared = np.zeros((len(pd_combos), 4), dtype=np.float) # This loop will populate array. see Details. for pair in range(len(pd_combos)): # Pull out each site's individual data. site1 = pd_combos[pair][0] # PD site 1 site2 = pd_combos[pair][1] # PD site 2 # Pull out the PD of the 2 sites combined. pdpair = pd_tot_pair[pair][0] # Pull out the sum of the separate PD values. sum_pair = sum_pd_pair[pair] # Metrics of interest: min_not_shared = min(pdpair - site1, pdpair - site2) # min(b,c) max_not_shared = max(pdpair - site1, pdpair - site2) # max(b,c) sum_not_shared = (2 * pdpair) - sum_pair # b+c shared_val = pdpair - sum_not_shared # a # Add metrics to appropriate row of array. not_shared[pair] = [ min_not_shared, max_not_shared, sum_not_shared, shared_val ] # Convert not_shared array to Matrix object. core_calc = Matrix( not_shared, headers={ '0': combin, '1': ['min_not_shared', 'max_not_shared', 'sum_not_shared', 'shared'] }) # return values. return core_calc
def calculate_continuous_ancestral_states(tree, char_mtx, sum_to_one=False, calc_std_err=False): """Calculates the continuous ancestral states for the nodes in a tree. Args: tree (Tree): A dendropy tree or TreeWrapper object. char_mtx (Matrix): A Matrix object with character information. Each row should represent a tip in the tree and each column should be a variable to calculate ancestral state for. calc_std_err (:obj:`bool`, optional): If True, calculate standard error for each variable. Defaults to False. sum_to_one (:obj:`bool`, optional): If True, standardize the character matrix so that the values in a row sum to one. Defaults to False. Returns: A matrix of character data with the following dimensions: * rows: nodes / tips in the tree * columns: character variables * depth: first is the calculated value, second layer is standard error if desired Todo: * Add function for consistent label handling. """ # Wrap tree if dendropy tree if not isinstance(tree, TreeWrapper): tree = TreeWrapper.from_base_tree(tree) # Assign labels to nodes that don't have them tree.add_node_labels() # Synchronize tree and character data # Prune tree prune_taxa = [] keep_taxon_labels = [] init_row_headers = char_mtx.get_row_headers() for taxon in tree.taxon_namespace: label = taxon.label.replace(' ', '_') if label not in init_row_headers: prune_taxa.append(taxon) print( 'Could not find {} in character matrix, pruning'.format(label)) else: keep_taxon_labels.append(label) if len(keep_taxon_labels) == 0: raise Exception( 'None of the tree tips were found in the character data') tree.prune_taxa(prune_taxa) tree.purge_taxon_namespace() # Prune character data keep_rows = [] i = 0 for label in init_row_headers: if label in keep_taxon_labels: keep_rows.append(i) else: print('Could not find {} in tree tips, pruning'.format(label)) i += 1 char_mtx = char_mtx.slice(keep_rows) # Standardize character matrix if requested tip_count, num_vars = char_mtx.shape if sum_to_one: for i in range(tip_count): sc = float(1.0) / np.sum(char_mtx[i]) for j in range(num_vars): char_mtx[i, j] *= sc # Initialize data matrix num_nodes = len(tree.nodes()) data_shape = (num_nodes, num_vars, 2 if calc_std_err else 1) data = np.zeros(data_shape, dtype=float) # Initialize headers row_headers = [] tip_col_headers = char_mtx.get_column_headers() tip_row_headers = char_mtx.get_row_headers() tip_lookup = dict([(tip_row_headers[i].replace('_', ' '), i) for i in range(tip_count)]) # Get the number of internal nodes in the tree internal_node_count = num_nodes - tip_count # Loop through the tree and set the matrix index for each node # Also set data values node_headers = [] node_i = tip_count tip_i = 0 node_index_lookup = {} for node in tree.nodes(): label = _get_node_label(node) if len(node.child_nodes()) == 0: # Tip node_index_lookup[label] = tip_i row_headers.append(label) data[tip_i, :, 0] = char_mtx[tip_lookup[label]] tip_i += 1 else: node_index_lookup[label] = node_i node_headers.append(label) # Internal node data[node_i, :, 0] = np.zeros((1, num_vars), dtype=float) node_i += 1 # Row headers should be extended with node headers row_headers.extend(node_headers) # For each variable for x in range(num_vars): # Compute the ML estimate of the root full_mcp = np.zeros((internal_node_count, internal_node_count), dtype=float) full_vcp = np.zeros(internal_node_count, dtype=float) for k in tree.postorder_edge_iter(): i = k.head_node if len(i.child_nodes()) != 0: node_num_i = node_index_lookup[_get_node_label(i)] - tip_count for j in i.child_nodes(): tbl = 2. / j.edge_length full_mcp[node_num_i][node_num_i] += tbl node_num_j = node_index_lookup[_get_node_label(j)] if len(j.child_nodes()) == 0: full_vcp[node_num_i] += (data[node_num_j, x, 0] * tbl) else: node_num_j -= tip_count full_mcp[node_num_i][node_num_j] -= tbl full_mcp[node_num_j][node_num_i] -= tbl full_mcp[node_num_j][node_num_j] += tbl b = la.cho_factor(full_mcp) # these are the ML estimates for the ancestral states ml_est = la.cho_solve(b, full_vcp) sos = 0 for k in tree.postorder_edge_iter(): i = k.head_node node_num_i = node_index_lookup[_get_node_label(i)] if len(i.child_nodes()) != 0: data[node_num_i, x, 0] = ml_est[node_num_i - tip_count] if calc_std_err: for j in i.child_nodes(): node_num_j = node_index_lookup[_get_node_label(j)] temp = data[node_num_i, x, 0] - data[node_num_j, x, 0] sos += temp * temp / j.edge_length # nni is node_num_i adjusted for only nodes nni = node_num_i - tip_count qpq = full_mcp[nni][nni] tm1 = np.delete(full_mcp, (nni), axis=0) tm = np.delete(tm1, (nni), axis=1) b = la.cho_factor(tm) sol = la.cho_solve(b, tm1[:, nni]) temp_std_err = qpq - np.inner(tm1[:, nni], sol) data[node_num_i, x, 1] = math.sqrt( 2.0 * sos / ((internal_node_count - 1) * temp_std_err)) depth_headers = ['maximum_likelihood'] if calc_std_err: depth_headers.append('standard_error') mtx_headers = {'0': row_headers, '1': tip_col_headers, '2': depth_headers} return tree, Matrix(data, headers=mtx_headers)
def calculate_phylo_beta_diversity_sorensen(pam, tree): """Calculates phylogenetic beta diversity for the sorensen index family. Args: pam (:obj:`Matrix`): A Lifemapper Matrix object with presence absence values. tree (:obj:`TreeWrapper`): A TreeWrapper object for a wrapped Dendropy phylogenetic tree. Returns: Phylogenetic beta diversity matrics (species by species) * beta_sim: ADD DESCRIPTION * phylo_beta_sim: ADD DESCRIPTION * beta_sne: ADD DESCRIPTION * phylo_beta_sne: ADD DESCRIPTION * beta_sor: ADD DESCRIPTION * phylo_beta_sor: ADD DESCRIPTION Todo: * Fill in method documentation * Fill in method """ # Build a header dictionary, all of the returned matricies will have the # same headers, site rows by site columns. # Note: This will differ from the R method because each site will be # present in both the rows and the columns. mtx_headers = { '0': pam.get_row_headers(), # Row headers '1': pam.get_row_headers() # Column headers } num_sites = pam.shape[0] # Get the number of sites in the PAM # Note: For ease of development, use these numpy arrays for the # computations. They will be wrapped into a Matrix object when they are # returned from the function. beta_sim_data = np.zeros((num_sites, num_sites), dtype=float) phylo_beta_sim_data = np.zeros((num_sites, num_sites), dtype=float) beta_sne_data = np.zeros((num_sites, num_sites), dtype=float) phylo_beta_sne_data = np.zeros((num_sites, num_sites), dtype=float) beta_sor_data = np.zeros((num_sites, num_sites), dtype=float) phylo_beta_sor_data = np.zeros((num_sites, num_sites), dtype=float) # TODO: Compute phylo beta diversity for sorensen index family core_calc = core_PD_calc(pam, tree) # This loop will populate arrays with beta diversity metrics. for my_row in range(core_calc.shape[0]): my_dat = core_calc[my_row, 0:4] my_dim = core_calc.get_row_headers()[my_row] phylo_beta_sim_data[my_dim[0], my_dim[1]] = ( my_dat[0] / (my_dat[0] + my_dat[3])) phylo_beta_sim_data[my_dim[1], my_dim[0]] = phylo_beta_sim_data[ my_dim[0], my_dim[1]] phylo_beta_sor_data[my_dim[0], my_dim[1]] = ( my_dat[2] / ((2*my_dat[3]) + my_dat[2])) phylo_beta_sor_data[my_dim[1], my_dim[0]] = phylo_beta_sor_data[ my_dim[0], my_dim[1]] phylo_beta_sne_data[my_dim[0], my_dim[1]] = ( (my_dat[1] - my_dat[0]) / ((2*my_dat[3]) + my_dat[2])) * ( my_dat[3] / (my_dat[0] + my_dat[3])) phylo_beta_sne_data[my_dim[1], my_dim[0]] = phylo_beta_sne_data[ my_dim[0], my_dim[1]] # Get core metrics for simple beta diversity (no phylo component). ''' Arrays: 0==shared; 1==not shared; 2==sum not shared; 3==max not shared; 4==min not shared. ''' core_beta = core_Beta_calc(pam, tree) # Populate arrays. beta_sim_data = core_beta[4] / (core_beta[4] + core_beta[0]) beta_sor_data = core_beta[2] / ((2 * core_beta[0]) + core_beta[2]) beta_sne_data = ( (core_beta[3] - core_beta[4]) / ((2 * core_beta[0]) + core_beta[2]) ) * (core_beta[0] / (core_beta[4] + core_beta[0])) # Just to match formatting across scripts. for i in range(num_sites): phylo_beta_sim_data[i, i] = 1. phylo_beta_sne_data[i, i] = 1. phylo_beta_sor_data[i, i] = 1. beta_sim_data[i, i] = 1. beta_sne_data[i, i] = 1. beta_sor_data[i, i] = 1. return ( Matrix(beta_sim_data, headers=mtx_headers), Matrix(phylo_beta_sim_data, headers=mtx_headers), Matrix(beta_sne_data, headers=mtx_headers), Matrix(phylo_beta_sne_data, headers=mtx_headers), Matrix(beta_sor_data, headers=mtx_headers), Matrix(phylo_beta_sor_data, headers=mtx_headers))
def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('--out_stats_matrix_filename', type=str, help='Location to write statistics matrix.') parser.add_argument('shapegrid_filename', type=str, help='File location of the shapegrid shapefile') parser.add_argument('pam_filename', type=str, help='File location of the PAM matrix for statistics') parser.add_argument('tree_filename', type=str, help='File location of the tree to use for statistics') parser.add_argument('tree_schema', choices=['newick', 'nexus'], help='The tree schema') parser.add_argument('out_geojson_filename', type=str, help='File location to write the output GeoJSON') parser.add_argument('out_csv_filename', type=str, help='File location to write the output CSV') parser.add_argument('out_matrix_filename', type=str, help='File location to write the output matrix') parser.add_argument('--layer', nargs=2, action='append', help='File location of a layer followed by a label') args = parser.parse_args() # Load data pam = Matrix.load(args.pam_filename) tree = TreeWrapper.get(path=args.tree_filename, schema=args.tree_schema) # Encode layers encoded_layers = encode_environment_layers(args.shapegrid_filename, args.layer) # Calculate PAM statistics stats_mtx = calculate_tree_site_statistics(pam, tree) if args.out_stats_matrix_filename: stats_mtx.write(args.out_stats_matrix_filename) # Join encoded layers and PAM statistics mtx = join_encoded_layers_and_pam_stats(encoded_layers, stats_mtx) # Generate GeoJSON geojson_data = create_geojson(args.shapegrid_filename, mtx) # Write GeoJSON with open(args.out_geojson_filename, 'w') as out_file: json.dump(geojson_data, out_file, indent=4) # Write matrix data new_rh = [] res = 0.5 for _, x, y in mtx.get_row_headers(): min_x = x - res max_x = x + res min_y = y - res max_y = y + res new_rh.append('"POLYGON (({} {},{} {},{} {},{} {},{} {}))"'.format( min_x, max_y, max_x, max_y, max_x, min_y, min_x, min_y, min_x, max_y)) mtx.write(args.out_matrix_filename) mtx.set_row_headers(new_rh) with open(args.out_csv_filename, 'w') as out_file: mtx.write_csv(out_file)