def test_spearmanr_full(self): """ Compare the optimized version of spearman mantel with the naive loop implementation """ x = DistanceMatrix.read(get_data_path('dm2.txt')) y = DistanceMatrix.read(get_data_path('dm3.txt')) num_perms = 12 np.random.seed(0) orig_stat_fast, permuted_stats_fast = _mantel_stats_spearman( x, y, num_perms) # compute the traditional way np.random.seed(0) x_flat = x.condensed_form() y_flat = y.condensed_form() orig_stat = spearmanr(x_flat, y_flat)[0] perm_gen = (spearmanr(x.permute(condensed=True), y_flat)[0] for _ in range(num_perms)) permuted_stats = np.fromiter(perm_gen, np.float, count=num_perms) self.assertAlmostEqual(orig_stat_fast, orig_stat) for i in range(num_perms): self.assertAlmostEqual(permuted_stats_fast[i], permuted_stats[i])
def test_fsvd(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) # Test eigh vs. fsvd pcoa and inplace parameter expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3, inplace=False) results = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=False) results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=True) assert_ordination_results_equal(results, expected_results, ignore_directionality=True, ignore_method_names=True) assert_ordination_results_equal(results, results_inplace, ignore_directionality=True, ignore_method_names=True) # Test number_of_dimensions edge cases results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0, inplace=False) expected_results2 = pcoa(dm3, method="fsvd", number_of_dimensions=dm3.data.shape[0], inplace=False) assert_ordination_results_equal(results2, expected_results2, ignore_directionality=True, ignore_method_names=True) with self.assertRaises(ValueError): dim_too_large = dm1.data.shape[0] + 10 pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large) with self.assertRaises(ValueError): pcoa(dm2, method="fsvd", number_of_dimensions=-1) with self.assertRaises(ValueError): dim_too_large = dm1.data.shape[0] + 10 pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large) with self.assertRaises(ValueError): pcoa(dm2, method="eigh", number_of_dimensions=-1) dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim')) with self.assertWarnsRegex(RuntimeWarning, "no value for number_of_dimensions"): pcoa(dm_big, method="fsvd", number_of_dimensions=0)
def test_fsvd_inplace(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3, inplace=True) results = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=True) assert_ordination_results_equal(results, expected_results, ignore_directionality=True, ignore_method_names=True)
def distmat_corr(truthfile, distfile, reps=3, corrstat=spearman): '''Returns correlation between condensed distance matrices, using corrstat''' distmat = DistanceMatrix.read(distfile) truthmat = DistanceMatrix.read(truthfile) truthmat = sample_matrix_to_runs(truthmat, reps) ids = list(sorted(distmat.ids)) t_ids = list(sorted(truthmat.ids)) assert ids == t_ids, (ids, t_ids) dist = distmat.filter(ids).condensed_form() truth = truthmat.filter(ids).condensed_form() return corrstat(truth, dist)
def distmat_corr(truthfile, distfile, reps=3, corrstat=spearman): '''Returns correlation between condensed distance matrices, using corrstat''' distmat = DistanceMatrix.read(distfile) truthmat = DistanceMatrix.read(truthfile) truthmat = sample_matrix_to_runs(truthmat, reps) ids = list(sorted(distmat.ids)) t_ids = list(sorted(truthmat.ids)) assert ids == t_ids, (ids, t_ids) dist = distmat.filter(ids).condensed_form() truth = truthmat.filter(ids).condensed_form() return corrstat(truth, dist)
def test_confirm_betadispr_results(self): mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv')) mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t') mp_mf.set_index('#SampleID', inplace=True) obs_med_mp = permdisp(mp_dm, mp_mf, column='BodySite') obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite', test='centroid') exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999] exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999] exp_ind = ['method name', 'test statistic name', 'sample size', 'number of groups', 'test statistic', 'p-value', 'number of permutations'] exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object', name='PERMDISP results') exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object', name='PERMDISP results') self.assert_series_equal(exp_med_mp, obs_med_mp) self.assert_series_equal(exp_cen_mp, obs_cen_mp)
def test_simple(self): eigvals = [ 0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0 ] proportion_explained = [ 0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0 ] sample_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.loadtxt( get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def test_simple(self): eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0] proportion_explained = [0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0] sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634'] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame( np.loadtxt(get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def setUp(self): # Crawford dataset for unweighted UniFrac fp = get_data_path('PCoA_sample_data_3') self.ordination = pcoa(DistanceMatrix.read(fp)) fp = get_data_path('PCoA_biplot_descriptors') self.descriptors = pd.read_table(fp, index_col='Taxon').T
def effect_size(mappings, alphas, betas, output, jobs, permutations, overwrite, na_values): # As we can have multiple mapping, alpha or beta files, we will construct # a mfs dictionary with all the dataframes. Additionally, we will load the # data_dictionary.csv file so we can use it to process the data mappings = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in mappings} for m, mf in mappings.items(): mappings[m].set_index('#SampleID', inplace=True) if betas: betas = {f: DistanceMatrix.read(f) for f in betas} with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par: par(joblib.delayed( _process_column)(bf, c, fname, finfo, alphas, betas, permutations) for bf, c, fname, finfo in _generate_betas( betas, mappings, permutations, output, overwrite)) else: alphas = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in alphas} for a, af in alphas.items(): alphas[a].set_index('#SampleID', inplace=True) for af, c, fname, finfo in _generate_alphas(alphas, mappings, output, overwrite): _process_column(af, c, fname, finfo, alphas, betas, permutations)
def effect_size(mappings, alphas, betas, output, jobs, permutations, overwrite, na_values): # As we can have multiple mapping, alpha or beta files, we will construct # a mfs dictionary with all the dataframes. Additionally, we will load the # data_dictionary.csv file so we can use it to process the data mappings = { f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in mappings } for m, mf in mappings.items(): mappings[m].set_index('#SampleID', inplace=True) if betas: betas = {f: DistanceMatrix.read(f) for f in betas} with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par: par( joblib.delayed(_process_column)(bf, c, fname, finfo, alphas, betas, permutations) for bf, c, fname, finfo in _generate_betas( betas, mappings, permutations, output, overwrite)) else: alphas = { f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in alphas } for a, af in alphas.items(): alphas[a].set_index('#SampleID', inplace=True) for af, c, fname, finfo in _generate_alphas(alphas, mappings, output, overwrite): _process_column(af, c, fname, finfo, alphas, betas, permutations)
def test_confirm_betadispr_results(self): mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv')) mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t') mp_mf.set_index('#SampleID', inplace=True) obs_med_mp = permdisp(mp_dm, mp_mf, column='BodySite') obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite', test='centroid') exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999] exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999] exp_ind = ['method name', 'test statistic name', 'sample size', 'number of groups', 'test statistic', 'p-value', 'number of permutations'] exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object', name='PERMDISP results') exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object', name='PERMDISP results') self.assert_series_equal(exp_med_mp, obs_med_mp) self.assert_series_equal(exp_cen_mp, obs_cen_mp)
def setUp(self): # Crawford dataset for unweighted UniFrac fp = get_data_path('PCoA_sample_data_3') self.ordination = pcoa(DistanceMatrix.read(fp)) fp = get_data_path('PCoA_biplot_descriptors') self.descriptors = pd.read_table(fp, index_col='Taxon').T
def get_spearmans(distfile, truth): distmat = DistanceMatrix.read(distfile) ids = list(sorted(distmat.ids)) distmat = distmat.filter(ids) dist = distmat.condensed_form() truth = truth.condensed_form() sp = stats.spearmanr(truth, dist) return sp.correlation
def get_spearmans(distfile, truth): distmat = DistanceMatrix.read(distfile) ids = list(sorted(distmat.ids)) distmat = distmat.filter(ids) dist = distmat.condensed_form() truth = truth.condensed_form() sp = stats.spearmanr(truth, dist) return sp.correlation
def setup(self): dist_matrix = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) self.ordination = PCoA(dist_matrix) self.ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ]
def test_unweighted_unifrac_qiime_tiny_test(self): dm_fp = get_data_path( os.path.join('qiime-191-tt', 'unweighted_unifrac_dm.txt'), 'data') expected = DistanceMatrix.read(dm_fp) for sid1 in self.q_table.columns: for sid2 in self.q_table.columns: actual = unweighted_unifrac( self.q_table[sid1], self.q_table[sid2], otu_ids=self.q_table.index, tree=self.q_tree) self.assertAlmostEqual(actual, expected[sid1, sid2])
def test_permutted(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) # this should not throw pcoa(dm1, method="fsvd", number_of_dimensions=3, inplace=False) # some operations, like permute, will change memory structure # we want to test that this does not break pcoa dm2 = dm1.permute() # we just want to assure it does not throw pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=False)
def test_io(self): # Very basic check that read/write public API is present and appears to # be functioning. Roundtrip from memory -> disk -> memory and ensure # results match. fh = StringIO() self.dm_3x3.write(fh) fh.seek(0) deserialized = DistanceMatrix.read(fh) self.assertEqual(deserialized, self.dm_3x3) self.assertTrue(type(deserialized) == DistanceMatrix)
def test_varmat_larg(self): np.random.seed(123) D = 50 N = 100 mean = np.ones(D) * 10 cov = np.eye(D) n__ = np.random.multivariate_normal(mean, cov, size=N) X = pd.DataFrame(np.abs(n__), columns=np.arange(D).astype(np.str)) res = variation_matrix(X) exp = DistanceMatrix.read(get_data_path('exp_varmat.txt')) self.assertEqual(str(res), str(exp))
def test_perm_pearsonr_full(self): x = DistanceMatrix.read(get_data_path('dm2.txt')) y = DistanceMatrix.read(get_data_path('dm3.txt')) x_data = x._data y_data = y._data x_flat = squareform(x_data, force='tovector', checks=False) y_flat = squareform(y_data, force='tovector', checks=False) xmean = x_flat.mean() ymean = y_flat.mean() xm = x_flat - xmean ym = y_flat - ymean normxm_la = scipy.linalg.norm(xm) normym_la = scipy.linalg.norm(ym) normxm = np.linalg.norm(xm) normym = np.linalg.norm(ym) self.assertAlmostEqual(normxm, normxm_la) self.assertAlmostEqual(normym, normym_la) perm_order = np.asarray( [[0, 2, 3, 4, 1, 5], [4, 3, 2, 5, 0, 1], [2, 5, 3, 1, 0, 4], [3, 5, 4, 1, 2, 0], [4, 3, 5, 2, 0, 1], [4, 5, 1, 2, 0, 3], [3, 5, 1, 0, 4, 2], [4, 5, 3, 1, 2, 0], [2, 1, 5, 4, 0, 3], [4, 1, 0, 5, 2, 3], [1, 2, 5, 4, 0, 3], [5, 4, 0, 1, 3, 2], [3, 0, 1, 5, 4, 2], [5, 0, 2, 3, 1, 4]], dtype=np.int) ym_normalized = ym / normym permuted_stats = np.empty(len(perm_order), dtype=x_data.dtype) mantel_perm_pearsonr_cy(x_data, perm_order, xmean, normxm, ym_normalized, permuted_stats) for i in range(len(perm_order)): exp_res = self._compute_perf_one(x_data, perm_order[i, :], xmean, normxm, ym_normalized) self.assertAlmostEqual(permuted_stats[i], exp_res)
def test_varmat_larg(self): np.random.seed(123) D = 50 N = 100 mean = np.ones(D)*10 cov = np.eye(D) X = pd.DataFrame(np.abs(np.random.multivariate_normal(mean, cov, size=N)), columns=np.arange(D).astype(np.str)) res = variation_matrix(X) exp = DistanceMatrix.read(get_data_path('exp_varmat.txt')) self.assertEqual(str(res), str(exp))
def effect_size(mappings, alphas, betas, output, jobs, permutations, alpha_method): if not mappings: raise ValueError("You need to pass a mappings") if not alphas and not betas: raise ValueError("You need to pass either alphas or betas") if alphas and betas: raise ValueError("You can't pass both alphas and betas") if output is None: raise ValueError("You need to pass a output") if not isdir(output): mkdir(output) # As we can have multiple mapping, alpha or files, we will construct a mfs # dictionary with all the dataframes. Additionally, we will load the # data_dictionary.csv file so we can use it to process the data mappings = { f: pd.read_csv(f, sep='\t', dtype=str, na_values=NA_VALUES) for f in mappings } for m, mf in mappings.items(): mappings[m].set_index('#SampleID', inplace=True) if betas: betas = {f: DistanceMatrix.read(f) for f in betas} print('maps: %d, betas: %d, cols: %s' % (len(mappings), len(betas), [len(m.columns.values) for _, m in mappings.items()])) with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par: par( joblib.delayed(_process_column)(bf, c, fname, method) for bf, c, fname, method in _generate_betas( betas, mappings, permutations, output)) else: alphas = { f: pd.read_csv(f, sep='\t', dtype=str, na_values=NA_VALUES) for f in alphas } for a, af in alphas.items(): alphas[a].set_index('#SampleID', inplace=True) for af, c, fname, method in _generate_alphas(alphas, mappings, output, alpha_method): _process_column(af, c, fname, method)
def _validate_distance_matrix(files, metadata, out_dir): """Validates a distance matrix artifact""" # Magic number [0] -> there is only one plain text file which is # the distance matrix dm_fp = files['plain_text'][0] dm = DistanceMatrix.read(dm_fp) # Get the ids of the distance matrix and the metadata dm_ids = set(dm.ids) metadata_ids = set(metadata) if not metadata_ids.issuperset(dm_ids): return (False, None, "The distance matrix contain samples not " "present in the metadata") filepaths = [(dm_fp, 'plain_text')] return True, [ArtifactInfo(None, 'distance_matrix', filepaths)], ""
def compute_beta_diversity(self): """Compute and cache beta diversity values This method calculates a beta diversity distance matrix and saves it to a folder for re-use. The matrices are calculated based on the full dataset so that any subsample drawn from the full dataset can be fetched from these precomputed matrices. See Also -------- Sculptor.compute_alpha_diversity """ dir_fp = 'roc-curves/%s/cached-matrices/' % self.name os.makedirs(dir_fp, exist_ok=True) X = self._original_bt.matrix_data.toarray().astype(np.int).T self._beta_diversity_matrices = {} for metric in self._beta_metrics: fp = os.path.join(dir_fp, metric + '.full.txt') if os.path.exists(fp): distance_matrix = DistanceMatrix.read(fp) else: if metric in {'unweighted_unifrac', 'weighted_unifrac'}: kws = { 'tree': self.tree, 'otu_ids': self._original_bt.ids('observation') } else: kws = {} distance_matrix = beta_diversity(metric, X, self._original_bt.ids(), **kws) distance_matrix.write(fp) self._beta_diversity_matrices[metric] = distance_matrix
def _generate_distance_matrix_summary(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file which is # the distance matrix dm = DistanceMatrix.read(files['plain_text'][0]) data = dm.condensed_form() # Generate a heatmap with the distance matrix # The sorting in the heatmap is going to be based in hierarchical # clustering. tree = TreeNode.from_linkage_matrix(linkage(data, method='average'), id_list=dm.ids) ids = list(dm.ids) order = [ids.index(n.name) for n in tree.tips()] # Plotting code adapted from skbio's DistanceMatrix.plot() fig, ax = plt.subplots() heatmap = ax.pcolormesh(dm.data[order][:, order]) fig.colorbar(heatmap) ax.invert_yaxis() ax.set_title('Distance Matrix - hierarchical clustering') ax.tick_params(axis='both', which='both', bottom='off', top='off', left='off', right='off', labelbottom='off', labelleft='off') sc_plot = BytesIO() fig.savefig(sc_plot, format='png') sc_plot.seek(0) uri = 'data:image/png;base64,' + quote(b64encode(sc_plot.getbuffer())) html_summary_fp = join(out_dir, 'index.html') with open(html_summary_fp, 'w') as f: f.write(DM_HTML % (dm.shape[0], data.min(), data.max(), data.mean(), np.median(data), uri)) return html_summary_fp, None
# -*- coding: utf-8 -*- """ Created on Wed Jul 20 14:18:58 2016 @author: virginiasaulnier """ from io import StringIO from skbio import DistanceMatrix from skbio import fisher_alpha dm_fh =StringIO("\ta\tb\tc\n" "a\t0.0\t0.5\t1.0\n" "b\t0.5\t0.0\t0.75\n" "c\t1.0\t0.75\t0.0\n") dm = DistanceMatrix.read(dm_fh) print(dm) my_pairs= StringIO("ac,gt,cg,gc,at,ta,gc,ta,tg") dm2 = DistanceMatrix.from_iterable(my_pairs,metric= fisher_alpha(),key=id)
def setup(self): dist_matrix = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) self.ordination = PCoA(dist_matrix) self.ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634']
def test_hamming_distance_matrix(self): msa = parse_msa_file(self.input_a3m_fp) obs = hamming_distance_matrix(msa) exp = DistanceMatrix.read(self.hamming_dm_fp) self.assertEqual(obs, exp)
all_param_grids = [defaults["LinearSVR"]] # ## Main benchmarking loop for reg_idx, (reg, name, grid) in enumerate(zip(regressors, names, all_param_grids)): if (run_defaults): print("Running default parameters for " + name) is_distmatrix = name in dm_set #Boolean switch for distance-matrix specific code blocks if is_distmatrix: ##### Use specific X and y for distance matrix benchmarking, not amplicon experiment object if name=="jensenshannon": md = exp.sample_metadata existing_dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+"aitchison"+'.txt') print("Computing Jensen-Shannon Distance Matrix") dm = DistanceMatrix(data=distance.pdist(exp.data.todense(), metric="jensenshannon"), ids=existing_dm.ids) else: dm = DistanceMatrix.read(dir_prefix+"/beta-q2/"+name+'.txt') md = exp.sample_metadata md = md.filter(dm.ids,axis='index') dm = dm.filter(md.index, strict=True) X_dist = dm.data y_dist = md[target] # Make directory for this regressor if it does not yet exist dir_name = dir_prefix +'/' +dir_prefix + '-' + name print(dir_name)
degapped_alignment_fn = "onekp_only_angios_degapped/{}.onlyangios.noshort.fasta".format( gene) #Read in sequence alignment and trim three ways: remove non angiosperms, remove gappy sites, remove gappy sequences if os.path.isfile(degapped_alignment_fn): angio_msa_nogap_noshort = TabularMSA.read(degapped_alignment_fn, constructor=DNA) sys.stderr.write("Read in degapped alignment: {}\n".format( angio_msa_nogap_noshort.shape)) else: angio_msa_nogap_noshort = get_reduced_alignment( "genes/{}/FNA2AA-upp-masked.fasta".format(gene), angio_1kp_ids) if os.path.isfile(distance_matrix_fn): p_dm = DistanceMatrix.read(distance_matrix_fn) p_dm_df = p_dm.to_data_frame() sys.stderr.write("Read in pre-determined distance matrix!\n") else: p_dm = DistanceMatrix.from_iterable(angio_msa_nogap_noshort, metric=p_distance, key="id") p_dm_df = p_dm.to_data_frame() p_dm_df.to_csv( "onekp_only_angios_pdistance/{}_angio_p_dm.csv".format(gene)) # Cluster sequences divergent_seqs_medoids = [] runs = {} best_run = len(p_dm_df)
def setUp(self): # The test dataset used here is a subset of the Lauber et al. 2009 # "88 Soils" dataset. It has been altered to exercise various aspects # of the code, including (but not limited to): # # - order of distance matrix IDs and IDs in data frame (metadata) are # not exactly the same # - data frame has an extra sample that is not in the distance matrix # - this extra sample has non-numeric and missing values in some of its # cells # # Additional variations of the distance matrix and data frame are used # to test different orderings of rows/columns, extra non-numeric data # frame columns, etc. # # This dataset is also useful because it is non-trivial in size (6 # samples, 11 environment variables) and it includes positive/negative # floats and integers in the data frame. self.dm = DistanceMatrix.read(get_data_path('dm.txt')) # Reordered rows and columns (i.e., different ID order). Still # conceptually the same distance matrix. self.dm_reordered = DistanceMatrix.read( get_data_path('dm_reordered.txt')) self.df = pd.read_csv(get_data_path('df.txt'), sep='\t', index_col=0) # Similar to the above data frame, except that it has an extra # non-numeric column, and some of the other rows and columns have been # reordered. self.df_extra_column = pd.read_csv( get_data_path('df_extra_column.txt'), sep='\t', index_col=0) # All columns in the original data frame (these are all numeric # columns). self.cols = self.df.columns.tolist() # This second dataset is derived from vegan::bioenv's example dataset # (varespec and varechem). The original dataset includes a site x # species table (e.g., OTU table) and a data frame of environmental # variables. Since the bioenv function defined here accepts a distance # matrix, we use a Bray-Curtis distance matrix that is derived from the # site x species table (this matches what is done by vegan::bioenv when # provided an OTU table, using their default distance measure). The # data frame only includes the numeric environmental variables we're # interested in for these tests: log(N), P, K, Ca, pH, Al self.dm_vegan = DistanceMatrix.read( get_data_path('bioenv_dm_vegan.txt')) self.df_vegan = pd.read_csv(get_data_path('bioenv_df_vegan.txt'), sep='\t', converters={0: str}) self.df_vegan.set_index('#SampleID', inplace=True) # Load expected results. self.exp_results = pd.read_csv(get_data_path('exp_results.txt'), sep='\t', index_col=0) self.exp_results_single_column = pd.read_csv( get_data_path('exp_results_single_column.txt'), sep='\t', index_col=0) self.exp_results_different_column_order = pd.read_csv( get_data_path('exp_results_different_column_order.txt'), sep='\t', index_col=0) self.exp_results_vegan = pd.read_csv( get_data_path('bioenv_exp_results_vegan.txt'), sep='\t', index_col=0)
# In[19]: # ENSURE METADATA TARGET IS TYPE INT exp.sample_metadata[target] = pd.to_numeric(exp.sample_metadata[target]) # In[ ]: for reg_idx, (reg, name, grid) in enumerate(zip(regressors, names, all_param_grids)): is_distmatrix = name in dm_set #Boolean switch for distance-matrix specific code blocks if is_distmatrix: ##### Use specific X and y for distance matrix benchmarking, not amplicon experiment object md = exp.sample_metadata dm = DistanceMatrix.read(distmatrix_fp[dataset] + name + '.txt') md = md.filter(dm.ids, axis='index') dm = dm.filter(md.index, strict=True) X_dist = dm.data y_dist = md[target] if (name == "PLSRegressor"): md = exp.sample_metadata X_dist = exp.data.toarray() y_dist = md[target] # Make directory for this regressor if it does not yet exist dir_name = dir_prefix + '/' + dir_prefix + '-' + name if not os.path.isdir(dir_name): os.mkdir(dir_name, mode=0o755)
def setUp(self): # The test dataset used here is a subset of the Lauber et al. 2009 # "88 Soils" dataset. It has been altered to exercise various aspects # of the code, including (but not limited to): # # - order of distance matrix IDs and IDs in data frame (metadata) are # not exactly the same # - data frame has an extra sample that is not in the distance matrix # - this extra sample has non-numeric and missing values in some of its # cells # # Additional variations of the distance matrix and data frame are used # to test different orderings of rows/columns, extra non-numeric data # frame columns, etc. # # This dataset is also useful because it is non-trivial in size (6 # samples, 11 environment variables) and it includes positive/negative # floats and integers in the data frame. self.dm = DistanceMatrix.read(get_data_path('dm.txt')) # Reordered rows and columns (i.e., different ID order). Still # conceptually the same distance matrix. self.dm_reordered = DistanceMatrix.read( get_data_path('dm_reordered.txt')) self.df = pd.read_csv(get_data_path('df.txt'), sep='\t', index_col=0) # Similar to the above data frame, except that it has an extra # non-numeric column, and some of the other rows and columns have been # reordered. self.df_extra_column = pd.read_csv( get_data_path('df_extra_column.txt'), sep='\t', index_col=0) # All columns in the original data frame (these are all numeric # columns). self.cols = self.df.columns.tolist() # This second dataset is derived from vegan::bioenv's example dataset # (varespec and varechem). The original dataset includes a site x # species table (e.g., OTU table) and a data frame of environmental # variables. Since the bioenv function defined here accepts a distance # matrix, we use a Bray-Curtis distance matrix that is derived from the # site x species table (this matches what is done by vegan::bioenv when # provided an OTU table, using their default distance measure). The # data frame only includes the numeric environmental variables we're # interested in for these tests: log(N), P, K, Ca, pH, Al self.dm_vegan = DistanceMatrix.read( get_data_path('bioenv_dm_vegan.txt')) self.df_vegan = pd.read_csv( get_data_path('bioenv_df_vegan.txt'), sep='\t', converters={0: str}) self.df_vegan.set_index('#SampleID', inplace=True) # Load expected results. self.exp_results = pd.read_csv(get_data_path('exp_results.txt'), sep='\t', index_col=0) self.exp_results_single_column = pd.read_csv( get_data_path('exp_results_single_column.txt'), sep='\t', index_col=0) self.exp_results_different_column_order = pd.read_csv( get_data_path('exp_results_different_column_order.txt'), sep='\t', index_col=0) self.exp_results_vegan = pd.read_csv( get_data_path('bioenv_exp_results_vegan.txt'), sep='\t', index_col=0)
index=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']) md_dup = pd.DataFrame([(1, 'a', 0.11, 1), (1, 'a', 0.12, 2), (1, 'a', 0.13, 2), (2, 'a', 0.19, 1), (2, 'a', 0.18, 2), (2, 'a', 0.21, 3), (1, 'b', 0.14, 4), (1, 'b', 0.13, 5), (1, 'b', 0.14, 6), (2, 'b', 0.26, 4), (2, 'b', 0.27, 5), (2, 'b', 0.29, 6) ], columns=['Time', 'Group', 'Value', 'ind'], index=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']) dm = DistanceMatrix.read(StringIO( "\t0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\n" "0\t0.0\t0.3\t1.0\t0.1\t0.1\t0.3\t0.4\t0.5\t0.6\t0.1\t0.2\t0.3\n" "1\t0.3\t0.0\t0.9\t0.2\t0.1\t0.4\t0.2\t0.6\t0.5\t0.2\t0.3\t0.4\n" "2\t1.0\t0.9\t0.0\t0.3\t0.1\t0.3\t0.3\t0.6\t0.6\t0.3\t0.3\t0.4\n" "3\t0.1\t0.2\t0.3\t0.0\t0.2\t0.3\t0.2\t0.5\t0.4\t0.4\t0.2\t0.3\n" "4\t0.1\t0.1\t0.1\t0.2\t0.0\t0.4\t0.3\t0.4\t0.7\t0.1\t0.5\t0.3\n" "5\t0.3\t0.4\t0.3\t0.3\t0.4\t0.0\t0.4\t0.3\t0.6\t0.2\t0.4\t0.2\n" "6\t0.4\t0.2\t0.3\t0.2\t0.3\t0.4\t0.0\t0.5\t0.9\t0.1\t0.3\t0.1\n" "7\t0.5\t0.6\t0.6\t0.5\t0.4\t0.3\t0.5\t0.0\t0.8\t0.1\t0.2\t0.3\n" "8\t0.6\t0.5\t0.6\t0.4\t0.7\t0.6\t0.9\t0.8\t0.0\t0.3\t0.5\t0.4\n" "9\t0.1\t0.2\t0.3\t0.4\t0.1\t0.2\t0.1\t0.1\t0.3\t0.0\t0.4\t0.5\n" "10\t0.2\t0.3\t0.3\t0.2\t0.5\t0.4\t0.3\t0.2\t0.5\t0.4\t0.0\t0.6\n" "11\t0.3\t0.4\t0.4\t0.3\t0.3\t0.2\t0.1\t0.3\t0.4\t0.5\t0.6\t0.0\n" )) groups = {'a': [1, 2, 3, 2, 3, 1.5, 2.5, 2.7, 3, 2, 1, 1.5], 'b': [3, 4, 5, 4.3, 3.4, 3.2, 3, 4.3, 4.9, 5, 3.2, 3.6]}
def test_hamming_distance_matrix(self): msa = parse_msa_file(self.input_a3m_fp) obs = hamming_distance_matrix(msa) exp = DistanceMatrix.read(self.hamming_dm_fp) self.assertEqual(obs, exp)
def setup(self): dist_matrix = DistanceMatrix.read(get_data_path("PCoA_sample_data_3")) self.ordination = PCoA(dist_matrix) self.ids = ["PC.636", "PC.635", "PC.356", "PC.481", "PC.354", "PC.593", "PC.355", "PC.607", "PC.634"]
def load_sample_matrix_to_runs(samplematfile, reps=3): '''Loads a truth distance matrix between samples and expands to runs''' samples = DistanceMatrix.read(samplematfile) return sample_matrix_to_runs(samples, reps)