def test_scaling2(self): scores = rda(self.Y, self.X, scaling=2) mat = np.loadtxt(get_data_path('example2_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) sample_constraints = pd.DataFrame(np.loadtxt( get_data_path('example2_sample_constraints_scaling2'))) # Load data as computed with vegan 2.0-8 vegan_features = pd.DataFrame( np.loadtxt(get_data_path( 'example2_species_scaling2_from_vegan')), index=self.feature_ids, columns=self.pc_ids) vegan_samples = pd.DataFrame( np.loadtxt(get_data_path( 'example2_site_scaling2_from_vegan')), index=self.sample_ids, columns=self.pc_ids) sample_constraints = pd.DataFrame( np.loadtxt(get_data_path( 'example2_sample_constraints_scaling2')), index=self.sample_ids, columns=self.pc_ids) mat = np.loadtxt(get_data_path( 'example2_biplot_scaling2')) cropped_pc_ids = self.pc_ids[:mat.shape[1]] biplot_scores = pd.DataFrame(mat, index=self.env_ids, columns=cropped_pc_ids) proportion_explained = pd.Series([0.44275783, 0.25614586, 0.15280354, 0.10497021, 0.02873375, 0.00987052, 0.00471828], index=self.pc_ids) eigvals = pd.Series([25.897954, 14.982578, 8.937841, 6.139956, 1.680705, 0.577350, 0.275984], index=self.pc_ids) exp = OrdinationResults( 'RDA', 'Redundancy Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=sample_constraints, biplot_scores=biplot_scores, proportion_explained=proportion_explained, eigvals=eigvals) assert_ordination_results_equal(scores, exp, ignore_directionality=True, decimal=6)
def do_analysis(df, n_components=-1): Y_cols = ["slump", "flow", "compressive_strength"] X_cols = [ "cement", "slag", "fly_ash", "water", "superplasticizer", "coarse_aggregate", "fine_aggregate" ] Y = df[Y_cols] X = df[X_cols] if n_components == -1: r2s = [] mses = [] rpds = [] xticks = np.arange(1, X.shape[1] + 1) for n_comp in xticks: y_cv, r2, mse, rpd = optimise_cca_cv(X, Y, n_comp) r2s.append(r2) mses.append(mse) rpds.append(rpd) plot_metrics(mses, 'MSE', 'min', xticks) plot_metrics(r2s, 'R2', 'max', xticks) #plot_metrics(rpds, 'RPD', 'max', xticks) n_components = np.argmin(mses) + 1 cca = CCA(n_components=n_components, scale=True) cca.fit(X, Y) loadings = pd.DataFrame(cca.x_loadings_) scores = pd.DataFrame(cca.x_scores_) X_rows_dict = {i: X_cols[i] for i in range(0, len(X_cols))} X_cols_dict = {i: 'LV' + str(i + 1) for i in range(0, n_components)} loadings.rename(index=X_rows_dict, columns=X_cols_dict, inplace=True) print(loadings) rda_res = rda(Y, X, scale_Y=True) print(rda_res) print(rda_res.proportion_explained)
def test_biplot_score(self): rda_ = rda(y=self.Y, x=self.X, scale_Y=False, scaling=1) # Load data as computed with vegan 2.4-3: # library(vegan) # data(varechem) # data(varespec) # rda_ = rda(X=varespec, Y=varechem, scale=FALSE) # write.table(summary(rda_, scaling=1)$biplot, # 'vare_rda_biplot_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$sites, # 'vare_rda_sites_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$species, # 'vare_rda_species_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$constraints, # # 'vare_rda_constraints_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$cont$importance[2, ], # 'vare_rda_propexpl_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$cont$importance[1, ], # 'vare_rda_eigvals_from_vegan.csv', sep=',') vegan_features = pd.read_csv( get_data_path('vare_rda_species_from_vegan.csv')) vegan_samples = pd.read_csv( get_data_path('vare_rda_sites_from_vegan.csv')) vegan_biplot = pd.read_csv( get_data_path('vare_rda_biplot_from_vegan.csv')) vegan_constraints = pd.read_csv( get_data_path('vare_rda_constraints_from_vegan.csv')) vegan_propexpl = pd.read_csv( get_data_path('vare_rda_propexpl_from_vegan.csv')) vegan_propexpl = pd.Series(vegan_propexpl.x.values, index=rda_.eigvals.index) vegan_eigvals = pd.read_csv( get_data_path('vare_rda_eigvals_from_vegan.csv')) vegan_eigvals = pd.Series(vegan_eigvals.x.values, index=rda_.eigvals.index) # scikit-bio returns singular values, whereas vegan returns eigenvalues vegan_eigvals = np.sqrt(vegan_eigvals * vegan_eigvals.shape[0]) vegan_propexpl = vegan_eigvals / vegan_eigvals.sum() # transform the output of rda_ to match column selection of vegan res_samples = rda_.samples.iloc[:, 0:6] res_features = rda_.features.iloc[:, 0:6] rda_ = OrdinationResults( 'RDA', 'Redundancy Analysis', samples=res_samples, features=res_features, sample_constraints=rda_.sample_constraints.iloc[:, 0:6], biplot_scores=rda_.biplot_scores.iloc[:, 0:6], proportion_explained=rda_.proportion_explained, eigvals=rda_.eigvals) exp = OrdinationResults('RDA', 'Redundancy Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=vegan_constraints, biplot_scores=vegan_biplot, proportion_explained=vegan_propexpl, eigvals=vegan_eigvals) # This scaling constant is required to make skbio comparable to vegan. scaling = (rda_.eigvals[0] / rda_.eigvals[:6]) exp.biplot_scores *= scaling assert_ordination_results_equal(rda_, exp, ignore_directionality=False, decimal=6)
def test_biplot_score(self): rda_ = rda(y=self.Y, x=self.X, scale_Y=False, scaling=1) # Load data as computed with vegan 2.4-3: # library(vegan) # data(varechem) # data(varespec) # rda_ = rda(X=varespec, Y=varechem, scale=FALSE) # write.table(summary(rda_, scaling=1)$biplot, # 'vare_rda_biplot_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$sites, # 'vare_rda_sites_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$species, # 'vare_rda_species_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$constraints, # # 'vare_rda_constraints_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$cont$importance[2, ], # 'vare_rda_propexpl_from_vegan.csv', sep=',') # write.table(summary(rda_, scaling=1)$cont$importance[1, ], # 'vare_rda_eigvals_from_vegan.csv', sep=',') vegan_features = pd.read_csv( get_data_path('vare_rda_species_from_vegan.csv')) vegan_samples = pd.read_csv( get_data_path('vare_rda_sites_from_vegan.csv')) vegan_biplot = pd.read_csv( get_data_path('vare_rda_biplot_from_vegan.csv')) vegan_constraints = pd.read_csv( get_data_path('vare_rda_constraints_from_vegan.csv')) vegan_propexpl = pd.read_csv( get_data_path('vare_rda_propexpl_from_vegan.csv')) vegan_propexpl = pd.Series( vegan_propexpl.x.values, index=rda_.eigvals.index) vegan_eigvals = pd.read_csv( get_data_path('vare_rda_eigvals_from_vegan.csv')) vegan_eigvals = pd.Series( vegan_eigvals.x.values, index=rda_.eigvals.index) # scikit-bio returns singular values, whereas vegan returns eigenvalues vegan_eigvals = np.sqrt(vegan_eigvals*vegan_eigvals.shape[0]) vegan_propexpl = vegan_eigvals/vegan_eigvals.sum() # transform the output of rda_ to match column selection of vegan res_samples = rda_.samples.iloc[:, 0:6] res_features = rda_.features.iloc[:, 0:6] rda_ = OrdinationResults( 'RDA', 'Redundancy Analysis', samples=res_samples, features=res_features, sample_constraints=rda_.sample_constraints.iloc[:, 0:6], biplot_scores=rda_.biplot_scores.iloc[:, 0:6], proportion_explained=rda_.proportion_explained, eigvals=rda_.eigvals) exp = OrdinationResults( 'RDA', 'Redundancy Analysis', samples=vegan_samples, features=vegan_features, sample_constraints=vegan_constraints, biplot_scores=vegan_biplot, proportion_explained=vegan_propexpl, eigvals=vegan_eigvals) pdt.assert_frame_equal(res_samples, vegan_samples) # This scaling constant is required to make skbio comparable to vegan. scaling = (rda_.eigvals[0] / rda_.eigvals[:6]) exp.biplot_scores *= scaling assert_ordination_results_equal( rda_, exp, ignore_directionality=False, decimal=6)