def gradient_clustering(table: pd.DataFrame, gradient: NumericMetadataColumn, weighted: bool = True) -> skbio.TreeNode: """ Builds a tree for features based on a gradient. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. gradient : qiime2.NumericMetadataColumn Continuous vector of measurements corresponding to samples. weighted : bool Specifies if abundance or presence/absence information should be used to perform the clustering. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to the gradient. """ c = gradient.to_series() if not weighted: table = (table > 0).astype(np.float) table, c = match(table, c) t = gradient_linkage(table, c, method='average') mean_g = mean_niche_estimator(table, c) mean_g = pd.Series(mean_g, index=table.columns) mean_g = mean_g.sort_values() t = gradient_sort(t, mean_g) return t
def distance_matrix(metadata: qiime2.NumericMetadataColumn)\ -> skbio.DistanceMatrix: if metadata.has_missing_values(): missing = metadata.get_ids(where_values_missing=True) raise ValueError( "Encountered missing value(s) in the metadata column. Computing " "a distance matrix from missing values is not supported. IDs with " "missing values: %s" % ', '.join(sorted(missing))) # This code is derived from @jairideout's scikit-bio cookbook recipe, # "Exploring Microbial Community Diversity" # https://github.com/biocore/scikit-bio-cookbook series = metadata.to_series() distances = scipy.spatial.distance.pdist(series.values[:, np.newaxis], metric='euclidean') return skbio.DistanceMatrix(distances, ids=series.index)
def autocorr(output_dir: str, distance_matrix: DistanceMatrix, metadata: qiime2.NumericMetadataColumn, permutations: int = 999, two_tailed: bool = True, transformation: str = 'R', intersect_ids: bool = False) -> None: # match ids — metadata can be superset metadata = metadata.to_series() metadata, distance_matrix = match_ids(metadata, distance_matrix, intersect_ids=intersect_ids) # compute Moran's I and Geary's class results, weights = autocorr_from_dm(metadata, distance_matrix, permutations=permutations, two_tailed=two_tailed, transformation=transformation) mplot = moran_plot(metadata, weights, transformation) # Visualize save_map(mplot, output_dir) mapviz(output_dir, results=results, title='Autocorrelation statistics')
def regress_samples(output_dir: str, table: pd.DataFrame, metadata: qiime2.NumericMetadataColumn, test_size: float = defaults['test_size'], step: float = defaults['step'], cv: int = defaults['cv'], random_state: int = None, n_jobs: int = defaults['n_jobs'], n_estimators: int = defaults['n_estimators'], estimator: str = 'RandomForestRegressor', optimize_feature_selection: bool = False, stratify: str = False, parameter_tuning: bool = False) -> None: # extract column name from NumericMetadataColumn column = metadata.to_series().name # disable feature selection for unsupported estimators optimize_feature_selection, calc_feature_importance = \ _disable_feature_selection(estimator, optimize_feature_selection) # specify parameters and distributions to sample from for parameter tuning estimator, param_dist, parameter_tuning = _set_parameters_and_estimator( estimator, table, metadata, column, n_estimators, n_jobs, cv, random_state, parameter_tuning, classification=True) estimator, cm, accuracy, importances = split_optimize_classify( table, metadata, column, estimator, output_dir, test_size=test_size, step=step, cv=cv, random_state=random_state, n_jobs=n_jobs, optimize_feature_selection=optimize_feature_selection, parameter_tuning=parameter_tuning, param_dist=param_dist, calc_feature_importance=calc_feature_importance, scoring=mean_squared_error, stratify=stratify, classification=False) _visualize(output_dir, estimator, cm, accuracy, importances, optimize_feature_selection, title='regression predictions')
def setUp(self): self.results = "results" if not os.path.exists(self.results): os.mkdir(self.results) self.balances = pd.DataFrame( { 'a': [-2, -1, 0, 1, 2], 'b': [-2, 0, 0, 0, 0] }, index=['a1', 'a2', 'a3', 'a4', 'a5']) self.tree = TreeNode.read([r'((k, q)d, ((x, y)a, z)b)c;']) self.taxonomy = pd.DataFrame( [['foo;barf;a;b;c;d;e', 1], ['foo;bark;f;g;h;i;j', 1], ['foo;bark;f;g;h;w;j', 1], ['nom;tu;k;l;m;n;o', 0.9], ['nom;tu;k;l;m;t;o', 0.9]], columns=['Taxon', 'Confidence'], index=['x', 'y', 'z', 'k', 'q']) self.balances = pd.DataFrame( [[1, 2, 3, 4, 5, 6, 7], [-3.1, -2.9, -3, 3, 2.9, 3.2, 3.1], [1, 1, 1, 1, 1, 1, 1], [3, 2, 1, 0, -1, -2, -3]], index=['d', 'a', 'b', 'c'], columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7']).T basis, _ = balance_basis(self.tree) self.table = pd.DataFrame( ilr_inv(self.balances, basis), columns=['x', 'y', 'z', 'k', 'q'], index=['s1', 's2', 's3', 's4', 's5', 's6', 's7']) index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'], name='id') self.categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'a', 'b', 'b', 'b', 'b'], index=index, name='categorical')) self.multi_categorical = CategoricalMetadataColumn( pd.Series(['a', 'a', 'c', 'b', 'b', 'b', 'c'], index=index, name='multi_categorical')) self.partial_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1', '2', '2', '2', 'a'], index=index, name='multi_categorical')) self.full_numerical_categorical = CategoricalMetadataColumn( pd.Series(['1', '1', '1.0', '2', '2', '2.0', '3'], index=index, name='numerical_categorical')) self.continuous = NumericMetadataColumn( pd.Series(np.arange(7), index=index, name='continuous'))
def gradient_clustering(table: pd.DataFrame, gradient: NumericMetadataColumn, ignore_missing_samples: bool = False, weighted: bool = True) -> skbio.TreeNode: """ Builds a tree for features based on a gradient. Parameters ---------- table : pd.DataFrame Contingency table where rows are samples and columns are features. gradient : qiime2.NumericMetadataColumn Continuous vector of measurements corresponding to samples. ignore_missing_samples: bool Whether to except or ignore when there are samples present in the table that are not present in the gradient metadata. weighted : bool Specifies if abundance or presence/absence information should be used to perform the clustering. Returns ------- skbio.TreeNode Represents the partitioning of features with respect to the gradient. """ c = gradient.to_series() if not ignore_missing_samples: difference = set(table.index) - set(c.index) if difference: raise KeyError("There are samples present in the table not " "present in the gradient metadata column. Override " "this error by using the `ignore_missing_samples` " "argument. Offending samples: %r" % ', '.join(sorted([str(i) for i in difference]))) if not weighted: table = (table > 0).astype(float) table, c = match(table, c) t = gradient_linkage(table, c, method='average') mean_g = mean_niche_estimator(table, c) mean_g = pd.Series(mean_g, index=table.columns) mean_g = mean_g.sort_values() t = gradient_sort(t, mean_g) return t
def regress( features: np.ndarray, y: qiime2.NumericMetadataColumn, c: np.ndarray = None, #PATH parameters : path: bool = True, path_numerical_method: str = 'not specified', path_n_active: int = 0, path_lambdas: list = None, path_nlam_log: int = 40, path_lamin_log: float = 1e-2, #CV parameters : cv: bool = True, cv_numerical_method: str = 'not specified', cv_seed: int = 1, cv_lambdas: list = None, # to do cv_one_se: bool = True, cv_subsets: int = 5, #StabSel parameters : stabsel: bool = True, stabsel_numerical_method: str = 'not specified', stabsel_seed: int = None, # do something here ! for now it can be a bool ! stabsel_lam: float = -1.0, # if negative, then it means 'theoretical' stabsel_true_lam: bool = True, stabsel_method: str = 'first', stabsel_b: int = 50, stabsel_q: int = 10, stabsel_percent_ns: float = 0.5, stabsel_lamin: float = 1e-2, stabsel_threshold: float = 0.7, stabsel_threshold_label: float = 0.4, # might unneeded here, but needed for visualisation #LAMfixed parameters : lamfixed: bool = True, lamfixed_numerical_method: str = 'not specified', lamfixed_lam: float = -1.0, # if negative, then it means 'theoretical' lamfixed_true_lam: bool = True, #Formulation parameters concomitant: bool = True, huber: bool = False, rho: float = 1.345, rescale: bool = False) -> classo_problem: y = y.to_series().to_numpy() problem = classo_problem(features, y, C=c, rescale=rescale) problem.formulation.huber = huber problem.formulation.concomitant = concomitant problem.formulation.rho = rho problem.model_selection.PATH = path if path: param = problem.model_selection.PATHparameters param.numerical_method = path_numerical_method param.n_active = path_n_active if path_lambdas is None: param.lambdas = np.array([ 10**(np.log10(path_lamin_log) * float(i) / path_nlam_log) for i in range(0, path_nlam_log) ]) else: param.lambdas = path_lambdas problem.model_selection.CV = cv if cv: param = problem.model_selection.CVparameters param.numerical_method = cv_numerical_method param.seed = cv_seed param.oneSE = cv_one_se param.Nsubsets = cv_subsets if cv_lambdas is None: param.lambdas = np.linspace(1., 1e-3, 500) else: param.lambdas = cv_lambdas problem.model_selection.StabSel = stabsel if stabsel: param = problem.model_selection.StabSelparameters param.numerical_method = stabsel_numerical_method param.seed = stabsel_seed param.true_lam = stabsel_true_lam param.method = stabsel_method param.B = stabsel_b param.q = stabsel_q param.percent_nS = stabsel_percent_ns param.lamin = stabsel_lamin param.threshold = stabsel_threshold param.threshold_label = stabsel_threshold_label if (stabsel_lam > 0.): param.lam = stabsel_lam else: param.lam = 'theoretical' problem.model_selection.LAMfixed = lamfixed if lamfixed: param = problem.model_selection.LAMfixedparameters param.numerical_method = lamfixed_numerical_method param.true_lam = lamfixed_true_lam if (lamfixed_lam > 0.): param.lam = lamfixed_lam else: param.lam = 'theoretical' problem.solve() return problem
def regress( features: pd.DataFrame, y: qiime2.NumericMetadataColumn, c: np.ndarray = None, weights: np.ndarray = None, do_yshift: bool = False, # taxa: skbio.TreeNode = None, # PATH parameters : path: bool = True, path_numerical_method: str = "not specified", path_n_active: int = 0, path_nlam_log: int = 40, path_lamin_log: float = 1e-2, # CV parameters : cv: bool = True, cv_numerical_method: str = "not specified", cv_seed: int = 1, cv_one_se: bool = True, cv_subsets: int = 5, cv_nlam: int = 100, cv_lamin: float = 1e-3, cv_logscale: bool = True, # StabSel parameters : stabsel: bool = True, stabsel_numerical_method: str = "not specified", stabsel_seed: int = None, # do something here ! for now it can be a bool ! stabsel_lam: float = -1.0, # if negative, then it means 'theoretical' stabsel_true_lam: bool = True, stabsel_method: str = "first", stabsel_b: int = 50, stabsel_q: int = 10, stabsel_percent_ns: float = 0.5, stabsel_lamin: float = 1e-2, stabsel_threshold: float = 0.7, stabsel_threshold_label: float = 0.4, # might unneeded here, but needed for visualisation # LAMfixed parameters : lamfixed: bool = True, lamfixed_numerical_method: str = "not specified", lamfixed_lam: float = -1.0, # if negative, then it means 'theoretical' lamfixed_true_lam: bool = True, # Formulation parameters concomitant: bool = True, huber: bool = False, rho: float = 1.345, intercept: bool = True, ) -> classo_problem: complete_y = y.to_series() complete_y = complete_y[~complete_y.isna()] features, pdY = features.align(y.to_series(), join="inner", axis=0) missing = pdY.isna() training_labels = list(pdY[~missing].index) label_missing = list(pdY.index[missing]) if label_missing: print("{} are missing in y ".format(label_missing)) Y = pdY[~missing].to_numpy() X = features.values[~missing, :] print(Y.shape, X.shape) if do_yshift: Y = Y - np.mean(Y) problem = classo_problem(X, Y, C=c, label=list(features.columns)) problem.formulation.huber = huber problem.formulation.concomitant = concomitant problem.formulation.rho = rho problem.formulation.intercept = intercept d = X.shape[1] if weights is not None: if len(weights) < d: problem.formulation.w = np.concatenate( [weights, np.ones(d - len(weights))], axis=0) else: problem.formulation.w = weights[:d] problem.model_selection.PATH = path if path: param = problem.model_selection.PATHparameters param.numerical_method = path_numerical_method param.n_active = path_n_active param.logscale = True param.Nlam = path_nlam_log param.lamin = path_lamin_log problem.model_selection.CV = cv if cv: param = problem.model_selection.CVparameters param.numerical_method = cv_numerical_method param.seed = cv_seed param.oneSE = cv_one_se param.Nsubsets = cv_subsets param.lamin = cv_lamin param.Nlam = cv_nlam param.logscale = cv_logscale problem.model_selection.StabSel = stabsel if stabsel: param = problem.model_selection.StabSelparameters param.numerical_method = stabsel_numerical_method param.seed = stabsel_seed param.true_lam = stabsel_true_lam param.method = stabsel_method param.B = stabsel_b param.q = stabsel_q param.percent_nS = stabsel_percent_ns param.lamin = stabsel_lamin param.threshold = stabsel_threshold param.threshold_label = stabsel_threshold_label if stabsel_lam > 0.0: param.lam = stabsel_lam else: param.lam = "theoretical" problem.model_selection.LAMfixed = lamfixed if lamfixed: param = problem.model_selection.LAMfixedparameters param.numerical_method = lamfixed_numerical_method param.true_lam = lamfixed_true_lam if lamfixed_lam > 0.0: param.lam = lamfixed_lam else: param.lam = "theoretical" print("start solve !") problem.solve() print("finished solve ! ") problem.data.complete_y = complete_y.values problem.data.complete_labels = list(complete_y.index) problem.data.training_labels = training_labels return problem
def gtr_single_partition(alignment: qiime2.Metadata, time: qiime2.NumericMetadataColumn, n_generations: int, sample_every: int, time_uncertainty: qiime2.NumericMetadataColumn = None, base_freq: str = "estimated", site_gamma: int = 4, site_invariant: bool = True, clock: str = 'ucln', coalescent_model: str = 'skygrid', skygrid_intervals: int = None, skygrid_duration: float = None, print_every: int = None, use_gpu: bool = False, n_threads: int = 1) -> BEASTPosteriorDirFmt: if coalescent_model == 'skygrid': if skygrid_duration is None or skygrid_intervals is None: raise ValueError("skygrid not parameterized (TODO: better error)") # Parallelization options beast_call = ['beast'] if use_gpu: if n_threads != 1: raise ValueError beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1'] else: beast_call += [ '-beagle_CPU', '-beagle_SSE', '-beagle_instances', str(n_threads) ] # Set up directory format where BEAST will write everything result = BEASTPosteriorDirFmt() control_file = str(result.control.path_maker()) ops_file = str(result.ops.path_maker().relative_to(result.path)) log_file = str(result.log.path_maker().relative_to(result.path)) trees_file = str(result.trees.path_maker().relative_to(result.path)) # Setup up samples for templating into control file seq_series = alignment.get_column('Sequence').to_series() time_series = time.to_series() if time_uncertainty is not None: uncertainty_series = time_uncertainty.to_series() else: uncertainty_series = time_series.copy() uncertainty_series[...] = None samples_df = pd.concat([seq_series, time_series, uncertainty_series], axis='columns', join='inner') samples_df.index.name = 'id' samples_df.columns = ['seq', 'time', 'time_uncertainty'] samples_df = samples_df.replace({pd.np.nan: None}) samples = list(samples_df.itertuples(index=True)) # Default print behavior if print_every is None: print_every = sample_every # Generate control file for BEAST template_kwargs = dict(trees_file=trees_file, ops_file=ops_file, log_file=log_file, sample_every=sample_every, print_every=print_every, n_generations=n_generations, time_unit='years', samples=samples, base_freq=base_freq, site_gamma=site_gamma, site_invariant=site_invariant, clock=clock, coalescent_model=coalescent_model, skygrid_duration=skygrid_duration, skygrid_intervals=skygrid_intervals) template = _get_template("gtr_single_partition.xml") template.stream(**template_kwargs).dump(control_file) beast_call += [str(control_file)] # Execute subprocess.run(beast_call, check=True, cwd=result.path) return result
def site_heterogeneous_hky( coding_regions: qiime2.Metadata, noncoding_regions: qiime2.Metadata, time: qiime2.NumericMetadataColumn, n_generations: int, sample_every: int, print_every: int = None, time_uncertainty: qiime2.NumericMetadataColumn = None, use_gpu: bool = False, n_threads: int = 1) -> BEASTPosteriorDirFmt: # Parallelization options beast_call = ['beast'] if use_gpu: if n_threads != 1: raise ValueError beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1'] else: beast_call += [ '-beagle_CPU', '-beagle_SSE', '-beagle_instances', str(n_threads) ] # Set up directory format where BEAST will write everything result = BEASTPosteriorDirFmt() control_file = str(result.control.path_maker()) ops_file = str(result.ops.path_maker().relative_to(result.path)) log_file = str(result.log.path_maker().relative_to(result.path)) trees_file = str(result.trees.path_maker().relative_to(result.path)) # Setup up samples for templating into control file orf_series = coding_regions.get_column('Sequence').to_series() nc_series = noncoding_regions.get_column('Sequence').to_series() time_series = time.to_series() uncertainty_series = time_uncertainty.to_series() samples_df = pd.concat( [orf_series, nc_series, time_series, uncertainty_series], axis='columns', join='inner') samples_df.index.name = 'id' samples_df.columns = ['seq_orf', 'seq_nc', 'time', 'time_uncertainty'] samples_df = samples_df.replace({pd.np.nan: None}) samples = list(samples_df.itertuples(index=True)) # Default print behavior if print_every is None: print_every = sample_every # Generate control file for BEAST template_kwargs = dict(trees_file=trees_file, ops_file=ops_file, log_file=log_file, sample_every=sample_every, print_every=print_every, n_generations=n_generations, time_unit='years', samples=samples) template = _get_template("orf_and_nc.xml") template.stream(**template_kwargs).dump(control_file) beast_call += [str(control_file)] # Execute subprocess.run(beast_call, check=True, cwd=result.path) return result