def test_type(self): self.assertTrue(check.argument_type(self, unittest.TestCase)) self.assertTrue( check.argument_type(None, unittest.TestCase, allow_none=True)) with self.assertRaises(ValueError): self.assertTrue(check.argument_type("0", unittest.TestCase))
def set_gold_standard_and_priors(self): """ Read priors file into priors_data and gold standard file into gold_standard """ self.priors_data = self.input_dataframe(self.priors_file) if self.split_priors_for_gold_standard: self.split_priors_into_gold_standard() else: self.gold_standard = self.input_dataframe(self.gold_standard_file) if self.split_gold_standard_for_crossvalidation: self.cross_validate_gold_standard() try: check.index_values_unique(self.priors_data.index) except ValueError as v_err: utils.Debug.vprint("Duplicate gene(s) in prior index", level=0) utils.Debug.vprint(str(v_err), level=0) try: check.index_values_unique(self.priors_data.columns) except ValueError as v_err: utils.Debug.vprint("Duplicate tf(s) in prior index", level=0) utils.Debug.vprint(str(v_err), level=0)
def _harmonize_paths(self): """ If _baseline is set, copy it to the workflow If _baseline is not set, copy to it from the workflow """ if self._baseline_output_dir is None and self.workflow.output_dir is None: raise ValueError( "No output path has been provided to either crossvalidation or workflow" ) elif self._baseline_output_dir is None: self._baseline_output_dir = self.workflow.output_dir elif self.workflow.output_dir is None: self.workflow.output_dir = self._baseline_output_dir try: check.argument_subpath(self.workflow.output_dir, self._baseline_output_dir) except ValueError: warnings.warn( "Workflow output path is {p}; resetting to {a}".format( a=self.workflow.output_dir, p=self._baseline_output_dir)) self.workflow.output_dir = self._baseline_output_dir if self._baseline_input_dir is None and self.workflow.input_dir is None: raise ValueError( "No input path has been provided to either crossvalidation or workflow" ) if self._baseline_input_dir is None: self._baseline_input_dir = self.workflow.input_dir if self.workflow.input_dir is None: self.workflow.input_dir = self._baseline_input_dir
def _covariance_by_task(X, Y): """ Returns C and D, containing terms for covariance update for OLS fit C: transpose(X_j)*Y for each feature j D: transpose(X_j)*X_l for each feature j for each feature l Reference: Friedman, Hastie, Tibshirani, 2010 in Journal of Statistical Software Regularization Paths for Generalized Linear Models via Coordinate Descent :param X: list(np.ndarray [N x K]) [T] List of design values for each task. Must be aligned on the feature (K) axis. :param Y: list(np.ndarray [N x 1]) [T] List of response values for each task :return cov_C, cov_D: np.ndarray [T x K], np.ndarray [T x K x K] Covariance of the predictors K to the response gene by task Covariance of the predictors K to K by task """ assert check.argument_type(X, list) assert check.argument_type(Y, list) assert len(X) == len(Y) assert max([xk.shape[1] for xk in X]) == min([xk.shape[1] for xk in X]) # Calculate dimensionality for returned arrays n_tasks = len(X) n_features = max([xk.shape[1] for xk in X]) # Build empty arrays cov_C = np.zeros((n_tasks, n_features)) cov_D = np.zeros((n_tasks, n_features, n_features)) # Populate arrays for task_id in range(n_tasks): cov_C[task_id] = np.dot(Y[task_id].transpose(), X[task_id]) # yTx cov_D[task_id] = np.dot(X[task_id].transpose(), X[task_id]) # xTx return cov_C, cov_D
def summarize_network(self, output_dir, gold_standard, priors): """ Take the betas and rescaled beta_errors, construct a network, and test it against the gold standard :param output_dir: str Path to write files into. Don't write anything if this is None. :param gold_standard: pd.DataFrame [G x K] Gold standard to test the network against :param priors: pd.DataFrame [G x K] Prior data :return result: InferelatorResult Returns an InferelatorResult """ assert check.argument_path(output_dir, allow_none=True) assert check.argument_type(gold_standard, pd.DataFrame) assert check.argument_type(priors, pd.DataFrame) rs_calc = self.metric(self.rescaled_betas, gold_standard, filter_method=self.filter_method) beta_threshold, beta_sign, beta_nonzero = self.threshold_and_summarize(self.betas, self.threshold) resc_betas_mean, resc_betas_median = self.mean_and_median(self.rescaled_betas) extra_cols = {BETA_SIGN_COLUMN: beta_sign, MEDIAN_EXPLAIN_VAR_COLUMN: resc_betas_median} m_name, score = rs_calc.score() utils.Debug.vprint("Model {metric}:\t{score}".format(metric=m_name, score=score), level=0) # Process data into a network dataframe network_data = self.process_network(rs_calc, priors, beta_threshold=beta_threshold, extra_columns=extra_cols) # Create a InferelatorResult object and have it write output files result = self.result_object(network_data, beta_threshold, rs_calc.all_confidences, rs_calc) if self.write_results and output_dir is not None: result.write_result_files(output_dir) return result
def elastic_net(X, Y, params): """ :param X: np.ndarray [K x N] :param Y: np.ndarray [1 x N] :param params: dict :return: """ assert check.argument_type(X, np.ndarray) assert check.argument_type(Y, np.ndarray) (K, N) = X.shape X = X.T # Make X into [N, K] Y = Y.flatten() # Make Y into [N, ] # Fit the linear model using the elastic net model = ElasticNetCV(**params).fit(X, Y) # Set coefficients below threshold to 0 coefs = model.coef_ # Get all model coefficients [K, ] coefs[np.abs(coefs) < MIN_COEF] = 0. # Threshold coefficients coef_nonzero = coefs != 0 # Create a boolean array where coefficients are nonzero [K, ] # If there are non-zero coefficients, redo the linear regression with them alone # And calculate beta_resc if coef_nonzero.sum() > 0: x = X[:, coef_nonzero] utils.make_array_2d(Y) betas = base_regression.recalculate_betas_from_selected(x, Y) betas_resc = base_regression.predict_error_reduction(x, Y, betas) return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc) else: return dict(pp=np.repeat(True, K).tolist(), betas=np.zeros(K), betas_resc=np.zeros(K))
def _split_axis(priors, split_ratio, axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED): """ Split by axis labels on the chosen axis :param priors: pd.DataFrame [M x N] :param split_ratio: float :param axis: [0, 1] :param seed: :return: """ assert check.argument_numeric(split_ratio, 0, 1) assert check.argument_enum(axis, [0, 1]) pc = priors.shape[axis] gs_count = int((1 - split_ratio) * pc) idx = ManagePriors._make_shuffled_index(pc, seed=seed) if axis == 0: axis_idx = priors.index elif axis == 1: axis_idx = priors.columns else: raise ValueError("Axis can only be 0 or 1") pr_idx = axis_idx[idx[0:gs_count]] gs_idx = axis_idx[idx[gs_count:]] priors_data = priors.drop(gs_idx, axis=axis) gold_standard = priors.drop(pr_idx, axis=axis) return priors_data, gold_standard
def _split_for_cv(all_data, split_ratio, split_axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED): """ Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for crossvalidation splits of a gold standard. :param all_data: pd.DataFrame [G x K] Existing prior or gold standard data :param split_ratio: float The proportion of the priors that should go into the gold standard :param split_axis: int Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None) Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have half of the data points of all_data :param seed: int Seed for the random generator :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K] Returns a new prior and gold standard by splitting the old one in half """ assert check.argument_numeric(split_ratio, 0, 1) assert check.argument_enum(split_axis, [0, 1], allow_none=True) # Split the priors into gold standard based on axis (flatten if axis=None) if split_axis is None: priors_data, _ = ManagePriors._split_flattened(all_data, split_ratio, seed=seed) gold_standard = all_data else: priors_data, gold_standard = ManagePriors._split_axis(all_data, split_ratio, axis=split_axis, seed=seed) return priors_data, gold_standard
def write_csv(data, pathname, filename): assert check.argument_path(pathname, allow_none=True) assert check.argument_type(filename, str, allow_none=True) assert check.argument_type(data, pd.DataFrame) if pathname is not None and filename is not None: data.to_csv(os.path.join(pathname, filename), sep='\t')
def _split_flattened(data, split_ratio, seed=default.DEFAULT_CV_RANDOM_SEED): """ Instead of splitting by axis labels, split edges and ignore axes :param data: pd.DataFrame [M x N] :param split_ratio: float :param seed: :return priors_data: pd.DataFrame [M x N] :return gold_standard: pd.DataFrame [M x N] """ check.argument_numeric(split_ratio, 0, 1) pc = np.sum(data.values != 0) gs_count = int(split_ratio * pc) idx = _make_shuffled_index(pc, seed=seed) pr_idx = data.values[data.values != 0].copy() gs_idx = data.values[data.values != 0].copy() pr_idx[idx[0:gs_count]] = 0 gs_idx[idx[gs_count:]] = 0 gs = data.values.copy() pr = data.values.copy() gs[gs != 0] = gs_idx pr[pr != 0] = pr_idx priors_data = pd.DataFrame(pr, index=data.index, columns=data.columns) gold_standard = pd.DataFrame(gs, index=data.index, columns=data.columns) return priors_data, gold_standard
def test_none(self): self.assertTrue(check.arguments_not_none(("A", "B"))) self.assertTrue(check.arguments_not_none(("A", None), num_none=1)) with self.assertRaises(ValueError): self.assertTrue(check.arguments_not_none((None, None, "A"))) with self.assertRaises(ValueError): self.assertTrue(check.arguments_not_none((None, None, "A"), num_none=0))
def test_enum(self): self.assertTrue(check.argument_enum("A", ("A", "B"))) self.assertTrue(check.argument_enum(["A", "B", "A"], ("A", "B"))) with self.assertRaises(ValueError): check.argument_enum(["A", "B", "C"], ("A", "B"))
def write_output_files(self, pr_calc, output_dir, priors, beta_threshold, extra_cols, threshold_network=True): assert check.argument_type(pr_calc, RankSummaryPR) assert check.argument_path(output_dir, allow_none=True, create_if_needed=True) self.write_csv(pr_calc.combined_confidences(), output_dir, self.confidence_file_name) self.write_csv(beta_threshold, output_dir, self.threshold_file_name) pr_calc.output_pr_curve_pdf(output_dir, file_name=self.pr_curve_file_name) # Threshold the network with the boolean beta_threshold if threshold_network is True beta_threshold = beta_threshold if threshold_network else None # Process data into a network dataframe, write it out, and return it network_data = self.process_network(pr_calc, priors, beta_threshold=beta_threshold, extra_columns=extra_cols) self.save_network_to_tsv(network_data, output_dir, output_file_name=self.network_file_name) return network_data
def context_likelihood_mi(x, y, bins=DEFAULT_NUM_BINS, logtype=DEFAULT_LOG_TYPE, return_mi=True): """ Wrapper to calculate the Context Likelihood of Relatedness and Mutual Information for two data sets that have common condition rows. The y argument will be used to calculate background MI for the x & y MI. As an implementation detail, y will be cast to a dense array if it is sparse. X can be sparse with no internal copy. This function handles unpacking and packing the InferelatorData. :param x: An N x G InferelatorData object :type x: InferelatorData [N x G] :param y: An N x K InferelatorData object :type y: InferelatorData [N x K] :param logtype: The logarithm function to use when calculating information. Defaults to natural log (np.log) :type logtype: np.log func :param bins: Number of bins for discretizing continuous variables :type bins: int :param return_mi: Boolean for returning a MI object. Defaults to True :type return_mi: bool :return clr, mi: CLR and MI InferelatorData objects. Returns (CLR, None) if return_mi is False. :rtype InferelatorData, InferelatorData: """ assert check.argument_integer(bins, allow_none=True) assert min(x.shape) > 0 assert min(y.shape) > 0 assert check.indexes_align((x.sample_names, y.sample_names)) # Create dense output matrix and copy the inputs mi_r = x.gene_names mi_c = y.gene_names # Build a [G x K] mutual information array mi = mutual_information(x.expression_data, y.expression_data, bins, logtype=logtype) array_set_diag(mi, 0., mi_r, mi_c) # Build a [K x K] mutual information array mi_bg = mutual_information(y.expression_data, y.expression_data, bins, logtype=logtype) array_set_diag(mi_bg, 0., mi_c, mi_c) # Calculate CLR clr = calc_mixed_clr(mi, mi_bg) MPControl.sync_processes(pref=SYNC_CLR_KEY) mi = pd.DataFrame(mi, index=mi_r, columns=mi_c) clr = pd.DataFrame(clr, index=mi_r, columns=mi_c) return clr, mi if return_mi else None
def cross_validate_gold_standard(priors_data, gold_standard, cv_split_axis, cv_split_ratio, random_seed): """ Sample the gold standard for crossvalidation, and then remove the new gold standard from the priors (if split on an axis) :param priors_data: pd.DataFrame [G x K] Prior data :param gold_standard: pd.DataFrame [G x K] Gold standard data :param cv_split_ratio: float The proportion of the priors that should go into the gold standard :param cv_split_axis: int Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None) Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have half of the data points of all_data :param random_seed: int Random seed :return priors_data, gold_standard: pd.DataFrame [G x K], pd.DataFrame [G x K] """ assert check.argument_enum(cv_split_axis, (0, 1), allow_none=True) assert check.argument_numeric(cv_split_ratio, low=0, high=1) if cv_split_axis == 1: utils.Debug.vprint( "Selecting cv_split_axis of 1 is possible but a very bad idea", level=1) utils.Debug.vprint("Resampling GS ({gs}) for crossvalidation".format( gs=gold_standard.shape), level=0) gs_to_prior, gold_standard = ManagePriors._split_for_cv( gold_standard, cv_split_ratio, split_axis=cv_split_axis, seed=random_seed) # If the priors are split on an axis, remove circularity if cv_split_axis is not None: priors_data, gold_standard = ManagePriors._remove_prior_circularity( priors_data, gold_standard, split_axis=cv_split_axis) else: if priors_data is not None: utils.Debug.vprint( "Existing prior is being replaced with a downsampled gold standard" ) priors_data = gs_to_prior _msg = "CV prior {pr} [{pr_x}] and gold standard {gs} [{gs_x}]" utils.Debug.vprint(_msg.format(pr=priors_data.shape, gs=gold_standard.shape, pr_x=(priors_data != 0).sum().sum(), gs_x=(gold_standard != 0).sum().sum()), level=0) return priors_data, gold_standard
def set_processes(cls, process_count): """ Set the number of dask workers to use :param process_count: int :return: """ check.argument_integer(process_count, low=1) cls.processes = process_count
def separate_tasks_by_metadata(self, meta_data_column=None): """ Take a single expression matrix and break it into multiple dataframes based on meta_data. Return a list of TaskData objects which have the task-specific data loaded into them :param meta_data_column: Meta_data column which corresponds to task ID :type meta_data_column: str :return new_task_objects: List of the TaskData objects with only one task's data each :rtype: list(TaskData) """ assert check.argument_type(self.meta_data, pd.DataFrame) assert check.argument_type(self.expression_matrix, pd.DataFrame) assert self.meta_data.shape[0] == self.expression_matrix.shape[1] meta_data_column = meta_data_column if meta_data_column is not None else self.meta_data_task_column if meta_data_column is None: raise ValueError( "tasks_from_metadata is set but meta_data_task_column is not" ) new_task_objects = list() tasks = self.meta_data[meta_data_column].unique().tolist() utils.Debug.vprint( "Creating {n} tasks from metadata column {col}".format( n=len(tasks), col=meta_data_column), level=0) # Remove data references from self expr_data = self.expression_matrix meta_data = self.meta_data self.expression_matrix = None self.meta_data = None for task in tasks: # Copy this object task_obj = copy.deepcopy(self) # Get an index of the stuff to keep task_idx = meta_data[meta_data_column] == task # Reset expression matrix, metadata, and task_name in the copy task_obj.expression_matrix = expr_data.iloc[:, [ i for i, j in enumerate(task_idx) if j ]] task_obj.meta_data = meta_data.loc[task_idx, :] task_obj.task_name = task new_task_objects.append(task_obj) utils.Debug.vprint("Separated data into {ntask} tasks".format( ntask=len(new_task_objects)), level=0) return new_task_objects
def __init__(self, prior, expression_matrix, expression_matrix_halftau): assert check.dataframes_align( [expression_matrix, expression_matrix_halftau]) assert check.indexes_align((prior.index, expression_matrix.index), check_order=False) self.prior = prior self.expression_matrix = expression_matrix self.expression_matrix_halftau = expression_matrix_halftau
def map(cls, func, *arg, **kwargs): """ Map a function across iterable(s) and return a list of results :param func: function Mappable function :param args: iterable Iterator(s) """ assert check.argument_callable(func) assert check.argument_list_type(arg, collections.abc.Iterable) return list(map(func, *arg))
def map(cls, func, *args, **kwargs): """ Map a function across iterable(s) and return a list of results :param func: function Mappable function :param args: iterable Iterator(s) """ assert check.argument_callable(func) assert check.argument_list_type(args, collections.Iterable) return cls.client.map(func, *args, chunksize=cls.chunk)
def _final_weights(X, y, TFs, gene): """ returns reduction on variance explained for each predictor (model without each predictor compared to full model) see: Greenfield et al., 2013. Robust data-driven incorporation of prior knowledge into the inference of dynamic regulatory networks. :param X: np.ndarray [N x k] A design matrix with N samples and k non-zero predictors :param y: np.ndarray [N x 1] A response matrix with N samples of a specific gene expression :param TFs: list() or np.ndarray or pd.Series A list of non-zero TFs (k) included in the model :param gene: str The gene modeled :return out_weights: pd.DataFrame [k x 4] An edge table (regulator -> target) with the model coefficient and the variance explained by that predictor for each non-zero predictor """ assert check.argument_type(X, np.ndarray) assert check.argument_type(y, np.ndarray) assert check.argument_type(TFs, (list, np.ndarray, pd.Series)) n_preds = len(TFs) # Linear fit using sklearn ols = LinearRegression().fit(X, y) # save weights and initialize rescaled weights vector weights = ols.coef_[0] resc_weights = np.zeros(n_preds) # variance of residuals (full model) var_full = np.var((y - ols.predict(X))**2) # when there is only one predictor if n_preds == 1: resc_weights[0] = 1 - (var_full / np.var(y)) # remove each at a time and calculate variance explained else: for j in range(len(TFs)): X_noj = X[:, np.setdiff1d(range(n_preds), j)] ols = LinearRegression().fit(X_noj, y) var_noj = np.var((y - ols.predict(X_noj))**2) resc_weights[j] = 1 - (var_full / var_noj) # Format output into an edge table out_weights = pd.DataFrame([TFs, [gene] * len(TFs), weights, resc_weights]).transpose() out_weights.columns = ['regulator', 'target', 'weights', 'resc_weights'] return out_weights
def set_processes(cls, process_count): """ Set the number of dask workers to use :param process_count: int :return: """ check.argument_integer(process_count, low=1) cls._job_n = math.ceil(process_count / cls._job_n_workers) utils.Debug.vprint("Using `set_processes` is not advised for the DASK CLUSTER configuration", level=0) utils.Debug.vprint("Using `set_job_size_params` is highly preferred", level=0) utils.Debug.vprint("Configured {n} jobs with {w} workers per job".format(n=cls._job_n, w=cls._job_n_workers), level=0)
def validate_init_args(betas, rescaled_betas, threshold=None, filter_method=None, metric=None): assert check.argument_type(betas, list) assert check.argument_list_type(betas, list) assert check.argument_list_type(betas[0], pd.DataFrame) assert check.argument_type(rescaled_betas, list) assert check.argument_list_type(rescaled_betas, list) assert check.argument_list_type(rescaled_betas[0], pd.DataFrame) assert all([check.dataframes_align(b_task + bresc_task) for b_task, bresc_task in zip(betas, rescaled_betas)]) assert check.argument_enum(filter_method, results_processor.FILTER_METHODS, allow_none=True) assert check.argument_numeric(threshold, 0, 1, allow_none=True)
def get_random_samples(self, num_obs, with_replacement=False, random_seed=None, random_gen=None, inplace=False, fix_names=True): """ Randomly sample to a specific number of observatons from the entire data set :param num_obs: Number of observations to return :type num_obs: int :param with_replacement: Sample with replacement, defaults to False :type with_replacement: bool, optional :param random_seed: Seed for numpy random generator, defaults to None. Will be ignored if a generator itself is passed to random_gen. :type random_seed: int, optional :param random_gen: Numpy random generator to use, defaults to None. :type random_gen: np.random.Generator, optional :param inplace: Change this instance of the data structure inplace and return a reference to itself :type inplace: bool, optional """ check.argument_integer(num_obs, low=1) check.argument_integer(random_seed, allow_none=True) if (num_obs > self.num_obs) and not with_replacement: _msg = "Unable to sample {x} from {y} observations without replacement".format(x=num_obs, y=self.num_obs) raise ValueError(_msg) # Make a new random generator if not provided if random_gen is None: random_gen = np.random.default_rng() if random_seed is None else np.random.default_rng(random_seed) # Sample with replacement using randint if with_replacement: keeper_ilocs = random_gen.integers(self.num_obs, size=(num_obs,)) # Sample without replacement using choice else: keeper_ilocs = random_gen.choice(np.arange(self.num_obs), size=(num_obs,), replace=False) # Change this instance's _adata (explicit copy allows the old data to be dereferenced instead of held as view) if inplace: self._adata = self._adata[keeper_ilocs, :].copy() return_obj = self # Create a new InferelatorData instance with the _adata slice else: return_obj = InferelatorData(self._adata[keeper_ilocs, :].copy()) # Fix names return_obj._adata.obs_names_make_unique() if with_replacement and fix_names else None return return_obj
def process_network(metric, priors, confidence_threshold=0, beta_threshold=None, extra_columns=None): """ Process rank-summed results into a network data frame :param metric: RankSummingMetric The rank-sum object with the math in it :param priors: pd.DataFrame [G x K] Prior data :param confidence_threshold: numeric The minimum confidence score needed to write a network edge :param beta_threshold: pd.DataFrame [G x K] The thresholded betas to include in the network. If None, include everything. :param extra_columns: dict(col_name: pd.DataFrame [G x K]) Any additional data to include, keyed by column name and indexable with row and column names :return network_data: pd.DataFrame [(G*K) x 7+] Network edge dataframe """ assert check.argument_type(metric, RankSummingMetric) assert check.argument_type(priors, pd.DataFrame, allow_none=True) assert check.argument_type(beta_threshold, pd.DataFrame, allow_none=True) assert check.argument_numeric(confidence_threshold, 0, 1) # Get the combined confidences and subset for confidence threshold network_data = metric.confidence_dataframe() network_data = network_data.loc[network_data[CONFIDENCE_COLUMN] > confidence_threshold, :] # If beta_threshold has been provided, melt and join it to the network data # Then discard anything which isn't meeting the threshold if beta_threshold is not None and False: beta_data = utils.melt_and_reindex_dataframe(beta_threshold, BETA_THRESHOLD_COLUMN) network_data = network_data.join(beta_data, on=[TARGET_COLUMN, REGULATOR_COLUMN]) network_data = network_data.loc[network_data[BETA_THRESHOLD_COLUMN] == 1, :] del network_data[BETA_THRESHOLD_COLUMN] if priors is not None: prior_data = utils.melt_and_reindex_dataframe(priors, PRIOR_COLUMN) network_data = network_data.join(prior_data, on=[TARGET_COLUMN, REGULATOR_COLUMN]) # Add any extra columns as needed if extra_columns is not None: for k in sorted(extra_columns.keys()): extra_data = utils.melt_and_reindex_dataframe(extra_columns[k], k) network_data = network_data.join(extra_data, on=[TARGET_COLUMN, REGULATOR_COLUMN]) # Make sure all missing values are NaN network_data[pd.isnull(network_data)] = np.nan return network_data
def format_prior(priors, gene, tasks, prior_weight, tfs=None): """ Returns weighted priors for one gene :param priors: list(pd.DataFrame [G x K]) or pd.DataFrame [G x K] Either a list of prior data or a single data frame to use for all tasks :param gene: str The gene to select from the priors :param tasks: list(int) A list of task IDs :param prior_weight: float, int How much to weight the priors :return prior_out: np.ndarray [K x T] The weighted priors for a specific gene in each task """ assert check.argument_type(priors, (list, pd.DataFrame), allow_none=True) assert check.argument_string(gene) assert check.argument_type(tasks, list) assert check.argument_numeric(prior_weight) if priors is None: return None def _reindex_to_gene(p): p = p.reindex([gene]) p = p.reindex(tfs, axis=1) if tfs is not None else p p = p.fillna(0.0) return p # If the priors are a list, get the gene-specific prior from each task if isinstance(priors, list) and len(priors) > 1: priors_out = [ _weight_prior( _reindex_to_gene(priors[k]).loc[gene, :].values, prior_weight) for k in tasks ] priors_out = np.transpose(np.vstack(priors_out)) # Otherwise just use the same prior for each task else: priors = priors[0] if isinstance(priors, list) else priors priors_out = np.tile( _weight_prior( _reindex_to_gene(priors).loc[gene, :].values, prior_weight).reshape(-1, 1), (1, len(tasks))) return priors_out
def read_expression(self, file=None): """ Read expression matrix file into expression_matrix """ file = file if file is not None else self.expression_matrix_file utils.Debug.vprint( "Loading expression data file {file}".format(file=file), level=1) self.expression_matrix = self.input_dataframe(file) try: check.dataframe_is_finite(self.expression_matrix) except ValueError as err: utils.Debug.vprint("Expression Matrix " + str(err), level=0) self.loaded_file_info("Expression Matrix", self.expression_matrix)
def meta_data(self, new_meta_data): if isinstance(new_meta_data, InferelatorData): new_meta_data = new_meta_data.meta_data # Reindex the new metadata to match the existing sample names new_meta_data = new_meta_data.copy() new_meta_data.index = new_meta_data.index.astype(str) # Force unique names by appending values if self._adata.obs_names.nunique() != self.num_obs: self._adata.obs_names_make_unique() # Drop duplicate names on the new meta data if new_meta_data.index.nunique() != new_meta_data.shape[0]: new_meta_data = new_meta_data.loc[~new_meta_data.duplicated(), :] try: Validator.indexes_align((self.sample_names, new_meta_data.index), check_order=True) except ValueError: msg = "Metadata update for {n} is misaligned".format(n=str(self)) name_overlap = len( set(new_meta_data.index).intersection(set(self.sample_names))) # If the new metadata has no overlapping index names and is the same length, just assume it's right order if (name_overlap == 0) and (new_meta_data.shape[0] == self.num_obs): msg += " (Metadata dimensions are correct; ignoring misalignment)" warnings.warn(msg) elif name_overlap == 0: msg = "Incorrectly sized metadata with no overlapping names was provided to {s}".format( s=str(self)) raise ValueError(msg) else: msg += " ({m} records are in both)".format(m=name_overlap) warnings.warn(msg) new_meta_data = new_meta_data.reindex(self.sample_names) # Join any new columns to any existing columns # Update (overwrite) any columns in the existing meta data if they are in the new meta data if len(self._adata.obs.columns) > 0: keep_columns = self._adata.obs.columns.difference( new_meta_data.columns) self._adata.obs = pd.concat( (new_meta_data, self._adata.obs.loc[:, keep_columns]), axis=1) else: self._adata.obs = new_meta_data
def sklearn_gene(x, y, model, min_coef=None, **kwargs): """ Use a scikit-learn model for regression :param x: Feature array :type x: np.ndarray [N x K] :param y: Response array :type y: np.ndarray [N x 1] :param model: Instance of a scikit BaseEstimator-derived model :type model: BaseEstimator :param min_coef: A minimum coefficient value to include in the model. Any values smaller will be set to 0. :type min_coef: numeric :return: A dict of results for this gene :rtype: dict """ assert check.argument_type(x, np.ndarray) assert check.argument_type(y, np.ndarray) assert check.argument_is_subclass(model, BaseEstimator) (N, K) = x.shape # Fit the model model.fit(x, y, **kwargs) # Get all model coefficients [K, ] try: coefs = model.coef_ except AttributeError: coefs = model.estimator_.coef_ # Set coefficients below threshold to 0 if min_coef is not None: coefs[np.abs(coefs) < min_coef] = 0. # Threshold coefficients coef_nonzero = coefs != 0 # Create a boolean array where coefficients are nonzero [K, ] # If there are non-zero coefficients, redo the linear regression with them alone # And calculate beta_resc if coef_nonzero.sum() > 0: x = x[:, coef_nonzero] utils.make_array_2d(y) betas = base_regression.recalculate_betas_from_selected(x, y) betas_resc = base_regression.predict_error_reduction(x, y, betas) return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc) else: return dict(pp=np.repeat(True, K).tolist(), betas=np.zeros(K), betas_resc=np.zeros(K))
def add_size_subsampling(self, size_vector, stratified_column_name=None, with_replacement=False, seed=42, size_sample_only=None): """ Resample expression data to a ratio of the original data. :param size_vector: An iterable with numeric ratios for downsampling. These values must be between 0 and 1. :type size_vector: iterable(floats) :param stratified_column_name: Set this to stratify sampling (to maintain group size ratios). If None, do not maintain group size ratios. Default is None. :type stratified_column_name: str, None :param with_replacement: Do sampling with or without replacement. Defaults to False :type with_replacement: bool :param seed: The random seed to use when selecting observations (this is not the same as the seed passed to the workflow) :param seed: int """ try: [check.argument_numeric(val, low=0, high=1) for val in size_vector] except ValueError as err: utils.Debug.vprint( "Size sampling parameter error: {err}".format(err=str(err)), level=0) raise self.size_sample_vector = size_vector self.size_sample_stratified_column = stratified_column_name self.size_sample_with_replacement = with_replacement self.size_sample_seed = seed self.size_sample_only = size_sample_only if size_sample_only is not None else self.size_sample_only