Esempio n. 1
0
    def test_type(self):
        self.assertTrue(check.argument_type(self, unittest.TestCase))
        self.assertTrue(
            check.argument_type(None, unittest.TestCase, allow_none=True))

        with self.assertRaises(ValueError):
            self.assertTrue(check.argument_type("0", unittest.TestCase))
Esempio n. 2
0
    def set_gold_standard_and_priors(self):
        """
        Read priors file into priors_data and gold standard file into gold_standard
        """
        self.priors_data = self.input_dataframe(self.priors_file)

        if self.split_priors_for_gold_standard:
            self.split_priors_into_gold_standard()
        else:
            self.gold_standard = self.input_dataframe(self.gold_standard_file)

        if self.split_gold_standard_for_crossvalidation:
            self.cross_validate_gold_standard()

        try:
            check.index_values_unique(self.priors_data.index)
        except ValueError as v_err:
            utils.Debug.vprint("Duplicate gene(s) in prior index", level=0)
            utils.Debug.vprint(str(v_err), level=0)

        try:
            check.index_values_unique(self.priors_data.columns)
        except ValueError as v_err:
            utils.Debug.vprint("Duplicate tf(s) in prior index", level=0)
            utils.Debug.vprint(str(v_err), level=0)
Esempio n. 3
0
    def _harmonize_paths(self):
        """
        If _baseline is set, copy it to the workflow
        If _baseline is not set, copy to it from the workflow
        """
        if self._baseline_output_dir is None and self.workflow.output_dir is None:
            raise ValueError(
                "No output path has been provided to either crossvalidation or workflow"
            )
        elif self._baseline_output_dir is None:
            self._baseline_output_dir = self.workflow.output_dir
        elif self.workflow.output_dir is None:
            self.workflow.output_dir = self._baseline_output_dir

        try:
            check.argument_subpath(self.workflow.output_dir,
                                   self._baseline_output_dir)
        except ValueError:
            warnings.warn(
                "Workflow output path is {p}; resetting to {a}".format(
                    a=self.workflow.output_dir, p=self._baseline_output_dir))
            self.workflow.output_dir = self._baseline_output_dir

        if self._baseline_input_dir is None and self.workflow.input_dir is None:
            raise ValueError(
                "No input path has been provided to either crossvalidation or workflow"
            )

        if self._baseline_input_dir is None:
            self._baseline_input_dir = self.workflow.input_dir
        if self.workflow.input_dir is None:
            self.workflow.input_dir = self._baseline_input_dir
def _covariance_by_task(X, Y):
    """
    Returns C and D, containing terms for covariance update for OLS fit
    C: transpose(X_j)*Y for each feature j
    D: transpose(X_j)*X_l for each feature j for each feature l
    Reference: Friedman, Hastie, Tibshirani, 2010 in Journal of Statistical Software
    Regularization Paths for Generalized Linear Models via Coordinate Descent
    :param X: list(np.ndarray [N x K]) [T]
        List of design values for each task. Must be aligned on the feature (K) axis.
    :param Y: list(np.ndarray [N x 1]) [T]
        List of response values for each task
    :return cov_C, cov_D: np.ndarray [T x K], np.ndarray [T x K x K]
        Covariance of the predictors K to the response gene by task
        Covariance of the predictors K to K by task
    """

    assert check.argument_type(X, list)
    assert check.argument_type(Y, list)
    assert len(X) == len(Y)
    assert max([xk.shape[1] for xk in X]) == min([xk.shape[1] for xk in X])

    # Calculate dimensionality for returned arrays
    n_tasks = len(X)
    n_features = max([xk.shape[1] for xk in X])

    # Build empty arrays
    cov_C = np.zeros((n_tasks, n_features))
    cov_D = np.zeros((n_tasks, n_features, n_features))

    # Populate arrays
    for task_id in range(n_tasks):
        cov_C[task_id] = np.dot(Y[task_id].transpose(), X[task_id])  # yTx
        cov_D[task_id] = np.dot(X[task_id].transpose(), X[task_id])  # xTx

    return cov_C, cov_D
Esempio n. 5
0
    def summarize_network(self, output_dir, gold_standard, priors):
        """
        Take the betas and rescaled beta_errors, construct a network, and test it against the gold standard
        :param output_dir: str
            Path to write files into. Don't write anything if this is None.
        :param gold_standard: pd.DataFrame [G x K]
            Gold standard to test the network against
        :param priors: pd.DataFrame [G x K]
            Prior data
        :return result: InferelatorResult
            Returns an InferelatorResult
        """

        assert check.argument_path(output_dir, allow_none=True)
        assert check.argument_type(gold_standard, pd.DataFrame)
        assert check.argument_type(priors, pd.DataFrame)

        rs_calc = self.metric(self.rescaled_betas, gold_standard, filter_method=self.filter_method)
        beta_threshold, beta_sign, beta_nonzero = self.threshold_and_summarize(self.betas, self.threshold)
        resc_betas_mean, resc_betas_median = self.mean_and_median(self.rescaled_betas)
        extra_cols = {BETA_SIGN_COLUMN: beta_sign, MEDIAN_EXPLAIN_VAR_COLUMN: resc_betas_median}

        m_name, score = rs_calc.score()
        utils.Debug.vprint("Model {metric}:\t{score}".format(metric=m_name, score=score), level=0)

        # Process data into a network dataframe
        network_data = self.process_network(rs_calc, priors, beta_threshold=beta_threshold, extra_columns=extra_cols)

        # Create a InferelatorResult object and have it write output files
        result = self.result_object(network_data, beta_threshold, rs_calc.all_confidences, rs_calc)

        if self.write_results and output_dir is not None:
            result.write_result_files(output_dir)

        return result
Esempio n. 6
0
def elastic_net(X, Y, params):
    """

    :param X: np.ndarray [K x N]
    :param Y: np.ndarray [1 x N]
    :param params: dict
    :return:
    """
    assert check.argument_type(X, np.ndarray)
    assert check.argument_type(Y, np.ndarray)

    (K, N) = X.shape
    X = X.T  # Make X into [N, K]
    Y = Y.flatten()  # Make Y into [N, ]

    # Fit the linear model using the elastic net
    model = ElasticNetCV(**params).fit(X, Y)

    # Set coefficients below threshold to 0
    coefs = model.coef_  # Get all model coefficients [K, ]
    coefs[np.abs(coefs) < MIN_COEF] = 0.  # Threshold coefficients
    coef_nonzero = coefs != 0  # Create a boolean array where coefficients are nonzero [K, ]

    # If there are non-zero coefficients, redo the linear regression with them alone
    # And calculate beta_resc
    if coef_nonzero.sum() > 0:
        x = X[:, coef_nonzero]
        utils.make_array_2d(Y)
        betas = base_regression.recalculate_betas_from_selected(x, Y)
        betas_resc = base_regression.predict_error_reduction(x, Y, betas)
        return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc)
    else:
        return dict(pp=np.repeat(True, K).tolist(),
                    betas=np.zeros(K),
                    betas_resc=np.zeros(K))
Esempio n. 7
0
    def _split_axis(priors, split_ratio, axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
        """
        Split by axis labels on the chosen axis
        :param priors: pd.DataFrame [M x N]
        :param split_ratio: float
        :param axis: [0, 1]
        :param seed:
        :return:
        """

        assert check.argument_numeric(split_ratio, 0, 1)
        assert check.argument_enum(axis, [0, 1])

        pc = priors.shape[axis]
        gs_count = int((1 - split_ratio) * pc)
        idx = ManagePriors._make_shuffled_index(pc, seed=seed)

        if axis == 0:
            axis_idx = priors.index
        elif axis == 1:
            axis_idx = priors.columns
        else:
            raise ValueError("Axis can only be 0 or 1")

        pr_idx = axis_idx[idx[0:gs_count]]
        gs_idx = axis_idx[idx[gs_count:]]

        priors_data = priors.drop(gs_idx, axis=axis)
        gold_standard = priors.drop(pr_idx, axis=axis)

        return priors_data, gold_standard
Esempio n. 8
0
    def _split_for_cv(all_data, split_ratio, split_axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
        """
        Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for
        crossvalidation splits of a gold standard.

        :param all_data: pd.DataFrame [G x K]
            Existing prior or gold standard data
        :param split_ratio: float
            The proportion of the priors that should go into the gold standard
        :param split_axis: int
            Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
            Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have
            half of the data points of all_data
        :param seed: int
            Seed for the random generator
        :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K]
            Returns a new prior and gold standard by splitting the old one in half
        """

        assert check.argument_numeric(split_ratio, 0, 1)
        assert check.argument_enum(split_axis, [0, 1], allow_none=True)

        # Split the priors into gold standard based on axis (flatten if axis=None)
        if split_axis is None:
            priors_data, _ = ManagePriors._split_flattened(all_data, split_ratio, seed=seed)
            gold_standard = all_data
        else:
            priors_data, gold_standard = ManagePriors._split_axis(all_data, split_ratio, axis=split_axis, seed=seed)

        return priors_data, gold_standard
Esempio n. 9
0
    def write_csv(data, pathname, filename):
        assert check.argument_path(pathname, allow_none=True)
        assert check.argument_type(filename, str, allow_none=True)
        assert check.argument_type(data, pd.DataFrame)

        if pathname is not None and filename is not None:
            data.to_csv(os.path.join(pathname, filename), sep='\t')
Esempio n. 10
0
def _split_flattened(data, split_ratio, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Instead of splitting by axis labels, split edges and ignore axes
    :param data: pd.DataFrame [M x N]
    :param split_ratio: float
    :param seed:
    :return priors_data: pd.DataFrame [M x N]
    :return gold_standard: pd.DataFrame [M x N]
    """

    check.argument_numeric(split_ratio, 0, 1)

    pc = np.sum(data.values != 0)
    gs_count = int(split_ratio * pc)
    idx = _make_shuffled_index(pc, seed=seed)

    pr_idx = data.values[data.values != 0].copy()
    gs_idx = data.values[data.values != 0].copy()

    pr_idx[idx[0:gs_count]] = 0
    gs_idx[idx[gs_count:]] = 0

    gs = data.values.copy()
    pr = data.values.copy()

    gs[gs != 0] = gs_idx
    pr[pr != 0] = pr_idx

    priors_data = pd.DataFrame(pr, index=data.index, columns=data.columns)
    gold_standard = pd.DataFrame(gs, index=data.index, columns=data.columns)

    return priors_data, gold_standard
Esempio n. 11
0
 def test_none(self):
     self.assertTrue(check.arguments_not_none(("A", "B")))
     self.assertTrue(check.arguments_not_none(("A", None), num_none=1))
     with self.assertRaises(ValueError):
         self.assertTrue(check.arguments_not_none((None, None, "A")))
     with self.assertRaises(ValueError):
         self.assertTrue(check.arguments_not_none((None, None, "A"), num_none=0))
Esempio n. 12
0
    def test_enum(self):

        self.assertTrue(check.argument_enum("A", ("A", "B")))
        self.assertTrue(check.argument_enum(["A", "B", "A"], ("A", "B")))

        with self.assertRaises(ValueError):
            check.argument_enum(["A", "B", "C"], ("A", "B"))
Esempio n. 13
0
    def write_output_files(self,
                           pr_calc,
                           output_dir,
                           priors,
                           beta_threshold,
                           extra_cols,
                           threshold_network=True):

        assert check.argument_type(pr_calc, RankSummaryPR)
        assert check.argument_path(output_dir,
                                   allow_none=True,
                                   create_if_needed=True)

        self.write_csv(pr_calc.combined_confidences(), output_dir,
                       self.confidence_file_name)
        self.write_csv(beta_threshold, output_dir, self.threshold_file_name)
        pr_calc.output_pr_curve_pdf(output_dir,
                                    file_name=self.pr_curve_file_name)

        # Threshold the network with the boolean beta_threshold if threshold_network is True
        beta_threshold = beta_threshold if threshold_network else None

        # Process data into a network dataframe, write it out, and return it
        network_data = self.process_network(pr_calc,
                                            priors,
                                            beta_threshold=beta_threshold,
                                            extra_columns=extra_cols)
        self.save_network_to_tsv(network_data,
                                 output_dir,
                                 output_file_name=self.network_file_name)
        return network_data
Esempio n. 14
0
def context_likelihood_mi(x,
                          y,
                          bins=DEFAULT_NUM_BINS,
                          logtype=DEFAULT_LOG_TYPE,
                          return_mi=True):
    """
    Wrapper to calculate the Context Likelihood of Relatedness and Mutual Information for two data sets that have
    common condition rows. The y argument will be used to calculate background MI for the x & y MI.
    As an implementation detail, y will be cast to a dense array if it is sparse.
    X can be sparse with no internal copy.

    This function handles unpacking and packing the InferelatorData.

    :param x: An N x G InferelatorData object
    :type x: InferelatorData [N x G]
    :param y: An N x K InferelatorData object
    :type y: InferelatorData [N x K]
    :param logtype: The logarithm function to use when calculating information. Defaults to natural log (np.log)
    :type logtype: np.log func
    :param bins: Number of bins for discretizing continuous variables
    :type bins: int
    :param return_mi: Boolean for returning a MI object. Defaults to True
    :type return_mi: bool
    :return clr, mi: CLR and MI InferelatorData objects. Returns (CLR, None) if return_mi is False.
    :rtype InferelatorData, InferelatorData:
    """

    assert check.argument_integer(bins, allow_none=True)
    assert min(x.shape) > 0
    assert min(y.shape) > 0
    assert check.indexes_align((x.sample_names, y.sample_names))

    # Create dense output matrix and copy the inputs
    mi_r = x.gene_names
    mi_c = y.gene_names

    # Build a [G x K] mutual information array
    mi = mutual_information(x.expression_data,
                            y.expression_data,
                            bins,
                            logtype=logtype)
    array_set_diag(mi, 0., mi_r, mi_c)

    # Build a [K x K] mutual information array
    mi_bg = mutual_information(y.expression_data,
                               y.expression_data,
                               bins,
                               logtype=logtype)
    array_set_diag(mi_bg, 0., mi_c, mi_c)

    # Calculate CLR
    clr = calc_mixed_clr(mi, mi_bg)

    MPControl.sync_processes(pref=SYNC_CLR_KEY)

    mi = pd.DataFrame(mi, index=mi_r, columns=mi_c)
    clr = pd.DataFrame(clr, index=mi_r, columns=mi_c)

    return clr, mi if return_mi else None
Esempio n. 15
0
    def cross_validate_gold_standard(priors_data, gold_standard, cv_split_axis,
                                     cv_split_ratio, random_seed):
        """
        Sample the gold standard for crossvalidation, and then remove the new gold standard from the priors (if split
        on an axis)

        :param priors_data: pd.DataFrame [G x K]
            Prior data
        :param gold_standard: pd.DataFrame [G x K]
            Gold standard data
        :param cv_split_ratio: float
            The proportion of the priors that should go into the gold standard
        :param cv_split_axis: int
            Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
            Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have
            half of the data points of all_data
        :param random_seed: int
            Random seed
        :return priors_data, gold_standard: pd.DataFrame [G x K], pd.DataFrame [G x K]
        """

        assert check.argument_enum(cv_split_axis, (0, 1), allow_none=True)
        assert check.argument_numeric(cv_split_ratio, low=0, high=1)

        if cv_split_axis == 1:
            utils.Debug.vprint(
                "Selecting cv_split_axis of 1 is possible but a very bad idea",
                level=1)

        utils.Debug.vprint("Resampling GS ({gs}) for crossvalidation".format(
            gs=gold_standard.shape),
                           level=0)
        gs_to_prior, gold_standard = ManagePriors._split_for_cv(
            gold_standard,
            cv_split_ratio,
            split_axis=cv_split_axis,
            seed=random_seed)

        # If the priors are split on an axis, remove circularity
        if cv_split_axis is not None:
            priors_data, gold_standard = ManagePriors._remove_prior_circularity(
                priors_data, gold_standard, split_axis=cv_split_axis)
        else:
            if priors_data is not None:
                utils.Debug.vprint(
                    "Existing prior is being replaced with a downsampled gold standard"
                )
            priors_data = gs_to_prior

        _msg = "CV prior {pr} [{pr_x}] and gold standard {gs} [{gs_x}]"
        utils.Debug.vprint(_msg.format(pr=priors_data.shape,
                                       gs=gold_standard.shape,
                                       pr_x=(priors_data != 0).sum().sum(),
                                       gs_x=(gold_standard != 0).sum().sum()),
                           level=0)

        return priors_data, gold_standard
    def set_processes(cls, process_count):
        """
        Set the number of dask workers to use
        :param process_count: int
        :return:
        """
        check.argument_integer(process_count, low=1)

        cls.processes = process_count
        def separate_tasks_by_metadata(self, meta_data_column=None):
            """
            Take a single expression matrix and break it into multiple dataframes based on meta_data. Return a list of
            TaskData objects which have the task-specific data loaded into them

            :param meta_data_column: Meta_data column which corresponds to task ID
            :type meta_data_column: str
            :return new_task_objects: List of the TaskData objects with only one task's data each
            :rtype: list(TaskData)

            """

            assert check.argument_type(self.meta_data, pd.DataFrame)
            assert check.argument_type(self.expression_matrix, pd.DataFrame)
            assert self.meta_data.shape[0] == self.expression_matrix.shape[1]

            meta_data_column = meta_data_column if meta_data_column is not None else self.meta_data_task_column
            if meta_data_column is None:
                raise ValueError(
                    "tasks_from_metadata is set but meta_data_task_column is not"
                )

            new_task_objects = list()
            tasks = self.meta_data[meta_data_column].unique().tolist()

            utils.Debug.vprint(
                "Creating {n} tasks from metadata column {col}".format(
                    n=len(tasks), col=meta_data_column),
                level=0)

            # Remove data references from self
            expr_data = self.expression_matrix
            meta_data = self.meta_data
            self.expression_matrix = None
            self.meta_data = None

            for task in tasks:
                # Copy this object
                task_obj = copy.deepcopy(self)

                # Get an index of the stuff to keep
                task_idx = meta_data[meta_data_column] == task

                # Reset expression matrix, metadata, and task_name in the copy
                task_obj.expression_matrix = expr_data.iloc[:, [
                    i for i, j in enumerate(task_idx) if j
                ]]
                task_obj.meta_data = meta_data.loc[task_idx, :]
                task_obj.task_name = task
                new_task_objects.append(task_obj)

            utils.Debug.vprint("Separated data into {ntask} tasks".format(
                ntask=len(new_task_objects)),
                               level=0)

            return new_task_objects
Esempio n. 18
0
    def __init__(self, prior, expression_matrix, expression_matrix_halftau):

        assert check.dataframes_align(
            [expression_matrix, expression_matrix_halftau])
        assert check.indexes_align((prior.index, expression_matrix.index),
                                   check_order=False)

        self.prior = prior
        self.expression_matrix = expression_matrix
        self.expression_matrix_halftau = expression_matrix_halftau
    def map(cls, func, *arg, **kwargs):
        """
        Map a function across iterable(s) and return a list of results

        :param func: function
            Mappable function
        :param args: iterable
            Iterator(s)
        """
        assert check.argument_callable(func)
        assert check.argument_list_type(arg, collections.abc.Iterable)
        return list(map(func, *arg))
    def map(cls, func, *args, **kwargs):
        """
        Map a function across iterable(s) and return a list of results

        :param func: function
            Mappable function
        :param args: iterable
            Iterator(s)
        """
        assert check.argument_callable(func)
        assert check.argument_list_type(args, collections.Iterable)
        return cls.client.map(func, *args, chunksize=cls.chunk)
def _final_weights(X, y, TFs, gene):
    """
    returns reduction on variance explained for each predictor
    (model without each predictor compared to full model)
    see: Greenfield et al., 2013. Robust data-driven incorporation of prior
    knowledge into the inference of dynamic regulatory networks.
    :param X: np.ndarray [N x k]
        A design matrix with N samples and k non-zero predictors
    :param y: np.ndarray [N x 1]
        A response matrix with N samples of a specific gene expression
    :param TFs: list() or np.ndarray or pd.Series
        A list of non-zero TFs (k) included in the model
    :param gene: str
        The gene modeled
    :return out_weights: pd.DataFrame [k x 4]
        An edge table (regulator -> target) with the model coefficient and the variance explained by that predictor for
        each non-zero predictor
    """

    assert check.argument_type(X, np.ndarray)
    assert check.argument_type(y, np.ndarray)
    assert check.argument_type(TFs, (list, np.ndarray, pd.Series))

    n_preds = len(TFs)

    # Linear fit using sklearn
    ols = LinearRegression().fit(X, y)

    # save weights and initialize rescaled weights vector
    weights = ols.coef_[0]
    resc_weights = np.zeros(n_preds)

    # variance of residuals (full model)
    var_full = np.var((y - ols.predict(X))**2)

    # when there is only one predictor
    if n_preds == 1:
        resc_weights[0] = 1 - (var_full / np.var(y))
    # remove each at a time and calculate variance explained
    else:
        for j in range(len(TFs)):
            X_noj = X[:, np.setdiff1d(range(n_preds), j)]
            ols = LinearRegression().fit(X_noj, y)
            var_noj = np.var((y - ols.predict(X_noj))**2)
            resc_weights[j] = 1 - (var_full / var_noj)

    # Format output into an edge table
    out_weights = pd.DataFrame([TFs, [gene] * len(TFs), weights,
                                resc_weights]).transpose()
    out_weights.columns = ['regulator', 'target', 'weights', 'resc_weights']

    return out_weights
    def set_processes(cls, process_count):
        """
        Set the number of dask workers to use
        :param process_count: int
        :return:
        """
        check.argument_integer(process_count, low=1)
        cls._job_n = math.ceil(process_count / cls._job_n_workers)

        utils.Debug.vprint("Using `set_processes` is not advised for the DASK CLUSTER configuration", level=0)
        utils.Debug.vprint("Using `set_job_size_params` is highly preferred", level=0)
        utils.Debug.vprint("Configured {n} jobs with {w} workers per job".format(n=cls._job_n, w=cls._job_n_workers),
                           level=0)
Esempio n. 23
0
 def validate_init_args(betas, rescaled_betas, threshold=None, filter_method=None, metric=None):
     assert check.argument_type(betas, list)
     assert check.argument_list_type(betas, list)
     assert check.argument_list_type(betas[0], pd.DataFrame)
     assert check.argument_type(rescaled_betas, list)
     assert check.argument_list_type(rescaled_betas, list)
     assert check.argument_list_type(rescaled_betas[0], pd.DataFrame)
     assert all([check.dataframes_align(b_task + bresc_task) for b_task, bresc_task in zip(betas, rescaled_betas)])
     assert check.argument_enum(filter_method, results_processor.FILTER_METHODS, allow_none=True)
     assert check.argument_numeric(threshold, 0, 1, allow_none=True)
Esempio n. 24
0
    def get_random_samples(self, num_obs, with_replacement=False, random_seed=None, random_gen=None, inplace=False,
                           fix_names=True):
        """
        Randomly sample to a specific number of observatons from the entire data set

        :param num_obs: Number of observations to return
        :type num_obs: int
        :param with_replacement: Sample with replacement, defaults to False
        :type with_replacement: bool, optional
        :param random_seed: Seed for numpy random generator, defaults to None. Will be ignored if a generator itself is
            passed to random_gen.
        :type random_seed: int, optional
        :param random_gen: Numpy random generator to use, defaults to None. 
        :type random_gen: np.random.Generator, optional
        :param inplace: Change this instance of the data structure inplace and return a reference to itself
        :type inplace: bool, optional
        """

        check.argument_integer(num_obs, low=1)
        check.argument_integer(random_seed, allow_none=True)

        if (num_obs > self.num_obs) and not with_replacement:
            _msg = "Unable to sample {x} from {y} observations without replacement".format(x=num_obs, y=self.num_obs)
            raise ValueError(_msg)

        # Make a new random generator if not provided
        if random_gen is None:
            random_gen = np.random.default_rng() if random_seed is None else np.random.default_rng(random_seed)

        # Sample with replacement using randint
        if with_replacement:
            keeper_ilocs = random_gen.integers(self.num_obs, size=(num_obs,))
        
        # Sample without replacement using choice
        else:
            keeper_ilocs = random_gen.choice(np.arange(self.num_obs), size=(num_obs,), replace=False)

        # Change this instance's _adata (explicit copy allows the old data to be dereferenced instead of held as view)
        if inplace:
            self._adata = self._adata[keeper_ilocs, :].copy()
            return_obj = self
        
        # Create a new InferelatorData instance with the _adata slice
        else:
            return_obj = InferelatorData(self._adata[keeper_ilocs, :].copy())

        # Fix names
        return_obj._adata.obs_names_make_unique() if with_replacement and fix_names else None

        return return_obj
Esempio n. 25
0
    def process_network(metric, priors, confidence_threshold=0, beta_threshold=None, extra_columns=None):
        """
        Process rank-summed results into a network data frame
        :param metric: RankSummingMetric
            The rank-sum object with the math in it
        :param priors: pd.DataFrame [G x K]
            Prior data
        :param confidence_threshold: numeric
            The minimum confidence score needed to write a network edge
        :param beta_threshold: pd.DataFrame [G x K]
            The thresholded betas to include in the network. If None, include everything.
        :param extra_columns: dict(col_name: pd.DataFrame [G x K])
            Any additional data to include, keyed by column name and indexable with row and column names
        :return network_data: pd.DataFrame [(G*K) x 7+]
            Network edge dataframe

        """

        assert check.argument_type(metric, RankSummingMetric)
        assert check.argument_type(priors, pd.DataFrame, allow_none=True)
        assert check.argument_type(beta_threshold, pd.DataFrame, allow_none=True)
        assert check.argument_numeric(confidence_threshold, 0, 1)

        # Get the combined confidences and subset for confidence threshold
        network_data = metric.confidence_dataframe()
        network_data = network_data.loc[network_data[CONFIDENCE_COLUMN] > confidence_threshold, :]

        # If beta_threshold has been provided, melt and join it to the network data
        # Then discard anything which isn't meeting the threshold
        if beta_threshold is not None and False:
            beta_data = utils.melt_and_reindex_dataframe(beta_threshold, BETA_THRESHOLD_COLUMN)
            network_data = network_data.join(beta_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])
            network_data = network_data.loc[network_data[BETA_THRESHOLD_COLUMN] == 1, :]
            del network_data[BETA_THRESHOLD_COLUMN]

        if priors is not None:
            prior_data = utils.melt_and_reindex_dataframe(priors, PRIOR_COLUMN)
            network_data = network_data.join(prior_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])

        # Add any extra columns as needed
        if extra_columns is not None:
            for k in sorted(extra_columns.keys()):
                extra_data = utils.melt_and_reindex_dataframe(extra_columns[k], k)
                network_data = network_data.join(extra_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])

        # Make sure all missing values are NaN
        network_data[pd.isnull(network_data)] = np.nan

        return network_data
def format_prior(priors, gene, tasks, prior_weight, tfs=None):
    """
    Returns weighted priors for one gene
    :param priors: list(pd.DataFrame [G x K]) or pd.DataFrame [G x K]
        Either a list of prior data or a single data frame to use for all tasks
    :param gene: str
        The gene to select from the priors
    :param tasks: list(int)
        A list of task IDs
    :param prior_weight: float, int
        How much to weight the priors
    :return prior_out: np.ndarray [K x T]
        The weighted priors for a specific gene in each task
    """

    assert check.argument_type(priors, (list, pd.DataFrame), allow_none=True)
    assert check.argument_string(gene)
    assert check.argument_type(tasks, list)
    assert check.argument_numeric(prior_weight)

    if priors is None:
        return None

    def _reindex_to_gene(p):
        p = p.reindex([gene])
        p = p.reindex(tfs, axis=1) if tfs is not None else p
        p = p.fillna(0.0)
        return p

    # If the priors are a list, get the gene-specific prior from each task
    if isinstance(priors, list) and len(priors) > 1:

        priors_out = [
            _weight_prior(
                _reindex_to_gene(priors[k]).loc[gene, :].values, prior_weight)
            for k in tasks
        ]
        priors_out = np.transpose(np.vstack(priors_out))

    # Otherwise just use the same prior for each task
    else:

        priors = priors[0] if isinstance(priors, list) else priors
        priors_out = np.tile(
            _weight_prior(
                _reindex_to_gene(priors).loc[gene, :].values,
                prior_weight).reshape(-1, 1), (1, len(tasks)))

    return priors_out
    def read_expression(self, file=None):
        """
        Read expression matrix file into expression_matrix
        """
        file = file if file is not None else self.expression_matrix_file
        utils.Debug.vprint(
            "Loading expression data file {file}".format(file=file), level=1)
        self.expression_matrix = self.input_dataframe(file)

        try:
            check.dataframe_is_finite(self.expression_matrix)
        except ValueError as err:
            utils.Debug.vprint("Expression Matrix " + str(err), level=0)

        self.loaded_file_info("Expression Matrix", self.expression_matrix)
Esempio n. 28
0
    def meta_data(self, new_meta_data):

        if isinstance(new_meta_data, InferelatorData):
            new_meta_data = new_meta_data.meta_data

        # Reindex the new metadata to match the existing sample names
        new_meta_data = new_meta_data.copy()
        new_meta_data.index = new_meta_data.index.astype(str)

        # Force unique names by appending values
        if self._adata.obs_names.nunique() != self.num_obs:
            self._adata.obs_names_make_unique()

        # Drop duplicate names on the new meta data
        if new_meta_data.index.nunique() != new_meta_data.shape[0]:
            new_meta_data = new_meta_data.loc[~new_meta_data.duplicated(), :]

        try:
            Validator.indexes_align((self.sample_names, new_meta_data.index),
                                    check_order=True)
        except ValueError:
            msg = "Metadata update for {n} is misaligned".format(n=str(self))

            name_overlap = len(
                set(new_meta_data.index).intersection(set(self.sample_names)))
            # If the new metadata has no overlapping index names and is the same length, just assume it's right order
            if (name_overlap == 0) and (new_meta_data.shape[0]
                                        == self.num_obs):
                msg += " (Metadata dimensions are correct; ignoring misalignment)"
                warnings.warn(msg)
            elif name_overlap == 0:
                msg = "Incorrectly sized metadata with no overlapping names was provided to {s}".format(
                    s=str(self))
                raise ValueError(msg)
            else:
                msg += " ({m} records are in both)".format(m=name_overlap)
                warnings.warn(msg)
                new_meta_data = new_meta_data.reindex(self.sample_names)

        # Join any new columns to any existing columns
        # Update (overwrite) any columns in the existing meta data if they are in the new meta data
        if len(self._adata.obs.columns) > 0:
            keep_columns = self._adata.obs.columns.difference(
                new_meta_data.columns)
            self._adata.obs = pd.concat(
                (new_meta_data, self._adata.obs.loc[:, keep_columns]), axis=1)
        else:
            self._adata.obs = new_meta_data
def sklearn_gene(x, y, model, min_coef=None, **kwargs):
    """
    Use a scikit-learn model for regression

    :param x: Feature array
    :type x: np.ndarray [N x K]
    :param y: Response array
    :type y: np.ndarray [N x 1]
    :param model: Instance of a scikit BaseEstimator-derived model
    :type model: BaseEstimator
    :param min_coef: A minimum coefficient value to include in the model. Any values smaller will be set to 0.
    :type min_coef: numeric
    :return: A dict of results for this gene
    :rtype: dict
    """
    assert check.argument_type(x, np.ndarray)
    assert check.argument_type(y, np.ndarray)
    assert check.argument_is_subclass(model, BaseEstimator)

    (N, K) = x.shape

    # Fit the model
    model.fit(x, y, **kwargs)

    # Get all model coefficients [K, ]
    try:
        coefs = model.coef_
    except AttributeError:
        coefs = model.estimator_.coef_

    # Set coefficients below threshold to 0
    if min_coef is not None:
        coefs[np.abs(coefs) < min_coef] = 0.  # Threshold coefficients

    coef_nonzero = coefs != 0  # Create a boolean array where coefficients are nonzero [K, ]

    # If there are non-zero coefficients, redo the linear regression with them alone
    # And calculate beta_resc
    if coef_nonzero.sum() > 0:
        x = x[:, coef_nonzero]
        utils.make_array_2d(y)
        betas = base_regression.recalculate_betas_from_selected(x, y)
        betas_resc = base_regression.predict_error_reduction(x, y, betas)
        return dict(pp=coef_nonzero, betas=betas, betas_resc=betas_resc)
    else:
        return dict(pp=np.repeat(True, K).tolist(),
                    betas=np.zeros(K),
                    betas_resc=np.zeros(K))
Esempio n. 30
0
    def add_size_subsampling(self,
                             size_vector,
                             stratified_column_name=None,
                             with_replacement=False,
                             seed=42,
                             size_sample_only=None):
        """
        Resample expression data to a ratio of the original data.

        :param size_vector: An iterable with numeric ratios for downsampling. These values must be between 0 and 1.
        :type size_vector: iterable(floats)
        :param stratified_column_name: Set this to stratify sampling (to maintain group size ratios). If None, do not
            maintain group size ratios. Default is None.
        :type stratified_column_name: str, None
        :param with_replacement: Do sampling with or without replacement. Defaults to False
        :type with_replacement: bool
        :param seed: The random seed to use when selecting observations
            (this is not the same as the seed passed to the workflow)
        :param seed: int
        """

        try:
            [check.argument_numeric(val, low=0, high=1) for val in size_vector]
        except ValueError as err:
            utils.Debug.vprint(
                "Size sampling parameter error: {err}".format(err=str(err)),
                level=0)
            raise

        self.size_sample_vector = size_vector
        self.size_sample_stratified_column = stratified_column_name
        self.size_sample_with_replacement = with_replacement
        self.size_sample_seed = seed
        self.size_sample_only = size_sample_only if size_sample_only is not None else self.size_sample_only