Ejemplo n.º 1
0
 def test_ilr_basis_one_dimension_error(self):
     table = np.array([[1., 10.], [1.14141414, 9.90909091],
                       [1.28282828, 9.81818182], [1.42424242, 9.72727273],
                       [1.56565657, 9.63636364]])
     basis = np.array([0.80442968, 0.19557032])
     with self.assertRaises(ValueError):
         ilr(table, basis=basis)
Ejemplo n.º 2
0
 def test_ilr_basis_one_dimension_error(self):
     table = np.array([[1., 10.],
                       [1.14141414, 9.90909091],
                       [1.28282828, 9.81818182],
                       [1.42424242, 9.72727273],
                       [1.56565657, 9.63636364]])
     basis = np.array([0.80442968, 0.19557032])
     with self.assertRaises(ValueError):
         ilr(table, basis=basis)
Ejemplo n.º 3
0
def _to_balances(table, tree):
    """ Converts a table of abundances to balances given a tree.

    Parameters
    ----------
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        features correspond to columns.
    tree : skbio.TreeNode
        Tree object where the leaves correspond to the columns contained in
        the table.

    Returns
    -------
    pd.DataFrame
        Contingency table where samples correspond to rows and
        balances correspond to columns.
    np.array
        Orthonormal basis in the Aitchison simplex generated from `tree`.
    """
    non_tips = [n.name for n in tree.levelorder() if not n.is_tip()]
    basis, _ = balance_basis(tree)

    mat = ilr(table.values, basis=basis)
    ilr_table = pd.DataFrame(mat,
                             columns=non_tips,
                             index=table.index)
    return ilr_table, basis
Ejemplo n.º 4
0
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame:
    """Performs isometric logratio (ilr) transformation on feature-table.

    This creates a new table with balances (groups of features) that
    distinguish samples. Zeros must first be removed from the table
    (e.g. add-pseudocount). For source documentation check out:
    https://numpydoc.readthedocs.io/en/latest/

    Parameters
    -----------
    table : pd.DataFrame
        Dataframe of the feature table where rows correspond to samples
        and columns are features. The values within the table must be
        positive and nonzero.
    tree : skbio.TreeNode
        A tree relating all of the features to balances or
        log-contrasts (hierarchy). This tree must be bifurcating
        (i.e. has exactly 2 nodes). The internal nodes of the tree
         will be renamed.

    Returns
    --------
    balances : pd.DataFrame
         Balances calculated from the feature table. Balance represents
         the log ratio of subchildren values below the specified internal node.
    """
    _table, _tree = match_tips(table, tree)
    basis, nodes = balance_basis(_tree)
    balances = ilr(_table.values, basis)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    return pd.DataFrame(balances,
                        columns=in_nodes,
                        index=table.index)
Ejemplo n.º 5
0
    def normalize_transform(self, mode='clr'):
        """
        Some operations may require transformed data.
        This function performs normalization and
        a clr transform on all OTU tables in a Batch object.
        It returns a deep copy of the original Batch object,
        so the original file is not modified.

        :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio)
        :return: Transformed copy of Batch object.
        """
        batchcopy = copy.deepcopy(self)
        try:
            for x in list(self.otu):
                # normalizes the data by samples
                normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False)
                mat = csr_matrix.toarray(normbiom.matrix_data)
                # replaces all zeros with a small value
                # multiplicative replacement preserves ratios between values
                mat = multiplicative_replacement(mat)
                if mode is 'clr':
                    mat = clr(mat)
                elif mode is 'ilr':
                    mat = ilr(mat)
                else:
                    raise ValueError("Only CLR and ILR transformations are currently supported.")
                normbiom._data = csc_matrix(mat)
                batchcopy.otu[x] = normbiom
        except Exception:
            logger.error("Failed to normalize data", exc_info=True)
        return batchcopy
Ejemplo n.º 6
0
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame:
    """Performs isometric logratio (ilr) transformation on feature-table.

    This creates a new table with balances (groups of features) that
    distinguish samples. Zeros must first be removed from the table
    (e.g. add-pseudocount). For source documentation check out:
    https://numpydoc.readthedocs.io/en/latest/

    Parameters
    -----------
    table : pd.DataFrame
        Dataframe of the feature table where rows correspond to samples
        and columns are features. The values within the table must be
        positive and nonzero.
    tree : skbio.TreeNode
        A tree relating all of the features to balances or
        log-contrasts (hierarchy). This tree must be bifurcating
        (i.e. has exactly 2 nodes). The internal nodes of the tree
         will be renamed.

    Returns
    --------
    balances : pd.DataFrame
         Balances calculated from the feature table. Balance represents
         the log ratio of subchildren values below the specified internal node.
    """
    _table, _tree = match_tips(table, tree)
    basis, nodes = balance_basis(_tree)
    balances = ilr(_table.values, basis)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    return pd.DataFrame(balances, columns=in_nodes, index=table.index)
Ejemplo n.º 7
0
def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame:
    _table, _tree = match_tips(table, tree)
    basis, _ = balance_basis(_tree)
    balances = ilr(_table.values, basis)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    return pd.DataFrame(balances,
                        columns=in_nodes,
                        index=table.index)
Ejemplo n.º 8
0
def _regression(y, X, basis=None):
    """
    Performs a simplicial ordinary least squares on a set of
    compositions and a response variable

    Parameters
    ----------
    y : numpy.ndarray, float
       a matrix of proportions where
       rows correspond to samples and
       columns correspond to features.
    X : numpy.ndarray, float
       independent variable

    Returns
    -------
    predict: pd.DataFrame, float
       a predicted matrix of proportions where
       rows correspond to samples and
       columns correspond to features.
    b: pd.DataFrame, float
       a matrix of estimated coefficient compositions
    resid: pd.DataFrame, float
       a matrix of compositional residuals
    r2: float
       coefficient of determination
    """
    y = np.atleast_2d(y)
    X = np.atleast_2d(X)

    # Need to add constant for intercept
    r, c = X.shape

    y_ = ilr(y, basis=basis)

    # Now perform least squares to calculate unknown coefficients
    inv = np.linalg.pinv(np.dot(X.T, X))
    cross = np.dot(inv, X.T)
    b_ = np.dot(cross, y_)
    predict_ = np.dot(X, b_)
    resid = (y_ - predict_)
    sst = (y_ - y_.mean(axis=0))
    r2 = 1 - ((resid**2).sum() / (sst**2).sum())

    if len(b_.shape) == 1:
        b_ = np.atleast_2d(b_).T

    b = ilr_inv(b_)
    if len(predict_.shape) == 1:
        predict_ = np.atleast_2d(predict_).T
    predict = ilr_inv(predict_)

    if len(resid.shape) == 1:
        resid = np.atleast_2d(resid).T
    resid = ilr_inv(resid)
    return predict, b, resid, r2
Ejemplo n.º 9
0
    def test_ilr_basis_isomorphism(self):
        # tests to make sure that the isomorphism holds
        # with the introduction of the basis.
        basis = np.array([[0.80442968, 0.19557032]])
        table = np.array([[
            np.log(1 / 10) * np.sqrt(1 / 2),
            np.log(1.14141414 / 9.90909091) * np.sqrt(1 / 2),
            np.log(1.28282828 / 9.81818182) * np.sqrt(1 / 2),
            np.log(1.42424242 / 9.72727273) * np.sqrt(1 / 2),
            np.log(1.56565657 / 9.63636364) * np.sqrt(1 / 2)
        ]]).T
        res = ilr(ilr_inv(table, basis=basis), basis=basis)
        npt.assert_allclose(res, table.squeeze())

        table = np.array([[1., 10.], [1.14141414, 9.90909091],
                          [1.28282828, 9.81818182], [1.42424242, 9.72727273],
                          [1.56565657, 9.63636364]])

        res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis)
        npt.assert_allclose(res, closure(table.squeeze()))
Ejemplo n.º 10
0
    def test_ilr_basis_isomorphism(self):
        # tests to make sure that the isomorphism holds
        # with the introduction of the basis.
        basis = np.array([[0.80442968, 0.19557032]])
        table = np.array([[np.log(1/10)*np.sqrt(1/2),
                           np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                           np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                           np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                           np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)]]).T
        res = ilr(ilr_inv(table, basis=basis), basis=basis)
        npt.assert_allclose(res, table.squeeze())

        table = np.array([[1., 10.],
                          [1.14141414, 9.90909091],
                          [1.28282828, 9.81818182],
                          [1.42424242, 9.72727273],
                          [1.56565657, 9.63636364]])

        res = ilr_inv(np.atleast_2d(ilr(table, basis=basis)).T, basis=basis)
        npt.assert_allclose(res, closure(table.squeeze()))
Ejemplo n.º 11
0
    def test_ilr_inv(self):
        mat = closure(self.cdata7)
        npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat)

        npt.assert_allclose(ilr_inv(np.identity(3)), self.ortho1,
                            rtol=1e-04, atol=1e-06)

        with self.assertRaises(ValueError):
            ilr_inv(self.cdata1, basis=self.cdata1)

        # make sure that inplace modification is not occurring
        ilr_inv(self.cdata1)
        npt.assert_allclose(self.cdata1,
                            np.array([[2, 2, 6],
                                      [4, 4, 2]]))
Ejemplo n.º 12
0
    def test_ilr_inv(self):
        mat = closure(self.cdata7)
        npt.assert_array_almost_equal(ilr_inv(ilr(mat)), mat)

        npt.assert_allclose(ilr_inv(np.identity(3)),
                            self.ortho1,
                            rtol=1e-04,
                            atol=1e-06)

        with self.assertRaises(ValueError):
            ilr_inv(self.cdata1, basis=self.cdata1)

        # make sure that inplace modification is not occurring
        ilr_inv(self.cdata1)
        npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
Ejemplo n.º 13
0
    def test_ilr_basis(self):
        table = np.array([[1., 10.], [1.14141414, 9.90909091],
                          [1.28282828, 9.81818182], [1.42424242, 9.72727273],
                          [1.56565657, 9.63636364]])
        basis = np.array([[0.80442968, 0.19557032]])
        res = ilr(table, basis=basis)
        exp = np.array([
            np.log(1 / 10) * np.sqrt(1 / 2),
            np.log(1.14141414 / 9.90909091) * np.sqrt(1 / 2),
            np.log(1.28282828 / 9.81818182) * np.sqrt(1 / 2),
            np.log(1.42424242 / 9.72727273) * np.sqrt(1 / 2),
            np.log(1.56565657 / 9.63636364) * np.sqrt(1 / 2)
        ])

        npt.assert_allclose(res, exp)
Ejemplo n.º 14
0
    def test_ilr_basis(self):
        table = np.array([[1., 10.],
                          [1.14141414, 9.90909091],
                          [1.28282828, 9.81818182],
                          [1.42424242, 9.72727273],
                          [1.56565657, 9.63636364]])
        basis = np.array([[0.80442968, 0.19557032]])
        res = ilr(table, basis=basis)
        exp = np.array([np.log(1/10)*np.sqrt(1/2),
                        np.log(1.14141414 / 9.90909091)*np.sqrt(1/2),
                        np.log(1.28282828 / 9.81818182)*np.sqrt(1/2),
                        np.log(1.42424242 / 9.72727273)*np.sqrt(1/2),
                        np.log(1.56565657 / 9.63636364)*np.sqrt(1/2)])

        npt.assert_allclose(res, exp)
Ejemplo n.º 15
0
def train_compositional_parameters(data):
    """
    Given noisy compositional data, try to learn the compositional noise
    parameters.  It is assumed that noise follows a Gaussian distribution in
    the ilr space.
    Parameters
    ----------
    data : array_like
       A matrix of counts where there are `n` rows and `m` columns
       where `n` corresponds to the number of samples and `m`
       corresponds to the number of species.
    Returns
    -------
    mu: float
       Mean of ilr normal in the default gram schmidt space
    cov: float
       Covariance matrix of ilr normal in the default gram schmidt space
    """
    X = ilr(data)
    mu = np.mean(X, axis=0)
    cov = np.cov(X.T)
    return mu, cov
Ejemplo n.º 16
0
    def test_ilr(self):
        mat = closure(self.cdata7)
        npt.assert_array_almost_equal(ilr(mat),
                                      np.array([0.70710678, 0.40824829]))

        # Should give same result as inner
        npt.assert_allclose(ilr(self.ortho1),
                            np.identity(3),
                            rtol=1e-04,
                            atol=1e-06)

        with self.assertRaises(ValueError):
            ilr(self.cdata1, basis=self.cdata1)

        # make sure that inplace modification is not occurring
        ilr(self.cdata1)
        npt.assert_allclose(self.cdata1, np.array([[2, 2, 6], [4, 4, 2]]))
Ejemplo n.º 17
0
    def test_ilr(self):
        mat = closure(self.cdata7)
        npt.assert_array_almost_equal(ilr(mat),
                                      np.array([0.70710678, 0.40824829]))

        # Should give same result as inner
        npt.assert_allclose(ilr(self.ortho1), np.identity(3),
                            rtol=1e-04, atol=1e-06)

        with self.assertRaises(ValueError):
            ilr(self.cdata1, basis=self.cdata1)

        # make sure that inplace modification is not occurring
        ilr(self.cdata1)
        npt.assert_allclose(self.cdata1,
                            np.array([[2, 2, 6],
                                      [4, 4, 2]]))
Ejemplo n.º 18
0
def balancetest(table, grouping, tree,
                significance_test=None,
                layout=None,
                normalize=True,
                mode='c'):
    """ Performs statistical test on ilr balances and plots on tree.

    Parameters
    ----------
    table : pd.DataFrame
        A 2D matrix of strictly positive values (i.e. counts or proportions)
        where the rows correspond to samples and the columns correspond to
        features.
    grouping : pd.Series
        Vector indicating the assignment of samples to groups.  For example,
        these could be strings or integers denoting which group a sample
        belongs to.  It must be the same length as the samples in `table`.
        The index must be the same on `table` and `grouping` but need not be
        in the same order.
    tree : skbio.TreeNode
        A strictly bifurcating tree defining a hierarchical relationship
        between all of the features within `table`
    significance_test : function, optional
        A statistical significance function to test for significance between
        classes.  This function must be able to accept at least two 1D
        array_like arguments of floats and returns a test statistic and a
        p-value, or a single statistic. By default ``scipy.stats.f_oneway``
        is used.
    layout : function, optional
        A layout for formatting the tree visualization. Must take a
        `ete.tree` as a parameter.
    mode : str
        Type of display to show the tree. ('c': circular, 'r': rectangular).

    Returns
    -------
    ete_tree : ete.Tree
        ETE tree converted from the `skbio.TreeNode` object
    ts : ete.TreeStyle
        ETE tree style used for formatting the visualized tree,
        with the test statistic plotted on each of the internal nodes.

    Note
    ----
    The `skbio.TreeNode` is assumed to strictly bifurcating and
    whose tips match `table`.  Also, it is assumed that none
    of the values in `table` are zero.  Replace with a pseudocount
    if necessary.

    See also
    --------
    skbio.TreeNode.bifurcate
    skbio.stats.composition.ilr
    skbio.stats.multiplicative_replacement
    scipy.stats.f_oneway
    """

    if np.any(table <= 0):
        raise ValueError('Cannot handle zeros or negative values in `table`. '
                         'Use pseudo counts or ``multiplicative_replacement``.'
                         )

    if significance_test is None:
        significance_test = scipy.stats.f_oneway

    sorted_features = [n.name for n in tree.tips()][::-1]
    if len(sorted_features) != len(table.columns):
        raise ValueError('The number of tips (%d) in the tree must be equal '
                         'to the number features in the table (%d).' %
                         (len(sorted_features), len(table.columns)))
    table = table.reindex(columns=sorted_features)

    mat, cats = check_table_grouping(table, grouping)

    basis, nodes = phylogenetic_basis(tree)
    ilr_coords = ilr(mat, basis=basis)

    ete_tree = Tree(str(tree))

    _cats = set(cats)
    i = 0
    for n in ete_tree.traverse():
        if not n.is_leaf():
            diffs = [ilr_coords[(cats == x).values, i] for x in _cats]

            stat = significance_test(*diffs)
            if len(stat) == 2:
                n.add_features(weight=-np.log(stat[1]))
            elif len(stat) == 1:
                n.add_features(weight=stat)
            else:
                raise ValueError(
                    "Too many arguments returned by %s" %
                    significance_test.__name__)
            i += 1

    # Create an empty TreeStyle
    ts = TreeStyle()

    # Set our custom layout function
    if layout is None:
        ts.layout_fn = default_layout
    else:
        ts.layout_fn = layout

    # Draw a tree
    ts.mode = mode

    # We will add node names manually
    ts.show_leaf_name = False
    # Show branch data
    ts.show_branch_length = True
    ts.show_branch_support = True

    return ete_tree, ts
Ejemplo n.º 19
0
 def _ilr_without_tree(X):
     return pd.DataFrame(ilr(X), index=X.index)
Ejemplo n.º 20
0
def run_preprocess(RNA_count_file,
                   ATAC_count_file,
                   ATAC_barcode_file,
                   ATAC_peak_file,
                   GTF_file,
                   max_threads=2,
                   velocity_file=None,
                   outname=None,
                   peak_per=None,
                   cell_per=None,
                   normalization='default',
                   skip=False):

    # Find common barcodes and write to a list

    # spinner = Halo(text='Organizing Cell Barcodes', spinner='dots',
    #                color='white', placement='right')
    # spinner.start()

    skip_atac = skip

    assert normalization in ['default', 'ilr', 'none']

    s = open(RNA_count_file).readline().replace(' ', '\t')
    RNA_barcodes = np.loadtxt(StringIO(s), dtype=str)[1:]
    ATAC_barcodes = np.loadtxt(ATAC_barcode_file, dtype=str)
    ATAC_peaks = np.loadtxt(ATAC_peak_file, dtype=str)

    spinner = Halo(text='Organizing Cell Barcodes',
                   spinner='dots',
                   color='white',
                   placement='right')
    spinner.start()

    #THIS IS WHERE THAT CORRECTION IS
    for j in range(0, np.size(ATAC_barcodes)):
        ATAC_barcodes[j] = ATAC_barcodes[j].replace('.R',
                                                    ',R').replace('.P', ',P')

    # OR
    # for j in range(0, np.size(ATAC_barcodes)):
    #     ATAC_barcodes[j] = ATAC_barcodes[j][:-5]
    # for j in range(0, np.size(RNA_barcodes)):
    #     RNA_barcodes[j] = RNA_barcodes[j][:-5]

    intersecting, rna_idx, atac_idx = np.intersect1d(RNA_barcodes,
                                                     ATAC_barcodes,
                                                     return_indices=True)
    common_idxs = np.hstack((rna_idx.reshape(-1, 1), atac_idx.reshape(-1, 1)))

    spinner.stop()
    print("Found Intersection of Barcodes")

    if skip_atac != False:

        # Now let's get the ATAC-seq counts

        n_threads = max_threads

        spinner = Halo(text='Loading ATAC-seq counts into memory',
                       spinner='dots',
                       color='white',
                       placement='right')
        spinner.start()

        txt_array = np.asarray(np.loadtxt(ATAC_count_file,
                                          skiprows=1,
                                          dtype=str)[1:, :],
                               dtype=int)

        spinner.stop()

        number_peaks = np.max(txt_array[:, 0])
        number_cells = np.max(txt_array[:, 1])

        # Split file across cores and write to a sparse matrix

        spinner = Halo(text='Normalizing ATAC-seq',
                       spinner='dots',
                       color='white',
                       placement='right')
        spinner.start()

        split_list = np.array_split(txt_array, n_threads)
        del txt_array
        gc.collect()

        ATAC_counts = scipy.sparse.lil_matrix((number_cells, number_peaks))

        #Looks like two cores is the fastest on my computer
        if __name__ != "__main__":
            with Manager() as manager:
                chunks = manager.list()
                processes = []
                for k in range(0, n_threads):

                    p = Process(target=make_sparse_chunk,
                                args=(number_cells, number_peaks,
                                      split_list[k], chunks))
                    p.start()
                    processes.append(p)

                for p in processes:
                    p.join()

                for chunk in chunks:
                    nonzero = chunk.nonzero()
                    ATAC_counts[nonzero] = chunk[nonzero]
                    del chunk
                    gc.collect()

            # Filter on depths and peak coverage if applicable
            # CELLS x PEAKS

            non_zero = ATAC_counts.nonzero()

            depths = np.sum(ATAC_counts, axis=1)

            if cell_per != None:
                bad_depths = np.argwhere(
                    depths <= np.percentile(depths, cell_per))
            else:
                bad_depths = np.argwhere(depths == 0)

            bad_idx_in_common = np.intersect1d(bad_depths, common_idxs)
            common_idxs = np.delete(common_idxs, (bad_idx_in_common), axis=0)

            # print("shape of ATAC counts matrix")
            # print(np.shape(ATAC_counts))
            # print("Nonzero elements")
            # print(np.shape(non_zero))

            if peak_per != None:
                peak_coverage = np.count_nonzero(ATAC_counts, axis=0)
                good_peaks = np.argwhere(
                    peak_coverage >= np.percentile(peak_coverage, peak_per))
                bad_peaks = np.argwhere(
                    peak_coverage < np.percentile(peak_coverage, peak_per))
                rem = np.sum(ATAC_counts[:, bad_peaks], axis=1)

                # COUNTS, BARCODES, PEAKS, DEPTHS
                if normalization == 'default':
                    pickle.dump([
                        normalize(scipy.sparse.hstack(
                            [ATAC_counts[common_idxs[:, 1], good_peaks], rem]),
                                  norm='l1',
                                  axis=1), ATAC_barcodes[common_idxs[:, 1]],
                        np.concatenate(
                            (ATAC_peaks[good_peaks], np.asarray(['rem']))),
                        depths[common_idxs[:, 1]]
                    ], open("ATAC_data.p", "wb"))
                    print("Wrote Preprocessed ATAC-seq Data")

                if normalization == 'ilr':
                    pickle.dump([
                        ilr(
                            scipy.sparse.hstack([
                                ATAC_counts[common_idxs[:, 1], good_peaks], rem
                            ]) + 1), ATAC_barcodes[common_idxs[:, 1]],
                        np.concatenate(
                            (ATAC_peaks[good_peaks], np.asarray(['rem']))),
                        depths[common_idxs[:, 1]]
                    ], open("ATAC_data.p", "wb"))
                    print("Wrote Preprocessed ATAC-seq Data")

                if normalization == 'none':
                    pickle.dump([
                        scipy.sparse.hstack([
                            ATAC_counts[common_idxs[:, 1], good_peaks], rem
                        ]), ATAC_barcodes[common_idxs[:, 1]],
                        np.concatenate(
                            (ATAC_peaks[good_peaks], np.asarray(['rem']))),
                        depths[common_idxs[:, 1]]
                    ], open("ATAC_data.p", "wb"))
                    print("Wrote Preprocessed ATAC-seq Data")

                del ATAC_counts
                gc.collect()

            else:

                if normalization == 'default':
                    # COUNTS, BARCODES, PEAKS, DEPTHS
                    pickle.dump([
                        normalize(ATAC_counts[common_idxs[:, 1], :],
                                  norm='l1',
                                  axis=1), ATAC_barcodes[common_idxs[:, 1]],
                        ATAC_peaks, depths[common_idxs[:, 1]]
                    ], open("ATAC_data.p", "wb"))
                    print("Wrote Preprocessed ATAC-seq Data")

                if normalization == 'ilr':
                    # COUNTS, BARCODES, PEAKS, DEPTHS
                    pickle.dump([
                        ilr(ATAC_counts[common_idxs[:, 1], :] + 1),
                        ATAC_barcodes[common_idxs[:, 1]], ATAC_peaks,
                        depths[common_idxs[:, 1]]
                    ], open("ATAC_data.p", "wb"))
                    print("Wrote Preprocessed ATAC-seq Data")

                if normalization == 'none':
                    # COUNTS, BARCODES, PEAKS, DEPTHS
                    pickle.dump([
                        ATAC_counts[common_idxs[:, 1], :],
                        ATAC_barcodes[common_idxs[:, 1]], ATAC_peaks,
                        depths[common_idxs[:, 1]]
                    ], open("ATAC_data.p", "wb"))
                    print("Wrote Preprocessed ATAC-seq Data")

                del ATAC_counts
                gc.collect()

        spinner.stop()

    spinner = Halo(text='Normalizing RNA-seq',
                   spinner='dots',
                   color='white',
                   placement='right')
    spinner.start()

    # Load in the RNA-seq
    RNA_counts = scipy.sparse.lil_matrix(
        np.asarray(np.loadtxt(RNA_count_file, skiprows=1, dtype=str)[:, 1:],
                   dtype=int).transpose())
    gene_names = np.loadtxt(RNA_count_file, usecols=0, skiprows=1, dtype=str)

    # COUNTS, BARCODES, GENE NAMES

    GTF_info = np.loadtxt(GTF_file, dtype=str, skiprows=5, delimiter='\t')

    rna_depths = np.sum(RNA_counts, axis=1)

    if normalization == 'default':
        pickle.dump([
            normalize(RNA_counts[common_idxs[:, 0], :], norm='l1',
                      axis=1), RNA_barcodes[common_idxs[:, 0]], gene_names,
            rna_depths[common_idxs[:, 0]], GTF_info
        ], open("RNA_data.p", "wb"))
    if normalization == 'ilr':
        pickle.dump([
            ilr(RNA_counts[common_idxs[:, 0], :] + 1),
            RNA_barcodes[common_idxs[:, 0]], gene_names,
            rna_depths[common_idxs[:, 0]], GTF_info
        ], open("RNA_data.p", "wb"))
    if normalization == 'none':
        pickle.dump([
            RNA_counts[common_idxs[:, 0], :], RNA_barcodes[common_idxs[:, 0]],
            gene_names, rna_depths[common_idxs[:, 0]], GTF_info
        ], open("RNA_data.p", "wb"))

    del RNA_counts
    gc.collect()

    spinner.stop()
    print("Wrote Preprocessed RNA-seq Data")
    print("Preprocessing Completed")
Ejemplo n.º 21
0
labels = [re.sub(regex, "", e) for e in labs]
# Remove first element "x"
labels.pop(0)

# Ensure that this is not the rarefied ASV table
sample_counts = unscaled_tab.sum(axis=1)  # T

# Perform total sum scaling normalization (TSS)
scaled = unscaled_tab.div(unscaled_tab.sum(axis=1), axis=0)
# scaled.sum(axis=1) # check

# Substitute zeros with small pseudocounts since...
zeros_scaled = comp.multiplicative_replacement(scaled)  # numpy.ndarray

# Isoform log transform since...
ilr_transformed = comp.ilr(zeros_scaled)

# Convert ndarray back to dataframe because...
df_ilr_transformed = pd.DataFrame(ilr_transformed,
                                  index=scaled.index,
                                  columns=scaled.columns)

########################################################################################################
# Decision tree methods tended to perform well
# HFE OTU feature reduction method brought a substantial performance improvement for nearly all methods
# After feature reduction most methods performed similarly so need to do that
########################################################################################################

# Split data into test and training sets
# Do before feature selection so features selected from training set, not whole dataset
train, test, y_train, y_test = mod.train_test_split(df_ilr_transformed,
Ejemplo n.º 22
0
def band_table(num_samples,
               num_features,
               tree=None,
               low=2,
               high=10,
               sigma=2,
               alpha=6,
               seed=0):
    """ Generates a simulated table of counts.

    Each organism is modeled as a Gaussian distribution.  Then counts
    are simulated using a Poisson distribution.

    Parameters
    ----------
    num_samples : int
        Number of samples to simulate
    num_features : int
        Number of features to simulate
    tree : skbio.TreeNode
        Tree used as a scaffold for the ilr transform.
        If None, then the gram_schmidt_basis will be used.
    low : float
        Smallest gradient value.
    high : float
        Largest gradient value.
    sigma : float
        Variance of each species distribution
    alpha : int
        Global count bias.  This bias is added to every cell in the matrix.
    seed : int or np.random.RandomState
        Random seed

    Returns
    -------
    biom.Table
        Biom representation of the count table.
    pd.DataFrame
        DataFrame containing relevant metadata.
    beta : np.array
        Regression parameter estimates.
    theta : np.array
        Bias per sample.
    """
    state = np.random.RandomState(seed)

    # measured gradient values for each sample
    gradient = np.linspace(low, high, num_samples)
    # optima for features (i.e. optimal ph for species)
    mu = np.linspace(low, high, num_features)
    sigma = np.array([sigma] * num_features)
    # construct species distributions
    table = chain_interactions(gradient, mu, sigma)
    samp_ids = ['S%d' % i for i in range(num_samples)]

    # obtain basis required to convert from balances to proportions.
    if tree is None:
        basis = _gram_schmidt_basis(num_features)
        feat_ids = ['F%d' % i for i in range(num_features)]
        table = pd.DataFrame(table, index=samp_ids, columns=feat_ids)
    else:
        feat_ids = [n.name for n in tree.tips()]
        table = pd.DataFrame(table, index=samp_ids, columns=feat_ids)
        basis = sparse_balance_basis(tree)[0].todense()

    # construct balances from gaussian distribution.
    # this will be necessary when refitting parameters later.
    Y = ilr(table, basis=clr_inv(basis))
    X = gradient.reshape(-1, 1)
    X = np.hstack((np.ones(len(X)).reshape(-1, 1), X.reshape(-1, 1)))
    pY, resid, B = ols(Y, X)
    gamma = B[0]
    beta = B[1].reshape(1, -1)
    # parameter estimates
    r = beta.shape[1]
    # Normal distribution to simulate linear regression
    M = np.eye(r)
    # Generate covariance matrix from inverse wishart
    Sigma = invwishart.rvs(df=r + 2, scale=M.dot(M.T), random_state=state)
    w, v = eigsh(Sigma, k=2)
    # Low rank covariance matrix
    sim_L = (v @ np.diag(w)).T

    # sample
    y = X.dot(B)
    Ys = np.vstack(
        [state.multivariate_normal(y[i, :], Sigma) for i in range(y.shape[0])])
    Yp = Ys @ basis
    # calculate bias terms
    theta = -np.log(np.exp(Yp).sum(axis=1)) + alpha

    # multinomial sample the entries
    #table = np.vstack(multinomial(nd, Yp[i, :]) for i in range(y.shape[0]))

    # poisson sample the entries
    table = np.vstack(
        state.poisson(np.exp(Yp[i, :] + theta[i]))
        for i in range(y.shape[0])).T

    table = Table(table, feat_ids, samp_ids)
    metadata = pd.DataFrame({'G': gradient}, index=samp_ids)
    return table, metadata, beta, theta, gamma
Ejemplo n.º 23
0
def train( iterations, sample_size, reduce, positive_train, negative_train, test_data, test_label):
    f1_original_clr = []
    f1_original = []
    f1_dca = []
    f1_clr = []
    f1_ilr = []

    roc_original_clr = []
    roc_original = []
    roc_dca = []
    roc_clr = []
    roc_ilr = []
    for _ in range( iterations ):
    # Select a smaller size
        #Select a random set from the train data
        train_sample_data, train_sample_label = split_train_test( positive_train, negative_train, sample_size )

        f1_original_data, roc_original_data = train_svm(train_sample_data, train_sample_label, test_data,
                                                            test_label)
        f1_original.append( f1_original_data )
        roc_original.append( roc_original_data )

        train_sample_data[train_sample_data == 0] = 0.1e-32
        test_data[test_data == 0] = 0.1e-32

        clr_original_train = clr(train_sample_data)
        clr_original_test = clr(test_data)

        scaler = StandardScaler()
        clr_original_train = np.nan_to_num(scaler.fit_transform(clr_original_train))
        clr_original_test = np.nan_to_num(scaler.fit_transform(clr_original_test))

        f1_original_data_clr, roc_original_data_clr = train_svm( clr_original_train, train_sample_label, clr_original_test, test_label )
        f1_original_clr.append ( f1_original_data_clr )
        roc_original_clr.append( roc_original_data_clr )

        matrices = genetic_algorithm( train_sample_data, reduce )
        roc_dca_iterations = []
        for br_matrix in matrices:
            #br_matrix = matrices[0]
            reduced_data = np.matmul(br_matrix, train_sample_data.transpose()).transpose()
            reduced_test = np.matmul(br_matrix, test_data.transpose()).transpose()

            f1_dca_data, roc_dca_data = train_svm( reduced_data, train_sample_label, reduced_test, test_label )
            #f1_dca.append( f1_dca_data )
            roc_dca_iterations.append( roc_dca_data )
        #print ("DCA max", max(roc_dca_iterations) )
        roc_dca.append( max(roc_dca_iterations) )
        #print ( " PCA CLR train shape ", train_sample_data.shape )
        # Do ILR and CLR transformation
        # Set zeros to small values
        train_sample_data[train_sample_data == 0] = 0.1e-32
        test_data[test_data == 0] = 0.1e-32

        clr_data_train = clr(train_sample_data)
        clr_test = clr(test_data)

        ilr_data_train = ilr( train_sample_data )
        ilr_test = ilr( test_data )
        np.savetxt("ilr_data.csv", ilr_data_train, delimiter=",")

        # Do PCA to reduce dimensions
        pca_clr = PCA(n_components = reduce)
        pca_ilr = PCA(n_components = reduce)
        #print ( "reduce ", reduce )

        fit_train_clr = np.ascontiguousarray( pca_clr.fit_transform(clr_data_train) )
        fit_test_clr = np.ascontiguousarray( pca_clr.transform(clr_test) )

        fit_train_ilr = np.ascontiguousarray( pca_ilr.fit_transform(ilr_data_train) )
        fit_test_ilr = np.ascontiguousarray( pca_ilr.transform(ilr_test) )
        np.savetxt("ilr_data_pca.csv", fit_train_ilr, delimiter=",")

        pca_clr_reduced_train = np.nan_to_num( fit_train_clr )
        pca_ilr_reduced_train = np.nan_to_num( fit_train_ilr )

        fit_test_clr = np.nan_to_num( fit_test_clr )
        fit_test_ilr = np.nan_to_num( fit_test_ilr )

        f1_pca_clr_data, roc_pca_clr_data = train_svm( pca_clr_reduced_train, train_sample_label, fit_test_clr, test_label )
        f1_pca_ilr_data, roc_pca_ilr_data = train_svm( pca_ilr_reduced_train, train_sample_label, fit_test_ilr, test_label )
        f1_clr.append( f1_pca_clr_data )
        roc_clr.append( roc_pca_clr_data )

        f1_ilr.append( f1_pca_ilr_data )
        roc_ilr.append( roc_pca_ilr_data )

        #print ( roc_original, roc_dca, roc_clr, roc_ilr)

    return ( sum ( roc_original ) / iterations ) , ( sum ( roc_original_clr ) / iterations ),  ( sum( roc_dca ) / iterations ) , ( sum( roc_clr ) / iterations ) , ( sum( roc_ilr ) / iterations )