コード例 #1
0
 def test_anova(self):
     "Tests anova"
     (E, NOx, _, _, _, results) = self.d
     gas = loess(E, NOx, span=2. / 3.)
     gas.fit()
     gas_null = loess(E, NOx, span=1.0)
     gas_null.fit()
     gas_anova = loess_anova(gas, gas_null)
     gas_anova_theo = results[4]
     npt.assert_almost_equal(gas_anova.dfn, gas_anova_theo[0], 5)
     npt.assert_almost_equal(gas_anova.dfd, gas_anova_theo[1], 5)
     npt.assert_almost_equal(gas_anova.F_value, gas_anova_theo[2], 5)
     npt.assert_almost_equal(gas_anova.Pr_F, gas_anova_theo[3], 5)
コード例 #2
0
ファイル: hvf_selection.py プロジェクト: Jaimian/pegasus
def fit_loess(x: List[float], y: List[float], span: float, degree: int) -> object:
    try:
        lobj = sl.loess(x, y, span=span, degree=2)
        lobj.fit()
        return lobj
    except ValueError:
        return None
コード例 #3
0
 def test_1dpredict(self):
     "Basic test 1d - prediction"
     (E, NOx, gas_fit_E, _, _, results) = self.d
     gas = loess(E, NOx, span=2. / 3.)
     gas.fit()
     prediction = gas.predict(gas_fit_E, stderror=False)
     npt.assert_almost_equal(prediction.values, results[2], 6)
コード例 #4
0
    def test_failures(self):
        "Tests failures"
        (E, NOx, gas_fit_E, _, _, _) = self.d
        gas = loess(E, NOx, span=2. / 3.)
        # This one should fail (all parametric)
        gas.model.parametric = True
        with pytest.raises(ValueError):
            gas.fit()

        # This one also (all drop_square)
        gas.model.drop_square = True
        with pytest.raises(ValueError):
            gas.fit()

        gas.model.degree = 1
        with pytest.raises(ValueError):
            gas.fit()

        # This one should not (revert to std)
        gas.model.parametric = False
        gas.model.drop_square = False
        gas.model.degree = 2
        gas.fit()

        # Now, for predict .................
        prediction = gas.predict(gas_fit_E, stderror=False)
        # This one should fail (extrapolation & blending)
        with pytest.raises(ValueError):
            gas.predict(prediction.values, stderror=False)

        # But this one should not ..........
        gas.predict(gas_fit_E, stderror=False)
コード例 #5
0
def plot_loess(x, y, plt_idx):

    # Sort data by x-coordinate for plotting
    ind = np.argsort(x)
    x = x[ind]
    y = y[ind]

    l = loess(x, y, surface='direct')
    l.fit()
    pred = l.predict(x, stderror=True)
    conf = pred.confidence(alpha=0.01)

    lowess = pred.values
    ll = np.maximum(0, conf.lower)
    ul = np.minimum(1, conf.upper)

    plt.subplot(2, 2, plt_idx)
    plt.plot(x, y, '+')
    plt.plot(x, lowess)
    plt.xlim(right=1100)
    y_margin = subsample_proportion / 20
    plt.ylim(bottom=-y_margin, top=subsample_proportion + y_margin)
    if plt_idx % 2 == 1:
        plt.ylabel('Transition probability')
    if plt_idx > 2:
        plt.xlabel('Distance to object')
    plt.fill_between(x, ll, ul, alpha=.33)
コード例 #6
0
def loess_curve(da_ts, time_dim='time', season=None, plot=True):
    from skmisc.loess import loess
    import matplotlib.pyplot as plt
    import xarray as xr
    import numpy as np
    if season is not None:
        da_ts = da_ts.sel({time_dim: da_ts[time_dim + '.season'] == season})
    x = da_ts.dropna(time_dim)[time_dim].values
    y = da_ts.dropna(time_dim).values
    l_obj = loess(x, y)
    l_obj.fit()
    pred = l_obj.predict(x, stderror=True)
    conf = pred.confidence()
    lowess = np.copy(pred.values)
    ll = np.copy(conf.lower)
    ul = np.copy(conf.upper)
    da_lowess = xr.Dataset()
    da_lowess['mean'] = xr.DataArray(lowess, dims=[time_dim])
    da_lowess['upper'] = xr.DataArray(ul, dims=[time_dim])
    da_lowess['lower'] = xr.DataArray(ll, dims=[time_dim])
    da_lowess[time_dim] = x
    if plot:
        plt.plot(x, y, '+')
        plt.plot(x, lowess)
        plt.fill_between(x, ll, ul, alpha=.33)
        plt.show()
    return da_lowess
コード例 #7
0
def loessPlot(X,y,scatter=True,res=100,x_min=None,x_max=None,x_plot=None,ci_alpha=0.05,scatter_kws={},line_kws={},fill_kws={},**loess_args,):
    '''
    Plots a loess curve with shaded confidence confidence intervals using
    scikit-misc loess function (https://has2k1.github.io/scikit-misc/loess.html)
    -x can be a (n,) or (n,k) ndarray. If x is (n,k), the x-axis of the plot will
    correspond to the first covariate in the first column, with the other covariates
    entering as invisible controls.
    -y must be a (n,) ndarray
    -res,x_min,x_max set the resolution and domain for sampling the loess prediction.
    If x_min and x_max are not provided they are set to the min and max of the first
    dimension of x.
    -x_plot (optional) overrides res,x_min,x_max and sets the sampling points for
    the plot directly. Must be a 1-d ndarray.
    -ci_alpha sets confidence interval alpha parameter (default=0.05)
    -Additional loess args can be passed as named parameters
    '''
    #Set default arguments for graphic elements
    scatter_args = {'s':10,'linewidth':0}
    scatter_args.update(scatter_kws)

    line_args = {}
    line_args.update(line_kws)

    fill_args = {'color':'k','alpha':0.25,'linewidth':0}
    fill_args.update(fill_kws)

    #Split off first dimension of X for plotting
    if len(X.shape) > 1:
        x = X[:,0]
    else:
        x = X
        X = X[:,np.newaxis]

    #Sort out plot range and sampling points
    if x_min is None:
        x_min = x.min()

    if x_max is None:
        x_max = x.max()

    if x_plot is None:
        x_plot = np.linspace(x_min,x_max,res)

    #Compute loess curve and confidence intervals
    loessObject = loess.loess(X,y,**loess_args)
    prediction = loessObject.predict(x_plot,True)
    confidence_intervals = prediction.confidence(alpha=ci_alpha)

    if ax is None:
        ax = plt.gca()

    #Plot
    if scatter:
        ax.scatter(x,y,**scatter_args)

    ax.plot(x_plot,prediction.values,**line_args)
    ax.fill_between(x_plot,confidence_intervals.upper,confidence_intervals.lower,**fill_args)

    return ax
コード例 #8
0
 def test_1dbasic_alt(self):
     "Basic test 1d - part #2"
     (E, NOx, _, _, _, results) = self.d
     gas_null = loess(E, NOx)
     gas_null.model.span = 1.0
     gas_null.fit()
     npt.assert_almost_equal(gas_null.outputs.fitted_values, results[1], 6)
     npt.assert_almost_equal(gas_null.outputs.enp, 3.5, 1)
     npt.assert_almost_equal(gas_null.outputs.residual_scale, 0.5197, 4)
コード例 #9
0
 def test_1dbasic(self):
     "Basic test 1d"
     (E, NOx, _, _, _, results) = self.d
     gas = loess(E, NOx)
     gas.model.span = 2. / 3.
     gas.fit()
     npt.assert_almost_equal(gas.outputs.fitted_values, results[0], 6)
     npt.assert_almost_equal(gas.outputs.enp, 5.5, 1)
     npt.assert_almost_equal(gas.outputs.residual_scale, 0.3404, 4)
コード例 #10
0
def localPartialCorr(y1,y2,X,res=100,x_min=None,x_max=None,x_plot=None,ci_alpha=0.05,inner_bw=10,ax=None,line_kws={},fill_kws={},**loess_args):
    '''
    Computes the local partial correlation betwen y1 and y2 given controls X

    WARNING: Standard errors should not be taking seriously. Need to update code to
    estimate them more precisely, or at least correct for intermediate smoothing step.
    '''
    #Split off first dimension of X for defining distances
    if len(X.shape) > 1:
        x = X[:,0]
    else:
        x = X
        X = X[:,np.newaxis]

    #Sort out plot range and sampling points
    if x_min is None:
        x_min = x.min()

    if x_max is None:
        x_max = x.max()

    if x_plot is None:
        x_plot = np.linspace(x_min,x_max,res)

    #Compute local residuals
    loessObjects = [loess.loess(X,y,**loess_args) for y in (y1,y2)]

    for loessObject in loessObjects:
        loessObject.fit()

    r1,r2 = [loessObject.outputs.fitted_residuals for loessObject in loessObjects]

    #Compute invariant components of weighted correlation
    r11 = r1**2
    r22 = r2**2
    r12 = r1*r2

    #Compute local kernal bandwidth for each plot point
    n_span = int(inner_bw)
    h = np.zeros(res)
    for i in range(res):
        d = np.abs(x - x_plot[i])
        h[i] = np.partition(d,n_span)[n_span]

    #Compute locally weighted correlation for each x_plot point
    corr = np.zeros(res)
    for i in range(res):
        #Construct weight matrix using tri-cube weight function
        d = np.abs(x - x_plot[i])
        w = (1 - (d/h[i])**3)**3
        w = np.clip(w,a_min=0,a_max=None)

        #Compute weighted correlation
        corr[i] = np.dot(w,r12) / np.sqrt(np.dot(w,r11)*np.dot(w,r22))

    return loessPlot(x_plot,corr,x_plot=x_plot,ax=ax,scatter=False,line_kws=line_kws,fill_kws=fill_kws,**loess_args)
コード例 #11
0
ファイル: utils.py プロジェクト: semir2/ST-mLiver
def smooth_fit(
    xs: np.ndarray,
    ys: np.ndarray,
    dist_thrs: Optional[float] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Smooth curve using loess

    will perform curve fitting using skmisc.loess,
    points above 'dist_thrs' will be excluded.

    Parameters:
    ----------

    xs : np.ndarray
        x values
    ys : np.ndarray
        y values
    dist_thrs : float
        exclude (x,y) tuples where x > dist_thrs

    Returns:
    -------
    A tuple with included x and y-values (xs',ys'), as well
    as fitted y-values (ys_hat) together with associated
    standard errors. The tuple is on the form
    (xs',ys',y_hat,std_err)

    """

    srt = np.argsort(xs)
    xs = xs[srt]
    ys = ys[srt]

    if dist_thrs is None:
        dist_thrs = np.inf

    keep = np.abs(xs) < dist_thrs
    xs = xs[keep]
    ys = ys[keep]

    # generate loess class object
    ls = loess(
        xs,
        ys,
    )
    # fit loess class to data
    ls.fit()

    # predict on data
    pred = ls.predict(xs, stderror=True)
    # get predicted values
    ys_hat = pred.values
    # get standard error
    stderr = pred.stderr

    return (xs, ys, ys_hat, stderr)
コード例 #12
0
 def test_2d_pred_nodata(self):
     "2D prediction - nodata"
     (x, y, _, _, _) = self.d
     madeup = loess(x, y)
     try:
         madeup.predict(None)
     except ValueError:
         pass
     else:
         raise AssertionError("The test should have failed")
コード例 #13
0
 def test_2dbasic(self):
     "2D standard"
     (x, y, results, _, _) = self.d
     madeup = loess(x, y)
     madeup.model.span = 0.5
     madeup.model.normalize = True
     madeup.fit()
     npt.assert_almost_equal(madeup.outputs.fitted_values, results[0], 5)
     npt.assert_almost_equal(madeup.outputs.enp, 14.9, 1)
     npt.assert_almost_equal(madeup.outputs.residual_scale, 0.9693, 4)
コード例 #14
0
 def test_2d_pred_confinv(self):
     "2D prediction - confidence"
     (x, y, results, _, newdata2) = self.d
     madeup = loess(x, y)
     madeup.model.span = 0.5
     madeup.model.normalize = True
     prediction = madeup.predict(newdata2, stderror=True)
     ci = prediction.confidence(alpha=0.01)
     npt.assert_almost_equal(ci.lower, results[6][::3], 5)
     npt.assert_almost_equal(ci.fit, results[6][1::3], 5)
     npt.assert_almost_equal(ci.upper, results[6][2::3], 5)
コード例 #15
0
 def test_2d_pred_nostderr(self):
     "2D prediction - no stderr"
     (x, y, results, newdata1, _) = self.d
     madeup = loess(x, y)
     madeup.model.span = 0.5
     madeup.model.normalize = True
     prediction = madeup.predict(newdata1, stderror=False)
     npt.assert_almost_equal(prediction.values, results[4], 5)
     #
     prediction = madeup.predict(newdata1, stderror=False)
     npt.assert_almost_equal(prediction.values, results[4], 5)
コード例 #16
0
 def test_1dpredict_2(self):
     "Basic test 1d - new predictions"
     (E, NOx, _, newdata, _, results) = self.d
     # gas = loess(E, NOx, span=2./3.)
     gas = loess(E, NOx)
     gas.model.span = 2. / 3.
     prediction = gas.predict(newdata, stderror=True)
     ci = prediction.confidence(alpha=0.01)
     npt.assert_almost_equal(ci.lower, results[3][0::3], 6)
     npt.assert_almost_equal(ci.fit, results[3][1::3], 6)
     npt.assert_almost_equal(ci.upper, results[3][2::3], 6)
コード例 #17
0
 def test_2d_modflags(self):
     "2D - modification of model flags"
     (x, y, results, _, _) = self.d
     madeup = loess(x, y)
     madeup.model.span = 0.8
     madeup.model.drop_square = [True, False]
     madeup.model.parametric = [True, False]
     npt.assert_equal(madeup.model.parametric[:2], [1, 0])
     madeup.fit()
     npt.assert_almost_equal(madeup.outputs.fitted_values, results[1], 5)
     npt.assert_almost_equal(madeup.outputs.enp, 6.9, 1)
     npt.assert_almost_equal(madeup.outputs.residual_scale, 1.4804, 4)
コード例 #18
0
 def test_2d_modfamily(self):
     "2D - family modification"
     (x, y, results, _, _) = self.d
     madeup = loess(x, y)
     madeup.model.span = 0.8
     madeup.model.drop_square = [True, False]
     madeup.model.parametric = [True, False]
     madeup.model.family = "symmetric"
     madeup.fit()
     npt.assert_almost_equal(madeup.outputs.fitted_values, results[2], 5)
     npt.assert_almost_equal(madeup.outputs.enp, 6.9, 1)
     npt.assert_almost_equal(madeup.outputs.residual_scale, 1.0868, 4)
コード例 #19
0
def fit_lowess(y_pred, y_true):
    l = loess(y_pred, y_true)
    pred, conf, smlowess, ll, ul = None, None, None, None, None

    try:
        l.fit()
        pred = l.predict(y_pred, stderror=True)
        conf = pred.confidence()
        smlowess = pred.values
        ll = conf.lower
        ul = conf.upper
    except ValueError as e:
        print(e)

    return pred, conf, smlowess, ll, ul
コード例 #20
0
ファイル: preprocess.py プロジェクト: jaydu1/VITAE
def feature_select(x, gene_num=2000):
    '''Select highly variable genes (HVGs)
    (See [Stuart *et al*, (2019)](https://www.nature.com/articles/nbt.4096) and its early version [preprint](https://www.biorxiv.org/content/10.1101/460147v1.full.pdf)
    Page 12-13: Data preprocessing - Feature selection for individual datasets).

    Parameters
    ----------
    x : np.array
        \([N, G^{raw}]\) The raw count data.
    gene_num : int, optional
        The number of genes to retain.

    Returns
    ----------
    x : np.array
        \([N, G]\) The count data after gene selection.
    index : np.array
        \([G, ]\) The selected index of genes.
    '''

    n, p = x.shape

    # mean and variance of each gene of the unnormalized data
    mean, var = np.mean(x, axis=0), np.var(x, axis=0, ddof=1)

    # model log10(var)~log10(mean) by local fitting of polynomials of degree 2
    loess_model = loess.loess(np.log10(mean),
                              np.log10(var),
                              span=0.3,
                              degree=2,
                              family='gaussian')
    loess_model.fit()
    fitted = loess_model.outputs.fitted_values

    # standardized feature
    z = (x - mean) / np.sqrt(10**fitted)

    # clipped the standardized features to remove outliers
    z = np.clip(z, -np.inf, np.sqrt(n))

    # the variance of standardized features across all cells represents a measure of
    # single cell dispersion after controlling for mean expression
    feature_score = np.sum(z**2, axis=0) / (n - 1)

    # feature selection
    index = feature_score.argsort()[::-1][0:gene_num]

    return x[:, index], index
コード例 #21
0
    def test_2d_pred_stderr(self):
        "2D prediction - w/ stderr"
        (x, y, results, _, newdata2) = self.d
        madeup = loess(x, y)
        madeup.model.span = 0.5
        madeup.model.normalize = True
        prediction = madeup.predict(newdata2, stderror=True)
        npt.assert_almost_equal(prediction.values, results[5], 5)
        npt.assert_almost_equal(prediction.stderr, [0.276746, 0.278009], 5)
        npt.assert_almost_equal(prediction.residual_scale, 0.969302, 6)
        npt.assert_almost_equal(prediction.df, 81.2319, 4)

        # Direct access
        prediction = madeup.predict(newdata2, stderror=True)
        npt.assert_almost_equal(prediction.values, results[5], 5)
        npt.assert_almost_equal(prediction.stderr, [0.276746, 0.278009], 5)
        npt.assert_almost_equal(prediction.residual_scale, 0.969302, 6)
        npt.assert_almost_equal(prediction.df, 81.2319, 4)
コード例 #22
0
ファイル: hvf_selection.py プロジェクト: nealpsmith/pegasus
def select_hvf_pegasus(data: AnnData,
                       consider_batch: bool,
                       n_top: int = 2000,
                       span: float = 0.02) -> None:
    """ Select highly variable features using the pegasus method
    """
    if "robust" not in data.var:
        raise ValueError("Please run `qc_metrics` to identify robust genes")

    estimate_feature_statistics(data, consider_batch)

    robust_idx = data.var["robust"].values
    hvf_index = np.zeros(robust_idx.sum(), dtype=bool)

    mean = data.var.loc[robust_idx, "mean"]
    var = data.var.loc[robust_idx, "var"]

    lobj = sl.loess(mean, var, span=span, degree=2)
    lobj.fit()

    rank1 = np.zeros(hvf_index.size, dtype=int)
    rank2 = np.zeros(hvf_index.size, dtype=int)

    delta = var - lobj.outputs.fitted_values
    fc = var / lobj.outputs.fitted_values

    rank1[np.argsort(delta)[::-1]] = range(hvf_index.size)
    rank2[np.argsort(fc)[::-1]] = range(hvf_index.size)
    hvf_rank = rank1 + rank2

    hvf_index[np.argsort(hvf_rank)[:n_top]] = True

    data.var["hvf_loess"] = 0.0
    data.var.loc[robust_idx, "hvf_loess"] = lobj.outputs.fitted_values

    data.var["hvf_rank"] = -1
    data.var.loc[robust_idx, "hvf_rank"] = hvf_rank
    data.var["highly_variable_features"] = False
    data.var.loc[robust_idx, "highly_variable_features"] = hvf_index
コード例 #23
0
ファイル: em.py プロジェクト: HectorRDB/CS289_Final_Project
def EM_initial_guess(data, times, nulls):
    # Initialize the EM algorithm as indicated in the report.
    # Returns the estimated sigma, the fitted loess and the uniform probabilities.
    n_genes, n_times = data.shape

    sigmas = np.sqrt(np.var(data, axis=1))
    fit_loess = np.zeros(data.shape)
    # For every gene
    for ix, row in data.iterrows():
        # Fit a lowess.
        model = loess(x=times, y=row)
        model.fit()
        fit_loess[ix] = model.predict(newdata=times).values

    # Set the probabilities p_j uniformly.
    n_0 = np.sum(np.sum(nulls))
    prob = n_0 / (2 * n_times * n_genes)

    # p is 0.5 * probability to be 0
    p = [prob for _ in range(n_genes)]

    return sigmas, fit_loess, p
コード例 #24
0
ファイル: em.py プロジェクト: HectorRDB/CS289_Final_Project
def EM_M_step(data, q, times):
    fit_loess = np.zeros(data.shape)
    p = np.zeros(data.shape[0])

    for ix, row in data.iterrows():

        if np.var(row) == 0:
            pass

        else:

            # Update the function f_j for every gene by fitting a weighted loess.
            model = loess(x=times, y=row, weights=q[ix])
            model.fit()
            fit_loess[ix] = model.predict(times).values

            # Update the probabilities p_j
            p[ix] = np.mean(q[ix])

    # We know that our function cannot be negative.
    fit_loess[fit_loess < 0] = 0

    return fit_loess, p
コード例 #25
0
ファイル: estimate_semipar.py プロジェクト: fagan2888/grmpy
def generate_residuals(x, y, bandwidth=0.05):
    """
    This function runs a series of loess regressions for different
    response variables (y) on a single explanatory variable (x)
    and computes the corresponding residuals.
    """
    # Turn input data into np.ndarrays.
    y = np.array(y)
    x = np.array(x)

    # Determine number of observations and number of columns for the
    # outcome variable.
    n = len(y)
    col_len = len(y[0])

    res = np.zeros([n, col_len])

    for i in range(col_len):
        yfit = loess(x, y[:, i], span=bandwidth, degree=1)
        yfit.fit()
        res[:, i] = yfit.outputs.fitted_residuals

    return res
コード例 #26
0
def loess_data(xs, ys):

    ixes = range(len(xs))

    sorted_xs = []
    sorted_ys = []

    for ix in sorted(ixes, key=lambda x: xs[x]):

        sorted_xs.append(xs[ix])
        sorted_ys.append(ys[ix])

    l = loess(sorted_xs, sorted_ys)
    l.fit()

    pred_x = sorted(list(set(sorted_xs)))
    pred = l.predict(pred_x, stderror=True)
    conf = pred.confidence()

    lowess = pred.values
    ll = conf.lower
    ul = conf.upper

    return pred_x, lowess, ll, ul
コード例 #27
0
ファイル: loess.py プロジェクト: FranckLejzerowicz/XDOC
def DOC_loess(do: list, p_pair: str, p_span: float, p_degree: float,
              p_family: str, p_iterations: int, p_surface: str):

    # Subset
    OL = do[0]
    DIS = do[1]
    OL_rows, OL_cols = OL.shape
    # Overlap values for loess prediction
    xs = np.linspace(start=0, stop=1, num=1001)

    # Vectorize
    if not p_pair:
        tril = np.tril_indices(OL_rows, k=-1)
        OL_tri = OL.values[tril]
        DIS_tri = DIS.values[tril]
    else:
        OL_tri = OL.values
        DIS_tri = DIS.values

    # # To data frame
    DF_l = pd.DataFrame({'y': DIS_tri, 'x': OL_tri})
    DF_l = DF_l.loc[~DF_l.isna().any(axis=1)]

    # Lowess
    LOW = loess(y=DF_l.y,
                x=DF_l.x,
                span=p_span,
                degree=p_degree,
                family=p_family,
                iterations=p_iterations,
                surface=p_surface)
    xs = [round(x, 3) for x in xs if DF_l.x.min() < x < DF_l.x.max()]
    LOW_pred = LOW.predict(newdata=xs)
    LOW_P = pd.DataFrame({"Overlap": xs, "LOWESS": LOW_pred.values})
    LOW_P = LOW_P.loc[~LOW_P.isna().any(axis=1)]
    return LOW_P
コード例 #28
0
def _highly_variable_genes_seurat_v3(
    adata: AnnData,
    layer: Optional[str] = None,
    n_top_genes: int = 2000,
    batch_key: Optional[str] = None,
    check_values: bool = True,
    span: float = 0.3,
    subset: bool = False,
    inplace: bool = True,
) -> Optional[pd.DataFrame]:
    """\
    See `highly_variable_genes`.

    For further implemenation details see https://www.overleaf.com/read/ckptrbgzzzpg

    Returns
    -------
    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or
    updates `.var` with the following fields

    highly_variable : bool
        boolean indicator of highly-variable genes
    **means**
        means per gene
    **variances**
        variance per gene
    **variances_norm**
        normalized variance per gene, averaged in the case of multiple batches
    highly_variable_rank : float
        Rank of the gene according to normalized variance, median rank in the case of multiple batches
    highly_variable_nbatches : int
        If batch_key is given, this denotes in how many batches genes are detected as HVG
    """

    try:
        from skmisc.loess import loess
    except ImportError:
        raise ImportError(
            'Please install skmisc package via `pip install --user scikit-misc'
        )

    X = adata.layers[layer] if layer is not None else adata.X
    if check_values and (check_nonnegative_integers(X) == False):
        warnings.warn(
            "`flavor='seurat_v3'` expects raw count data, but non-integers were found.",
            UserWarning,
        )

    if batch_key is None:
        batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int))
    else:
        batch_info = adata.obs[batch_key].values

    norm_gene_vars = []
    for b in np.unique(batch_info):

        ad = adata[batch_info == b]
        X = ad.layers[layer] if layer is not None else ad.X

        mean, var = _get_mean_var(X)
        not_const = var > 0
        estimat_var = np.zeros(adata.shape[1], dtype=np.float64)

        y = np.log10(var[not_const])
        x = np.log10(mean[not_const])
        model = loess(x, y, span=span, degree=2)
        model.fit()
        estimat_var[not_const] = model.outputs.fitted_values
        reg_std = np.sqrt(10**estimat_var)

        batch_counts = X.astype(np.float64).copy()
        # clip large values as in Seurat
        N = np.sum(batch_info == b)
        vmax = np.sqrt(N)
        clip_val = reg_std * vmax + mean
        if sp_sparse.issparse(batch_counts):
            batch_counts = sp_sparse.csr_matrix(batch_counts)
            mask = batch_counts.data > clip_val[batch_counts.indices]
            batch_counts.data[mask] = clip_val[batch_counts.indices[mask]]
        else:
            clip_val_broad = np.broadcast_to(clip_val, batch_counts.shape)
            np.putmask(
                batch_counts,
                batch_counts > clip_val_broad,
                clip_val_broad,
            )

        if sp_sparse.issparse(batch_counts):
            squared_batch_counts_sum = np.array(
                batch_counts.power(2).sum(axis=0))
            batch_counts_sum = np.array(batch_counts.sum(axis=0))
        else:
            squared_batch_counts_sum = np.square(batch_counts).sum(axis=0)
            batch_counts_sum = batch_counts.sum(axis=0)

        norm_gene_var = (1 / ((N - 1) * np.square(reg_std))) * (
            (N * np.square(mean)) + squared_batch_counts_sum -
            2 * batch_counts_sum * mean)
        norm_gene_vars.append(norm_gene_var.reshape(1, -1))

    norm_gene_vars = np.concatenate(norm_gene_vars, axis=0)
    # argsort twice gives ranks, small rank means most variable
    ranked_norm_gene_vars = np.argsort(np.argsort(-norm_gene_vars, axis=1),
                                       axis=1)

    # this is done in SelectIntegrationFeatures() in Seurat v3
    ranked_norm_gene_vars = ranked_norm_gene_vars.astype(np.float32)
    num_batches_high_var = np.sum(
        (ranked_norm_gene_vars < n_top_genes).astype(int), axis=0)
    ranked_norm_gene_vars[ranked_norm_gene_vars >= n_top_genes] = np.nan
    ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars)
    median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan)

    df = pd.DataFrame(index=np.array(adata.var_names))
    df['highly_variable_nbatches'] = num_batches_high_var
    df['highly_variable_rank'] = median_ranked
    df['variances_norm'] = np.mean(norm_gene_vars, axis=0)
    df['means'] = mean
    df['variances'] = var

    df.sort_values(
        ['highly_variable_rank', 'highly_variable_nbatches'],
        ascending=[True, False],
        na_position='last',
        inplace=True,
    )
    df['highly_variable'] = False
    df.loc[:int(n_top_genes), 'highly_variable'] = True
    df = df.loc[adata.var_names]

    if inplace or subset:
        adata.uns['hvg'] = {'flavor': 'seurat_v3'}
        logg.hint('added\n'
                  '    \'highly_variable\', boolean vector (adata.var)\n'
                  '    \'highly_variable_rank\', float vector (adata.var)\n'
                  '    \'means\', float vector (adata.var)\n'
                  '    \'variances\', float vector (adata.var)\n'
                  '    \'variances_norm\', float vector (adata.var)')
        adata.var['highly_variable'] = df['highly_variable'].values
        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
        adata.var['means'] = df['means'].values
        adata.var['variances'] = df['variances'].values
        adata.var['variances_norm'] = df['variances_norm'].values.astype(
            'float64', copy=False)
        if batch_key is not None:
            adata.var['highly_variable_nbatches'] = df[
                'highly_variable_nbatches'].values
        if subset:
            adata._inplace_subset_var(df['highly_variable'].values)
    else:
        if batch_key is None:
            df = df.drop(['highly_variable_nbatches'], axis=1)
        return df
コード例 #29
0
def get_significant_ev(ev1, ev2, ids, cond1, cond2, norm='loess', plot='all',
                       mid_point=0., EV_range=(-1.4, 1.4), confidence=0.95,
                       alpha=0.75, kernel_density=200):
    """
    Compare two eigenvectors (from two conditions), using as null, or
       background, model the differences between each pair of neighbor bins.
       EigenVectors are Z-score normalized and their difference is LOESS
       normalized

    :param ev1: list of values in eigenvectors from first condition
    :param ev2: list of values in eigenvectors from second condition
    :param ids: list of names (e.g. [('chr1', 1294), ...]) corresponding to
       values in each eigenvector
    :param cond1: name of first experiment (corresponding to first eigenvector)
    :param cond2: name of second experiment (corresponding to second
       eigenvector)
    :param loess norm: normalization to perform on the difference of
       eigenvectors (options are loess or none)
    :param 0.75 alpha: smoothing parameter for LOESS normalization (0 pass
       through all points, 1, straight line)
    :param 200 kernel_density: density of the matrix for Gaussian kernel
       density estimation
    :param all plot: plots to be generate. If 'all' 6 plots  will be generated
       three for differences between conditions (before, after LOESS
       normalization, and density map), the same two for null mode. If 'final'
       only a single plot with density maps of observed data and null model. If
       'none', no plot will be generated.
    :param 0.95 confidence: confidence level for definition of bins with
       significantly different compartments

    :returns: a dictionary with, as keys, the input ids, and as values, a tuple
       with two value: 1- the probabilities (or Cumulative Densities) of each
       compared pair of eigenvector values inside the Gaussian kernel density
       of the null model, and 2- the LOESS normalized difference between
       eigenvectors values
    """
    print 'Getting EigenVectors'
    ev1 = np.array(ev1)
    ev2 = np.array(ev2)
    ids = np.array(ids)

    # ~normalize
    # ev1 = ev1 / np.std(ev1) / 3
    # ev2 = ev2 / np.std(ev2) / 3

    # plot
    axes = []
    if plot == 'all':
        _ = plt.figure(figsize=(18, 18))
    elif plot != 'none':
        _ = plt.figure(figsize=(10, 10))
        axe = plt.subplot(111)
    if plot in ['all', 'correlation']:
        if plot == 'all':
            axe = plt.subplot(2, 2, 1)
        axes.append(axe)
        axe = compare_AB(ev1, ev2, axe=axe, xlabel='Eigenvector of ' + cond1,
                         ylabel='Eigenvector of ' + cond2, color_ab=True,
                         mid_point=mid_point, EV_range=EV_range)
        axe.set_title('Correlation between EigenVectors (%s vs %s)' % (
            cond1, cond2))

    # Definition of null model
    print 'Defining Null model'
    ev3 = []
    ev4 = []
    for pos in xrange(0, len(ev1) - 1, 2):
        # we want same chromosome and true neighbors
        if (ids[pos][0] == ids[pos + 1][0] and
            ids[pos + 1][1] - ids[pos][1] == 1):
            ev3.append(ev1[pos])
            ev3.append(ev2[pos])
            ev4.append(ev1[pos + 1])
            ev4.append(ev2[pos + 1])

    if plot == 'all':
        axes.append(plt.subplot(2, 2, 2))
        axe = compare_AB(
            ev3, ev4, axe=axes[-1],
            xlabel='Eigenvector from %s and %s ($n$)' % (cond1, cond2),
            ylabel='Eigenvector from %s and %s ($n+1$)' % (cond1, cond2),
            color_ab=True, mid_point=mid_point, EV_range=EV_range)
        axe.set_title((
            'Correlation of EigenVectors (Null model)\n'
            '{0} bins $n$ vs $n+1$ and {1} '
            'bins $n$ vs $n+1$').format(cond1, cond2))

    ##########################################################################
    # Normalization

    # Z-scorish
    zev1 = ev1  # (ev1 - np.mean(ev1)) / np.std(ev1) / 3
    zev2 = ev2  # (ev2 - np.mean(ev2)) / np.std(ev2) / 3
    # prepare data for MA plot
    x = (zev1 + zev2) / 2
    y = (zev1 - zev2)
    # sort
    idx = np.argsort(x)
    x = x[idx]
    y = y[idx]
    ids = ids[idx]

    # for null model:
    zev3 = np.array(ev3)
    zev4 = np.array(ev4)
    x_cor = (zev3 + zev4) / 2
    y_cor = (zev3 - zev4)
    idx_cor = np.argsort(x_cor)
    x_cor = x_cor[idx_cor]
    y_cor = y_cor[idx_cor]

    # LOESS
    if norm == 'loess':
        print 'Computing LOESS on observed data'
        lo = loess(x, y, span=alpha, weight=None)
        lo.fit()
        pred = lo.outputs.fitted_values.copy()
        df = lo.outputs.enp
    else:
        pred = np.zeros(len(y))
        df = 0

    # LOESS on Null model
    if norm == 'loess':
        print 'Computing LOESS on Null model'
        lo = loess(x_cor, y_cor, span=alpha, weight=None)
        lo.fit()
        pred_cor = lo.outputs.fitted_values.copy()
        df = lo.outputs.enp
    else:
        pred_cor = np.zeros(len(y_cor))
        df = 0

    ##########################################################################
    # ordinary least square regression
    print 'Perform OLS regression and outlier test'

    # for real data
    modelR = OLS(y - pred, x, )
    resultR = modelR.fit()

    # for null model
    modelN = OLS(y_cor - pred_cor, x_cor)
    resultN = modelN.fit()

    sigmaN = np.sqrt(resultN.mse_resid)

    inflR = resultR.get_influence
    hiiR = inflR().hat_matrix_diag  # model leverage
    sigmaR = np.sqrt(resultR.mse_resid)
    residR = resultR.resid / sigmaN / np.sqrt(1 - hiiR)
    dfR = modelR.df_resid - 1

    unadj_pR = st.t.sf(np.abs(residR), dfR) * 2
    adj_pR = multipletests(unadj_pR, alpha=0.05, method='bonferroni')[1]

    if plot in ['all', 'difference']:
        if plot == 'all':
            axe = plt.subplot(2, 2, 3)
        axe.set_title(('Bland-Altman plot of EigenVectors (%s vs %s)\n'
                       'with prediction bands based on null model') % (
                           cond1, cond2))
        axes.append(nice_ba_plot(x, y, unadj_pR, sigmaN, sigmaR, pred,
                                 cond1, cond2, alpha=alpha, ax=axe))

    ##########################################################################
    print 'Perform the kernel density estimate for null model'
    y -= pred
    y_cor -= pred_cor
    # Perform the kernel density estimate for null model
    xmin = min(x_cor) - abs(min(x_cor)) * .5
    ymin = min(y_cor) - abs(min(y_cor)) * .5
    xmax = max(x_cor) * 1.5
    ymax = max(y_cor) * 1.5
    xx, yy = np.mgrid[xmin:xmax:complex(0, kernel_density),
                      ymin:ymax:complex(0, kernel_density)]
    positions = np.vstack([xx.ravel(), yy.ravel()])
    z_cor = np.vstack([x_cor, y_cor])
    kernel_cor = st.gaussian_kde(z_cor)
    f_cor = np.reshape(kernel_cor(positions).T, xx.shape)
    f_cor_sum = f_cor.sum()
    f_cor /= f_cor_sum

    print 'Perform the kernel density estimate for comparison'
    # Perform the kernel density estimate for comparison
    z = np.vstack([x, y])
    kernel = st.gaussian_kde(z)
    f = np.reshape(kernel(positions).T, xx.shape)
    f /= f.sum()

    # define probability lines
    n = 10000
    t = np.linspace(0, f_cor.max(), n)

    # kernel probability for null model
    integral = ((f_cor >= t[:, None, None]) * f_cor).sum(axis=(1, 2))
    # function to get kernel density at a given CDF
    ff = interpolate.interp1d(integral, t)
    steps = [0.99, 0.95, 0.9, 0.75, 0.50, 0.25]
    t_contours = ff(np.array(steps))
    # function to get CDF at a given kernel density
    invff = interpolate.interp1d(t, integral)

    # significant bins in Null model
    cut = confidence

    # significant bins observed data
    print 'Computing significant changes in observed data'
    signx = []
    signy = []
    result = {}

    # get kernel density for each pair of eigenvectors
    pvals = kernel_cor.pdf((x, y)) / f_cor_sum

    ev1 = ev1[idx]
    ev2 = ev2[idx]
    for i, pv in enumerate(pvals):
        try:
            pv = invff(pv)  # convert PDF to CDF
        except ValueError:
            try:
                pv = 1.
            except ValueError:
                pv = 0.
            continue
        if pv > cut:
            signx.append(x[i])
            signy.append(y[i])
        result[ids[i][0], ids[i][1]] = (ev1[i], ev2[i], pv, y[i],
                                        unadj_pR[i], adj_pR[i])

    if plot in ['all', 'density']:
        if plot == 'all':
            axe = plt.subplot(2, 2, 4)
        plt.title(('LOESS normalized BA density plot of EigenVectors\n'
                   '({0} vs {1} plotted over null model)').format(
                       cond1, cond2))
        axes.append(nice_contour_plot(
            xx, yy, f, f_cor, cond1, cond2, ax=axe, total_len=len(x), cut=cut,
            signx=signx, signy=signy, t_contours=t_contours, steps=steps))

    if plot in ['all', 'correlation']:
        for axe in axes[:2]:
            axe.set_xlim(EV_range)
            axe.set_ylim(EV_range)
    if plot in ['all', 'density', 'difference']:
        xlim = (min(x.min(), x_cor.min()), max(x.max(), x_cor.max()))
        ylim = (min(y.min(), y_cor.min()) * 1.15,
                max(y.max(), y_cor.max()) * 1.15)
        for axe in (axes[2:] if plot == 'all' else axes):
            axe.set_xlim(xlim)
            axe.set_ylim(ylim)
    return result
コード例 #30
0
    output = runByCaseSmooth(case, maf, genometot, data, span, IDs, nathres,
                             offby)
    return case, output


def runByCaseSmooth(case,
                    maf,
                    genometot,
                    data,
                    span,
                    IDs,
                    nathres=0.3,
                    offby=3):
    start_time = time.time()
    model = loess.loess(data['starts'],
                        data['counts'],
                        span=span,
                        surface='direct')
    model.fit()
    stored_all = {
        'mutdiff': [],
        'position': [],
        'mutrate': [],
        'mutrate_noadj': [],
        'patient': []
    }
    use_mean = True

    these = getMinDistByGenome(maf, case, IDs, offby=offby, use_mean=use_mean)

    if these.shape[0] == 0:
        logger.info(
コード例 #31
0
ファイル: seqff.py プロジェクト: haoziyeung/seqff
    def seqff(self):
        """
        Returns values of seqff, wrsc, enet score

        supplementary files can be downloaded below:
        https://obgyn.onlinelibrary.wiley.com/doi/abs/10.1002/pd.4615


        :param bininfoData: location of supplementary table2.csv file
        :type bininfoData: String

        :param inputData: directory path or file location of inputdata ( ".sam" or ".bam" or ".newtemp")
        :type inputData: String

        :param rdata: location of supplementary .rdata file
        :type rdata: String

        :param output_lod: where result files are(total 4 directories will be created)
        :type output_lod: String

        :return: None
        """

        start = time.time()

        # load bininfo
        bininfo = load_bininfo(self.bininfodata_loc)

        # load input files
        if os.path.isdir(self.input_loc):
            input_list = [
                self.input_loc + x for x in os.listdir(self.input_loc)
            ]

        elif os.path.isfile(self.input_loc):
            input_list = [self.input_loc]

        else:
            raise FileNotFoundError(
                "error occurred : inputData is not a Directory or File")

        for i, file in enumerate(input_list):
            filetype = file.split(".")[-1]
            # filetype : 'sam' or 'bam' or 'newtemp'
            if 'sam' in filetype:
                bincount = load_sam(file)

            elif 'newtemp' in filetype:
                bincount = load_counts(file)
                file = file.replace(".newtemp", "")  # TEMP .newtemp -> .bam

            elif 'bam' in filetype:
                bincount = load_bam(file)

            else:
                continue

            #CREATE newtemp file in "output_loc"/newtemp/
            create_newtemp(bincount, file, self.newtemp_loc)

            newtemp = pd.DataFrame.from_dict(bincount, orient='index')
            newtemp.reset_index(level=0, inplace=True)
            newtemp.rename(columns={
                'index': 'binName',
                0: 'counts'
            },
                           inplace=True)

            temp_bininfo = bininfo.copy(deep=True)
            temp_bininfo = temp_bininfo.merge(
                newtemp, on='binName',
                how='left')  # missing value : NaN, not NA in pandas
            temp_bininfo['counts'] = temp_bininfo['counts'].fillna(0)

            temp_bininfo.sort_values(by='binorder', inplace=True)
            temp_bininfo.reset_index(drop=True)

            ####DATA PROCESSING #######################
            autosomebinsonly = []
            for index in range(61927):
                boolean = (temp_bininfo['FRS'][index] != 'NA') and \
                          (float(temp_bininfo['GC'][index]) > 0.316) and \
                          (temp_bininfo['CHR'][index] != 'chrX') and \
                          (temp_bininfo['CHR'][index] != 'chrY')
                autosomebinsonly.append(boolean)
            autosomebinsonly = pd.Series(autosomebinsonly)

            alluseablebins = []
            for index in range(61927):
                boolean = (temp_bininfo['FRS'][index] != "NA") and (float(
                    temp_bininfo['GC'][index]) > 0.316)
                alluseablebins.append(boolean)
            alluseablebins = pd.Series(alluseablebins)

            #CREATE alluseablebins file in "output_loc"/alluseablebins
            #create_alluseablebins(alluseablebins, file, self.alluseablebins_loc)

            sum_counts = pd.Series(temp_bininfo['counts'])
            sum_counts = sum_counts[autosomebinsonly].sum(skipna=True)

            autoscaledtemp = pd.Series(
                temp_bininfo['counts'].loc[(autosomebinsonly)],
                copy=True) / sum_counts  # NA-related code removed
            allscaledtemp = pd.Series(
                temp_bininfo['counts'].loc[(alluseablebins)],
                copy=True) / sum_counts

            gc_index = {}
            cnt = 0
            for index, isauto in enumerate(autosomebinsonly):
                if isauto:
                    if temp_bininfo['GC'].iat[index] in gc_index:
                        gc_index[temp_bininfo['GC'].iat[index]].append(
                            float(autoscaledtemp.iat[cnt]))
                        cnt += 1

                    else:
                        gc_index[temp_bininfo['GC'].iat[index]] = [
                            float(autoscaledtemp.iat[cnt])
                        ]
                        cnt += 1

            key_list = []
            val_list = []
            for key, val in gc_index.items():
                key_list.append(key)
                val_list.append(np.median(val))

            loess_var = loess(key_list, val_list)  # default span : 0.75
            loess_var.fit()
            # y = loess.loess_prediction(newData, loessVar)
            # temp_loessPredict.loess_debugging(loessVar)

            ###prediction###
            loess_x = [
                float(gc) for index, gc in enumerate(temp_bininfo['GC'])
                if (alluseablebins[index])
            ]
            # print(temp_bininfo['GC'])
            loess_fitted = loess_var.predict(loess_x)
            loess_fitted = list(loess_fitted.values)
            # print(loess_fitted)

            median_autoscaledtemp = np.median(autoscaledtemp)
            median_autoscaledtemp = float(
                median_autoscaledtemp)  # for fixed constant

            normalizedbincount = [
                (x + (median_autoscaledtemp - loess_fitted[index]))
                for index, x in enumerate(allscaledtemp)
            ]

            #CREATE normalizedbincount in "output_loc"/normalizedbincount
            create_normalizedbincount(normalizedbincount, file,
                                      self.normalizedbincount_loc)

            bincounts = pd.Series(data=np.repeat(a=0.0, repeats=61927),
                                  index=temp_bininfo['binName'],
                                  dtype=np.float64)

            sum_normalizedbincount = sum(
                [val for val in normalizedbincount if not math.isnan(val)])
            sum_normalizedbincount = float(
                sum_normalizedbincount)  # deep copy temporarily

            cnt = 0
            for index, x in enumerate(alluseablebins):
                if x == True:
                    data = (normalizedbincount[cnt] /
                            sum_normalizedbincount) * len(normalizedbincount)
                    bincounts.iat[index] = data
                    cnt += 1

            #CREATE bincounts in "output_loc"/bincounts
            create_bincounts(bincounts, file, self.bincounts_loc)

            wrsc = self.prediction(bincounts, self.B, self.mu,
                                   self.parameter_1, self.parameter_2)
            enet = np.dot(bincounts, (self.elnetbeta)) + (self.elnetintercept)
            ff = (wrsc + enet) / 2

            result_lines = list()
            result_lines.append("SeqFF\tEnet\tWRSC")
            result_lines.append("{}\t{}\t{}".format(ff, enet, wrsc))

            #CREATE results of seqff (seqff paper result covered) in "output_loc"/results
            create_results(result_lines, file, self.results_loc)

            end = time.time()
            elapsed = end - start
            h = int(elapsed) // 3600
            m = (int(elapsed) - (h * 3600)) // 60
            s = (int(elapsed) % 60)
            print("elapsed time: %d hr %d min %d sec" % (h, m, s))
            print("elapsed :", elapsed)
            print("progress : {} / {}".format(i + 1, self.progress))