Python StandardScaler.index Examples

Programming Language: Python

Namespace/Package Name: sklearn.preprocessing

Class/Type: StandardScaler

Method/Function: index

Examples at hotexamples.com: 4

Python StandardScaler.index - 4 examples found. These are the top rated real world Python examples of sklearn.preprocessing.StandardScaler.index extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

dot(30)

mean_(30)

fit(30)

fit_transform(30)

inverse_transform(30)

astype(30)

min(28)

columns(25)

get_params(21)

mean(21)

copy(17)

max(15)

drop(13)

head(10)

append(10)

describe(7)

flatten(6)

index(4)

join(3)

dropna(3)

StandardScaler(3)

fit_tranform(3)

corr(2)

isnull(2)

insert(2)

cpu(2)

as_matrix(2)

fit_transfrom(2)

cuda(2)

fillna(2)

cov(2)

__abs__(1)

interpolate(1)

merge(1)

median(1)

__dict__(1)

__dir__(1)

__init__(1)

lower(1)

_reset(1)

execute(1)

inverser_transform(1)

abs(1)

inverse(1)

any(1)

feature_scalling(1)

fit_transfor(1)

iinverse_transform(1)

idxmin(1)

idxmax(1)

Example #1

Show file

File: pca_big_function.py Project: ebbflo235/R-and-D

def factor_rets(stocks_ret, num_comps):
    # need to complete standardization, cov, eig vals / vecs inside function b/c of 252 day window
    stocks_ret_std = StandardScaler().fit_transform(stocks_ret)
    stocks_ret_std = p.DataFrame(stocks_ret_std)
    stocks_ret_std.index = stocks_ret.index

    stocks_log = np.log(stocks)
    stocks_std = StandardScaler().fit_transform(stocks_ret)
    ### eigenvectors are returned standardized in numpy
    ## getting complex number when doing this with 252*470 b/c can't invert matrix
    ## need more granular data or extend window, will extend window to two years for now (504 days)
    #cov_mat = np.cov(stocks_ret_std.T)
    #eig_vals,eig_vecs = np.linalg.eig(cov_mat)

    ## same result using correlation matrix
    ## getting complex number when doing this with 252*470
    cov_mat = np.corrcoef(stocks_ret.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])
                 for i in range(len(eig_vals))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort(key=lambda x: x[0], reverse=True)

    sigma_bar = stocks_ret.apply(np.std, axis=0)
    #SVD has sign issues, but good check
    #u,s,v = np.linalg.svd(stocks_ret_std.T)
    factor_returns = []
    for i in range(num_comps):
        V = eig_pairs[i][
            1]  #standardized eigenvector, numpy standardizes automatically
        eig_val = eig_pairs[i][0]
        portfolio_weights = (1 / np.sqrt(eig_val)) * (
            V / sigma_bar)  ## run standardized returns against this
        portfolio_weights1 = V  ## non standardized rets, if i standardize after will it be the same?
        ## last row of stocks _ret, trailing 252 day window, local variable
        F = np.matmul(stocks_ret, portfolio_weights)
        factor_returns.append(F)
        ## need to multiply by non standardized returns also,cause eigenvectors are standardized

    factor_returns = p.DataFrame(factor_returns).T
    factor_returns.index = stocks_ret.index

    intercept = p.Series(np.ones(len(factor_returns)),
                         index=factor_returns.index)
    X = p.concat([intercept, factor_returns], axis=1)
    X = X[-60:]

    # this function only needs to take in y because we are not iterating X
    def matrix_regression(y):
        b = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T, X)), X.T), y)
        e = y - np.matmul(X, b)
        return list(b), list(e)

    betas = []
    residuals = []

    for j in range(len(stocks_ret.columns)):
        b = matrix_regression(stocks_ret_std.iloc[-60:, j])[0]
        e = matrix_regression(stocks_ret_std.iloc[-60:, j])[1]
        betas.append(b)
        residuals.append(e)

    out_betas = p.DataFrame(betas)
    out_resid = p.DataFrame(residuals)
    out_betas.index = stocks_ret.columns
    out_betas.columns = ['Intercept', 'PC1', 'PC2', 'PC3']
    out_resid.index = stocks_ret.columns

    def autoregression(y):
        y_tminus1 = y.shift(1)
        intercept = p.Series(np.ones(len(y)))
        y_tminus1 = p.concat([intercept, y_tminus1], axis=1)
        y_tminus1 = y_tminus1[1:len(y_tminus1)]
        y = y[1:len(y)]
        b = np.matmul(
            np.matmul(np.linalg.inv(np.matmul(y_tminus1.T, y_tminus1)),
                      y_tminus1.T), y)
        e = y - np.matmul(y_tminus1, b)
        a = b[0]
        b = b[1:len(b)]
        return a, list(b), list(e)

    a = []
    b = []
    z = []

    for i in range(len(out_resid)):
        alpha = autoregression(out_resid.iloc[i, :])[0]
        beta = autoregression(out_resid.iloc[i, :])[1]
        zeta = autoregression(out_resid.iloc[i, :])[2]
        a.append(alpha)
        b.append(beta)
        z.append(zeta)

    a = p.DataFrame(a)
    b = p.DataFrame(b)
    z = p.DataFrame(z)
    a.index = b.index = z.index = out_betas.index

    z_var = z.apply(np.var, axis=1)

    part1 = (1 - b)
    part2 = p.DataFrame(np.sqrt(z_var))

    denom = part1 * part2
    numer = -a * np.sqrt(1 - b**2)
    s = numer / denom

    return s

Example #2

Show file

 def get_labelled(X):
     X_std = StandardScaler().fit_transform(X)
     X_std = pd.DataFrame(X_std, columns=X.columns)
     X_std.index = X.index
     y = X.index.to_series().apply(lambda x: x in anomalous_centres)
     return X_std, y

Example #3

Show file

    def loadRawData(datadir,
                    puckid,
                    num_nmf_factors=100,
                    prep_for_benchmarking=False):
        """
Load data for a particular puck, clean it up a bit and store as AnnData. For later use, also performs a NMF and stores those.
Borrows code from autoNMFreg_windows.py, provided with the Slide-Seq raw data.
        """
        from sklearn.preprocessing import StandardScaler

        puckdir = "{0}/Puck_{1}".format(datadir, puckid)
        beadmapdir = max(glob.glob("{0}/BeadMapping_*-*_????".format(puckdir)),
                         key=os.path.getctime)
        schema_debug("Flag 314.001 ", beadmapdir)

        # gene exp
        gexp_file = "{0}/MappedDGEForR.csv".format(beadmapdir)
        dge = fast_csv_read(gexp_file, header=0, index_col=0)
        #  for faster testing runs, use below, it has just the first 500 cols of the gexp_file
        ## dge = fast_csv_read("/tmp/a1_dge.csv", header = 0, index_col = 0)
        dge = dge.T
        dge = dge.reset_index()
        dge = dge.rename(columns={'index': 'barcode'})
        schema_debug("Flag 314.010 ", dge.shape, dge.columns)

        # spatial location
        beadloc_file = "{0}/BeadLocationsForR.csv".format(beadmapdir)
        coords = fast_csv_read(beadloc_file, header=0)
        coords = coords.rename(columns={'Barcodes': 'barcode'})
        coords = coords.rename(columns={'barcodes': 'barcode'})
        schema_debug("Flag 314.020 ", coords.shape, coords.columns)

        # Slide-Seq cluster assignments
        atlas_clusters_file = "{0}/AnalogizerClusterAssignments.csv".format(
            beadmapdir)
        clstrs = pd.read_csv(atlas_clusters_file, index_col=None)
        assert list(clstrs.columns) == ["Var1", "x"]
        clstrs.columns = ["barcode", "atlas_cluster"]
        clstrs = clstrs.set_index("barcode")
        schema_debug("Flag 314.030 ", clstrs.shape, clstrs.columns)

        df_merged = dge.merge(coords, right_on='barcode', left_on='barcode')
        df_merged = df_merged[df_merged.barcode.isin(clstrs.index)]
        schema_debug("Flag 314.040 ", df_merged.shape, df_merged.columns)

        # remove sparse gene exp
        counts = df_merged.drop(['xcoord', 'ycoord'], axis=1)
        counts2 = counts.copy(deep=True)
        counts2 = counts2.set_index('barcode')  #.drop('barcode',axis=1)
        counts2_okcols = counts2.sum(axis=0) > 0
        counts2 = counts2.loc[:, counts2_okcols]
        UMI_threshold = 5
        counts2_umis = counts2.sum(axis=1).values
        counts2 = counts2.loc[counts2_umis > UMI_threshold, :]
        schema_debug("Flag 314.0552 ", counts.shape, counts2.shape,
                     counts2_umis.shape, isinstance(counts2, pd.DataFrame))

        #slide-seq authors normalize to have sum=1 across each bead, rather than 1e6
        cval = counts2_umis[counts2_umis > UMI_threshold]
        if not prep_for_benchmarking:
            counts2 = counts2.divide(
                cval, axis=0)  #np.true_divide(counts2, counts2_umis[:,None])
            #counts2 = np.true_divide(counts2, counts2_umis[:,None])

            # this is also a little unusual, but I'm following their practice
            counts2.iloc[:, :] = StandardScaler(with_mean=False).fit_transform(
                counts2.values)
            schema_debug("Flag 314.0553 ", counts2.shape, counts2_umis.shape,
                         isinstance(counts2, pd.DataFrame))

        coords2 = df_merged.loc[df_merged.barcode.isin(counts2.index),
                                ["barcode", "xcoord", "ycoord"]].copy(
                                    deep=True)
        coords2 = coords2.set_index('barcode')  #.drop('barcode', axis=1)
        schema_debug("Flag 314.0555 ", coords2.shape,
                     isinstance(coords2, pd.DataFrame))

        ok_barcodes = set(coords2.index) & set(counts2.index) & set(
            clstrs.index)
        schema_debug("Flag 314.060 ", coords2.shape, counts2.shape,
                     clstrs.shape, len(ok_barcodes))

        if prep_for_benchmarking:
            return (counts2[counts2.index.isin(ok_barcodes)].sort_index(),
                    coords2[coords2.index.isin(ok_barcodes)].sort_index(),
                    clstrs[clstrs.index.isin(ok_barcodes)].sort_index())

        ## do NMF
        K1 = num_nmf_factors
        listK1 = ["P{}".format(i + 1) for i in range(K1)]
        random_state = 17  #for repeatability, a fixed value
        model1 = sklearn.decomposition.NMF(n_components=K1,
                                           init='random',
                                           random_state=random_state,
                                           alpha=0,
                                           l1_ratio=0)
        Ho = model1.fit_transform(
            counts2.values
        )  #yes, slideseq code had Ho and Wo mixed up. Just following their lead here.
        Wo = model1.components_

        schema_debug("Flag 314.070 ", Ho.shape, Wo.shape)

        Ho_norm = StandardScaler(with_mean=False).fit_transform(Ho)
        Ho_norm = pd.DataFrame(Ho_norm)
        Ho_norm.index = counts2.index
        Ho_norm.columns = listK1
        Wo = pd.DataFrame(Wo)
        Wo.index = listK1
        Wo.index.name = "Factor"
        Wo.columns = list(counts2.columns)

        Ho_norm = Ho_norm[Ho_norm.index.isin(ok_barcodes)]
        Ho_norm = Ho_norm / Ho_norm.std(axis=0)

        schema_debug("Flag 314.080 ", Ho_norm.shape, Wo.shape)

        genexp = counts2[counts2.index.isin(ok_barcodes)].sort_index()
        beadloc = coords2[coords2.index.isin(ok_barcodes)].sort_index()
        clstrs = clstrs[clstrs.index.isin(ok_barcodes)].sort_index()
        Ho_norm = Ho_norm.sort_index()

        schema_debug("Flag 314.090 ", genexp.shape, beadloc.shape,
                     clstrs.shape, Ho_norm.shape, genexp.index[:5],
                     beadloc.index[:5])

        beadloc["atlas_cluster"] = clstrs["atlas_cluster"]

        if "AnnData" not in dir():
            from anndata import AnnData

        adata = AnnData(X=genexp.values,
                        obs=beadloc,
                        uns={
                            "Ho": Ho_norm,
                            "Ho.index": list(Ho_norm.index),
                            "Ho.columns": list(Ho_norm.columns),
                            "Wo": Wo,
                            "Wo.index": list(Wo.index),
                            "Wo.columns": list(Wo.columns)
                        })
        return adata

Example #4

Show file

File: slopes_live.py Project: adamstreu/forex

                                           window)
            c_positions[currency] = pos['pos'].values.ravel()

        # Standardize
        if standardize:
            positions = pd.DataFrame(positions,
                                     columns=pair_list,
                                     index=np.arange(start, end + 1))
            c_positions = StandardScaler().fit_transform(
                pos['pos'].fillna(0).values.reshape(-1, 1))
            c_positions = pd.DataFrame(c_positions.ravel(),
                                       columns=[currency],
                                       index=np.arange(start, end + 1))
        else:
            positions.index = np.arange(start, end + 1)
            c_positions.index = np.arange(start, end + 1)

        # Plot parameters for only most recent values
        plt_end = cu.last_valid_index()
        plt_start = plt_end - interval

        # Plot most recent values
        positions = positions.loc[plt_start:plt_end]
        c_positions = c_positions.loc[plt_start:plt_end]
        positions.plot(figsize=(14, 6))
        plt.plot(positions.index.values, c_positions.values, color='black')
        plt.plot(positions.index.values,
                 np.zeros(positions.shape[0]),
                 color='grey')
        plt.plot(positions.index.values,
                 np.ones(positions.shape[0]) * 2,