Example #1
0
def factor_rets(stocks_ret, num_comps):
    # need to complete standardization, cov, eig vals / vecs inside function b/c of 252 day window
    stocks_ret_std = StandardScaler().fit_transform(stocks_ret)
    stocks_ret_std = p.DataFrame(stocks_ret_std)
    stocks_ret_std.index = stocks_ret.index

    stocks_log = np.log(stocks)
    stocks_std = StandardScaler().fit_transform(stocks_ret)
    ### eigenvectors are returned standardized in numpy
    ## getting complex number when doing this with 252*470 b/c can't invert matrix
    ## need more granular data or extend window, will extend window to two years for now (504 days)
    #cov_mat = np.cov(stocks_ret_std.T)
    #eig_vals,eig_vecs = np.linalg.eig(cov_mat)

    ## same result using correlation matrix
    ## getting complex number when doing this with 252*470
    cov_mat = np.corrcoef(stocks_ret.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])
                 for i in range(len(eig_vals))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort(key=lambda x: x[0], reverse=True)

    sigma_bar = stocks_ret.apply(np.std, axis=0)
    #SVD has sign issues, but good check
    #u,s,v = np.linalg.svd(stocks_ret_std.T)
    factor_returns = []
    for i in range(num_comps):
        V = eig_pairs[i][
            1]  #standardized eigenvector, numpy standardizes automatically
        eig_val = eig_pairs[i][0]
        portfolio_weights = (1 / np.sqrt(eig_val)) * (
            V / sigma_bar)  ## run standardized returns against this
        portfolio_weights1 = V  ## non standardized rets, if i standardize after will it be the same?
        ## last row of stocks _ret, trailing 252 day window, local variable
        F = np.matmul(stocks_ret, portfolio_weights)
        factor_returns.append(F)
        ## need to multiply by non standardized returns also,cause eigenvectors are standardized

    factor_returns = p.DataFrame(factor_returns).T
    factor_returns.index = stocks_ret.index

    intercept = p.Series(np.ones(len(factor_returns)),
                         index=factor_returns.index)
    X = p.concat([intercept, factor_returns], axis=1)
    X = X[-60:]

    # this function only needs to take in y because we are not iterating X
    def matrix_regression(y):
        b = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T, X)), X.T), y)
        e = y - np.matmul(X, b)
        return list(b), list(e)

    betas = []
    residuals = []

    for j in range(len(stocks_ret.columns)):
        b = matrix_regression(stocks_ret_std.iloc[-60:, j])[0]
        e = matrix_regression(stocks_ret_std.iloc[-60:, j])[1]
        betas.append(b)
        residuals.append(e)

    out_betas = p.DataFrame(betas)
    out_resid = p.DataFrame(residuals)
    out_betas.index = stocks_ret.columns
    out_betas.columns = ['Intercept', 'PC1', 'PC2', 'PC3']
    out_resid.index = stocks_ret.columns

    def autoregression(y):
        y_tminus1 = y.shift(1)
        intercept = p.Series(np.ones(len(y)))
        y_tminus1 = p.concat([intercept, y_tminus1], axis=1)
        y_tminus1 = y_tminus1[1:len(y_tminus1)]
        y = y[1:len(y)]
        b = np.matmul(
            np.matmul(np.linalg.inv(np.matmul(y_tminus1.T, y_tminus1)),
                      y_tminus1.T), y)
        e = y - np.matmul(y_tminus1, b)
        a = b[0]
        b = b[1:len(b)]
        return a, list(b), list(e)

    a = []
    b = []
    z = []

    for i in range(len(out_resid)):
        alpha = autoregression(out_resid.iloc[i, :])[0]
        beta = autoregression(out_resid.iloc[i, :])[1]
        zeta = autoregression(out_resid.iloc[i, :])[2]
        a.append(alpha)
        b.append(beta)
        z.append(zeta)

    a = p.DataFrame(a)
    b = p.DataFrame(b)
    z = p.DataFrame(z)
    a.index = b.index = z.index = out_betas.index

    z_var = z.apply(np.var, axis=1)

    part1 = (1 - b)
    part2 = p.DataFrame(np.sqrt(z_var))

    denom = part1 * part2
    numer = -a * np.sqrt(1 - b**2)
    s = numer / denom

    return s
Example #2
0
 def get_labelled(X):
     X_std = StandardScaler().fit_transform(X)
     X_std = pd.DataFrame(X_std, columns=X.columns)
     X_std.index = X.index
     y = X.index.to_series().apply(lambda x: x in anomalous_centres)
     return X_std, y
Example #3
0
    def loadRawData(datadir,
                    puckid,
                    num_nmf_factors=100,
                    prep_for_benchmarking=False):
        """
Load data for a particular puck, clean it up a bit and store as AnnData. For later use, also performs a NMF and stores those.
Borrows code from autoNMFreg_windows.py, provided with the Slide-Seq raw data.
        """
        from sklearn.preprocessing import StandardScaler

        puckdir = "{0}/Puck_{1}".format(datadir, puckid)
        beadmapdir = max(glob.glob("{0}/BeadMapping_*-*_????".format(puckdir)),
                         key=os.path.getctime)
        schema_debug("Flag 314.001 ", beadmapdir)

        # gene exp
        gexp_file = "{0}/MappedDGEForR.csv".format(beadmapdir)
        dge = fast_csv_read(gexp_file, header=0, index_col=0)
        #  for faster testing runs, use below, it has just the first 500 cols of the gexp_file
        ## dge = fast_csv_read("/tmp/a1_dge.csv", header = 0, index_col = 0)
        dge = dge.T
        dge = dge.reset_index()
        dge = dge.rename(columns={'index': 'barcode'})
        schema_debug("Flag 314.010 ", dge.shape, dge.columns)

        # spatial location
        beadloc_file = "{0}/BeadLocationsForR.csv".format(beadmapdir)
        coords = fast_csv_read(beadloc_file, header=0)
        coords = coords.rename(columns={'Barcodes': 'barcode'})
        coords = coords.rename(columns={'barcodes': 'barcode'})
        schema_debug("Flag 314.020 ", coords.shape, coords.columns)

        # Slide-Seq cluster assignments
        atlas_clusters_file = "{0}/AnalogizerClusterAssignments.csv".format(
            beadmapdir)
        clstrs = pd.read_csv(atlas_clusters_file, index_col=None)
        assert list(clstrs.columns) == ["Var1", "x"]
        clstrs.columns = ["barcode", "atlas_cluster"]
        clstrs = clstrs.set_index("barcode")
        schema_debug("Flag 314.030 ", clstrs.shape, clstrs.columns)

        df_merged = dge.merge(coords, right_on='barcode', left_on='barcode')
        df_merged = df_merged[df_merged.barcode.isin(clstrs.index)]
        schema_debug("Flag 314.040 ", df_merged.shape, df_merged.columns)

        # remove sparse gene exp
        counts = df_merged.drop(['xcoord', 'ycoord'], axis=1)
        counts2 = counts.copy(deep=True)
        counts2 = counts2.set_index('barcode')  #.drop('barcode',axis=1)
        counts2_okcols = counts2.sum(axis=0) > 0
        counts2 = counts2.loc[:, counts2_okcols]
        UMI_threshold = 5
        counts2_umis = counts2.sum(axis=1).values
        counts2 = counts2.loc[counts2_umis > UMI_threshold, :]
        schema_debug("Flag 314.0552 ", counts.shape, counts2.shape,
                     counts2_umis.shape, isinstance(counts2, pd.DataFrame))

        #slide-seq authors normalize to have sum=1 across each bead, rather than 1e6
        cval = counts2_umis[counts2_umis > UMI_threshold]
        if not prep_for_benchmarking:
            counts2 = counts2.divide(
                cval, axis=0)  #np.true_divide(counts2, counts2_umis[:,None])
            #counts2 = np.true_divide(counts2, counts2_umis[:,None])

            # this is also a little unusual, but I'm following their practice
            counts2.iloc[:, :] = StandardScaler(with_mean=False).fit_transform(
                counts2.values)
            schema_debug("Flag 314.0553 ", counts2.shape, counts2_umis.shape,
                         isinstance(counts2, pd.DataFrame))

        coords2 = df_merged.loc[df_merged.barcode.isin(counts2.index),
                                ["barcode", "xcoord", "ycoord"]].copy(
                                    deep=True)
        coords2 = coords2.set_index('barcode')  #.drop('barcode', axis=1)
        schema_debug("Flag 314.0555 ", coords2.shape,
                     isinstance(coords2, pd.DataFrame))

        ok_barcodes = set(coords2.index) & set(counts2.index) & set(
            clstrs.index)
        schema_debug("Flag 314.060 ", coords2.shape, counts2.shape,
                     clstrs.shape, len(ok_barcodes))

        if prep_for_benchmarking:
            return (counts2[counts2.index.isin(ok_barcodes)].sort_index(),
                    coords2[coords2.index.isin(ok_barcodes)].sort_index(),
                    clstrs[clstrs.index.isin(ok_barcodes)].sort_index())

        ## do NMF
        K1 = num_nmf_factors
        listK1 = ["P{}".format(i + 1) for i in range(K1)]
        random_state = 17  #for repeatability, a fixed value
        model1 = sklearn.decomposition.NMF(n_components=K1,
                                           init='random',
                                           random_state=random_state,
                                           alpha=0,
                                           l1_ratio=0)
        Ho = model1.fit_transform(
            counts2.values
        )  #yes, slideseq code had Ho and Wo mixed up. Just following their lead here.
        Wo = model1.components_

        schema_debug("Flag 314.070 ", Ho.shape, Wo.shape)

        Ho_norm = StandardScaler(with_mean=False).fit_transform(Ho)
        Ho_norm = pd.DataFrame(Ho_norm)
        Ho_norm.index = counts2.index
        Ho_norm.columns = listK1
        Wo = pd.DataFrame(Wo)
        Wo.index = listK1
        Wo.index.name = "Factor"
        Wo.columns = list(counts2.columns)

        Ho_norm = Ho_norm[Ho_norm.index.isin(ok_barcodes)]
        Ho_norm = Ho_norm / Ho_norm.std(axis=0)

        schema_debug("Flag 314.080 ", Ho_norm.shape, Wo.shape)

        genexp = counts2[counts2.index.isin(ok_barcodes)].sort_index()
        beadloc = coords2[coords2.index.isin(ok_barcodes)].sort_index()
        clstrs = clstrs[clstrs.index.isin(ok_barcodes)].sort_index()
        Ho_norm = Ho_norm.sort_index()

        schema_debug("Flag 314.090 ", genexp.shape, beadloc.shape,
                     clstrs.shape, Ho_norm.shape, genexp.index[:5],
                     beadloc.index[:5])

        beadloc["atlas_cluster"] = clstrs["atlas_cluster"]

        if "AnnData" not in dir():
            from anndata import AnnData

        adata = AnnData(X=genexp.values,
                        obs=beadloc,
                        uns={
                            "Ho": Ho_norm,
                            "Ho.index": list(Ho_norm.index),
                            "Ho.columns": list(Ho_norm.columns),
                            "Wo": Wo,
                            "Wo.index": list(Wo.index),
                            "Wo.columns": list(Wo.columns)
                        })
        return adata
Example #4
0
                                           window)
            c_positions[currency] = pos['pos'].values.ravel()

        # Standardize
        if standardize:
            positions = pd.DataFrame(positions,
                                     columns=pair_list,
                                     index=np.arange(start, end + 1))
            c_positions = StandardScaler().fit_transform(
                pos['pos'].fillna(0).values.reshape(-1, 1))
            c_positions = pd.DataFrame(c_positions.ravel(),
                                       columns=[currency],
                                       index=np.arange(start, end + 1))
        else:
            positions.index = np.arange(start, end + 1)
            c_positions.index = np.arange(start, end + 1)

        # Plot parameters for only most recent values
        plt_end = cu.last_valid_index()
        plt_start = plt_end - interval

        # Plot most recent values
        positions = positions.loc[plt_start:plt_end]
        c_positions = c_positions.loc[plt_start:plt_end]
        positions.plot(figsize=(14, 6))
        plt.plot(positions.index.values, c_positions.values, color='black')
        plt.plot(positions.index.values,
                 np.zeros(positions.shape[0]),
                 color='grey')
        plt.plot(positions.index.values,
                 np.ones(positions.shape[0]) * 2,