def factor_rets(stocks_ret, num_comps): # need to complete standardization, cov, eig vals / vecs inside function b/c of 252 day window stocks_ret_std = StandardScaler().fit_transform(stocks_ret) stocks_ret_std = p.DataFrame(stocks_ret_std) stocks_ret_std.index = stocks_ret.index stocks_log = np.log(stocks) stocks_std = StandardScaler().fit_transform(stocks_ret) ### eigenvectors are returned standardized in numpy ## getting complex number when doing this with 252*470 b/c can't invert matrix ## need more granular data or extend window, will extend window to two years for now (504 days) #cov_mat = np.cov(stocks_ret_std.T) #eig_vals,eig_vecs = np.linalg.eig(cov_mat) ## same result using correlation matrix ## getting complex number when doing this with 252*470 cov_mat = np.corrcoef(stocks_ret.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) # Make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] # Sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs.sort(key=lambda x: x[0], reverse=True) sigma_bar = stocks_ret.apply(np.std, axis=0) #SVD has sign issues, but good check #u,s,v = np.linalg.svd(stocks_ret_std.T) factor_returns = [] for i in range(num_comps): V = eig_pairs[i][ 1] #standardized eigenvector, numpy standardizes automatically eig_val = eig_pairs[i][0] portfolio_weights = (1 / np.sqrt(eig_val)) * ( V / sigma_bar) ## run standardized returns against this portfolio_weights1 = V ## non standardized rets, if i standardize after will it be the same? ## last row of stocks _ret, trailing 252 day window, local variable F = np.matmul(stocks_ret, portfolio_weights) factor_returns.append(F) ## need to multiply by non standardized returns also,cause eigenvectors are standardized factor_returns = p.DataFrame(factor_returns).T factor_returns.index = stocks_ret.index intercept = p.Series(np.ones(len(factor_returns)), index=factor_returns.index) X = p.concat([intercept, factor_returns], axis=1) X = X[-60:] # this function only needs to take in y because we are not iterating X def matrix_regression(y): b = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T, X)), X.T), y) e = y - np.matmul(X, b) return list(b), list(e) betas = [] residuals = [] for j in range(len(stocks_ret.columns)): b = matrix_regression(stocks_ret_std.iloc[-60:, j])[0] e = matrix_regression(stocks_ret_std.iloc[-60:, j])[1] betas.append(b) residuals.append(e) out_betas = p.DataFrame(betas) out_resid = p.DataFrame(residuals) out_betas.index = stocks_ret.columns out_betas.columns = ['Intercept', 'PC1', 'PC2', 'PC3'] out_resid.index = stocks_ret.columns def autoregression(y): y_tminus1 = y.shift(1) intercept = p.Series(np.ones(len(y))) y_tminus1 = p.concat([intercept, y_tminus1], axis=1) y_tminus1 = y_tminus1[1:len(y_tminus1)] y = y[1:len(y)] b = np.matmul( np.matmul(np.linalg.inv(np.matmul(y_tminus1.T, y_tminus1)), y_tminus1.T), y) e = y - np.matmul(y_tminus1, b) a = b[0] b = b[1:len(b)] return a, list(b), list(e) a = [] b = [] z = [] for i in range(len(out_resid)): alpha = autoregression(out_resid.iloc[i, :])[0] beta = autoregression(out_resid.iloc[i, :])[1] zeta = autoregression(out_resid.iloc[i, :])[2] a.append(alpha) b.append(beta) z.append(zeta) a = p.DataFrame(a) b = p.DataFrame(b) z = p.DataFrame(z) a.index = b.index = z.index = out_betas.index z_var = z.apply(np.var, axis=1) part1 = (1 - b) part2 = p.DataFrame(np.sqrt(z_var)) denom = part1 * part2 numer = -a * np.sqrt(1 - b**2) s = numer / denom return s
def get_labelled(X): X_std = StandardScaler().fit_transform(X) X_std = pd.DataFrame(X_std, columns=X.columns) X_std.index = X.index y = X.index.to_series().apply(lambda x: x in anomalous_centres) return X_std, y
def loadRawData(datadir, puckid, num_nmf_factors=100, prep_for_benchmarking=False): """ Load data for a particular puck, clean it up a bit and store as AnnData. For later use, also performs a NMF and stores those. Borrows code from autoNMFreg_windows.py, provided with the Slide-Seq raw data. """ from sklearn.preprocessing import StandardScaler puckdir = "{0}/Puck_{1}".format(datadir, puckid) beadmapdir = max(glob.glob("{0}/BeadMapping_*-*_????".format(puckdir)), key=os.path.getctime) schema_debug("Flag 314.001 ", beadmapdir) # gene exp gexp_file = "{0}/MappedDGEForR.csv".format(beadmapdir) dge = fast_csv_read(gexp_file, header=0, index_col=0) # for faster testing runs, use below, it has just the first 500 cols of the gexp_file ## dge = fast_csv_read("/tmp/a1_dge.csv", header = 0, index_col = 0) dge = dge.T dge = dge.reset_index() dge = dge.rename(columns={'index': 'barcode'}) schema_debug("Flag 314.010 ", dge.shape, dge.columns) # spatial location beadloc_file = "{0}/BeadLocationsForR.csv".format(beadmapdir) coords = fast_csv_read(beadloc_file, header=0) coords = coords.rename(columns={'Barcodes': 'barcode'}) coords = coords.rename(columns={'barcodes': 'barcode'}) schema_debug("Flag 314.020 ", coords.shape, coords.columns) # Slide-Seq cluster assignments atlas_clusters_file = "{0}/AnalogizerClusterAssignments.csv".format( beadmapdir) clstrs = pd.read_csv(atlas_clusters_file, index_col=None) assert list(clstrs.columns) == ["Var1", "x"] clstrs.columns = ["barcode", "atlas_cluster"] clstrs = clstrs.set_index("barcode") schema_debug("Flag 314.030 ", clstrs.shape, clstrs.columns) df_merged = dge.merge(coords, right_on='barcode', left_on='barcode') df_merged = df_merged[df_merged.barcode.isin(clstrs.index)] schema_debug("Flag 314.040 ", df_merged.shape, df_merged.columns) # remove sparse gene exp counts = df_merged.drop(['xcoord', 'ycoord'], axis=1) counts2 = counts.copy(deep=True) counts2 = counts2.set_index('barcode') #.drop('barcode',axis=1) counts2_okcols = counts2.sum(axis=0) > 0 counts2 = counts2.loc[:, counts2_okcols] UMI_threshold = 5 counts2_umis = counts2.sum(axis=1).values counts2 = counts2.loc[counts2_umis > UMI_threshold, :] schema_debug("Flag 314.0552 ", counts.shape, counts2.shape, counts2_umis.shape, isinstance(counts2, pd.DataFrame)) #slide-seq authors normalize to have sum=1 across each bead, rather than 1e6 cval = counts2_umis[counts2_umis > UMI_threshold] if not prep_for_benchmarking: counts2 = counts2.divide( cval, axis=0) #np.true_divide(counts2, counts2_umis[:,None]) #counts2 = np.true_divide(counts2, counts2_umis[:,None]) # this is also a little unusual, but I'm following their practice counts2.iloc[:, :] = StandardScaler(with_mean=False).fit_transform( counts2.values) schema_debug("Flag 314.0553 ", counts2.shape, counts2_umis.shape, isinstance(counts2, pd.DataFrame)) coords2 = df_merged.loc[df_merged.barcode.isin(counts2.index), ["barcode", "xcoord", "ycoord"]].copy( deep=True) coords2 = coords2.set_index('barcode') #.drop('barcode', axis=1) schema_debug("Flag 314.0555 ", coords2.shape, isinstance(coords2, pd.DataFrame)) ok_barcodes = set(coords2.index) & set(counts2.index) & set( clstrs.index) schema_debug("Flag 314.060 ", coords2.shape, counts2.shape, clstrs.shape, len(ok_barcodes)) if prep_for_benchmarking: return (counts2[counts2.index.isin(ok_barcodes)].sort_index(), coords2[coords2.index.isin(ok_barcodes)].sort_index(), clstrs[clstrs.index.isin(ok_barcodes)].sort_index()) ## do NMF K1 = num_nmf_factors listK1 = ["P{}".format(i + 1) for i in range(K1)] random_state = 17 #for repeatability, a fixed value model1 = sklearn.decomposition.NMF(n_components=K1, init='random', random_state=random_state, alpha=0, l1_ratio=0) Ho = model1.fit_transform( counts2.values ) #yes, slideseq code had Ho and Wo mixed up. Just following their lead here. Wo = model1.components_ schema_debug("Flag 314.070 ", Ho.shape, Wo.shape) Ho_norm = StandardScaler(with_mean=False).fit_transform(Ho) Ho_norm = pd.DataFrame(Ho_norm) Ho_norm.index = counts2.index Ho_norm.columns = listK1 Wo = pd.DataFrame(Wo) Wo.index = listK1 Wo.index.name = "Factor" Wo.columns = list(counts2.columns) Ho_norm = Ho_norm[Ho_norm.index.isin(ok_barcodes)] Ho_norm = Ho_norm / Ho_norm.std(axis=0) schema_debug("Flag 314.080 ", Ho_norm.shape, Wo.shape) genexp = counts2[counts2.index.isin(ok_barcodes)].sort_index() beadloc = coords2[coords2.index.isin(ok_barcodes)].sort_index() clstrs = clstrs[clstrs.index.isin(ok_barcodes)].sort_index() Ho_norm = Ho_norm.sort_index() schema_debug("Flag 314.090 ", genexp.shape, beadloc.shape, clstrs.shape, Ho_norm.shape, genexp.index[:5], beadloc.index[:5]) beadloc["atlas_cluster"] = clstrs["atlas_cluster"] if "AnnData" not in dir(): from anndata import AnnData adata = AnnData(X=genexp.values, obs=beadloc, uns={ "Ho": Ho_norm, "Ho.index": list(Ho_norm.index), "Ho.columns": list(Ho_norm.columns), "Wo": Wo, "Wo.index": list(Wo.index), "Wo.columns": list(Wo.columns) }) return adata
window) c_positions[currency] = pos['pos'].values.ravel() # Standardize if standardize: positions = pd.DataFrame(positions, columns=pair_list, index=np.arange(start, end + 1)) c_positions = StandardScaler().fit_transform( pos['pos'].fillna(0).values.reshape(-1, 1)) c_positions = pd.DataFrame(c_positions.ravel(), columns=[currency], index=np.arange(start, end + 1)) else: positions.index = np.arange(start, end + 1) c_positions.index = np.arange(start, end + 1) # Plot parameters for only most recent values plt_end = cu.last_valid_index() plt_start = plt_end - interval # Plot most recent values positions = positions.loc[plt_start:plt_end] c_positions = c_positions.loc[plt_start:plt_end] positions.plot(figsize=(14, 6)) plt.plot(positions.index.values, c_positions.values, color='black') plt.plot(positions.index.values, np.zeros(positions.shape[0]), color='grey') plt.plot(positions.index.values, np.ones(positions.shape[0]) * 2,