def _spearman_r(a, b, weights, axis, skipna): """ndarray implementation of scipy.stats.spearmanr. Parameters ---------- a : ndarray Input array. b : ndarray Input array. axis : int The axis to apply the correlation along. weights : ndarray Input array. skipna : bool If True, skip NaNs when computing function. Returns ------- res : ndarray Spearmanr's correlation coefficient. See Also -------- scipy.stats.spearmanr """ if skipna: a, b, weights = _match_nans(a, b, weights) _a = rankdata(a, axis=axis) _b = rankdata(b, axis=axis) return _pearson_r(_a, _b, weights, axis, skipna)
def spearmanr(array2d): """ Spearman correlation coefficient on a matrix columns with missing values and ties Should give the same result as cor(x, method="spearman", use="pairwise") in R """ ra = np.ma.masked_invalid(bn.rankdata(array2d, axis=0)) ncols = ra.shape[1] cor_mat = np.empty((ncols, ncols), dtype=float) for j in xrange(ncols): x = ra[:, j] xm = ra.mask[:, j] for k in xrange(ncols): if j == k: cor_mat[j, k] = 1 continue if k > j: continue y = ra[:, k] ym = ra.mask[:, k] both_valid = np.logical_not(np.logical_or(xm, ym)) r = np.corrcoef(x[both_valid], y[both_valid])[1, 0] cor_mat[j, k] = r cor_mat[k, j] = r return (cor_mat)
def normalize_cells(X, ranked=True): """ Scale matrix sthat all cells (rows) sum to 1 and have l2-norm of 1 Arguments: X {array} -- Cell x gene matrix (sparse or dense) Keyword Arguments: ranked {bool} -- Indicator whether to rank cells (default: {True}) Returns: np.ndarray -- Cells x genes matrix of normalized cells """ if sparse.issparse(X): res = X.toarray() else: res = X if ranked: res = bottleneck.rankdata(res, axis=1) avg = np.mean(res, axis=1) res -= avg[:, None] norm = np.sqrt(bottleneck.nansum(res**2, axis=1))[:, None] res /= norm return res
def combine_fcast_and_mcli(fcast, mcli): big_ds = xr.concat([ mcli['Pressure'].drop('timestr'), fcast['Pressure'].expand_dims('time') ], dim='time') percentile = bottleneck.rankdata(big_ds, axis=0) / len(big_ds['time']) return percentile
def roc_auc_score_bottleneck(actual, predicted, approx=False): if approx: r = np.argsort(predicted) else: r = rankdata(predicted) n_pos = np.sum(actual) n_neg = len(actual) - n_pos sum1 = (np.sum(r[actual == 1]) - n_pos * (n_pos + 1) / 2) print(f"bottleneck nPos {n_pos} nNeg {n_neg}, sum {sum1}") return sum1 / (n_pos * n_neg)
def rank(data, nan_val): """Rank normalize data Rank standardize inplace Does not return Arguments: data {np.array} -- Array of data """ finite = np.isfinite(data) ranks = bottleneck.rankdata(data[finite]).astype(data.dtype) ranks -= 1 top = np.max(ranks) ranks /= top data[...] = nan_val data[np.where(finite)] = ranks del ranks, finite gc.collect()
def lilliefors_Dcrit_gev(n_obs, significance_levels, shape, n_sample=10000): """Estimate the critical values of the KS test using statistical simulation. See also: Wilks, D. S. (2011). Frequentist Statistical Inference. International Geophysics, 100, 133–186. https://doi.org/10.1016/B978-0-12-385022-5.00005-1 """ q_levels = [1-i for i in significance_levels] D_list = [] for i in range(n_sample): ams_sim = scipy.stats.genextreme.rvs(c=shape, size=n_obs) rank = bottleneck.rankdata(ams_sim) ecdf = rank / n_obs loc, scale, shape = ev_fit.gev_pwm(ams_sim, ecdf, n_obs, ax_year=0, shape=np.full((1), shape)) cdf = ev_fit.gev_cdf(ams_sim, loc, scale, shape) D = np.abs(ecdf-cdf).max() D_list.append(D) return np.quantile(D_list, q_levels)
def create_nw_spearman(data): """Create Co-expreesion network Computes co-expression nnetwork using Spearman correaltion and then ranking the network Arguments: data {array} -- Cells x Genes array of floats, either dense or sparse Returns: np.ndarray -- co-expression nnetwork (2-D dense array) """ if sparse.issparse(data): data = data.toarray() data = bottleneck.rankdata(data, axis=0) nw = np.corrcoef(data, rowvar=False) np.fill_diagonal(nw, 1) rank(nw, nan_val=0) np.fill_diagonal(nw, 1) return nw
def compute_aurocs(votes, positives=None, compute_p=False): """Compute AUORCs based on neighbors voting and candidates identities Arguments: votes {pd.DataFrame} -- DataFrame with votes for cell types Keyword Arguments: positives {Vector} -- Vector of assignments for positive values. If left empty, cells are assumed to be the row names of the votes matrix (default: {None}) compute_p {pd.DataFrame} -- Boolean for whether or not to compute the p value for the AUROC (default : {False}) Returns: pd.DataFrame -- DataFrame of testing cell types x training cell types """ res_col = votes.columns if positives is None: positives = design_matrix(votes.index) res_idx = positives.columns positives = positives.values n_pos = bottleneck.nansum(positives, axis=0) n_neg = positives.shape[0] - n_pos sum_pos_ranks = positives.T @ bottleneck.rankdata(votes.values, axis=0) roc = sum_pos_ranks / n_pos[:, None] roc -= (n_pos[:, None] + 1) / 2 roc /= n_neg[:, None] if compute_p: n_pos = n_pos[:, None] n_neg = n_neg[:, None] U = roc * n_pos * n_neg Z = (np.abs(U - (n_pos * n_neg / 2))) / np.sqrt(n_pos * n_neg * (n_pos + n_neg + 1) / 12) p = stats.norm.sf(Z) p = pd.DataFrame(p, index=res_idx, columns=res_col) return pd.DataFrame(roc, index=res_idx, columns=res_col), p return pd.DataFrame(roc, index=res_idx, columns=res_col)
def spearman_correlation(x, y): """Spearman's s, or rank correlation coefficient. From http://xarray.pydata.org/en/stable/dask.html#automatic-parallelization. """ x_ranks = bn.rankdata(x, axis=-1) y_ranks = bn.rankdata(y, axis=-1) return pearson_correlation(x_ranks, y_ranks)
def run_cm_mixomics_test(sample, feature, random_states, root): """ Compare corrmapper with mixomics and marginal corr networks """ import os import numpy as np import copy import pandas as pd import scipy as sp import bottleneck as bn from sklearn.datasets import make_sparse_spd_matrix from sklearn.preprocessing import StandardScaler from rpy2.robjects import numpy2ri from rpy2.robjects.packages import importr from cmTest import test_utils as tu from cmTest import hugeR for random_state in random_states: # ---------------------------------------------------------------------- # SIMULATE DATASET # ---------------------------------------------------------------------- informative = int(feature * .05) n_features = feature n_informative = informative n_redundant = 0 n_relevant = n_informative + n_redundant s = sample f = feature i = informative i_half = n_informative/2 r = n_redundant prec_real = make_sparse_spd_matrix(n_informative, smallest_coef=.4, alpha=.98, largest_coef=.8, random_state=random_state) cov_real = sp.linalg.inv(prec_real) d = np.sqrt(np.diag(cov_real)) # divide through cols cov_real /= d # divide through rows cov_real /= d[:, np.newaxis] prec_real *= d prec_real *= d[:, np.newaxis] covs = [cov_real, cov_real] X, y = tu.make_classification(n_samples=s, n_features=f, n_clusters_per_class=1, n_informative=i, n_redundant=r, shuffle=False, class_sep=.25, cov=covs, random_state=random_state) # ---------------------------------------------------------------------- # SPLIT DATASET INTO TWO # ---------------------------------------------------------------------- # split relevant features equally and randomly X1_rel_feats = set(np.random.choice(n_relevant, n_relevant / 2, replace=False)) X2_rel_feats = set(range(n_relevant)) - X1_rel_feats # now split all other features randomly and equally other_n = n_features - n_relevant X1_other_feats = set(np.random.choice(range(n_relevant, n_features), other_n / 2, replace=False)) X2_other_feats = set(range(n_relevant, n_features)) - X1_other_feats # merge relevant and irrelevant features X1_feats = np.array(sorted(list(X1_other_feats.union(X1_rel_feats)))) X2_feats = np.array(sorted(list(X2_other_feats.union(X2_rel_feats)))) # check we have each feature only once sorted(list(set(X1_feats).union(X2_feats))) == range(n_features) # define X1 and X2 X1 = X[:, X1_feats] X2 = X[:, X2_feats] datasets_original = [X1, X2] # these two lists will keep track of the features we are left with from # the original data matrix. feature nums < n_informative are informative. X12_feats_original = [X1_feats, X2_feats] # ---------------------------------------------------------------------- # START PIPELINE # ---------------------------------------------------------------------- file_id = ("samp_%d_feat_%d_rand_%d" % (s, f, random_state)) results_folder = root + "cmTest/results2/" # this ensure that we only run analysis that ins't already finished result_file = results_folder + file_id + '.txt' file_exist = os.path.isfile(result_file) fs_methods = ["fdr", "l1svc", "boruta", "jmi"] ss = StandardScaler() if not file_exist or (file_exist and os.stat(result_file).st_size == 0): o = open(result_file, 'w') # ----------------------------------------------------------------- # CORRMAPPER o.write('Method, Prec, Recall\n') for fs_method in fs_methods: no_feat = False X12_feats = [copy.deepcopy(X12_feats_original[0]), copy.deepcopy(X12_feats_original[1])] datasets = [copy.deepcopy(datasets_original[0]), copy.deepcopy(datasets_original[1])] for i, dataset in enumerate(datasets): # variance filtering dataset = pd.DataFrame(dataset) two_n = int(2 * dataset.shape[0]) top_var_ix = np.array(sorted(np.argsort(dataset.var())[-two_n:])) dataset = dataset[top_var_ix].values # update features of the datasets X12_feats[i] = X12_feats[i][top_var_ix] # FS try: sel = tu.do_fs(dataset, y, fs_method) if len(sel) == 0: no_feat = True else: # update features of the datasets X12_feats[i] = X12_feats[i][sel] datasets[i] = dataset[:, sel] except: no_feat = True if not no_feat: # concatenate datasets dataset1 = pd.DataFrame(datasets[0]) dataset2 = pd.DataFrame(datasets[1]) merged_datasets_df = dataset1.join(dataset2, how='inner', lsuffix='_data1', rsuffix='_data2') X_fs = merged_datasets_df.values # standardise X_fs = ss.fit_transform(X_fs) # run hugeR's glasso and StARS cov, prec = hugeR.hugeR(X_fs, 0.05) # match features to original informative ones, check docstring # of translate_estimated_matrix_into_original for explanation if prec.shape[0] > 1: prec = tu.translate_estimated_m_into_original(prec, X12_feats, informative) # we only compare the N12 network to make it fair for mixomics p, r = tu.quality_of_graph(prec_real[i_half:,:i_half], prec[i_half:,:i_half], sym=False) else: p, r = np.nan, np.nan o.write(','.join(map(str, [fs_method, p, r])) + '\n') # reorder real precision for other two methods all_rel_feats = list(X1_rel_feats) + list(X2_rel_feats) prec_real2 = prec_real prec_real2 = prec_real2[all_rel_feats,:] prec_real2 = prec_real2[:,all_rel_feats] prec_real2 = prec_real2[i_half:,:i_half] # ----------------------------------------------------------------- # GRAPH LASSO try: dataset = pd.DataFrame(X) two_n = int(2 * dataset.shape[0]) top_var_ix = np.array(sorted(np.argsort(dataset.var())[-two_n:])) X_gl = dataset[top_var_ix].values X_gl = X[:, top_var_ix] X_gl = ss.fit_transform(X_gl) cov, prec = hugeR.hugeR(X_gl, 0.05) p, r = tu.quality_of_graph(prec_real[i_half:,:i_half], prec[X1.shape[1]:X1.shape[1]+i_half,:i_half], sym=False) o.write(','.join(map(str, ["glasso", p, r])) + '\n') except: o.write(','.join(map(str, ["glasso", np.nan, np.nan])) + '\n') # ----------------------------------------------------------------- # MARGINAL CORR NETWORK cov_thresholds = [.05, .1, .2, .3, .5, .7, .8] rX = bn.rankdata(X, axis=0) marg_cov = np.corrcoef(rX, rowvar=0) marg_cov = np.abs(marg_cov[X1.shape[1]:X1.shape[1]+i_half,:i_half]) for thresh in cov_thresholds: tmp_cov = (marg_cov > thresh).astype(int) p, r = tu.quality_of_graph(prec_real2, tmp_cov, sym=False) method = "marginal %f" % thresh o.write(','.join(map(str, [method, p, r])) + '\n') # ----------------------------------------------------------------- # MIXOMICS base = importr('base') # this allows us to send numpy to R directly, neat numpy2ri.activate() mo = importr('mixOmics') mo_spls_model = mo.spls(X1, X2, ncomp = 3, keepX=i_half, keepY=i_half) mo_network = mo.network(mo_spls_model) mo_cov = np.array(base.as_data_frame(mo_network.rx('M'))) numpy2ri.deactivate() mo_cov = np.abs(mo_cov[:i_half,:i_half]) for thresh in cov_thresholds: tmp_cov = (mo_cov > thresh).astype(int) p, r = tu.quality_of_graph(prec_real2, tmp_cov, sym=False) method = "mixomics %f" % thresh o.write(','.join(map(str, [method, p, r])) + '\n') o.close()
def spearman_correlation_gufunc(x, y): x_ranks = bottleneck.rankdata(x, axis=-1) y_ranks = bottleneck.rankdata(y, axis=-1) return pearson_correlation_gufunc(x_ranks, y_ranks)
def ev_centrality_dist(net, core, show_plot = False, save_plot = True, outfile = None): ''' Calculate eigenvector centrality, an empirical CDF, and ranking for each vertex. Plot both centrality x empirical CDF and centrality x ranking, highlighting core vertices. Note that the plot is saved as a file only if *both* `save_plot` is true and output filename are given. :param net: The network whose degree distribution we'd like to plot :param core: The property map of core vertices :param show_plot: Show the plot on the screen? :param save_plot: Save the plot as a file? :param outfile: Filename to use to save the plot :return: The CDF and ranking plots. '''# Calculate eigenvector centrality and write it into the graph print('Calculating eigenvector centrality') net.vp['evc'] = gt.eigenvector(net, epsilon=1e-03)[1] print('Done') # Extract them into a useful format eigen_central = net.vp['evc'].get_array().tolist() # x values: centralities centralities = list(set(eigen_central)) # Use the ecdf to build the y values eigen_central_ecdf = ecdf(eigen_central) # Use 1-ecdf for legibility when most nodes have centrality near 0 centrality_distribution = \ [1 - eigen_central_ecdf(centrality) for centrality in centralities] # Write 1-ecdf into the graph net.vp['evc ecdf'] = \ net.new_vertex_property('float', vals = [1 - eigen_central_ecdf(net.vp['evc'][vertex]) for vertex in net.vertices()]) # Rank the vertices by eigenvector centrality vertex_ranking = len(eigen_central) - bn.rankdata(eigen_central) + 1 # Write them into the graph net.vp['evc rank'] = net.new_vertex_property('int', vals = vertex_ranking) #print(vertex_ranking) print('Mapping rankings to centralities') # Map these against `centralities`: # for each degree, get the index of its first occurrence in the # vertex-level list `eigen_central`; that index corresponds to the # index in `vertex_ranking` ranking = [vertex_ranking[eigen_central.index(centrality)] for centrality in centralities] # Combine into a single data frame centrality_dist = pd.DataFrame({'centrality': centralities, 'density': centrality_distribution, 'rank': ranking}) #print(centrality_dist.head()) # Grab centralities and rankings for the core vertices centralities_core = [net.vp['evc'][vertex] for vertex in core] centrality_distribution_core = [net.vp['evc ecdf'][vertex] for vertex in core] ranking_core = [net.vp['evc rank'][vertex] for vertex in core] centrality_dist_core = \ pd.DataFrame({'centrality': centralities_core, 'density': centrality_distribution_core, 'rank': ranking_core}) #print(centrality_dist_core) print('Summary statistics for core vertex centralities:') print(pd.DataFrame({k: summary(centrality_dist_core[k]) for k in centrality_dist_core})) # Build the plot density_plot = ggplot(aes(x = 'centrality'), data = centrality_dist) +\ geom_area(aes(ymin = 0, ymax = 'density', fill = 'blue'), alpha = .3) +\ geom_line(aes(y = 'density'), color = 'blue', alpha = .8) +\ xlab('Eigenvector centrality') +\ ylab('1 - Cumulative probability density') +\ scale_x_log10() + scale_y_log10() +\ theme_bw() #Add a rug for the core vertices density_plot = density_plot + \ geom_point(aes(x = 'centrality', y = 'density'), shape = '+', size = 250, alpha = .8, color = 'red', data = centrality_dist_core) # If requested, show the plot if show_plot: print(density_plot) # Save to disk if outfile is not None and save_plot: ggsave(filename = outfile + '.evc_density' + '.pdf', plot = density_plot) # Same thing for degree x ranking ranking_plot = ggplot(aes(x = 'centrality'), data = centrality_dist) +\ geom_area(aes(ymin = 0, ymax = 'rank', fill = 'blue'), alpha = .3) +\ geom_line(aes(y = 'rank'), color = 'blue', alpha = .8) +\ xlab('Eigenvector centrality') +\ ylab('Rank') +\ scale_x_log10() + scale_y_log10() +\ theme_bw() ranking_plot = ranking_plot +\ geom_point(aes(x = 'centrality', y = 'rank'), shape = '+', size = 250, alpha = .8, color = 'red', data = centrality_dist_core) if show_plot: print(ranking_plot) if outfile is not None and save_plot: ggsave(filename = outfile + '.evc_rank' + '.pdf', plot = ranking_plot) return(density_plot, ranking_plot)
def degree_dist(net, core, show_plot = False, save_plot = True, outfile = None): ''' Calculate out degree, an empirical CDF, and ranking for each vertex. Plot both degree x empirical CDF and degree x ranking, highlighting core vertices. Note that the plot is saved as a file only if *both* `save_plot` is true and output filename are given. :param net: The network whose degree distribution we'd like to plot :param core: The property map of core vertices :param show_plot: Show the plot on the screen? :param save_plot: Save the plot as a file? :param outfile: Filename to use to save the plot :return: The CDF and ranking plots. ''' # Build degree distribution # Out degree for every vertex out_degrees = [vertex.out_degree() for vertex in net.vertices()] # Write them into the graph net.vp['out-degree'] = net.new_vertex_property('int', vals = out_degrees) # x values: degrees degrees = list(set(out_degrees)) # Use the ecdf to build the y values out_degree_ecdf = ecdf(out_degrees) # Use 1-ecdf for legibility when most nodes have degree near 0 out_degree_dist = [1 - out_degree_ecdf(degree) for degree in degrees] # Write 1-ecdf into the graph net.vp['out-degree ecdf'] = \ net.new_vertex_property('float', vals = [1 - out_degree_ecdf(net.vp['out-degree'][vertex]) for vertex in net.vertices()]) # Rank the vertices by out-degree vertex_ranking = len(out_degrees) - bn.rankdata(out_degrees) + 1 # Write them into the graph net.vp['out-degree rank'] = net.new_vertex_property('int', vals = vertex_ranking) # Map these against `degree`: # for each degree, get the index of its first occurrence in the # vertex-level list `out_degrees`; that index corresponds to the # index in `vertex_ranking` ranking = [vertex_ranking[out_degrees.index(degree)] for degree in degrees] # Combine into a single data frame degree_dist = pd.DataFrame({'degree': degrees, 'density': out_degree_dist, 'rank': ranking}) # Grab the degrees and rankings for the core vertices out_degrees_core = [net.vp['out-degree'][vertex] for vertex in core] out_degree_dist_core = [net.vp['out-degree ecdf'][vertex] for vertex in core] ranking_core = [net.vp['out-degree rank'][vertex] for vertex in core] degree_dist_core = \ pd.DataFrame({'degree': out_degrees_core, 'density': out_degree_dist_core, 'rank': ranking_core}) #print(degree_dist_core) print('Summary statistics for core vertex out-degrees:') print(pd.DataFrame({k: summary(degree_dist_core[k]) for k in degree_dist_core})) # Build the degree x density plot density_plot = ggplot(aes(x = 'degree'), data = degree_dist) +\ geom_area(aes(ymin = 0, ymax = 'density', fill = 'blue'), alpha = .3) +\ geom_line(aes(y = 'density', color = 'blue'), alpha = .8) +\ xlab('Out-degree') +\ ylab('1 - Cumulative probability density') +\ scale_x_log10() + scale_y_log10() +\ theme_bw() # Add a rug for the core vertices density_plot = density_plot + \ geom_point(aes(x = 'degree', y = 'density'), shape = '+', size = 250, alpha = .8, color = 'red', data = degree_dist_core) # If requested, show the plot if show_plot: print(density_plot) # Save to disk if outfile is not None and save_plot: ggsave(filename = outfile + '.degree_density' + '.pdf', plot = density_plot) # Same thing for degree x ranking ranking_plot = ggplot(aes(x = 'degree'), data = degree_dist) +\ geom_area(aes(ymin = 0, ymax = 'rank', fill = 'blue'), alpha = .3) +\ geom_line(aes(y = 'rank', color = 'blue'), alpha = .8) +\ xlab('Out-degree') +\ ylab('Rank') +\ scale_x_log10() + scale_y_log10() +\ theme_bw() ranking_plot = ranking_plot +\ geom_point(aes(x = 'degree', y = 'rank'), shape = '+', size = 250, alpha = .8, color = 'red', data = degree_dist_core) if show_plot: print(ranking_plot) if outfile is not None and save_plot: ggsave(filename = outfile + '.degree_rank' + '.pdf', plot = ranking_plot) return(density_plot, ranking_plot)
def allPairs_sharpeWeightedRank_2D(datearray,symbols,adjClose,signal2D,LongPeriod,rankthreshold,riskDownside_min,riskDownside_max,rankThresholdPct): # adjClose -- # 2D array with adjusted closing prices (axes are stock number, date) # rankthreshold -- # select this many funds with best recent performance import numpy as np import nose import os import sys try: import bottleneck as bn from bn import rankdata as rd except: import scipy.stats.mstats as bn gainloss = np.ones((adjClose.shape[0],adjClose.shape[1]),dtype=float) gainloss[:,1:] = adjClose[:,1:] / adjClose[:,:-1] gainloss[isnan(gainloss)]=1. # convert signal2D to contain either 1 or 0 for weights signal2D -= signal2D.min() signal2D *= signal2D.max() # apply signal to daily gainloss gainloss = gainloss * signal2D gainloss[gainloss == 0] = 1.0 value = 10000. * np.cumprod(gainloss,axis=1) # calculate gainloss over period of "LongPeriod" days monthgainloss = np.ones((adjClose.shape[0],adjClose.shape[1]),dtype=float) monthgainloss[:,LongPeriod:] = adjClose[:,LongPeriod:] / adjClose[:,:-LongPeriod] monthgainloss[isnan(monthgainloss)]=1. monthgainlossweight = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=float) rankweight = 1./rankthreshold ######################################################################## ## Calculate change in rank of active stocks each day (without duplicates as ties) ######################################################################## monthgainlossRank = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=int) monthgainlossPrevious = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=float) monthgainlossPreviousRank = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=int) ### ### ### monthgainlossRank = allPairsRanking( adjClose, symbols, datearray, span=LongPeriod ) ### ### ### ########monthgainlossRank = bn.rankdata(monthgainloss,axis=0) # reverse the ranks (low ranks are biggest gainers) maxrank = np.max(monthgainlossRank) monthgainlossRank -= maxrank-1 monthgainlossRank *= -1 monthgainlossRank += 2 monthgainlossPrevious[:,LongPeriod:] = monthgainloss[:,:-LongPeriod] monthgainlossPreviousRank = bn.rankdata(monthgainlossPrevious,axis=0) # reverse the ranks (low ranks are biggest gainers) maxrank = np.max(monthgainlossPreviousRank) monthgainlossPreviousRank -= maxrank-1 monthgainlossPreviousRank *= -1 monthgainlossPreviousRank += 2 # weight deltaRank for best and worst performers differently rankoffsetchoice = rankthreshold delta = -( monthgainlossRank.astype('float') - monthgainlossPreviousRank.astype('float') ) / ( monthgainlossRank.astype('float') + float(rankoffsetchoice) ) # if rank is outside acceptable threshold, set deltarank to zero so stock will not be chosen # - remember that low ranks are biggest gainers rankThreshold = (1. - rankThresholdPct) * ( monthgainlossRank.max() - monthgainlossRank.min() ) for ii in range(monthgainloss.shape[0]): for jj in range(monthgainloss.shape[1]): if monthgainloss[ii,jj] > rankThreshold : delta[ii,jj] = -monthgainloss.shape[0]/2 if jj == monthgainloss.shape[1]: print "*******setting delta (Rank) low... Stock has rank outside acceptable range... ",ii, symbols[ii], monthgainloss[ii,jj] # if adjClose is nan, set deltarank to zero so stock will not be chosen # - remember that low ranks are biggest gainers rankThreshold = (1. - rankThresholdPct) * ( monthgainlossRank.max() - monthgainlossRank.min() ) for ii in range(monthgainloss.shape[0]): if isnan( adjClose[ii,-1] ) : delta[ii,:] = -monthgainloss.shape[0]/2 numisnans = adjClose[ii,:] # NaN in last value usually means the stock is removed from the index so is not updated, but history is still in HDF file print "*******setting delta (Rank) low... Stock has NaN for last value... ",ii, symbols[ii], numisnans[np.isnan(numisnans)].shape deltaRank = bn.rankdata( delta, axis=0 ) # reverse the ranks (low deltaRank have the fastest improving rank) maxrank = np.max(deltaRank) deltaRank -= maxrank-1 deltaRank *= -1 deltaRank += 2 for ii in range(monthgainloss.shape[1]): if deltaRank[:,ii].min() == deltaRank[:,ii].max(): deltaRank[:,ii] = 0. ######################################################################## ## Hold values constant for calendar month (gains, ranks, deltaRanks) ######################################################################## for ii in range(1,monthgainloss.shape[1]): if datearray[ii].month == datearray[ii-1].month: monthgainloss[:,ii] = monthgainloss[:,ii-1] delta[:,ii] = delta[:,ii-1] deltaRank[:,ii] = deltaRank[:,ii-1] ######################################################################## ## Calculate number of active stocks each day ######################################################################## # TODO: activeCount can be computed before loop to save CPU cycles # count number of unique values activeCount = np.zeros(adjClose.shape[1],dtype=float) for ii in np.arange(0,monthgainloss.shape[0]): firsttradedate = np.argmax( np.clip( np.abs( gainloss[ii,:]-1. ), 0., .00001 ) ) activeCount[firsttradedate:] += 1 minrank = np.min(deltaRank,axis=0) maxrank = np.max(deltaRank,axis=0) # convert rank threshold to equivalent percent of rank range rankthresholdpercentequiv = np.round(float(rankthreshold)*(activeCount-minrank+1)/adjClose.shape[0]) ranktest = deltaRank <= rankthresholdpercentequiv ######################################################################## ### Calculate downside risk measure for weighting stocks. ### Use 1./ movingwindow_sharpe_ratio for risk measure. ### Modify weights with 1./riskDownside and scale so they sum to 1.0 ######################################################################## riskDownside = 1. / move_sharpe_2D(adjClose,gainloss,LongPeriod) riskDownside = np.clip( riskDownside, riskDownside_min, riskDownside_max) riskDownside[isnan(riskDownside)] = np.max(riskDownside[~isnan(riskDownside)]) for ii in range(riskDownside.shape[0]) : riskDownside[ii] = riskDownside[ii] / np.sum(riskDownside,axis=0) ######################################################################## ### calculate equal weights for ranks below threshold ######################################################################## elsecount = 0 elsedate = 0 for ii in np.arange(1,monthgainloss.shape[1]) : if activeCount[ii] > minrank[ii] and rankthresholdpercentequiv[ii] > 0: for jj in range(value.shape[0]): test = deltaRank[jj,ii] <= rankthresholdpercentequiv[ii] if test == True : monthgainlossweight[jj,ii] = 1./rankthresholdpercentequiv[ii] monthgainlossweight[jj,ii] = monthgainlossweight[jj,ii] / riskDownside[jj,ii] else: monthgainlossweight[jj,ii] = 0. elif activeCount[ii] == 0 : monthgainlossweight[:,ii] *= 0. monthgainlossweight[:,ii] += 1./adjClose.shape[0] else : elsedate = datearray[ii] elsecount += 1 monthgainlossweight[:,ii] = 1./activeCount[ii] aaa = np.sum(monthgainlossweight,axis=0) allzerotest = np.sum(monthgainlossweight,axis=0) sumallzerotest = allzerotest[allzerotest == 0].shape if sumallzerotest > 0: print "" print " invoking correction to monthgainlossweight....." print "" for ii in np.arange(1,monthgainloss.shape[1]) : if np.sum(monthgainlossweight[:,ii]) == 0: monthgainlossweight[:,ii] = 1./activeCount[ii] print " weights calculation else clause encountered :",elsecount," times. last date encountered is ",elsedate rankweightsum = np.sum(monthgainlossweight,axis=0) monthgainlossweight[isnan(monthgainlossweight)] = 0. # changed result from 1 to 0 monthgainlossweight = monthgainlossweight / np.sum(monthgainlossweight,axis=0) monthgainlossweight[isnan(monthgainlossweight)] = 0. # changed result from 1 to 0 # input symbols and company names from text file companyName_file = os.path.join( os.getcwd(), "symbols", "companyNames.txt" ) with open( companyName_file, "r" ) as f: companyNames = f.read() print "\n\n\n" companyNames = companyNames.split("\n") ii = companyNames.index("") del companyNames[ii] companySymbolList = [] companyNameList = [] for iname,name in enumerate(companyNames): name = name.replace("amp;", "") testsymbol, testcompanyName = name.split(";") companySymbolList.append(testsymbol) companyNameList.append(testcompanyName) # print list showing current rankings and weights # - symbol # - rank # - weight from sharpe ratio # - price import os rank_text = "<div id='rank_table_container'><h3>"+"<p>Current stocks, with ranks, weights, and prices are :</p></h3><font face='courier new' size=3><table border='1'> \ <tr><td>Rank \ </td><td>Symbol \ </td><td>Company \ </td><td>Weight \ </td><td>Price \ </td><td>Trend \ </td></tr>\n" for i, isymbol in enumerate(symbols): for j in range(len(symbols)): if int( deltaRank[j,-1] ) == i : if signal2D[j,-1] == 1.: trend = 'up' else: trend = 'down' # search for company name try: symbolIndex = companySymbolList.index(symbols[j]) companyName = companyNameList[symbolIndex] except: companyName = "" rank_text = rank_text + \ "<tr><td>" + format(deltaRank[j,-1],'6.0f') + \ "<td>" + format(symbols[j],'5s') + \ "<td>" + format(companyName,'15s') + \ "<td>" + format(monthgainlossweight[j,-1],'5.03f') + \ "<td>" + format(adjClose[j,-1],'6.2f') + \ "<td>" + trend + \ "</td></tr> \n" rank_text = rank_text + "</table></div>\n" filepath = os.path.join( os.getcwd(), "pyTAAA_web", "pyTAAAweb_RankList.txt" ) with open( filepath, "w" ) as f: f.write(rank_text) print "leaving function sharpeWeightedRank_2D..." return monthgainlossweight
def rollingRankArgsort(array): try: return bd.rankdata(array)[-1] except: return np.nan
for ii in np.arange(1,monthgainloss.shape[1]): if date[ii].month == date[ii-1].month: monthgainloss[:,ii] = monthgainloss[:,ii-1] for ii in range(monthgainloss.shape[0]): plt.plot(datearray,monthgainloss[ii,:]) print " monthgainloss check: ",monthgainloss[isnan(monthgainloss)].shape monthgainlossrange = np.ones(x.shape[0],dtype=float) monthgainlossweight = np.zeros((x.shape[0],x.shape[2]),dtype=float) rankthreshold = 9 # select this many funds with best recent performance rankthreshold = 4 # select this many funds with best recent performance rankthreshold = 4 # select this many funds with best recent performance rankweight = 1./rankthreshold monthgainlossrank = bn.rankdata(monthgainloss,axis=0) rankmin = np.min(monthgainlossrank,axis=0) rankmax = np.max(monthgainlossrank,axis=0) rankcutoff = float(x.shape[0]-rankthreshold)/(x.shape[0]-1)*(rankmax-rankmin)*rankmin ranktest = monthgainlossrank > rankcutoff monthgainlossweight[ranktest == True] = rankweight rankweightsum = np.sum(monthgainlossweight,axis=0) print " 2a - rankweightsum check isnan: ",rankweightsum[isnan(rankweightsum)].shape[0] print " 2b - rankweightsum check isinf: ",rankweightsum[isinf(rankweightsum)].shape[0] print " 2c - rankweightsum check: zero ",rankweightsum[where(rankweightsum==0)].shape[0] print " 2d - shape of rankweightsum : ",rankweightsum.shape[0] plt.figure(22) plt.grid() plt.title('rankweightsum') plt.plot(datearray,rankweightsum)
def time_rankdata(self, dtype, shape): bn.rankdata(self.arr)
def allPairs_sharpeWeightedRank_2D(datearray, symbols, adjClose, signal2D, LongPeriod, rankthreshold, riskDownside_min, riskDownside_max, rankThresholdPct): # adjClose -- # 2D array with adjusted closing prices (axes are stock number, date) # rankthreshold -- # select this many funds with best recent performance import numpy as np import nose import os import sys try: import bottleneck as bn from bn import rankdata as rd except: import scipy.stats.mstats as bn gainloss = np.ones((adjClose.shape[0], adjClose.shape[1]), dtype=float) gainloss[:, 1:] = adjClose[:, 1:] / adjClose[:, :-1] gainloss[isnan(gainloss)] = 1. # convert signal2D to contain either 1 or 0 for weights signal2D -= signal2D.min() signal2D *= signal2D.max() # apply signal to daily gainloss gainloss = gainloss * signal2D gainloss[gainloss == 0] = 1.0 value = 10000. * np.cumprod(gainloss, axis=1) # calculate gainloss over period of "LongPeriod" days monthgainloss = np.ones((adjClose.shape[0], adjClose.shape[1]), dtype=float) monthgainloss[:, LongPeriod:] = adjClose[:, LongPeriod:] / adjClose[:, : -LongPeriod] monthgainloss[isnan(monthgainloss)] = 1. monthgainlossweight = np.zeros((adjClose.shape[0], adjClose.shape[1]), dtype=float) rankweight = 1. / rankthreshold ######################################################################## ## Calculate change in rank of active stocks each day (without duplicates as ties) ######################################################################## monthgainlossRank = np.zeros((adjClose.shape[0], adjClose.shape[1]), dtype=int) monthgainlossPrevious = np.zeros((adjClose.shape[0], adjClose.shape[1]), dtype=float) monthgainlossPreviousRank = np.zeros( (adjClose.shape[0], adjClose.shape[1]), dtype=int) ### ### ### monthgainlossRank = allPairsRanking(adjClose, symbols, datearray, span=LongPeriod) ### ### ### ########monthgainlossRank = bn.rankdata(monthgainloss,axis=0) # reverse the ranks (low ranks are biggest gainers) maxrank = np.max(monthgainlossRank) monthgainlossRank -= maxrank - 1 monthgainlossRank *= -1 monthgainlossRank += 2 monthgainlossPrevious[:, LongPeriod:] = monthgainloss[:, :-LongPeriod] monthgainlossPreviousRank = bn.rankdata(monthgainlossPrevious, axis=0) # reverse the ranks (low ranks are biggest gainers) maxrank = np.max(monthgainlossPreviousRank) monthgainlossPreviousRank -= maxrank - 1 monthgainlossPreviousRank *= -1 monthgainlossPreviousRank += 2 # weight deltaRank for best and worst performers differently rankoffsetchoice = rankthreshold delta = -(monthgainlossRank.astype('float') - monthgainlossPreviousRank.astype('float')) / ( monthgainlossRank.astype('float') + float(rankoffsetchoice)) # if rank is outside acceptable threshold, set deltarank to zero so stock will not be chosen # - remember that low ranks are biggest gainers rankThreshold = (1. - rankThresholdPct) * (monthgainlossRank.max() - monthgainlossRank.min()) for ii in range(monthgainloss.shape[0]): for jj in range(monthgainloss.shape[1]): if monthgainloss[ii, jj] > rankThreshold: delta[ii, jj] = -monthgainloss.shape[0] / 2 if jj == monthgainloss.shape[1]: print "*******setting delta (Rank) low... Stock has rank outside acceptable range... ", ii, symbols[ ii], monthgainloss[ii, jj] # if adjClose is nan, set deltarank to zero so stock will not be chosen # - remember that low ranks are biggest gainers rankThreshold = (1. - rankThresholdPct) * (monthgainlossRank.max() - monthgainlossRank.min()) for ii in range(monthgainloss.shape[0]): if isnan(adjClose[ii, -1]): delta[ii, :] = -monthgainloss.shape[0] / 2 numisnans = adjClose[ii, :] # NaN in last value usually means the stock is removed from the index so is not updated, but history is still in HDF file print "*******setting delta (Rank) low... Stock has NaN for last value... ", ii, symbols[ ii], numisnans[np.isnan(numisnans)].shape deltaRank = bn.rankdata(delta, axis=0) # reverse the ranks (low deltaRank have the fastest improving rank) maxrank = np.max(deltaRank) deltaRank -= maxrank - 1 deltaRank *= -1 deltaRank += 2 for ii in range(monthgainloss.shape[1]): if deltaRank[:, ii].min() == deltaRank[:, ii].max(): deltaRank[:, ii] = 0. ######################################################################## ## Hold values constant for calendar month (gains, ranks, deltaRanks) ######################################################################## for ii in range(1, monthgainloss.shape[1]): if datearray[ii].month == datearray[ii - 1].month: monthgainloss[:, ii] = monthgainloss[:, ii - 1] delta[:, ii] = delta[:, ii - 1] deltaRank[:, ii] = deltaRank[:, ii - 1] ######################################################################## ## Calculate number of active stocks each day ######################################################################## # TODO: activeCount can be computed before loop to save CPU cycles # count number of unique values activeCount = np.zeros(adjClose.shape[1], dtype=float) for ii in np.arange(0, monthgainloss.shape[0]): firsttradedate = np.argmax( np.clip(np.abs(gainloss[ii, :] - 1.), 0., .00001)) activeCount[firsttradedate:] += 1 minrank = np.min(deltaRank, axis=0) maxrank = np.max(deltaRank, axis=0) # convert rank threshold to equivalent percent of rank range rankthresholdpercentequiv = np.round( float(rankthreshold) * (activeCount - minrank + 1) / adjClose.shape[0]) ranktest = deltaRank <= rankthresholdpercentequiv ######################################################################## ### Calculate downside risk measure for weighting stocks. ### Use 1./ movingwindow_sharpe_ratio for risk measure. ### Modify weights with 1./riskDownside and scale so they sum to 1.0 ######################################################################## riskDownside = 1. / move_sharpe_2D(adjClose, gainloss, LongPeriod) riskDownside = np.clip(riskDownside, riskDownside_min, riskDownside_max) riskDownside[isnan(riskDownside)] = np.max( riskDownside[~isnan(riskDownside)]) for ii in range(riskDownside.shape[0]): riskDownside[ii] = riskDownside[ii] / np.sum(riskDownside, axis=0) ######################################################################## ### calculate equal weights for ranks below threshold ######################################################################## elsecount = 0 elsedate = 0 for ii in np.arange(1, monthgainloss.shape[1]): if activeCount[ii] > minrank[ii] and rankthresholdpercentequiv[ii] > 0: for jj in range(value.shape[0]): test = deltaRank[jj, ii] <= rankthresholdpercentequiv[ii] if test == True: monthgainlossweight[ jj, ii] = 1. / rankthresholdpercentequiv[ii] monthgainlossweight[jj, ii] = monthgainlossweight[ jj, ii] / riskDownside[jj, ii] else: monthgainlossweight[jj, ii] = 0. elif activeCount[ii] == 0: monthgainlossweight[:, ii] *= 0. monthgainlossweight[:, ii] += 1. / adjClose.shape[0] else: elsedate = datearray[ii] elsecount += 1 monthgainlossweight[:, ii] = 1. / activeCount[ii] aaa = np.sum(monthgainlossweight, axis=0) allzerotest = np.sum(monthgainlossweight, axis=0) sumallzerotest = allzerotest[allzerotest == 0].shape if sumallzerotest > 0: print "" print " invoking correction to monthgainlossweight....." print "" for ii in np.arange(1, monthgainloss.shape[1]): if np.sum(monthgainlossweight[:, ii]) == 0: monthgainlossweight[:, ii] = 1. / activeCount[ii] print " weights calculation else clause encountered :", elsecount, " times. last date encountered is ", elsedate rankweightsum = np.sum(monthgainlossweight, axis=0) monthgainlossweight[isnan( monthgainlossweight)] = 0. # changed result from 1 to 0 monthgainlossweight = monthgainlossweight / np.sum(monthgainlossweight, axis=0) monthgainlossweight[isnan( monthgainlossweight)] = 0. # changed result from 1 to 0 # input symbols and company names from text file companyName_file = os.path.join(os.getcwd(), "symbols", "companyNames.txt") with open(companyName_file, "r") as f: companyNames = f.read() print "\n\n\n" companyNames = companyNames.split("\n") ii = companyNames.index("") del companyNames[ii] companySymbolList = [] companyNameList = [] for iname, name in enumerate(companyNames): name = name.replace("amp;", "") testsymbol, testcompanyName = name.split(";") companySymbolList.append(testsymbol) companyNameList.append(testcompanyName) # print list showing current rankings and weights # - symbol # - rank # - weight from sharpe ratio # - price import os rank_text = "<div id='rank_table_container'><h3>" + "<p>Current stocks, with ranks, weights, and prices are :</p></h3><font face='courier new' size=3><table border='1'> \ <tr><td>Rank \ </td><td>Symbol \ </td><td>Company \ </td><td>Weight \ </td><td>Price \ </td><td>Trend \ </td></tr>\n" for i, isymbol in enumerate(symbols): for j in range(len(symbols)): if int(deltaRank[j, -1]) == i: if signal2D[j, -1] == 1.: trend = 'up' else: trend = 'down' # search for company name try: symbolIndex = companySymbolList.index(symbols[j]) companyName = companyNameList[symbolIndex] except: companyName = "" rank_text = rank_text + \ "<tr><td>" + format(deltaRank[j,-1],'6.0f') + \ "<td>" + format(symbols[j],'5s') + \ "<td>" + format(companyName,'15s') + \ "<td>" + format(monthgainlossweight[j,-1],'5.03f') + \ "<td>" + format(adjClose[j,-1],'6.2f') + \ "<td>" + trend + \ "</td></tr> \n" rank_text = rank_text + "</table></div>\n" filepath = os.path.join(os.getcwd(), "pyTAAA_web", "pyTAAAweb_RankList.txt") with open(filepath, "w") as f: f.write(rank_text) print "leaving function sharpeWeightedRank_2D..." return monthgainlossweight
def corr_main(params): """ This is the main function which performs the following steps: - open dataset(s), load selected features, merge datasets - perform GLASSO with huge R package - calculated permuted p-values with GPD approximation in parallel - correct for multiple testing - save r and p value matrices for users - save networks from r values for users - write variables and datasets for visualisation in JS """ # -------------------------------------------------------------------------- # CALCULATE GRAPHLASSO AND PERMUTED P-VALS # -------------------------------------------------------------------------- # open first dataset path = os.path.join(params['output_folder'], params['dataset1']) dataset1, sep = open_file(path) n, p = dataset1.shape # if there's a 2nd dataset, merge them if not params['autocorr']: path2 = os.path.join(params['output_folder'], params['dataset2']) dataset2, sep2 = open_file(path2) # if two featres has the same name we need prefixes merged_datasets_df = dataset1.join(dataset2, how='inner', lsuffix='_data1', rsuffix='_data2') X = merged_datasets_df.values else: merged_datasets_df = dataset1 X = merged_datasets_df.values # standardise X ss = StandardScaler() X = ss.fit_transform(X) # perform GLASSO with huge in R lambda_threshold = params['lambda_val'] cov, prec = hugeR.hugeR(X, lambda_threshold) # create column ranked X for corr_permutation rX = bn.rankdata(X, axis=0) # get GPD approximated p-values perm_num = 10000 rs, p_vals, p_mask = cp.gpd_spearman(rX, perm_num=perm_num, prec=prec, mc_method=params['multi_corr_method'], mc_alpha=params['alpha_val']) # delete correlations that did not pass the multi test correction rs[~p_mask] = 0 p_vals[~p_mask] = 1 # -------------------------------------------------------------------------- # CHECK IF GENOMIC FILTERING IS NEEDED # -------------------------------------------------------------------------- # if fs, load metadata column for fold_change calculation later if params['fs']: path = os.path.join(params['study_folder'], params['metadata_file']) y, _ = open_file(path) y = y[params['fs_cols']].iloc[1:].dropna() else: y = None # if genomic, check if filtering overlapping and distant corrs needed discard_or_constrain = params['discard_overlap'] or params['constrain_corr'] if params['annotation'] and discard_or_constrain: genomic = True else: genomic = False # -------------------------------------------------------------------------- # GENERATE PAIRWISE PLOTS FOR DATA1, DATA2, DATA1-2 # -------------------------------------------------------------------------- generate_pair_plots(params, rs, p_vals, merged_datasets_df, p) # -------------------------------------------------------------------------- # WRITE RESULTS FOR DATA1, DATA2, DATA1-2 # -------------------------------------------------------------------------- params = write_results(params, rs[:p, :p], p_vals[:p, :p], genomic, (dataset1, dataset1), 'dataset1', y, True) if not params['autocorr']: params = write_results(params, rs[p:, p:], p_vals[p:, p:], genomic, (dataset2, dataset2), 'dataset2', y, True) params = write_results(params, rs[:p, p:], p_vals[:p, p:], genomic, (dataset1, dataset2), 'dataset1_2', y) # if corr_done in params is False one of the writing steps failed if 'corr_done' not in params: params['corr_done'] = True return params