def compute_ir_metrics(ge_wrapper, ge_loader, smiles_wrapper, smiles_loader, split="train", train_smiles=None): gex_embeddings, chem_embeddings, smiles_gex_labels, smiles_chem_labels = get_embeddings(ge_wrapper, ge_loader, smiles_wrapper, smiles_loader) gex_chem_distances = cdist(gex_embeddings, chem_embeddings, metric=config['retrieval']['metric']) gex_chem_ranks = rankdata(gex_chem_distances, axis=1) rank_first_match = get_ranks_first_match(gex_chem_ranks, smiles_gex_labels, smiles_chem_labels) # smiles_wrapper.pert_smiles == smiles_chem_labels list_of_inds = [[i for i,j in enumerate(smiles_gex_labels)]] if split == "val": ge_inds_in_train = [i for i, j in enumerate(smiles_gex_labels) if j in train_smiles] ge_inds_not_in_train = [i for i, j in enumerate(smiles_gex_labels) if j not in train_smiles] list_of_inds.append(ge_inds_in_train) list_of_inds.append(ge_inds_not_in_train) chem_inds_not_in_train = [i for i,j in enumerate(smiles_chem_labels) if j not in train_smiles] ir_results = [] for inds in list_of_inds: ir_results.append(prepare_metrics(rank_first_match, inds)) if split == "val": ranks_subset = rankdata(gex_chem_ranks[:, chem_inds_not_in_train][ge_inds_not_in_train,:], axis=1) rank_first_match = get_ranks_first_match(ranks_subset, smiles_gex_labels[ge_inds_not_in_train], smiles_chem_labels[chem_inds_not_in_train]) ir_results.append(prepare_metrics(rank_first_match, [k for k in range(len(ge_inds_not_in_train))] )) return ir_results
def rank_data(r, rground): # we checked this heavily, and is correct, e.g. rground will go from largest rank to smallest r = rankdata(r) rground = rankdata(rground) if np.sum(r) != np.sum(rground): raise AssertionError("ranks should add up to the same") return r, rground
def run_split(root, print_all=False): cum_scores = [] n_pos = None for filename in glob("%s/*.scored" % root): pos, neg = read_scores(filename) scores = concatenate((pos, neg)) ranks = rankdata(-scores) if n_pos is None: n_pos = len(pos) else: assert n_pos == len(pos) pos_ranks = ranks[:n_pos] counts = [(pos_ranks <= i).sum() for i in range(1, len(ranks))] cum_scores.append(tuple(counts)) cum_scores = array(cum_scores).transpose() if print_all: print '#cutoff\tcounts...' for i, row in enumerate(cum_scores): print '%d\t%s' % (i + 1, '\t'.join(['%d' % count for count in row])) else: means = cum_scores.mean(axis=1) print '#cutoff\trecall' for i, mean in enumerate(means): print '%d\t%.1f' % (i + 1, mean)
def run_loo(root, print_all=False): cum_scores = defaultdict(list) for filename in glob("%s/*.scored" % root): id_ = os.path.basename(filename).split('.')[0] major, rep = id_.split('-') pos, neg = read_scores(filename) scores = concatenate((pos, neg)) ranks = rankdata(-scores) assert len(pos) == 1 cum_scores[rep].append(ranks[0]) reps = len(cum_scores) assert len(set(map(len, cum_scores.values()))) == 1 print >> sys.stderr, "Found %d reps per iter..." % reps ranks = array([cum_scores[rep] for rep in sorted(cum_scores)]).transpose() if print_all: print '#cutoff\trecall...' for cutoff in range(1, ranks.shape[0] + 1): counts = (ranks <= cutoff).sum(axis=0) print '%d\t%s' % (cutoff, '\t'.join( ['%.1f' % count for count in counts])) else: print '#cutoff\trecall' for cutoff in range(1, ranks.shape[0] + 1): count = float((ranks <= cutoff).sum()) / reps print '%d\t%.1f' % (cutoff, count)
def quantile_normalization(self): """ Return the np.array which contains the normalized values """ rank_matrix = [] for c in range(self.all_table.shape[1]): col = self.all_table[:, c] rank_col = mstats.rankdata(col) rank_matrix.append(rank_col) ranks = numpy.array(rank_matrix) trans_rank = numpy.transpose(ranks) # Calculate for means of ranks print(" Calculating for the mean of ranked data...") sort_matrix = numpy.sort(self.all_table, axis=0) means = [] for r in range(self.all_table.shape[0]): row = [x for x in sort_matrix[r, :]] means.append(numpy.mean(row)) # Replace the value by new means print(" Replacing the data value by normalized mean...") normalized_table = numpy.around(trans_rank) for i, v in enumerate(means): normalized_table[normalized_table == i + 1] = v # print(rounded_rank) self.norm_table = normalized_table
def rank_rows(M): """Rank order rows of M. Preserve masks. fill value for M must be the maximum value for that array. """ #scipy.stats.mstats.rankdata for i in xrange(np.size(M,0)): try: mask = M.mask except AttributeError: M[i,:] = mstats.rankdata(M[i,:]) -1 else: assert np.sum(M == M.fill_value) == 0 mask = np.copy(M[i,:].mask) M[i,:] = mstats.rankdata(M[i,:].data) -1 M[i,:].mask = mask
def rankData(x): ''' Assumes x has shape assumes nodes is the last dimension Returns the ranked data It is not the most efficient method used ''' from scipy.stats.mstats import rankdata s = x.shape # extract shape ranking = np.zeros(s) # ranking data maxim = np.zeros(s[:-1]) # largest driver node rank maxim[:] = np.nan # assume nans # reshape (not really productive) maxim = maxim.reshape(-1) ranking = ranking.reshape((-1, s[-1])) noneFound = 0 mask = np.ma.masked_invalid(x) for idx, sample in enumerate(mask.reshape(-1, s[-1])): # allow only if there is variance in the data if sample.sum() != 0: rank = rankdata(sample) ranking[idx] = rank maxim[idx] = rank.argmax() else: noneFound += 0 ranking = ranking.reshape(s) # reshape back maxim = maxim.reshape(s[:-1]) print(f'In {noneFound} trials no max are found') return ranking, maxim
def main(kernel, ranks_file, stats_dir, metric='acc'): techniques = list(TECHNIQUES.keys()) stats = dict() stat_count = defaultdict(int) for technique, (stats_file, parser) in TECHNIQUES.items(): stats_file = (stats_file % metric) with open(os.path.join(stats_dir, stats_file), 'r') as f: for line in f: parts = line.strip().split(',') results = parser(parts, kernel) if results is None: continue dset, stat = results stats[technique, dset] = stat stat_count[dset] += 1 good_datasets = [dset for dset in stat_count.keys() if stat_count[dset] == len(techniques)] data = np.array([[stats[t, d] for d in good_datasets] for t in techniques]) ranks = rankdata(-data, axis=0) avg_ranks = np.average(ranks, axis=1) with open(ranks_file, 'w+') as f: for t, r in zip(techniques, avg_ranks.flat): line = '%s,%d,%f\n' % (t, ranks.shape[1], r) f.write(line) print line,
def compute_rank(data): print '\nRANK\n' # rankdata assigns rank 1 to the lowest element, so # we need to negate before ranking. ssim_rank = rankdata(np.array(data['ssim']) * -1.0, axis=1) psnr_rank = rankdata(np.array(data['psnr']) * -1.0, axis=1) # Rank mean + std. for i, m in enumerate(data['models']): print '%30s ssim-rank %.2f ± %.2f psnr-rank %.2f ± %.2f' % ( m, np.mean(ssim_rank[:, i]), np.std(ssim_rank[:, i]), np.mean(psnr_rank[:, i]), np.std(psnr_rank[:, i])) # Rank frequencies print '\n SSIM rank freqs' print_rank_freqs(data, ssim_rank) print '\n PSNR rank freqs' print_rank_freqs(data, psnr_rank)
def _ranks(data): """ This function computes ranks for data in the table along axis=0. Parameters ---------- data : np.ndarray Array of data to be ranked Returns ------- np.ndarray Table of data ranks """ x_len = data.shape[0] x_mask = data.sum(axis=0) > 0 # create a matrix of ranges - init with average rank # for columns without nonzero expressions data_ge_ranked = np.ones(data.shape) * (1 + data.shape[0]) / 2 # compute ranks only for nonzero columns for i in np.where(x_mask)[0]: mask = data[:, i] > 0 col = np.ones(x_len) * (1 + (x_len - mask.sum())) / 2 col[mask] = rankdata(data[mask, i]) + (x_len - mask.sum()) data_ge_ranked[:, i] = col return data_ge_ranked
def fit(self, signal) -> "CostRank": """Set parameters of the instance. Args: signal (array): signal. Shape (n_samples,) or (n_samples, n_features) Returns: self """ if signal.ndim == 1: signal = signal.reshape(-1, 1) obs, vars = signal.shape # Convert signal data into ranks in the range [1, n] ranks = rankdata(signal, axis=0) # Center the ranks into the range [-(n+1)/2, (n+1)/2] centered_ranks = ranks - ((obs + 1) / 2) # Sigma is the covariance of these ranks. # If it's a scalar, reshape it into a 1x1 matrix cov = np.cov(centered_ranks, rowvar=False, bias=True).reshape(vars, vars) # Use the pseudoinverse to handle linear dependencies # see Lung-Yut-Fong, A., Lévy-Leduc, C., & Cappé, O. (2015) try: self.inv_cov = pinv(cov) except LinAlgError as e: raise LinAlgError( "The covariance matrix of the rank signal is not invertible and the " "pseudo-inverse computation did not converge.") from e self.ranks = centered_ranks return self
def main(kernel, ranks_file, stats_dir, metric='acc'): techniques = list(TECHNIQUES.keys()) stats = dict() stat_count = defaultdict(int) for technique, (stats_file, parser) in TECHNIQUES.items(): stats_file = (stats_file % metric) with open(os.path.join(stats_dir, stats_file), 'r') as f: for line in f: parts = line.strip().split(',') results = parser(parts, kernel) if results is None: continue dset, stat = results stats[technique, dset] = stat stat_count[dset] += 1 good_datasets = [ dset for dset in stat_count.keys() if stat_count[dset] == len(techniques) ] data = np.array([[stats[t, d] for d in good_datasets] for t in techniques]) ranks = rankdata(-data, axis=0) avg_ranks = np.average(ranks, axis=1) with open(ranks_file, 'w+') as f: for t, r in zip(techniques, avg_ranks.flat): line = '%s,%d,%f\n' % (t, ranks.shape[1], r) f.write(line) print line,
def get_rank_vector(self, x): """Get ranking with explicit handling of missing values, tagged as nan.""" n = len(x) if np.all(np.isnan(x)): #return np.ones(len(x)) * n return np.ones(len(x)) * (self.threshold + 1) ranks = mstats.rankdata(np.ma.masked_invalid(x)) maxrank = np.max(ranks) # Make all missing data have the same rank, not ordered by appearance #ranks[ranks == 0] = maxrank + 1 ranks[ranks == 0] = self.threshold + 1 """ #ranks[ranks == 0] = n + 1 ranks[ranks == 0] = np.nan # Convert nan to max rank, penalize missing submissions maxrank = np.nanmax(ranks) if np.isnan(maxrank): maxrank = 0 # All values missing, set to equal rank ranks[np.isnan(ranks)] = maxrank + 1 """ return ranks
def run_loo(root, print_all=False): cum_scores = defaultdict(list) for filename in glob("%s/*.scored" % root): id_ = os.path.basename(filename).split('.')[0] major, rep = id_.split('-') pos, neg = read_scores(filename) scores = concatenate((pos, neg)) ranks = rankdata(-scores) assert len(pos) == 1 cum_scores[rep].append(ranks[0]) reps = len(cum_scores) assert len(set(map(len, cum_scores.values()))) == 1 print >>sys.stderr, "Found %d reps per iter..." % reps ranks = array([cum_scores[rep] for rep in sorted(cum_scores)]).transpose() if print_all: print '#cutoff\trecall...' for cutoff in range(1, ranks.shape[0] + 1): counts = (ranks <= cutoff).sum(axis=0) print '%d\t%s' % (cutoff, '\t'.join(['%.1f' % count for count in counts])) else: print '#cutoff\trecall' for cutoff in range(1, ranks.shape[0] + 1): count = float((ranks <= cutoff).sum()) / reps print '%d\t%.1f' % (cutoff, count)
def kde_privacy(self, x_train, x_test, y_train, y_test): num_test = y_test.shape[0] kdes = self.create_kdes(x_train, y_train) p_vals = [] maps = {} cnt = 0 for (l, k) in kdes.iteritems(): p_vals.append(k.evaluate(x_test.T)) maps[l] = cnt cnt += 1 p_vals = np.asarray( p_vals ) # num_label(num_kde) * num_data p(i,j)=p(data j comes from kde i) num_labels = p_vals.shape[0] probs = p_vals / np.max(p_vals, 0) #tops = np.sum(probs>self.eps,0) ranks = 1 + num_labels - rankdata(probs, 0) #+1 is ok? trueranks = np.zeros(num_test) for i in range(num_test): trueranks[i] = ranks[maps[y_test[i]], i] #ek_priv = np.maximum(tops/num_labels,trueranks/num_labels) #e_priv = tops/num_labels rank_priv = trueranks #/num_labels log_rank_priv = np.log(rank_priv) opt_bayes = 1 - sum(trueranks == 1) / num_test priv = [ num_labels, opt_bayes, np.mean(rank_priv), np.std(rank_priv), np.mean(log_rank_priv), np.std(log_rank_priv) ] return priv
def cdf(self, x, c=0.0, sigma=1.0, theta=0.0, nu=1.0): """ Cumulative distribution function derivative at x of the VarianceGamma distribution. Parameters ---------- x : array_like quantiles c, sigma, theta, nu : array_like The shape parameter(s) for the distribution (see docstring of the instance object for more information) (default=0, 1, 0, 1) Returns ------- cdf : ndarray Cumulative distribution function evaluated at x """ if sigma <= 0 or nu <= 0: raise ValueError("The value of sigma and nu must be positive") x = np.atleast_1d(x) xs = np.sort(x) # Get the breaks and add -inf and inf xi = self._breaks(c, sigma, theta, nu) xi = np.concatenate([[-inf], xi, [inf]]) # Evaluate cdf at break points int_xi = np.zeros(1) for j in range(1, xi.size - 3): int_xi = np.append( int_xi, quad(self.pdf, xi[j], xi[j + 1], args=(c, sigma, theta, nu))[0] + int_xi[j - 1]) # Create masks that contains index of a range mx = np.full(x.shape, xi.size - 2, dtype=np.int32) for j in range(xi.size - 1): mx[(xs >= xi[j]) & (xs < xi[j + 1])] = j # Integrate interval = np.sort(np.stack((xi[mx], xs), axis=-1)) xint = np.full(x.shape + (101, ), nan) for index in np.ndindex(xs.shape): if not isinf(interval[index][0]) and not isinf(interval[index][1]): xint[index] = np.linspace(interval[index][0], interval[index][1], xint.shape[-1]) yint = self.pdf(xint, c, sigma, theta, nu) resint = trapz(yint, xint) # Accumulate resint[mx == 0] = 0 for j in range(1, xi.size - 2): resint[mx == j] = int_xi[j - 1] + resint[mx == j] resint[mx == (xi.size - 2)] = 1 return resint.flatten()[rankdata(x).astype(int) - 1].reshape( resint.shape)
def ranking_filter(score): """lecture 4 slide 35""" n = np.sum(~np.isnan(score)) ranks = mstats.rankdata(np.ma.masked_invalid(score)) ranks[ranks == 0] = np.nan pos = np.nan_to_num(ranks - np.nansum(ranks) / n) pos /= np.nansum(np.abs(pos)) / 2 return pos
def long_ranking_filter(score): try: ranks = mstats.rankdata(np.ma.masked_invalid(score)) except: return score * 0 + 1 / len(score) ranks[ranks == 0] = np.nan pos = np.nan_to_num(ranks / np.nansum(ranks)) return pos
def _get_rank_for_one_correct_label(self, prob, y): ranks = rankdata(-prob, axis=1) inverse_ranks = self.num_community - ranks target_inverse_ranks = inverse_ranks * y best_target_inverse_ranks = target_inverse_ranks.max(axis=1) best_target_ranks = self.num_community - best_target_inverse_ranks rounded_best_target_ranks = np.rint(best_target_ranks).astype(int) return rounded_best_target_ranks
def evaluate_one_comparison(data, prior_count, pair, rel_rank_ZV): """ Internal function: Evaluates the rank score of all proteins in one comparison. The rank score is small for proteins with large positive logarithmic fold change values and proteins whose count changes from zero to a non-zero value. The rank score is close to one for proteins with large negative logarithmic fold change values and proteins whose count changes from a non-zero value to zero. Parameters ---------- data : ndarray NumPy array with the analyzed protein counts. prior_count : float, optional This count is added to all actual protein counts when the logarithmic fold changes are computed. It attenuates the impact of proteins with low counts. The default is 2. pair : list with two integers The integers define the columns involved in the comparison. The first element is the reference column, the second element is the column with a modified condition. Returns ------- s : ndarray with the rank score of all proteins (for the given comparison) lfc : ndarray with the logarithmic fold change for all proteins (for the given comparison) """ # compute the fraction of VV, VZ, ZV, and ZZ pairs for the given pair of columns VV_proteins = np.where((data[:, pair[0]] > 0) & (data[:, pair[1]] > 0) == True)[0] #xVV = VV_proteins.size / float(data.shape[0]) VZ_proteins = np.where((data[:, pair[0]] > 0) & (data[:, pair[1]] == 0) == True)[0] #xVZ = VZ_proteins.size / float(data.shape[0]) ZV_proteins = np.where((data[:, pair[0]] == 0) & (data[:, pair[1]] > 0) == True)[0] #xZV = ZV_proteins.size / float(data.shape[0]) ZZ_proteins = np.where((data[:, pair[0]] == 0) & (data[:, pair[1]] == 0) == True)[0] #xZZ = ZZ_proteins.size / float(data.shape[0]) score_vector = np.zeros(data.shape[0]) score_vector[:] = -1 lfc_vals = np.zeros(data.shape[0]) for row in range(data.shape[0]): if data[row, pair[1]] > 0 and data[row, pair[0]] > 0: lfc_vals[row] = np.log2( float(prior_count + data[row, pair[1]]) / (prior_count + data[row, pair[0]])) else: lfc_vals[row] = np.nan rank_lfc = mstats.rankdata( np.ma.masked_invalid(-lfc_vals) ) # the largest lfc values get the smallest rank; inf and nan entries get zero rank score_vector[VV_proteins] = (rank_lfc[VV_proteins] - 0.5) / VV_proteins.size score_vector[ZV_proteins] = rel_rank_ZV score_vector[VZ_proteins] = 1 - rel_rank_ZV score_vector[ ZZ_proteins] = -1 # by setting negative score here, we mark that this is a ZZ pair return score_vector, lfc_vals
def nanrankdata(arr): ''' Ranks data ignoring NaN values ''' if np.all(np.isnan(arr)): return arr.copy() ranks = mstats.rankdata(np.ma.masked_invalid(arr)) ranks[ranks == 0] = np.nan return ranks
def testMcmcCoplulaFit(self): print( "--------------------- MCMC COPULA FIT TEST --------------------------" ) # Load matlab data set stocks = np.loadtxt(dataDir + 'stocks.csv', delimiter=',') x = stocks[:, 0] y = stocks[:, 1] # Rank transform the data u = rankdata(x) / (len(x) + 1) v = rankdata(y) / (len(y) + 1) # Fit t copula and gaussian copula thetag0 = [0.2] g_copula = gc() theta_g_fit_mle = g_copula.fitMLE(u, v, *thetag0, bounds=((-0.99, 0.99), ))[0] aic_g_fit_mle = g_copula._AIC(u, v, 0, *theta_g_fit_mle) theta_g_fit_mcmc = g_copula.fitMcmc(u, v, *thetag0, bounds=((-0.99, 0.99), ), ngen=500, nburn=200)[0] aic_g_fit_mcmc = g_copula._AIC(u, v, 0, *theta_g_fit_mcmc) print("Gaussian copula MLE paramter [rho]: " + str(theta_g_fit_mle) + " AIC =" + str(aic_g_fit_mle)) print("Gaussian copula MCMC paramter [rho]: " + str(theta_g_fit_mcmc) + " AIC =" + str(aic_g_fit_mcmc)) # check MLE and MCMC solution are the same in this case self.assertAlmostEqual(theta_g_fit_mle[0], theta_g_fit_mcmc[0], delta=tol) # check againt expected true_rho_ranked = 0.7387 self.assertAlmostEqual(theta_g_fit_mcmc[0], true_rho_ranked, delta=tol)
def friedman(results, alpha=0.05): """ Performs the Friedman test on the given results determining if there is a difference between configurations results: list of list of numbers representing results for parameters for a number of problems alpha: 1 - confidence of outcome """ ranks = rankdata(array(results), axis=1) (k, n) = ranks.shape T = (n - 1) * sum((sum(ranks) - k * (n + 1) / 2.0) ** 2) / sum(sum(ranks ** 2 - (n + 1) * (n + 1) / 4.0)) return T, chi2.ppf(1 - alpha, n - 1)
def select_attributes(data, z_threshold=1): """ Function selects "over"-expressed attributes for items with Mann-Whitney U test. Parameters ---------- data : Orange.data.Table Tabular data with gene expressions z_threshold : float The threshold for selecting the attribute. For each item the attributes with z-value above this value are selected. Returns ------- :obj:`list` Sets of selected attributes for each cell """ if len(data.X) <= 1: return [], [] # rank data data_ge_ranked = rankdata(data.X, axis=0) # compute U, mu, sigma n = data_ge_ranked.shape[0] n2 = n - 1 u = data_ge_ranked - 1 mu = n2 / 2 sigma = np.zeros(data_ge_ranked.shape[1]) for i in range(data_ge_ranked.shape[1]): _, counts = np.unique(data_ge_ranked[:, i], return_counts=True) sigma[i] = np.sqrt( 1 * n2 / 12 * ((n + 1) - np.sum((counts ** 3 - counts)) / (n * (n - 1)))) # compute z z = (u - mu) / (sigma + 1e-16) # gene selection attributes_np = np.array([ a.attributes.get("Entrez ID") for a in data.domain.attributes]) attributes_sets = [ set(map(str, set(attributes_np[row > z_threshold]) - {None})) for row in z] # map to string was added since there seems to be no guarantee that # Entrez ID is a string. # pack z values to data table # take only attributes in new domain domain = Domain([x for x in data.domain.attributes]) z_table = Table(domain, z) return attributes_sets, z_table
def rank(self, method=0): """! @brief rank transfom the data @param method <b>int</b> if == 0: use standard rank transform, else: use CDF data transform. """ self.rankMethod = method if method == 0: self.u = rankdata(self.x) / (len(self.x) + 1) self.v = rankdata(self.y) / (len(self.y) + 1) else: # use alternate CDF rank transform method kde_x = gaussian_kde(self.x) kde_y = gaussian_kde(self.y) u_hat = np.zeros(len(self.x)) v_hat = np.zeros(len(self.y)) for i, (xp, yp) in enumerate(zip(self.x, self.y)): u_hat[i] = kde_x.integrate_box_1d(-np.inf, xp) v_hat[i] = kde_y.integrate_box_1d(-np.inf, yp) self.u = u_hat self.v = v_hat
def generate_correlation_map(x: np.ndarray, y: np.ndarray, method: str = 'pearson') -> np.ndarray: """ Correlate each row in matrix X against each row in matrix Y. Parameters ---------- x Shape N X T. y Shape M X T. method Method use to compute the correlation. Must be one of 'pearson' or 'spearman' Returns ------- np.array N X M array in which each element is a correlation coefficient. """ if method.lower() not in ['spearman', 'pearson']: raise NotImplementedError( f'Method {method} not understood, must be one of "pearson", "spearman"' ) if method.lower() == 'spearman': x = rankdata(x, axis=1) y = rankdata(y, axis=1) mu_x = x.mean(axis=1) mu_y = y.mean(axis=1) n = x.shape[1] if n != y.shape[1]: raise ValueError('x and y must have the same number of timepoints.') s_x = x.std(axis=1, ddof=n - 1) s_y = y.std(axis=1, ddof=n - 1) cov = np.dot(x, y.T) - n * np.dot(mu_x[:, np.newaxis], mu_y[np.newaxis, :]) return cov / np.dot(s_x[:, np.newaxis], s_y[np.newaxis, :])
def rank_all_stocks(self, metrics, update=True): # We load the fundamental indicators from a csv file (if update==False), # or from the export_csv_data method (if update==True) if update: self.export_csv_data(metrics) try: with open('Statistics_{}.csv'.format(self.sector), 'r') as f: Stats = [] reader = csv.reader(f, delimiter=',') for row in reader: Stats.append(row) except: print( 'Cannot find file...probably does not exists in the directory') # ranks = [c[0] for c in Stats] for i1 in range(len(Stats[0]) ): # For each fundamental indicator in the csv file... if Stats[0][ i1] in metrics: # ...if that fundamental indicator is in our list of metrics... array = np.array([ float(c[i1]) for c in Stats[1:] ]) # ...we store its value for each stock in an array variable rank = mstats.rankdata( np.ma.masked_invalid(array) ) # We rank each stock for this fundamental indicator rank[rank == 0] = np.nan if Stats[0][i1] in best: if best[ Stats[0] [i1]] == 'H': # The rank depends on if we want the value of the fundamental indicator to be high or low rank = np.nanmax(rank) + 1 - rank rank = list(rank) # for i in range(len(rank)): # if not np.isnan(rank[i]): # rank[i] = int(rank[i]) rank = [ Stats[0][i1] ] + rank # We stack the ranks for all fundamental indicators in a matrix ranks ranks = np.vstack((ranks, rank)) # We extract the rank of each stock for each fundamental indicator in a csv file named # 'Rank_?.csv', where "?" is the name of the sector with open('Rank_{}.csv'.format(self.sector), "w", newline='') as f: writer = csv.writer(f) writer.writerows(np.transpose(ranks))
def test_ranking(self): x = ma.array([ 0, 1, 1, 1, 2, 3, 4, 5, 5, 6, ]) assert_almost_equal(mstats.rankdata(x), [1, 3, 3, 3, 5, 6, 7, 8.5, 8.5, 10]) x[[3, 4]] = masked assert_almost_equal(mstats.rankdata(x), [1, 2.5, 2.5, 0, 0, 4, 5, 6.5, 6.5, 8]) assert_almost_equal(mstats.rankdata(x, use_missing=True), [1, 2.5, 2.5, 4.5, 4.5, 4, 5, 6.5, 6.5, 8]) x = ma.array([ 0, 1, 5, 1, 2, 4, 3, 5, 1, 6, ]) assert_almost_equal(mstats.rankdata(x), [1, 3, 8.5, 3, 5, 7, 6, 8.5, 3, 10]) x = ma.array([[0, 1, 1, 1, 2], [ 3, 4, 5, 5, 6, ]]) assert_almost_equal(mstats.rankdata(x), [[1, 3, 3, 3, 5], [6, 7, 8.5, 8.5, 10]]) assert_almost_equal(mstats.rankdata(x, axis=1), [[1, 3, 3, 3, 5], [1, 2, 3.5, 3.5, 5]]) assert_almost_equal(mstats.rankdata(x, axis=0), [[1, 1, 1, 1, 1], [ 2, 2, 2, 2, 2, ]])
def test_ranking(self): x = ma.array([0, 1, 1, 1, 2, 3, 4, 5, 5, 6]) assert_almost_equal(mstats.rankdata(x), [1, 3, 3, 3, 5, 6, 7, 8.5, 8.5, 10]) x[[3, 4]] = masked assert_almost_equal(mstats.rankdata(x), [1, 2.5, 2.5, 0, 0, 4, 5, 6.5, 6.5, 8]) assert_almost_equal(mstats.rankdata(x, use_missing=True), [1, 2.5, 2.5, 4.5, 4.5, 4, 5, 6.5, 6.5, 8]) x = ma.array([0, 1, 5, 1, 2, 4, 3, 5, 1, 6]) assert_almost_equal(mstats.rankdata(x), [1, 3, 8.5, 3, 5, 7, 6, 8.5, 3, 10]) x = ma.array([[0, 1, 1, 1, 2], [3, 4, 5, 5, 6]]) assert_almost_equal(mstats.rankdata(x), [[1, 3, 3, 3, 5], [6, 7, 8.5, 8.5, 10]]) assert_almost_equal(mstats.rankdata(x, axis=1), [[1, 3, 3, 3, 5], [1, 2, 3.5, 3.5, 5]]) assert_almost_equal(mstats.rankdata(x, axis=0), [[1, 1, 1, 1, 1], [2, 2, 2, 2, 2]])
def calc_percentile(values): """Calculates the percentile values for a sample of numbers. NOTE: There are multiple ways to calculate the percentile. The problem lies in how to deal with duplicate entries. We will use the simplest definition of percentile, which is the rank of an entry divided by the total number of entries. When calculating the rank, we take the mean ranks of all entries with the same value. The calculated percentiles are in [0, 100). """ return (rankdata(values) - 1) / len(values) * 100
def __prepare_inputs(self, inputs): scaled_inputs = self.scaler.transform(inputs) standardized_inputs = self.standard_scaler.transform(inputs) outputs = np.concatenate((scaled_inputs, standardized_inputs), axis=1) if self.log and np.sum(inputs <= 0) == 0: outputs = np.concatenate((outputs, np.log(inputs)), axis=1) if self.sqrt and np.sum(inputs <= 0) == 0: outputs = np.concatenate((outputs, np.sqrt(inputs)), axis=1) if self.square: outputs = np.concatenate((outputs, np.square(inputs)), axis=1) if self.percentile: outputs = np.concatenate((outputs, rankdata(inputs, axis=0) / len(inputs)), axis=1) inputs = outputs.copy() return inputs
def get_ranks(np_array): print "original array has {0} total and {1} valid elements".format( np_array.size, np_array.size-np.count_nonzero(np.isnan(np_array))) #temp = np_array.argsort() #rank_array = np.empty(len(np_array), int) #rank_array[temp] = np.arange(len(np_array)) rank_array = mstats.rankdata(np.ma.masked_invalid(np_array)) rank_array[rank_array==0] = np.nan rank_array -= 1 print "rank array has {0} max rank and {1} valid ranks".format( np.nanmax(rank_array), rank_array.size-np.count_nonzero(np.isnan(rank_array))) return rank_array
def rank(self, method=0): """! @brief Compute ranks of the data @param method <b>int</b> if == 0: use standard rank transform, else: use CDF data transform. @return (u, v) tuple of <b>np_1darray</b> ranked samples """ if method == 0: u = rankdata(self.x) / (len(self.x) + 1) v = rankdata(self.y) / (len(self.y) + 1) else: # use alternate CDF rank transform method kde_x = gaussian_kde(self.x) kde_y = gaussian_kde(self.y) u_hat = np.zeros(len(self.x)) v_hat = np.zeros(len(self.y)) for i, (xp, yp) in enumerate(zip(self.x, self.y)): u_hat[i] = kde_x.integrate_box_1d(-np.inf, xp) v_hat[i] = kde_y.integrate_box_1d(-np.inf, yp) u = u_hat v = v_hat self.UU, self.VV = u, v return u, v
def get_ranks(np_array): print "original array has {0} total and {1} valid elements".format( np_array.size, np_array.size - np.count_nonzero(np.isnan(np_array))) #temp = np_array.argsort() #rank_array = np.empty(len(np_array), int) #rank_array[temp] = np.arange(len(np_array)) rank_array = mstats.rankdata(np.ma.masked_invalid(np_array)) rank_array[rank_array == 0] = np.nan rank_array -= 1 print "rank array has {0} max rank and {1} valid ranks".format( np.nanmax(rank_array), rank_array.size - np.count_nonzero(np.isnan(rank_array))) return rank_array
def post_hoc(results, alpha, stat): ''' Performs a post-hoc test on the given results to determine the index configurations which are not statistically worse than the best configuration results: list of list of numbers representing results for parameters for a number of problems alpha: 1 - confidence of outcome stat: statistic obtained from friedman test ''' ranks = rankdata(array(results), axis=1) (k,n) = ranks.shape rank_sum = list(sum(ranks)) best = min(rank_sum) rhs = ((2*k*(1-stat/(k*(n-1)))*sum(sum(ranks**2 - (n+1)*(n+1) / 4.)))/((k-1)*(n-1)))**0.5 * t.ppf(1-alpha/2, n-1) return [rank_sum.index(i) for i in rank_sum if abs(best - i) < rhs]
def _mannwhitneyu(x, y, use_continuity=True): """ Computes the Mann-Whitney statistic Missing values in `x` and/or `y` are discarded. Parameters ---------- x : ndarray, Input, vector or observations x features matrix y : ndarray, Input, vector or observations x features matrix. If matrix, must have same number of features as x use_continuity : {True, False}, optional Whether a continuity correction (1/2.) should be taken into account. Returns ------- statistic : float The Mann-Whitney statistic approx z : float The normal-approximated z-score for U. pvalue : float Approximate p-value assuming a normal distribution. """ if x.ndim == 1 and y.ndim == 1: x, y = x[:, np.newaxis], y[:, np.newaxis] ranks = rankdata(np.concatenate([x, y]), axis=0) nx, ny = x.shape[0], y.shape[0] nt = nx + ny U = ranks[:nx].sum(0) - nx * (nx + 1) / 2. mu = (nx * ny) / 2. u = np.amin([U, nx*ny - U], axis=0) # get smaller U by convention sigsq = np.ones(ranks.shape[1]) * (nt ** 3 - nt) / 12. for i in np.arange(len(sigsq)): ties = count_tied_groups(ranks[:, i]) sigsq[i] -= np.sum(v * (k ** 3 - k) for (k, v) in ties.items()) / 12. sigsq *= nx * ny / float(nt * (nt - 1)) if use_continuity: z = (U - 1 / 2. - mu) / np.sqrt(sigsq) else: z = (U - mu) / np.sqrt(sigsq) prob = erfc(abs(z) / np.sqrt(2)) return np.vstack([u, z, prob]).T
def empirical_copula_uniform(x): ''' Evaluate the empirical copula-uniform dual representation of x as rank(x)/n. Parameters ---------- x : (n, d) np.array n i.i.d. draws from a d-dimensional distribution. ''' mask = np.isnan(x).copy() valid_mask = np.logical_not(mask).astype(int) r = rankdata(x, axis=0) np.copyto(r, np.nan, where=mask) non_nan_ns = valid_mask.astype(float).sum(axis=0) u = r / non_nan_ns return u
def _correlation_star(data: np.ndarray, method: str) -> np.ndarray: """ Calculates the correlation* ((S* i,j) or (R* i,j)) similarity component of the YS1 and YR1 dissimilarity metrics. \ For every pair of samples (i,j), returns (corr(i,j) + 1) / 2, \ where corr is either the Pearson correlation or Spearman correlation. :param data: an n-by-p numpy array of n samples by p features, to calculate pairwise distance on. :type data: np.ndarray :param method: the correlation metric to use when calculating correlation* :type method: 'pearson' or 'spearman' :return: an n-by-n numpy array of correlation* similarity scores. :rtype: np.ndarray """ assert isinstance(method, str), f"'method' must be a string. Instead got {type(method)}." method = method.lower() assert method in {'spearman', 'pearson'}, f"'method' must be 'spearman' or 'pearson'. Instead got '{method}'." if method == 'spearman': return (np.corrcoef(rankdata(data, axis=1)) + 1) / 2 return (np.corrcoef(data) + 1) / 2
def ecdf(data): ''' Computes the Expected CDF of a data array. Parameters ---------- data: array of numbers The data to compute the cdf Returns ------- Two other arrays corresponding to the x and y axes. ''' obs = np.asanyarray(data) rank = mstats.rankdata(obs) return_val_x = np.unique(obs) return_val_y = np.unique(rank) / len(rank) return (return_val_x, return_val_y)
def mann_whitney_u(x, y): x = asarray(x) y = asarray(y) n1 = len(x) n2 = len(y) ranked = rankdata(np.concatenate((x,y))) rankx = ranked[0:n1] # get the x-ranks # ranky = ranked[n1:] # the rest are y-ranks u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0) # calc U for x u2 = n1*n2 - u1 # remainder is U for y #bigu = max(u1,u2) smallu = min(u1,u2) # T = np.sqrt(tiecorrect(ranked)) # correction factor for tied scores T = tiecorrect(ranked) #print T if T == 0: #raise ValueError('All numbers are identical in amannwhitneyu') z = 0 else: sd = np.sqrt(T*n1*n2*(n1+n2+1)/12.0) z = (smallu-n1*n2/2.0) / sd # normal approximation for prob calc return u1, u2, z, distributions.norm.sf(abs(z)) # (1.0 - zprob(z))
def __init__(self, y, regime, permutations=999): ranks = rankdata(y, axis=0) self.ranks = ranks n, k = y.shape ranks_d = ranks[:, range(1, k)] - ranks[:, range(k - 1)] self.ranks_d = ranks_d regimes = sp.unique(regime) self.regimes = regimes self.total = sum(abs(ranks_d)) self.max_total = sum([abs(i - n + i - 1) for i in range(1, n + 1)]) self._calc(regime) self.theta = self._calc(regime) self.permutations = permutations if permutations: np.perm = np.random.permutation sim = np.array([self._calc( np.perm(regime)) for i in xrange(permutations)]) self.theta.shape = (1, len(self.theta)) sim = np.concatenate((self.theta, sim)) self.sim = sim den = permutations + 1. self.pvalue_left = (sim <= sim[0]).sum(axis=0) / den self.pvalue_right = (sim > sim[0]).sum(axis=0) / den self.z = (sim[0] - sim.mean(axis=0)) / sim.std(axis=0)
def normalize_filter_quantilemap(v,r): ranks = rankdata(v)/len(v) v2 = mquantiles(r,prob=ranks) v3 = savitzky_golay(v2,5,2,1,1) return v3
#!/home/paulk/software/bin/python from __future__ import division from sys import argv,exit,stderr,stdout from scipy import * from scipy.stats.mstats import rankdata ranks = dict() data = dict() f = open(argv[1]) unwanted = ['I','S','\t'] for row in f: if row[0] in unwanted: continue l = row.strip().split('\t') ranks[l[1]] = rankdata(map(float,l[2:])) f.close() c = 0 for r in ranks: if c > 20: break print "\t".join([r] + map(str,list(ranks[r]))) c += 0
def prioritize_gurobi(input_rasters, output_rank_raster, step=0.05, save_intermediate=False, compress='DEFLATE', ol_normalize=False, weights=None, verbose=False, logger=None): """ Solve (multiple) maximum coverage problems for a set of input rasters. Create a hierarchical spatial prioritization using Gurobi solver to solve multiple optimization problems with different budget levels. Each budget level must be in range [0.0, 1.0] and corresponds to the area of interest. In other words, value of 0.1 would correspond to best 10% of the landscape. Budget levels will be automatically created using step value defined by the step arguments. Gurobi solver will then solve each problem using the representation levels in input_rasters. All the (binary) results are then summed together forming a selection frequency. Finally, the selection frequency is rescaled into range [0, 1] forming a rank priority raster. It is possible to provide a list (vector) of weights for each features. These values are used as simple multipliers for each feature when summing the values over all features. If provided, the list must match the number of input rasters exactly. :param input_rasters: List of String paths of input rasters. :param output_raster: String path to the rank raster file to be created. :param step: numeric value in (0, 1) defining the step length for budget levels. :param save_intermediate: should intermediate optiomization results be saved? :param compress: String compression level used for the output raster. :param ol_normalize: Boolean setting OL (Occurrence Level) normalization. :param weights: list of weights. Length must match the number of input rasters. :param verbose Boolean indicating how much information is printed out. :param logger logger object to be used. """ # 1. Setup -------------------------------------------------------------- all_start = timer() load_start = timer() if not logger: logging.basicConfig() llogger = logging.getLogger('optimize_gurobi') llogger.setLevel(logging.DEBUG if verbose else logging.INFO) else: llogger = logger if len(input_rasters) < 1: llogger.error("Input rasters list cannot be empty") sys.exit(1) # Check inputs assert len(input_rasters) > 0, "Input rasters list cannot be empty" assert len(output_rank_raster) != "", "Output raster path cannot be empty" try: step = float(step) except ValueError: llogger.error("'step' must be coercible to float") sys.exit(1) assert step > 0.0 and step < 1.0, "Step argument must be in range (0, 1)" # Construct budget levels based on the step provided. 0.0 (nothing) and # 1.0 (everything) are not needed. budget_levels = np.linspace(0.0+step, 1.0, 1/step) # 2. Pre-processing ----------------------------------------------------- llogger.info(" [** PRE-PROCESSING **]") # Create a sum array, i.e. sum all (occurrence level normalized) raster # values in input_rasters together. NOTE: sum_raster() returns a masked # array. sum_array_masked = spatutils.sum_raster(input_rasters, olnormalize=True, weights=weights, logger=llogger) # To speed up things, do 2 things: 1) save mask (NoData) and get rid of # NoData cells for now, 2) flatten the array. (height, width) = sum_array_masked.shape mask = ma.getmask(sum_array_masked) # Get all the non-masked data as a 1-D array. sum_array = ma.compressed(sum_array_masked) # Create equal cost array cost = np.ones(sum_array.size) load_end = timer() load_elapsed = round(load_end - load_start, 2) llogger.info(" [TIME] Pre-processing took {} sec".format(load_elapsed)) # 3. Optimize ----------------------------------------------------------- opt_start = timer() llogger.info(" [** OPTIMIZING **]") blevels_str = ", ".join([str(level) for level in budget_levels]) llogger.info(" [NOTE] Target budget levels: {}".format(blevels_str)) # Construct a ndarray (matrix) that will hold the selection frequency. # Populate it with 0s sel_freq = np.full((height, width), 0) # Define budget and optimize_maxcover for i, blevel in enumerate(budget_levels): no_blevel = i + 1 prefix = utils.get_iteration_prefix(no_blevel, len(budget_levels)) budget = blevel * cost.size llogger.info("{} Optimizing with ".format(prefix) + "budget level {}...".format(blevel)) x = optimize_maxcover(cost, budget, sum_array, verbose=verbose, logger=llogger) # Create a full (filled with 0s) raster template x_selection = np.full((height, width), 0.0) # Place the values of result array (it's binary = {0, 1}) into template # elements that are False in the original mask x_selection[~mask] = x # Add the selected elements (planning units) into the selection # frequency matrix sel_freq += x_selection # Get the raster profile from the input raster files profile = spatutils.get_profile(input_rasters, logger=llogger) if save_intermediate: # Replace the real nodata values with a proper NoData value nodata_value = 255 x_selection[mask] = nodata_value # Create a masked array output_x = ma.masked_values(x_selection, nodata_value) # Construct the output raster name btoken = str(blevel).replace('.', '_') # Construct a subdir name based on the basename output_bname = os.path.basename(output_rank_raster).split('.')[0] output_subir = os.path.join(os.path.dirname(output_rank_raster), output_bname) if not os.path.exists(output_subir): os.makedirs(output_subir) output_raster = os.path.join(output_subir, "budget_level_{}.tif".format(btoken)) # Write out the data llogger.debug(" Writing intermediate output to " + "{}".format(output_raster)) profile.update(dtype=rasterio.uint8, compress=compress, nodata=nodata_value) with rasterio.open(output_raster, 'w', **profile) as dst: dst.write_mask(mask) dst.write(output_x.astype(np.uint8), 1) opt_end = timer() opt_elapsed = round(opt_end - opt_start, 2) llogger.info(" [TIME] Optimization took {} sec".format(opt_elapsed)) # 4. Rank values --------------------------------------------------------- post_start = timer() llogger.info(" [** POST-PROCESSING **]") llogger.info(" [1/3] Ranking selection frequencies") # Use 0s from summation as a mask rank_array = ma.masked_values(sel_freq, 0.0) rank_array = rankdata(rank_array) # 5. Recale data into range [0, 1] --------------------------------------- llogger.info(" [2/3] Rescaling ranks") rank_array = spatutils.normalize(rank_array) # 6. Prepare and write output -------------------------------------------- llogger.info(" [3/3] Writing output to {}".format(output_rank_raster)) # Replace the real nodata values with a proper NoData value nodata_value = -3.4e+38 rank_array[mask] = nodata_value # Create a masked array rank_array = ma.masked_values(rank_array, nodata_value) profile.update(dtype=rasterio.float64, compress=compress, nodata=nodata_value) with rasterio.open(output_rank_raster, 'w', **profile) as dst: dst.write_mask(mask) dst.write(rank_array.astype(np.float64), 1) post_end = timer() post_elapsed = round(post_end - post_start, 2) llogger.info(" [TIME] Post-processing took {} sec".format(post_elapsed)) all_end = timer() all_elapsed = round(all_end - all_start, 2) llogger.info(" [TIME] All processing took {} sec".format(all_elapsed))
def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("Starting full motif analysis") self.logger.info("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.info("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.info("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.info("Parameters:") for param, value in params.items(): self.logger.info(" %s: %s" % (param, value)) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.info("Inputfile is a FASTA file") self.input_type = "FASTA" except: # Leave it to BED pass if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info("Input type is FASTA, can't use background type '%s'" % bg) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if not os.path.exists(index_dir): self.logger.error("No index found for genome %s! Has GimmeMotifs been configured correctly and is the genome indexed?" % params["genome"]) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info("Input type is BED, can't use background type '%s'" % bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except: self.logger.info("Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.info("Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.info("Invalid time limit for motif prediction, setting to no limit") self.max_time = None else: self.logger.info("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not(False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!") else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("Starting motif prediction (%s) using %s" % (analysis, ", ".join([x for x in tools.keys() if tools[x]]))) bg_file = self.bg_file["fa"][sorted(background, lambda x,y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.info("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("Predicted %s motifs, written to %s" % (len(motifs), self.predicted_pfm)) if len(motifs) == 0: self.logger.info("No motifs found. Done.") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) print result.stats for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write("%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error("No stats for motif {0}, skipping this motif!".format(motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0],1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats['enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant, written to %s" % (nsig, self.significant_pfm)) if nsig == 0: self.logger.info("No significant motifs found. Done.") sys.exit() # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster(clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file = self.validation_fa, bg_file = bg_file) p.add_motifs(("Clustering", (pwmfile_to_motifs(self.final_pwm), "",""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.info("Creating localization plots") motifs = pwmfile_to_motifs(self.final_pwm) for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int(mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not(params["keep_intermediate"]): self.logger.info("Deleting intermediate files. Please specifify the -k option if you want to keep these files.") shutil.rmtree(self.tmpdir) self.logger.info("Done")
def main(folder): read_scores = {} read_ranks = {} read_lengths = {} read_info_content = {} final_scores = {} final_ranks = {} num_tests = {} filenames = [] num_ranks = 0 # get general info tool_0 = os.listdir(folder)[0] pkl_file = open(os.path.join(folder, tool_0), 'rb') (reads, results) = pickle.load(pkl_file) for filename, values in reads.iteritems(): read_scores[filename] = {} read_ranks[filename] = {} read_lengths[filename] = values["length"] read_info_content[filename] = values["info_content"]/float(values["length"]) filenames.append(filename) tools = os.listdir(folder) # get specific info for tool in tools: pkl_file = open(os.path.join(folder, tool), 'rb') (reads, results) = pickle.load(pkl_file) final_scores[tool] = [0] * 4 final_ranks[tool] = [0] * 4 num_tests[tool] = len(results) for filename, values in results.iteritems(): read_scores[filename][tool] = values["scores"] for i in range(len(values["scores"])): final_scores[tool][i] += values["scores"][i] for filename, tool_scores in read_scores.iteritems(): # We only rank on the ones that didn't error for anyone if len(tool_scores) < len(tools): continue num_ranks += 1 raw_scores = [(tool, scores) for tool, scores in tool_scores.iteritems()] ranks = [ rankdata([1 - r[1][score] for r in raw_scores] ) for score in range(4)] for p, (tool, _) in enumerate(raw_scores): for score_type in range(4): final_ranks[tool][score_type] += ranks[score_type][p] results = [] print "Tool\ttests\tq\ttc\tcline\tmodeler\tq\ttc\tcline\tmodeler" for tool in tools: results = [tool, num_tests[tool]] results.extend([s/float(num_tests[tool]) for s in final_scores[tool]]) results.extend([s/float(num_ranks) for s in final_ranks[tool]]) print "%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % ( tuple(results) ) length_bins = [round(x) for x in stats.mstats.mquantiles([read_lengths[f] for f in filenames], [0, 0.2, 0.4, 0.6, 0.8, 1])] info_bins = [round(x, 2) for x in stats.mstats.mquantiles([read_info_content[f] for f in filenames], [0, 0.2, 0.4, 0.6, 0.8, 1])] #information_traces = [] #length_traces = [] traces = [] #pp.pprint(read_scores) for tool in tools: chosen_files = [f for f in filenames if tool in read_scores[f].keys()] qscore_data = [read_scores[f][tool][0] for f in chosen_files] tc_data = [read_scores[f][tool][1] for f in chosen_files] labels = ["Sum of pairs" for _ in range(len(qscore_data))] labels.extend(["Total column" for _ in range(len(tc_data))]) all_data = qscore_data all_data.extend(tc_data) traces.append(Box( y=all_data, x=labels, name=tool, opacity=0.8 ) ) layout = Layout( boxmode='group', showlegend=True, title="Sum of pairs (SP) and total column (TC) scores for 300 randomly selected PANDIT alignments", titlefont = Font( size = 20), yaxis=YAxis( title='Score', showgrid=False, autorange = True, titlefont = Font( size = 20) ), legend = Legend( font = Font( size = 18 ), yanchor = "bottom", xanchor = "left") ) data = Data(traces) fig = Figure(data=data, layout=layout) py.sign_in("imogen", "mtf1tawct2") py.plot(fig)
plt.plot( [ beginTimeOfCloud , endTimeOfCloud ], [ cloudTopHeight, cloudTopHeight ], 'k' ) plt.title('Radar reflectivity') plt.xlabel('Time [' + time_offset_radar_refl.units + ']') plt.ylabel('Altitude [m]') plt.figure() #plt.show() #pdb.set_trace() # uniformCloudBlock = close-up selection of contiguous cloud values. # Each column is a different altitude. uniformCloudBlock = zeros((lenTimestepRangeCloud,lenLevelRangeCloud)) for col in range(0,lenLevelRangeCloud): uniformCloudBlock[:,col] = rankdata(reflCloudBlock[:,col]) / \ MaskedArray.count(reflCloudBlock[:,col]) uniformCloudBlock = masked_where( uniformCloudBlock == 0, uniformCloudBlock ) # I'm not sure if it's appropriate to rank first, then fill. # So I'm not sure if this is correct. uniformCloudBlockFilled = filled(uniformCloudBlock,fill_value=0) plt.clf() for idx in range(1,5): plt.subplot(2,2,idx) plt.plot(uniformCloudBlockFilled[:,5],uniformCloudBlockFilled[:,idx],'.') plt.title('Copula') plt.figure() #pdb.set_trace()
def main(folder): read_scores = {} read_ranks = {} read_lengths = {} read_info_content = {} final_scores = {} final_ranks = {} num_tests = {} filenames = [] num_ranks = 0 # get general info tool_0 = os.listdir(folder)[0] pkl_file = open(os.path.join(folder, tool_0), 'rb') (reads, results) = pickle.load(pkl_file) for filename, values in reads.iteritems(): read_scores[filename] = {} read_ranks[filename] = {} filenames.append(filename) tools = os.listdir(folder) # get specific info for tool in tools: pkl_file = open(os.path.join(folder, tool), 'rb') (reads, results) = pickle.load(pkl_file) final_scores[tool] = [0] * 2 final_ranks[tool] = [0] * 2 num_tests[tool] = len(results) for filename, values in results.iteritems(): read_scores[filename][tool] = values["scores"] for i in range(len(values["scores"])): final_scores[tool][i] += values["scores"][i] #for filename, tool_scores in read_scores.iteritems(): # print filename, tool_scores for filename, tool_scores in read_scores.iteritems(): # We only rank on the ones that didn't error for anyone if len(tool_scores) < len(tools): continue num_ranks += 1 raw_scores = [(tool, scores) for tool, scores in tool_scores.iteritems()] ranks = [ rankdata([1 - r[1][score] for r in raw_scores] ) for score in range(2)] for p, (tool, _) in enumerate(raw_scores): for score_type in range(2): final_ranks[tool][score_type] += ranks[score_type][p] results = [] print "Tool\ttests\tsp\ttc\tsp\ttc" for tool in tools: results = [tool, num_tests[tool]] results.extend([s/float(num_tests[tool]) for s in final_scores[tool]]) results.extend([s/float(num_ranks) for s in final_ranks[tool]]) print "%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f" % ( tuple(results) ) traces = [] tool_names = {"clustalo.sh":"Clustal Omega", "clustalw.sh":"Clustal W", "muscle.sh":"MUSCLE", "mafft.sh":"MAFFT L-INS-I", "prank.sh":"PRANK", "prank_f.sh":"PRANK+F", "noah_full.sh":"NOAH (full algorithm)", "noah_basic.sh":"NOAH (chained guide tree, no FB traversal)", "noah_no_fb.sh":"NOAH (guide arc, no FB traversal)", "noah_no_arc.sh":"NOAH (chained guide tree, FB traversal)" } for tool in tools: chosen_files = [f for f in filenames if tool in read_scores[f].keys()] qscore_data = [read_scores[f][tool][0] for f in chosen_files] #tc_data = [read_scores[f][tool][1] for f in chosen_files] labels = ["Sum of pairs" for _ in range(len(qscore_data))] #labels.extend(["Total column" for _ in range(len(tc_data))]) all_data = qscore_data traces.append(Box( y=all_data, name=tool_names[tool], opacity=0.8 ) ) layout = Layout( showlegend=True, title="Sum of pairs (SP) scores", titlefont = Font( size = 20), yaxis=YAxis( title='Score', showgrid=False, autorange = True, titlefont = Font( size = 20) ), xaxis=XAxis( showticklabels=False), legend = Legend( font = Font( size = 18 ), yanchor = "bottom", xanchor = "left") ) data = Data(traces) fig = Figure(data=data, layout=layout) py.sign_in("imogen", "mtf1tawct2")
def sort_pars(results, pars): ''' Sorts the parameters in order of performance according to results ''' return [i[1] for i in sorted(zip(list(sum(rankdata(array(results), axis=1))),pars))]
def ranktransform(X): from scipy.stats.mstats import rankdata rankX = rankdata(X) Xpercentile = rankX/ X.shape[0] return Xpercentile