def ssea_run(counts, size_factors, membership, rng, config): ''' counts: numpy array of float values size_factors: normalization factors for counts membership: int array (0 or 1) with set membership rng: RandomState object config: Config object ''' # run kernel to generate a range of observed enrichment scores resample_rand_seeds = np.empty(config.resampling_iterations, dtype=np.int) resample_count_ranks = np.empty( (config.resampling_iterations, counts.shape[0]), dtype=np.int) resample_es_vals = np.zeros(config.resampling_iterations, dtype=np.float) resample_es_ranks = np.zeros(config.resampling_iterations, dtype=np.int) for i in xrange(config.resampling_iterations): # save random number generator seed before running kernel resample_rand_seeds[i] = rng.seed k = ssea_kernel(counts, size_factors, membership, rng, resample_counts=True, permute_samples=False, add_noise=True, noise_loc=config.noise_loc, noise_scale=config.noise_scale, method_miss=config.weight_miss, method_hit=config.weight_hit, method_param=config.weight_param) k = KernelResult._make(k) resample_count_ranks[i] = k.ranks resample_es_vals[i] = k.es_val resample_es_ranks[i] = k.es_rank # find the median ES value median_index = int(config.resampling_iterations / 2) median_index = resample_es_vals.argsort()[median_index] median_es_val = resample_es_vals[median_index] # choose whether to use the positive or negative side of the # distribution based on the median ES value if median_es_val == 0: return Result.default(), np.zeros((0, ), dtype=np.float) elif median_es_val < 0: signfunc = np.less else: signfunc = np.greater # subset to include only the corresponding side of the distribution resample_sign_inds = signfunc(resample_es_vals, 0) resample_rand_seeds = resample_rand_seeds[resample_sign_inds] resample_count_ranks = resample_count_ranks[resample_sign_inds] resample_es_vals = resample_es_vals[resample_sign_inds] resample_es_ranks = resample_es_ranks[resample_sign_inds] # determine the median value median_index = int(resample_es_vals.shape[0] / 2) median_index = resample_es_vals.argsort()[median_index] # select the kernel run representing the median ES value rand_seed = resample_rand_seeds[median_index] es_val = resample_es_vals[median_index] es_rank = resample_es_ranks[median_index] ranks = resample_count_ranks[median_index] es_sign = cmp(es_val, 0) # permute samples and determine ES null distribution null_es_vals = np.zeros(config.perms, dtype=np.float) null_es_ranks = np.zeros(config.perms, dtype=np.float) i = 0 while i < config.perms: k = ssea_kernel(counts, size_factors, membership, rng, resample_counts=True, permute_samples=True, add_noise=True, noise_loc=config.noise_loc, noise_scale=config.noise_scale, method_miss=config.weight_miss, method_hit=config.weight_hit, method_param=config.weight_param) k = KernelResult._make(k) if cmp(k.es_val, 0) == es_sign: null_es_vals[i] = k.es_val null_es_ranks[i] = k.es_rank i += 1 # Subset the null ES scores to only positive or negative null_sign_inds = signfunc(null_es_vals, 0) null_es_vals = null_es_vals[null_sign_inds] null_es_ranks = null_es_ranks[null_sign_inds] # Adjust for variation in gene set size. Normalize ES(S,null) # and the observed ES(S), separately rescaling the positive and # negative scores by dividing by the mean of the ES(S,null) to # yield normalized scores NES(S,null) null_es_mean = np.fabs(null_es_vals.mean()) null_nes_vals = null_es_vals / null_es_mean # Normalize the observed ES(S) by rescaling by the mean of # the ES(S,null) separately for positive and negative ES(S) nes_val = es_val / null_es_mean # estimate nominal p value for S from ES(S,null) by using the # positive or negative portion of the distribution corresponding # to the sign of the observed ES(S) p_value = (np.fabs(null_es_vals) >= np.fabs(es_val)).sum().astype(np.float) p_value /= null_es_vals.shape[0] # Create result object for this SSEA test res = Result() res.rand_seed = int(rand_seed) res.es = round(es_val, FLOAT_PRECISION) res.es_rank = int(es_rank) res.nominal_p_value = round(p_value, SCIENTIFIC_NOTATION_PRECISION) res.nes = round(nes_val, FLOAT_PRECISION) # save some of the resampled es points res.resample_es_vals = np.around(resample_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION) res.resample_es_ranks = resample_es_ranks[:Result.MAX_POINTS] # save null distribution points res.null_es_mean = null_es_mean res.null_es_vals = np.around(null_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION) res.null_es_ranks = null_es_ranks[:Result.MAX_POINTS] # get indexes of hits in this set m = membership[ranks] hit_inds = (m > 0).nonzero()[0] num_hits = hit_inds.shape[0] num_misses = m.shape[0] - num_hits # calculate leading edge stats if es_val < 0: core_hits = sum(i >= es_rank for i in hit_inds) core_misses = (m.shape[0] - es_rank) - core_hits else: core_hits = sum(i <= es_rank for i in hit_inds) core_misses = 1 + es_rank - core_hits null_hits = num_hits - core_hits null_misses = num_misses - core_misses # fisher exact test (one-sided hypothesis that LE is enricheD) fisher_p_value = fisher.pvalue(core_hits, core_misses, null_hits, null_misses).right_tail # odds ratio n = np.inf if null_hits == 0 else float(core_hits) / null_hits d = np.inf if null_misses == 0 else float(core_misses) / null_misses if np.isfinite(n) and np.isfinite(d): if n == 0 and d == 0: odds_ratio = np.nan else: odds_ratio = np.inf if d == 0 else (n / d) elif np.isfinite(d): odds_ratio = np.inf else: odds_ratio = np.nan if n == 0 else 0.0 # create dictionary result res.core_hits = int(core_hits) res.core_misses = int(core_misses) res.null_hits = int(null_hits) res.null_misses = int(null_misses) res.fisher_p_value = np.round(fisher_p_value, SCIENTIFIC_NOTATION_PRECISION) res.odds_ratio = odds_ratio # return result and null distribution for subsequent fdr calculations return res, null_nes_vals
def ssea_serial(config, sample_set, output_basename, startrow=None, endrow=None): ''' main SSEA loop (single processor) matrix_dir: numpy memmap matrix containing numeric data sample_set: SampleSet object config: Config object output_basename: prefix for writing result files ''' # initialize random number generator rng = RandomState() # open data matrix bm = BigCountMatrix.open(config.matrix_dir) # determine range of matrix to process if startrow is None: startrow = 0 if endrow is None: endrow = bm.shape[0] assert startrow < endrow # get membership array for sample set membership = sample_set.get_array(bm.colnames) valid_samples = (membership >= 0) # setup histograms hists = _init_hists() # setup report file unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX outfileh = open(unsorted_json_file, 'wb') for i in xrange(startrow, endrow): logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow)) # read from memmap counts = np.array(bm.counts[i, :], dtype=np.float) # remove 'nan' values valid_inds = np.logical_and(valid_samples, np.isfinite(counts)) # subset counts, size_factors, and membership array counts = counts[valid_inds] size_factors = bm.size_factors[valid_inds] valid_membership = membership[valid_inds] # write dummy results for invalid rows if (valid_inds.sum() == 0) or (np.all(counts == 0)): res = Result.default() else: # run ssea res, null_nes_vals = ssea_run(counts, size_factors, valid_membership, rng, config) # update histograms null_keys = [] obs_keys = [] if res.es < 0: null_keys.append('null_nes_neg') obs_keys.append('obs_nes_neg') elif res.es > 0: null_keys.append('null_nes_pos') obs_keys.append('obs_nes_pos') for k in xrange(len(null_keys)): null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX) obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX) hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0] hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0] # save t_id res.t_id = i # convert to json and write print >> outfileh, res.to_json() # close report file outfileh.close() # save histograms to a file output_hist_file = output_basename + NPY_HISTS_SUFFIX np.savez(output_hist_file, **hists) # cleanup bm.close() # sort output json file by abs(NES) logging.debug("Worker %s: sorting results" % (output_basename)) # make tmp dir for sorting if os.path.exists(output_basename): shutil.rmtree(output_basename) os.makedirs(output_basename) # call batch sort python function sorted_json_file = output_basename + JSON_SORTED_SUFFIX batch_sort(input=unsorted_json_file, output=sorted_json_file, key=_cmp_json_nes, buffer_size=SORT_BUFFER_SIZE, tempdirs=[output_basename]) # remove tmp dir shutil.rmtree(output_basename) # remove unsorted json file os.remove(unsorted_json_file) logging.debug("Worker %s: done" % (output_basename)) return 0
def test_default_result(self): result = Result.default() self.assertTrue(result.t_id is None) self.assertTrue(result.ss_rank is None)
def ssea_run(counts, size_factors, membership, rng, config): ''' counts: numpy array of float values size_factors: normalization factors for counts membership: int array (0 or 1) with set membership rng: RandomState object config: Config object ''' # run kernel to generate a range of observed enrichment scores resample_rand_seeds = np.empty(config.resampling_iterations, dtype=np.int) resample_count_ranks = np.empty((config.resampling_iterations, counts.shape[0]), dtype=np.int) resample_es_vals = np.zeros(config.resampling_iterations, dtype=np.float) resample_es_ranks = np.zeros(config.resampling_iterations, dtype=np.int) for i in xrange(config.resampling_iterations): # save random number generator seed before running kernel resample_rand_seeds[i] = rng.seed k = ssea_kernel(counts, size_factors, membership, rng, resample_counts=True, permute_samples=False, add_noise=True, noise_loc=config.noise_loc, noise_scale=config.noise_scale, method_miss=config.weight_miss, method_hit=config.weight_hit, method_param=config.weight_param) k = KernelResult._make(k) resample_count_ranks[i] = k.ranks resample_es_vals[i] = k.es_val resample_es_ranks[i] = k.es_rank # find the median ES value median_index = int(config.resampling_iterations / 2) median_index = resample_es_vals.argsort()[median_index] median_es_val = resample_es_vals[median_index] # choose whether to use the positive or negative side of the # distribution based on the median ES value if median_es_val == 0: return Result.default(), np.zeros((0,), dtype=np.float) elif median_es_val < 0: signfunc = np.less else: signfunc = np.greater # subset to include only the corresponding side of the distribution resample_sign_inds = signfunc(resample_es_vals, 0) resample_rand_seeds = resample_rand_seeds[resample_sign_inds] resample_count_ranks = resample_count_ranks[resample_sign_inds] resample_es_vals = resample_es_vals[resample_sign_inds] resample_es_ranks = resample_es_ranks[resample_sign_inds] # determine the median value median_index = int(resample_es_vals.shape[0] / 2) median_index = resample_es_vals.argsort()[median_index] # select the kernel run representing the median ES value rand_seed = resample_rand_seeds[median_index] es_val = resample_es_vals[median_index] es_rank = resample_es_ranks[median_index] ranks = resample_count_ranks[median_index] es_sign = cmp(es_val, 0) # permute samples and determine ES null distribution null_es_vals = np.zeros(config.perms, dtype=np.float) null_es_ranks = np.zeros(config.perms, dtype=np.float) i = 0 while i < config.perms: k = ssea_kernel(counts, size_factors, membership, rng, resample_counts=True, permute_samples=True, add_noise=True, noise_loc=config.noise_loc, noise_scale=config.noise_scale, method_miss=config.weight_miss, method_hit=config.weight_hit, method_param=config.weight_param) k = KernelResult._make(k) if cmp(k.es_val, 0) == es_sign: null_es_vals[i] = k.es_val null_es_ranks[i] = k.es_rank i += 1 # Subset the null ES scores to only positive or negative null_sign_inds = signfunc(null_es_vals, 0) null_es_vals = null_es_vals[null_sign_inds] null_es_ranks = null_es_ranks[null_sign_inds] # Adjust for variation in gene set size. Normalize ES(S,null) # and the observed ES(S), separately rescaling the positive and # negative scores by dividing by the mean of the ES(S,null) to # yield normalized scores NES(S,null) null_es_mean = np.fabs(null_es_vals.mean()) null_nes_vals = null_es_vals / null_es_mean # Normalize the observed ES(S) by rescaling by the mean of # the ES(S,null) separately for positive and negative ES(S) nes_val = es_val / null_es_mean # estimate nominal p value for S from ES(S,null) by using the # positive or negative portion of the distribution corresponding # to the sign of the observed ES(S) p_value = (np.fabs(null_es_vals) >= np.fabs(es_val)).sum().astype(np.float) p_value /= null_es_vals.shape[0] # Create result object for this SSEA test res = Result() res.rand_seed = int(rand_seed) res.es = round(es_val, FLOAT_PRECISION) res.es_rank = int(es_rank) res.nominal_p_value = round(p_value, SCIENTIFIC_NOTATION_PRECISION) res.nes = round(nes_val, FLOAT_PRECISION) # save some of the resampled es points res.resample_es_vals = np.around(resample_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION) res.resample_es_ranks = resample_es_ranks[:Result.MAX_POINTS] # save null distribution points res.null_es_mean = null_es_mean res.null_es_vals = np.around(null_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION) res.null_es_ranks = null_es_ranks[:Result.MAX_POINTS] # get indexes of hits in this set m = membership[ranks] hit_inds = (m > 0).nonzero()[0] num_hits = hit_inds.shape[0] num_misses = m.shape[0] - num_hits # calculate leading edge stats if es_val < 0: core_hits = sum(i >= es_rank for i in hit_inds) core_misses = (m.shape[0] - es_rank) - core_hits else: core_hits = sum(i <= es_rank for i in hit_inds) core_misses = 1 + es_rank - core_hits null_hits = num_hits - core_hits null_misses = num_misses - core_misses # fisher exact test (one-sided hypothesis that LE is enricheD) fisher_p_value = fisher.pvalue(core_hits, core_misses, null_hits, null_misses).right_tail # odds ratio n = np.inf if null_hits == 0 else float(core_hits) / null_hits d = np.inf if null_misses == 0 else float(core_misses) / null_misses if np.isfinite(n) and np.isfinite(d): if n == 0 and d == 0: odds_ratio = np.nan else: odds_ratio = np.inf if d == 0 else (n / d) elif np.isfinite(d): odds_ratio = np.inf else: odds_ratio = np.nan if n == 0 else 0.0 # create dictionary result res.core_hits = int(core_hits) res.core_misses = int(core_misses) res.null_hits = int(null_hits) res.null_misses = int(null_misses) res.fisher_p_value = np.round(fisher_p_value, SCIENTIFIC_NOTATION_PRECISION) res.odds_ratio = odds_ratio # return result and null distribution for subsequent fdr calculations return res, null_nes_vals
def ssea_serial(config, sample_set, output_basename, startrow=None, endrow=None): ''' main SSEA loop (single processor) matrix_dir: numpy memmap matrix containing numeric data sample_set: SampleSet object config: Config object output_basename: prefix for writing result files ''' # initialize random number generator rng = RandomState() # open data matrix bm = BigCountMatrix.open(config.matrix_dir) # determine range of matrix to process if startrow is None: startrow = 0 if endrow is None: endrow = bm.shape[0] assert startrow < endrow # get membership array for sample set membership = sample_set.get_array(bm.colnames) valid_samples = (membership >= 0) # setup histograms hists = _init_hists() # setup report file unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX outfileh = open(unsorted_json_file, 'wb') for i in xrange(startrow, endrow): logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow)) # read from memmap counts = np.array(bm.counts[i,:], dtype=np.float) # remove 'nan' values valid_inds = np.logical_and(valid_samples, np.isfinite(counts)) # subset counts, size_factors, and membership array counts = counts[valid_inds] size_factors = bm.size_factors[valid_inds] valid_membership = membership[valid_inds] # write dummy results for invalid rows if (valid_inds.sum() == 0) or (np.all(counts == 0)): res = Result.default() else: # run ssea res, null_nes_vals = ssea_run(counts, size_factors, valid_membership, rng, config) # update histograms null_keys = [] obs_keys = [] if res.es < 0: null_keys.append('null_nes_neg') obs_keys.append('obs_nes_neg') elif res.es > 0: null_keys.append('null_nes_pos') obs_keys.append('obs_nes_pos') for k in xrange(len(null_keys)): null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX) obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX) hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0] hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0] # save t_id res.t_id = i # convert to json and write print >>outfileh, res.to_json() # close report file outfileh.close() # save histograms to a file output_hist_file = output_basename + NPY_HISTS_SUFFIX np.savez(output_hist_file, **hists) # cleanup bm.close() # sort output json file by abs(NES) logging.debug("Worker %s: sorting results" % (output_basename)) # make tmp dir for sorting if os.path.exists(output_basename): shutil.rmtree(output_basename) os.makedirs(output_basename) # call batch sort python function sorted_json_file = output_basename + JSON_SORTED_SUFFIX batch_sort(input=unsorted_json_file, output=sorted_json_file, key=_cmp_json_nes, buffer_size=SORT_BUFFER_SIZE, tempdirs=[output_basename]) # remove tmp dir shutil.rmtree(output_basename) # remove unsorted json file os.remove(unsorted_json_file) logging.debug("Worker %s: done" % (output_basename)) return 0