コード例 #1
0
def ssea_run(counts, size_factors, membership, rng, config):
    '''
    counts: numpy array of float values
    size_factors: normalization factors for counts
    membership: int array (0 or 1) with set membership
    rng: RandomState object
    config: Config object
    '''
    # run kernel to generate a range of observed enrichment scores
    resample_rand_seeds = np.empty(config.resampling_iterations, dtype=np.int)
    resample_count_ranks = np.empty(
        (config.resampling_iterations, counts.shape[0]), dtype=np.int)
    resample_es_vals = np.zeros(config.resampling_iterations, dtype=np.float)
    resample_es_ranks = np.zeros(config.resampling_iterations, dtype=np.int)
    for i in xrange(config.resampling_iterations):
        # save random number generator seed before running kernel
        resample_rand_seeds[i] = rng.seed
        k = ssea_kernel(counts,
                        size_factors,
                        membership,
                        rng,
                        resample_counts=True,
                        permute_samples=False,
                        add_noise=True,
                        noise_loc=config.noise_loc,
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        resample_count_ranks[i] = k.ranks
        resample_es_vals[i] = k.es_val
        resample_es_ranks[i] = k.es_rank
    # find the median ES value
    median_index = int(config.resampling_iterations / 2)
    median_index = resample_es_vals.argsort()[median_index]
    median_es_val = resample_es_vals[median_index]
    # choose whether to use the positive or negative side of the
    # distribution based on the median ES value
    if median_es_val == 0:
        return Result.default(), np.zeros((0, ), dtype=np.float)
    elif median_es_val < 0:
        signfunc = np.less
    else:
        signfunc = np.greater
    # subset to include only the corresponding side of the distribution
    resample_sign_inds = signfunc(resample_es_vals, 0)
    resample_rand_seeds = resample_rand_seeds[resample_sign_inds]
    resample_count_ranks = resample_count_ranks[resample_sign_inds]
    resample_es_vals = resample_es_vals[resample_sign_inds]
    resample_es_ranks = resample_es_ranks[resample_sign_inds]
    # determine the median value
    median_index = int(resample_es_vals.shape[0] / 2)
    median_index = resample_es_vals.argsort()[median_index]
    # select the kernel run representing the median ES value
    rand_seed = resample_rand_seeds[median_index]
    es_val = resample_es_vals[median_index]
    es_rank = resample_es_ranks[median_index]
    ranks = resample_count_ranks[median_index]
    es_sign = cmp(es_val, 0)
    # permute samples and determine ES null distribution
    null_es_vals = np.zeros(config.perms, dtype=np.float)
    null_es_ranks = np.zeros(config.perms, dtype=np.float)
    i = 0
    while i < config.perms:
        k = ssea_kernel(counts,
                        size_factors,
                        membership,
                        rng,
                        resample_counts=True,
                        permute_samples=True,
                        add_noise=True,
                        noise_loc=config.noise_loc,
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        if cmp(k.es_val, 0) == es_sign:
            null_es_vals[i] = k.es_val
            null_es_ranks[i] = k.es_rank
            i += 1
    # Subset the null ES scores to only positive or negative
    null_sign_inds = signfunc(null_es_vals, 0)
    null_es_vals = null_es_vals[null_sign_inds]
    null_es_ranks = null_es_ranks[null_sign_inds]
    # Adjust for variation in gene set size. Normalize ES(S,null)
    # and the observed ES(S), separately rescaling the positive and
    # negative scores by dividing by the mean of the ES(S,null) to
    # yield normalized scores NES(S,null)
    null_es_mean = np.fabs(null_es_vals.mean())
    null_nes_vals = null_es_vals / null_es_mean
    # Normalize the observed ES(S) by rescaling by the mean of
    # the ES(S,null) separately for positive and negative ES(S)
    nes_val = es_val / null_es_mean
    # estimate nominal p value for S from ES(S,null) by using the
    # positive or negative portion of the distribution corresponding
    # to the sign of the observed ES(S)
    p_value = (np.fabs(null_es_vals) >= np.fabs(es_val)).sum().astype(np.float)
    p_value /= null_es_vals.shape[0]
    # Create result object for this SSEA test
    res = Result()
    res.rand_seed = int(rand_seed)
    res.es = round(es_val, FLOAT_PRECISION)
    res.es_rank = int(es_rank)
    res.nominal_p_value = round(p_value, SCIENTIFIC_NOTATION_PRECISION)
    res.nes = round(nes_val, FLOAT_PRECISION)
    # save some of the resampled es points
    res.resample_es_vals = np.around(resample_es_vals[:Result.MAX_POINTS],
                                     FLOAT_PRECISION)
    res.resample_es_ranks = resample_es_ranks[:Result.MAX_POINTS]
    # save null distribution points
    res.null_es_mean = null_es_mean
    res.null_es_vals = np.around(null_es_vals[:Result.MAX_POINTS],
                                 FLOAT_PRECISION)
    res.null_es_ranks = null_es_ranks[:Result.MAX_POINTS]
    # get indexes of hits in this set
    m = membership[ranks]
    hit_inds = (m > 0).nonzero()[0]
    num_hits = hit_inds.shape[0]
    num_misses = m.shape[0] - num_hits
    # calculate leading edge stats
    if es_val < 0:
        core_hits = sum(i >= es_rank for i in hit_inds)
        core_misses = (m.shape[0] - es_rank) - core_hits
    else:
        core_hits = sum(i <= es_rank for i in hit_inds)
        core_misses = 1 + es_rank - core_hits
    null_hits = num_hits - core_hits
    null_misses = num_misses - core_misses
    # fisher exact test (one-sided hypothesis that LE is enricheD)
    fisher_p_value = fisher.pvalue(core_hits, core_misses, null_hits,
                                   null_misses).right_tail
    # odds ratio
    n = np.inf if null_hits == 0 else float(core_hits) / null_hits
    d = np.inf if null_misses == 0 else float(core_misses) / null_misses
    if np.isfinite(n) and np.isfinite(d):
        if n == 0 and d == 0:
            odds_ratio = np.nan
        else:
            odds_ratio = np.inf if d == 0 else (n / d)
    elif np.isfinite(d):
        odds_ratio = np.inf
    else:
        odds_ratio = np.nan if n == 0 else 0.0
    # create dictionary result
    res.core_hits = int(core_hits)
    res.core_misses = int(core_misses)
    res.null_hits = int(null_hits)
    res.null_misses = int(null_misses)
    res.fisher_p_value = np.round(fisher_p_value,
                                  SCIENTIFIC_NOTATION_PRECISION)
    res.odds_ratio = odds_ratio
    # return result and null distribution for subsequent fdr calculations
    return res, null_nes_vals
コード例 #2
0
def ssea_serial(config,
                sample_set,
                output_basename,
                startrow=None,
                endrow=None):
    '''
    main SSEA loop (single processor)
    
    matrix_dir: numpy memmap matrix containing numeric data 
    sample_set: SampleSet object
    config: Config object
    output_basename: prefix for writing result files
    '''
    # initialize random number generator
    rng = RandomState()
    # open data matrix
    bm = BigCountMatrix.open(config.matrix_dir)
    # determine range of matrix to process
    if startrow is None:
        startrow = 0
    if endrow is None:
        endrow = bm.shape[0]
    assert startrow < endrow
    # get membership array for sample set
    membership = sample_set.get_array(bm.colnames)
    valid_samples = (membership >= 0)
    # setup histograms
    hists = _init_hists()
    # setup report file
    unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX
    outfileh = open(unsorted_json_file, 'wb')
    for i in xrange(startrow, endrow):
        logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow))
        # read from memmap
        counts = np.array(bm.counts[i, :], dtype=np.float)
        # remove 'nan' values
        valid_inds = np.logical_and(valid_samples, np.isfinite(counts))
        # subset counts, size_factors, and membership array
        counts = counts[valid_inds]
        size_factors = bm.size_factors[valid_inds]
        valid_membership = membership[valid_inds]
        # write dummy results for invalid rows
        if (valid_inds.sum() == 0) or (np.all(counts == 0)):
            res = Result.default()
        else:
            # run ssea
            res, null_nes_vals = ssea_run(counts, size_factors,
                                          valid_membership, rng, config)
            # update histograms
            null_keys = []
            obs_keys = []
            if res.es < 0:
                null_keys.append('null_nes_neg')
                obs_keys.append('obs_nes_neg')
            elif res.es > 0:
                null_keys.append('null_nes_pos')
                obs_keys.append('obs_nes_pos')
            for k in xrange(len(null_keys)):
                null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX)
                obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX)
                hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0]
                hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0]
        # save t_id
        res.t_id = i
        # convert to json and write
        print >> outfileh, res.to_json()
    # close report file
    outfileh.close()
    # save histograms to a file
    output_hist_file = output_basename + NPY_HISTS_SUFFIX
    np.savez(output_hist_file, **hists)
    # cleanup
    bm.close()
    # sort output json file by abs(NES)
    logging.debug("Worker %s: sorting results" % (output_basename))
    # make tmp dir for sorting
    if os.path.exists(output_basename):
        shutil.rmtree(output_basename)
    os.makedirs(output_basename)
    # call batch sort python function
    sorted_json_file = output_basename + JSON_SORTED_SUFFIX
    batch_sort(input=unsorted_json_file,
               output=sorted_json_file,
               key=_cmp_json_nes,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[output_basename])
    # remove tmp dir
    shutil.rmtree(output_basename)
    # remove unsorted json file
    os.remove(unsorted_json_file)
    logging.debug("Worker %s: done" % (output_basename))
    return 0
コード例 #3
0
ファイル: test_algo.py プロジェクト: BioXiao/ssea
 def test_default_result(self):
     result = Result.default()
     self.assertTrue(result.t_id is None)
     self.assertTrue(result.ss_rank is None)
コード例 #4
0
 def test_default_result(self):
     result = Result.default()
     self.assertTrue(result.t_id is None)
     self.assertTrue(result.ss_rank is None)
コード例 #5
0
ファイル: algo.py プロジェクト: BioXiao/ssea
def ssea_run(counts, size_factors, membership, rng, config):
    '''
    counts: numpy array of float values
    size_factors: normalization factors for counts
    membership: int array (0 or 1) with set membership
    rng: RandomState object
    config: Config object
    '''    
    # run kernel to generate a range of observed enrichment scores
    resample_rand_seeds = np.empty(config.resampling_iterations, dtype=np.int)
    resample_count_ranks = np.empty((config.resampling_iterations, counts.shape[0]), dtype=np.int)
    resample_es_vals = np.zeros(config.resampling_iterations, dtype=np.float) 
    resample_es_ranks = np.zeros(config.resampling_iterations, dtype=np.int)
    for i in xrange(config.resampling_iterations):
        # save random number generator seed before running kernel
        resample_rand_seeds[i] = rng.seed
        k = ssea_kernel(counts, size_factors, membership, rng,
                        resample_counts=True,
                        permute_samples=False,
                        add_noise=True,
                        noise_loc=config.noise_loc, 
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        resample_count_ranks[i] = k.ranks
        resample_es_vals[i] = k.es_val
        resample_es_ranks[i] = k.es_rank
    # find the median ES value
    median_index = int(config.resampling_iterations / 2)
    median_index = resample_es_vals.argsort()[median_index]
    median_es_val = resample_es_vals[median_index]
    # choose whether to use the positive or negative side of the 
    # distribution based on the median ES value
    if median_es_val == 0:
        return Result.default(), np.zeros((0,), dtype=np.float)
    elif median_es_val < 0:
        signfunc = np.less
    else:
        signfunc = np.greater
    # subset to include only the corresponding side of the distribution
    resample_sign_inds = signfunc(resample_es_vals, 0)
    resample_rand_seeds = resample_rand_seeds[resample_sign_inds]
    resample_count_ranks = resample_count_ranks[resample_sign_inds]
    resample_es_vals = resample_es_vals[resample_sign_inds]
    resample_es_ranks = resample_es_ranks[resample_sign_inds]
    # determine the median value
    median_index = int(resample_es_vals.shape[0] / 2)
    median_index = resample_es_vals.argsort()[median_index]
    # select the kernel run representing the median ES value
    rand_seed = resample_rand_seeds[median_index]
    es_val = resample_es_vals[median_index]
    es_rank = resample_es_ranks[median_index]
    ranks = resample_count_ranks[median_index]
    es_sign = cmp(es_val, 0)
    # permute samples and determine ES null distribution
    null_es_vals = np.zeros(config.perms, dtype=np.float) 
    null_es_ranks = np.zeros(config.perms, dtype=np.float)
    i = 0
    while i < config.perms:
        k = ssea_kernel(counts, size_factors, membership, rng,
                        resample_counts=True,
                        permute_samples=True,
                        add_noise=True,
                        noise_loc=config.noise_loc, 
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        if cmp(k.es_val, 0) == es_sign:
            null_es_vals[i] = k.es_val
            null_es_ranks[i] = k.es_rank
            i += 1
    # Subset the null ES scores to only positive or negative 
    null_sign_inds = signfunc(null_es_vals, 0)
    null_es_vals = null_es_vals[null_sign_inds]
    null_es_ranks = null_es_ranks[null_sign_inds]
    # Adjust for variation in gene set size. Normalize ES(S,null)
    # and the observed ES(S), separately rescaling the positive and
    # negative scores by dividing by the mean of the ES(S,null) to
    # yield normalized scores NES(S,null)
    null_es_mean = np.fabs(null_es_vals.mean())
    null_nes_vals = null_es_vals / null_es_mean
    # Normalize the observed ES(S) by rescaling by the mean of
    # the ES(S,null) separately for positive and negative ES(S)
    nes_val = es_val / null_es_mean
    # estimate nominal p value for S from ES(S,null) by using the
    # positive or negative portion of the distribution corresponding
    # to the sign of the observed ES(S)
    p_value = (np.fabs(null_es_vals) >= np.fabs(es_val)).sum().astype(np.float)
    p_value /= null_es_vals.shape[0]
    # Create result object for this SSEA test
    res = Result()
    res.rand_seed = int(rand_seed)
    res.es = round(es_val, FLOAT_PRECISION)
    res.es_rank = int(es_rank)
    res.nominal_p_value = round(p_value, SCIENTIFIC_NOTATION_PRECISION)
    res.nes = round(nes_val, FLOAT_PRECISION)
    # save some of the resampled es points 
    res.resample_es_vals = np.around(resample_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION)
    res.resample_es_ranks = resample_es_ranks[:Result.MAX_POINTS]
    # save null distribution points
    res.null_es_mean = null_es_mean
    res.null_es_vals = np.around(null_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION)
    res.null_es_ranks = null_es_ranks[:Result.MAX_POINTS]
    # get indexes of hits in this set
    m = membership[ranks]
    hit_inds = (m > 0).nonzero()[0]
    num_hits = hit_inds.shape[0]
    num_misses = m.shape[0] - num_hits
    # calculate leading edge stats
    if es_val < 0:
        core_hits = sum(i >= es_rank for i in hit_inds)
        core_misses = (m.shape[0] - es_rank) - core_hits
    else:
        core_hits = sum(i <= es_rank for i in hit_inds)
        core_misses = 1 + es_rank - core_hits
    null_hits = num_hits - core_hits
    null_misses = num_misses - core_misses
    # fisher exact test (one-sided hypothesis that LE is enricheD)
    fisher_p_value = fisher.pvalue(core_hits, core_misses, null_hits, null_misses).right_tail
    # odds ratio
    n = np.inf if null_hits == 0 else float(core_hits) / null_hits
    d = np.inf if null_misses == 0 else float(core_misses) / null_misses
    if np.isfinite(n) and np.isfinite(d):
        if n == 0 and d == 0:
            odds_ratio = np.nan
        else:
            odds_ratio = np.inf if d == 0 else (n / d)
    elif np.isfinite(d):
        odds_ratio = np.inf
    else:
        odds_ratio = np.nan if n == 0 else 0.0
    # create dictionary result
    res.core_hits = int(core_hits)
    res.core_misses = int(core_misses)
    res.null_hits = int(null_hits)
    res.null_misses = int(null_misses)
    res.fisher_p_value = np.round(fisher_p_value, SCIENTIFIC_NOTATION_PRECISION)
    res.odds_ratio = odds_ratio
    # return result and null distribution for subsequent fdr calculations
    return res, null_nes_vals
コード例 #6
0
ファイル: algo.py プロジェクト: BioXiao/ssea
def ssea_serial(config, sample_set, output_basename, 
                startrow=None, endrow=None):
    '''
    main SSEA loop (single processor)
    
    matrix_dir: numpy memmap matrix containing numeric data 
    sample_set: SampleSet object
    config: Config object
    output_basename: prefix for writing result files
    '''
    # initialize random number generator
    rng = RandomState()
    # open data matrix
    bm = BigCountMatrix.open(config.matrix_dir)
    # determine range of matrix to process
    if startrow is None:
        startrow = 0
    if endrow is None:
        endrow = bm.shape[0]
    assert startrow < endrow
    # get membership array for sample set
    membership = sample_set.get_array(bm.colnames)
    valid_samples = (membership >= 0)
    # setup histograms
    hists = _init_hists()
    # setup report file
    unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX
    outfileh = open(unsorted_json_file, 'wb')    
    for i in xrange(startrow, endrow):
        logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow))
        # read from memmap
        counts = np.array(bm.counts[i,:], dtype=np.float)
        # remove 'nan' values
        valid_inds = np.logical_and(valid_samples, np.isfinite(counts))
        # subset counts, size_factors, and membership array
        counts = counts[valid_inds]
        size_factors = bm.size_factors[valid_inds]
        valid_membership = membership[valid_inds]
        # write dummy results for invalid rows
        if (valid_inds.sum() == 0) or (np.all(counts == 0)):
            res = Result.default()
        else:
            # run ssea
            res, null_nes_vals = ssea_run(counts, size_factors, 
                                          valid_membership, rng, config)
            # update histograms
            null_keys = []
            obs_keys = []
            if res.es < 0:
                null_keys.append('null_nes_neg')
                obs_keys.append('obs_nes_neg')
            elif res.es > 0:
                null_keys.append('null_nes_pos')
                obs_keys.append('obs_nes_pos')
            for k in xrange(len(null_keys)):
                null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX)
                obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX)
                hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0]
                hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0]
        # save t_id
        res.t_id = i
        # convert to json and write
        print >>outfileh, res.to_json()
    # close report file
    outfileh.close()
    # save histograms to a file
    output_hist_file = output_basename + NPY_HISTS_SUFFIX
    np.savez(output_hist_file, **hists)
    # cleanup
    bm.close()
    # sort output json file by abs(NES)
    logging.debug("Worker %s: sorting results" % (output_basename))
    # make tmp dir for sorting
    if os.path.exists(output_basename):
        shutil.rmtree(output_basename)
    os.makedirs(output_basename) 
    # call batch sort python function
    sorted_json_file = output_basename + JSON_SORTED_SUFFIX
    batch_sort(input=unsorted_json_file,
               output=sorted_json_file,
               key=_cmp_json_nes,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[output_basename])
    # remove tmp dir
    shutil.rmtree(output_basename)
    # remove unsorted json file
    os.remove(unsorted_json_file)    
    logging.debug("Worker %s: done" % (output_basename))
    return 0