def compute_ir_metrics(ge_wrapper, ge_loader, smiles_wrapper, smiles_loader,
                           split="train", train_smiles=None):
        gex_embeddings, chem_embeddings, smiles_gex_labels, smiles_chem_labels = get_embeddings(ge_wrapper, ge_loader,
                                                                            smiles_wrapper, smiles_loader)
        gex_chem_distances = cdist(gex_embeddings, chem_embeddings, metric=config['retrieval']['metric'])
        gex_chem_ranks = rankdata(gex_chem_distances, axis=1)
        rank_first_match = get_ranks_first_match(gex_chem_ranks, smiles_gex_labels, smiles_chem_labels)

        # smiles_wrapper.pert_smiles == smiles_chem_labels

        list_of_inds = [[i for i,j in enumerate(smiles_gex_labels)]]
        if split == "val":
            ge_inds_in_train = [i for i, j in enumerate(smiles_gex_labels) if j in train_smiles]
            ge_inds_not_in_train = [i for i, j in enumerate(smiles_gex_labels) if j not in train_smiles]
            list_of_inds.append(ge_inds_in_train)
            list_of_inds.append(ge_inds_not_in_train)

            chem_inds_not_in_train = [i for i,j in enumerate(smiles_chem_labels) if j not in train_smiles]

        ir_results = []
        for inds in list_of_inds:
            ir_results.append(prepare_metrics(rank_first_match, inds))

        if split == "val":
            ranks_subset = rankdata(gex_chem_ranks[:, chem_inds_not_in_train][ge_inds_not_in_train,:], axis=1)
            rank_first_match = get_ranks_first_match(ranks_subset,
                                                     smiles_gex_labels[ge_inds_not_in_train],
                                                     smiles_chem_labels[chem_inds_not_in_train])
            ir_results.append(prepare_metrics(rank_first_match,
                                              [k for k in range(len(ge_inds_not_in_train))]
                                              ))
        return ir_results
Beispiel #2
0
def rank_data(r, rground):
    # we checked this heavily, and is correct, e.g. rground will go from largest rank to smallest
    r = rankdata(r)
    rground = rankdata(rground)
    if np.sum(r) != np.sum(rground):
        raise AssertionError("ranks should add up to the same")
    return r, rground
Beispiel #3
0
def run_split(root, print_all=False):
    cum_scores = []
    n_pos = None
    for filename in glob("%s/*.scored" % root):
        pos, neg = read_scores(filename)
        scores = concatenate((pos, neg))
        ranks = rankdata(-scores)
        if n_pos is None:
            n_pos = len(pos)
        else:
            assert n_pos == len(pos)

        pos_ranks = ranks[:n_pos]
        counts = [(pos_ranks <= i).sum() for i in range(1, len(ranks))]
        cum_scores.append(tuple(counts))

    cum_scores = array(cum_scores).transpose()
    if print_all:
        print '#cutoff\tcounts...'
        for i, row in enumerate(cum_scores):
            print '%d\t%s' % (i + 1, 
                              '\t'.join(['%d' % count for count in row]))
    else:
        means = cum_scores.mean(axis=1)
        print '#cutoff\trecall'
        for i, mean in enumerate(means):
            print '%d\t%.1f' % (i + 1, mean)
Beispiel #4
0
def run_loo(root, print_all=False):
    cum_scores = defaultdict(list)
    for filename in glob("%s/*.scored" % root):
        id_ = os.path.basename(filename).split('.')[0]
        major, rep = id_.split('-')

        pos, neg = read_scores(filename)
        scores = concatenate((pos, neg))
        ranks = rankdata(-scores)
        assert len(pos) == 1
        cum_scores[rep].append(ranks[0])

    reps = len(cum_scores)
    assert len(set(map(len, cum_scores.values()))) == 1

    print >> sys.stderr, "Found %d reps per iter..." % reps
    ranks = array([cum_scores[rep] for rep in sorted(cum_scores)]).transpose()
    if print_all:
        print '#cutoff\trecall...'
        for cutoff in range(1, ranks.shape[0] + 1):
            counts = (ranks <= cutoff).sum(axis=0)
            print '%d\t%s' % (cutoff, '\t'.join(
                ['%.1f' % count for count in counts]))
    else:
        print '#cutoff\trecall'
        for cutoff in range(1, ranks.shape[0] + 1):
            count = float((ranks <= cutoff).sum()) / reps
            print '%d\t%.1f' % (cutoff, count)
Beispiel #5
0
    def quantile_normalization(self):
        """ Return the np.array which contains the normalized values
        """
        rank_matrix = []
        for c in range(self.all_table.shape[1]):
            col = self.all_table[:, c]
            rank_col = mstats.rankdata(col)
            rank_matrix.append(rank_col)

        ranks = numpy.array(rank_matrix)
        trans_rank = numpy.transpose(ranks)

        # Calculate for means of ranks
        print("    Calculating for the mean of ranked data...")
        sort_matrix = numpy.sort(self.all_table, axis=0)
        means = []
        for r in range(self.all_table.shape[0]):
            row = [x for x in sort_matrix[r, :]]
            means.append(numpy.mean(row))

        # Replace the value by new means
        print("    Replacing the data value by normalized mean...")
        normalized_table = numpy.around(trans_rank)
        for i, v in enumerate(means):
            normalized_table[normalized_table == i + 1] = v
        # print(rounded_rank)
        self.norm_table = normalized_table
def rank_rows(M):
  """Rank order rows of M. Preserve masks.

  fill value for M must be the maximum value for that array.
  """
  #scipy.stats.mstats.rankdata
  for i in xrange(np.size(M,0)):
    try:
      mask = M.mask
    except AttributeError:
      M[i,:] = mstats.rankdata(M[i,:]) -1
    else:
      assert np.sum(M == M.fill_value) == 0
      mask = np.copy(M[i,:].mask)
      M[i,:] = mstats.rankdata(M[i,:].data) -1
      M[i,:].mask = mask
Beispiel #7
0
def run_split(root, print_all=False):
    cum_scores = []
    n_pos = None
    for filename in glob("%s/*.scored" % root):
        pos, neg = read_scores(filename)
        scores = concatenate((pos, neg))
        ranks = rankdata(-scores)
        if n_pos is None:
            n_pos = len(pos)
        else:
            assert n_pos == len(pos)

        pos_ranks = ranks[:n_pos]
        counts = [(pos_ranks <= i).sum() for i in range(1, len(ranks))]
        cum_scores.append(tuple(counts))

    cum_scores = array(cum_scores).transpose()
    if print_all:
        print '#cutoff\tcounts...'
        for i, row in enumerate(cum_scores):
            print '%d\t%s' % (i + 1, '\t'.join(['%d' % count
                                                for count in row]))
    else:
        means = cum_scores.mean(axis=1)
        print '#cutoff\trecall'
        for i, mean in enumerate(means):
            print '%d\t%.1f' % (i + 1, mean)
def rankData(x):
    '''
    Assumes x has shape assumes nodes is the last dimension
    Returns the ranked data
    It is not the most efficient method used
    '''
    from scipy.stats.mstats import rankdata
    s = x.shape  # extract shape
    ranking = np.zeros(s)  # ranking data
    maxim = np.zeros(s[:-1])  # largest driver node rank
    maxim[:] = np.nan  # assume nans

    # reshape (not really productive)
    maxim = maxim.reshape(-1)
    ranking = ranking.reshape((-1, s[-1]))
    noneFound = 0
    mask = np.ma.masked_invalid(x)
    for idx, sample in enumerate(mask.reshape(-1, s[-1])):
        # allow only if there is variance in the data
        if sample.sum() != 0:
            rank = rankdata(sample)
            ranking[idx] = rank
            maxim[idx] = rank.argmax()
        else:
            noneFound += 0
    ranking = ranking.reshape(s)  # reshape back
    maxim = maxim.reshape(s[:-1])
    print(f'In {noneFound} trials no max are found')
    return ranking, maxim
Beispiel #9
0
    def quantile_normalization(self):
        """ Return the np.array which contains the normalized values
        """
        rank_matrix = []
        for c in range(self.all_table.shape[1]):
            col = self.all_table[:, c]
            rank_col = mstats.rankdata(col)
            rank_matrix.append(rank_col)

        ranks = numpy.array(rank_matrix)
        trans_rank = numpy.transpose(ranks)

        # Calculate for means of ranks
        print("    Calculating for the mean of ranked data...")
        sort_matrix = numpy.sort(self.all_table, axis=0)
        means = []
        for r in range(self.all_table.shape[0]):
            row = [x for x in sort_matrix[r, :]]
            means.append(numpy.mean(row))

        # Replace the value by new means
        print("    Replacing the data value by normalized mean...")
        normalized_table = numpy.around(trans_rank)
        for i, v in enumerate(means):
            normalized_table[normalized_table == i + 1] = v
        # print(rounded_rank)
        self.norm_table = normalized_table
Beispiel #10
0
def main(kernel, ranks_file, stats_dir, metric='acc'):
    techniques = list(TECHNIQUES.keys())
    stats = dict()
    stat_count = defaultdict(int)
    for technique, (stats_file, parser) in TECHNIQUES.items():
        stats_file = (stats_file % metric)
        with open(os.path.join(stats_dir, stats_file), 'r') as f:
            for line in f:
                parts = line.strip().split(',')
                results = parser(parts, kernel)
                if results is None: continue
                dset, stat = results
                stats[technique, dset] = stat
                stat_count[dset] += 1

    good_datasets = [dset for dset in stat_count.keys()
                     if stat_count[dset] == len(techniques)]

    data = np.array([[stats[t, d] for d in good_datasets] for t in techniques])
    ranks = rankdata(-data, axis=0)
    avg_ranks = np.average(ranks, axis=1)
    with open(ranks_file, 'w+') as f:
        for t, r in zip(techniques, avg_ranks.flat):
            line = '%s,%d,%f\n' % (t, ranks.shape[1], r)
            f.write(line)
            print line,
Beispiel #11
0
def compute_rank(data):
    print '\nRANK\n'
    # rankdata assigns rank 1 to the lowest element, so
    # we need to negate before ranking.
    ssim_rank = rankdata(np.array(data['ssim']) * -1.0, axis=1)
    psnr_rank = rankdata(np.array(data['psnr']) * -1.0, axis=1)
    # Rank mean + std.
    for i, m in enumerate(data['models']):
        print '%30s    ssim-rank %.2f ± %.2f    psnr-rank %.2f ± %.2f' % (
            m, np.mean(ssim_rank[:, i]), np.std(ssim_rank[:, i]),
            np.mean(psnr_rank[:, i]), np.std(psnr_rank[:, i]))
    # Rank frequencies
    print '\n    SSIM rank freqs'
    print_rank_freqs(data, ssim_rank)
    print '\n    PSNR rank freqs'
    print_rank_freqs(data, psnr_rank)
    def _ranks(data):
        """
        This function computes ranks for data in the table along axis=0.

        Parameters
        ----------
        data : np.ndarray
            Array of data to be ranked

        Returns
        -------
        np.ndarray
            Table of data ranks
        """
        x_len = data.shape[0]
        x_mask = data.sum(axis=0) > 0

        # create a matrix of ranges - init with average rank
        # for columns without nonzero expressions
        data_ge_ranked = np.ones(data.shape) * (1 + data.shape[0]) / 2

        # compute ranks only for nonzero columns
        for i in np.where(x_mask)[0]:
            mask = data[:, i] > 0
            col = np.ones(x_len) * (1 + (x_len - mask.sum())) / 2
            col[mask] = rankdata(data[mask, i]) + (x_len - mask.sum())
            data_ge_ranked[:, i] = col
        return data_ge_ranked
Beispiel #13
0
    def fit(self, signal) -> "CostRank":
        """Set parameters of the instance.

        Args:
            signal (array): signal. Shape (n_samples,) or (n_samples, n_features)

        Returns:
            self
        """
        if signal.ndim == 1:
            signal = signal.reshape(-1, 1)

        obs, vars = signal.shape

        # Convert signal data into ranks in the range [1, n]
        ranks = rankdata(signal, axis=0)
        # Center the ranks into the range [-(n+1)/2, (n+1)/2]
        centered_ranks = ranks - ((obs + 1) / 2)
        # Sigma is the covariance of these ranks.
        # If it's a scalar, reshape it into a 1x1 matrix
        cov = np.cov(centered_ranks, rowvar=False,
                     bias=True).reshape(vars, vars)

        # Use the pseudoinverse to handle linear dependencies
        # see Lung-Yut-Fong, A., Lévy-Leduc, C., & Cappé, O. (2015)
        try:
            self.inv_cov = pinv(cov)
        except LinAlgError as e:
            raise LinAlgError(
                "The covariance matrix of the rank signal is not invertible and the "
                "pseudo-inverse computation did not converge.") from e
        self.ranks = centered_ranks

        return self
def main(kernel, ranks_file, stats_dir, metric='acc'):
    techniques = list(TECHNIQUES.keys())
    stats = dict()
    stat_count = defaultdict(int)
    for technique, (stats_file, parser) in TECHNIQUES.items():
        stats_file = (stats_file % metric)
        with open(os.path.join(stats_dir, stats_file), 'r') as f:
            for line in f:
                parts = line.strip().split(',')
                results = parser(parts, kernel)
                if results is None: continue
                dset, stat = results
                stats[technique, dset] = stat
                stat_count[dset] += 1

    good_datasets = [
        dset for dset in stat_count.keys()
        if stat_count[dset] == len(techniques)
    ]

    data = np.array([[stats[t, d] for d in good_datasets] for t in techniques])
    ranks = rankdata(-data, axis=0)
    avg_ranks = np.average(ranks, axis=1)
    with open(ranks_file, 'w+') as f:
        for t, r in zip(techniques, avg_ranks.flat):
            line = '%s,%d,%f\n' % (t, ranks.shape[1], r)
            f.write(line)
            print line,
Beispiel #15
0
    def get_rank_vector(self, x):
        """Get ranking with explicit handling of missing values, tagged as nan."""

        n = len(x)

        if np.all(np.isnan(x)):
            #return np.ones(len(x)) * n
            return np.ones(len(x)) * (self.threshold + 1)

        ranks = mstats.rankdata(np.ma.masked_invalid(x))

        maxrank = np.max(ranks)

        # Make all missing data have the same rank, not ordered by appearance
        #ranks[ranks == 0] = maxrank + 1
        ranks[ranks == 0] = self.threshold + 1
        """
    #ranks[ranks == 0] = n + 1
    ranks[ranks == 0] = np.nan

    # Convert nan to max rank, penalize missing submissions
    maxrank = np.nanmax(ranks)
    if np.isnan(maxrank):
      maxrank = 0 # All values missing, set to equal rank

    ranks[np.isnan(ranks)] = maxrank + 1
    """

        return ranks
  def get_rank_vector(self, x):
    """Get ranking with explicit handling of missing values, tagged as nan."""

    n = len(x)

    if np.all(np.isnan(x)):
      #return np.ones(len(x)) * n
      return np.ones(len(x)) * (self.threshold + 1)

    ranks = mstats.rankdata(np.ma.masked_invalid(x))

    maxrank = np.max(ranks)

    # Make all missing data have the same rank, not ordered by appearance
    #ranks[ranks == 0] = maxrank + 1
    ranks[ranks == 0] = self.threshold + 1


    """
    #ranks[ranks == 0] = n + 1
    ranks[ranks == 0] = np.nan

    # Convert nan to max rank, penalize missing submissions
    maxrank = np.nanmax(ranks)
    if np.isnan(maxrank):
      maxrank = 0 # All values missing, set to equal rank

    ranks[np.isnan(ranks)] = maxrank + 1
    """

    return ranks
Beispiel #17
0
def run_loo(root, print_all=False):
    cum_scores = defaultdict(list)
    for filename in glob("%s/*.scored" % root):
        id_ = os.path.basename(filename).split('.')[0]
        major, rep = id_.split('-')

        pos, neg = read_scores(filename)
        scores = concatenate((pos, neg))
        ranks = rankdata(-scores)
        assert len(pos) == 1
        cum_scores[rep].append(ranks[0])

    reps = len(cum_scores)
    assert len(set(map(len, cum_scores.values()))) == 1

    print >>sys.stderr, "Found %d reps per iter..." % reps
    ranks = array([cum_scores[rep] for rep in sorted(cum_scores)]).transpose()
    if print_all:
        print '#cutoff\trecall...'
        for cutoff in range(1, ranks.shape[0] + 1):
            counts = (ranks <= cutoff).sum(axis=0)
            print '%d\t%s' % (cutoff, '\t'.join(['%.1f' % count 
                                                 for count in counts]))
    else:
        print '#cutoff\trecall'
        for cutoff in range(1, ranks.shape[0] + 1):
            count = float((ranks <= cutoff).sum()) / reps
            print '%d\t%.1f' % (cutoff, count)
Beispiel #18
0
    def kde_privacy(self, x_train, x_test, y_train, y_test):
        num_test = y_test.shape[0]
        kdes = self.create_kdes(x_train, y_train)
        p_vals = []
        maps = {}
        cnt = 0
        for (l, k) in kdes.iteritems():
            p_vals.append(k.evaluate(x_test.T))
            maps[l] = cnt
            cnt += 1
        p_vals = np.asarray(
            p_vals
        )  # num_label(num_kde) * num_data p(i,j)=p(data j comes from kde i)
        num_labels = p_vals.shape[0]
        probs = p_vals / np.max(p_vals, 0)
        #tops = np.sum(probs>self.eps,0)
        ranks = 1 + num_labels - rankdata(probs, 0)  #+1 is ok?
        trueranks = np.zeros(num_test)
        for i in range(num_test):
            trueranks[i] = ranks[maps[y_test[i]], i]
        #ek_priv = np.maximum(tops/num_labels,trueranks/num_labels)
        #e_priv = tops/num_labels
        rank_priv = trueranks  #/num_labels
        log_rank_priv = np.log(rank_priv)
        opt_bayes = 1 - sum(trueranks == 1) / num_test
        priv = [
            num_labels, opt_bayes,
            np.mean(rank_priv),
            np.std(rank_priv),
            np.mean(log_rank_priv),
            np.std(log_rank_priv)
        ]

        return priv
Beispiel #19
0
    def cdf(self, x, c=0.0, sigma=1.0, theta=0.0, nu=1.0):
        """
        Cumulative distribution function  derivative at x of the VarianceGamma distribution.
        
        Parameters
        ----------
        x : array_like
            quantiles
        c, sigma, theta, nu : array_like
            The shape parameter(s) for the distribution (see docstring of the
            instance object for more information) (default=0, 1, 0, 1)
            
        Returns
        -------
        cdf : ndarray
            Cumulative distribution function evaluated at x
            
        """
        if sigma <= 0 or nu <= 0:
            raise ValueError("The value of sigma and nu must be positive")

        x = np.atleast_1d(x)
        xs = np.sort(x)

        # Get the breaks and add -inf and inf
        xi = self._breaks(c, sigma, theta, nu)
        xi = np.concatenate([[-inf], xi, [inf]])

        # Evaluate cdf at break points
        int_xi = np.zeros(1)
        for j in range(1, xi.size - 3):
            int_xi = np.append(
                int_xi,
                quad(self.pdf, xi[j], xi[j + 1], args=(c, sigma, theta, nu))[0]
                + int_xi[j - 1])

        # Create masks that contains index of a range
        mx = np.full(x.shape, xi.size - 2, dtype=np.int32)
        for j in range(xi.size - 1):
            mx[(xs >= xi[j]) & (xs < xi[j + 1])] = j

        # Integrate
        interval = np.sort(np.stack((xi[mx], xs), axis=-1))
        xint = np.full(x.shape + (101, ), nan)
        for index in np.ndindex(xs.shape):
            if not isinf(interval[index][0]) and not isinf(interval[index][1]):
                xint[index] = np.linspace(interval[index][0],
                                          interval[index][1], xint.shape[-1])
        yint = self.pdf(xint, c, sigma, theta, nu)
        resint = trapz(yint, xint)

        # Accumulate
        resint[mx == 0] = 0
        for j in range(1, xi.size - 2):
            resint[mx == j] = int_xi[j - 1] + resint[mx == j]
        resint[mx == (xi.size - 2)] = 1

        return resint.flatten()[rankdata(x).astype(int) - 1].reshape(
            resint.shape)
Beispiel #20
0
def ranking_filter(score):
    """lecture 4 slide 35"""
    n = np.sum(~np.isnan(score))
    ranks = mstats.rankdata(np.ma.masked_invalid(score))
    ranks[ranks == 0] = np.nan
    pos = np.nan_to_num(ranks - np.nansum(ranks) / n)
    pos /= np.nansum(np.abs(pos)) / 2
    return pos
Beispiel #21
0
def long_ranking_filter(score):
    try:
        ranks = mstats.rankdata(np.ma.masked_invalid(score))
    except:
        return score * 0 + 1 / len(score)
    ranks[ranks == 0] = np.nan
    pos = np.nan_to_num(ranks / np.nansum(ranks))
    return pos
 def _get_rank_for_one_correct_label(self, prob, y):
     ranks = rankdata(-prob, axis=1)
     inverse_ranks = self.num_community - ranks
     target_inverse_ranks = inverse_ranks * y
     best_target_inverse_ranks = target_inverse_ranks.max(axis=1)
     best_target_ranks = self.num_community - best_target_inverse_ranks
     rounded_best_target_ranks = np.rint(best_target_ranks).astype(int)
     return rounded_best_target_ranks
Beispiel #23
0
def evaluate_one_comparison(data, prior_count, pair, rel_rank_ZV):
    """
    Internal function: Evaluates the rank score of all proteins in one comparison.
    The rank score is small for proteins with large positive logarithmic fold change values and proteins whose count changes from zero to a non-zero value.
    The rank score is close to one for proteins with large negative logarithmic fold change values and proteins whose count changes from a non-zero value to zero.

    Parameters
    ----------
    data : ndarray
    NumPy array with the analyzed protein counts.
    prior_count : float, optional
    This count is added to all actual protein counts when the logarithmic fold changes are computed. It attenuates the impact of proteins with low counts. The default is 2.
    pair : list with two integers
    The integers define the columns involved in the comparison.
    The first element is the reference column, the second element is the column with a modified condition.

    Returns
    -------
    s : ndarray with the rank score of all proteins (for the given comparison)
    lfc : ndarray with the logarithmic fold change for all proteins (for the given comparison)
    """

    # compute the fraction of VV, VZ, ZV, and ZZ pairs for the given pair of columns
    VV_proteins = np.where((data[:, pair[0]] > 0)
                           & (data[:, pair[1]] > 0) == True)[0]
    #xVV = VV_proteins.size / float(data.shape[0])
    VZ_proteins = np.where((data[:, pair[0]] > 0)
                           & (data[:, pair[1]] == 0) == True)[0]
    #xVZ = VZ_proteins.size / float(data.shape[0])
    ZV_proteins = np.where((data[:, pair[0]] == 0)
                           & (data[:, pair[1]] > 0) == True)[0]
    #xZV = ZV_proteins.size / float(data.shape[0])
    ZZ_proteins = np.where((data[:, pair[0]] == 0)
                           & (data[:, pair[1]] == 0) == True)[0]
    #xZZ = ZZ_proteins.size / float(data.shape[0])
    score_vector = np.zeros(data.shape[0])
    score_vector[:] = -1
    lfc_vals = np.zeros(data.shape[0])
    for row in range(data.shape[0]):
        if data[row, pair[1]] > 0 and data[row, pair[0]] > 0:
            lfc_vals[row] = np.log2(
                float(prior_count + data[row, pair[1]]) /
                (prior_count + data[row, pair[0]]))
        else:
            lfc_vals[row] = np.nan
    rank_lfc = mstats.rankdata(
        np.ma.masked_invalid(-lfc_vals)
    )  # the largest lfc values get the smallest rank; inf and nan entries get zero rank
    score_vector[VV_proteins] = (rank_lfc[VV_proteins] -
                                 0.5) / VV_proteins.size
    score_vector[ZV_proteins] = rel_rank_ZV
    score_vector[VZ_proteins] = 1 - rel_rank_ZV
    score_vector[
        ZZ_proteins] = -1  # by setting negative score here, we mark that this is a ZZ pair
    return score_vector, lfc_vals
Beispiel #24
0
def nanrankdata(arr):
    '''
    Ranks data ignoring NaN values
    '''
    if np.all(np.isnan(arr)):
        return arr.copy()

    ranks = mstats.rankdata(np.ma.masked_invalid(arr))
    ranks[ranks == 0] = np.nan

    return ranks
Beispiel #25
0
    def testMcmcCoplulaFit(self):
        print(
            "--------------------- MCMC COPULA FIT TEST --------------------------"
        )
        # Load matlab data set
        stocks = np.loadtxt(dataDir + 'stocks.csv', delimiter=',')
        x = stocks[:, 0]
        y = stocks[:, 1]

        # Rank transform the data
        u = rankdata(x) / (len(x) + 1)
        v = rankdata(y) / (len(y) + 1)

        # Fit t copula and gaussian copula
        thetag0 = [0.2]
        g_copula = gc()
        theta_g_fit_mle = g_copula.fitMLE(u,
                                          v,
                                          *thetag0,
                                          bounds=((-0.99, 0.99), ))[0]
        aic_g_fit_mle = g_copula._AIC(u, v, 0, *theta_g_fit_mle)
        theta_g_fit_mcmc = g_copula.fitMcmc(u,
                                            v,
                                            *thetag0,
                                            bounds=((-0.99, 0.99), ),
                                            ngen=500,
                                            nburn=200)[0]
        aic_g_fit_mcmc = g_copula._AIC(u, v, 0, *theta_g_fit_mcmc)
        print("Gaussian copula MLE paramter [rho]: " + str(theta_g_fit_mle) +
              " AIC =" + str(aic_g_fit_mle))
        print("Gaussian copula MCMC paramter [rho]: " + str(theta_g_fit_mcmc) +
              " AIC =" + str(aic_g_fit_mcmc))

        # check MLE and MCMC solution are the same in this case
        self.assertAlmostEqual(theta_g_fit_mle[0],
                               theta_g_fit_mcmc[0],
                               delta=tol)

        # check againt expected
        true_rho_ranked = 0.7387
        self.assertAlmostEqual(theta_g_fit_mcmc[0], true_rho_ranked, delta=tol)
Beispiel #26
0
def friedman(results, alpha=0.05):
    """
    Performs the Friedman test on the given results determining if there is a difference between configurations

    results: list of list of numbers representing results for parameters for a number of problems
    alpha: 1 - confidence of outcome
    """

    ranks = rankdata(array(results), axis=1)
    (k, n) = ranks.shape
    T = (n - 1) * sum((sum(ranks) - k * (n + 1) / 2.0) ** 2) / sum(sum(ranks ** 2 - (n + 1) * (n + 1) / 4.0))
    return T, chi2.ppf(1 - alpha, n - 1)
    def select_attributes(data, z_threshold=1):
        """
        Function selects "over"-expressed attributes for items with Mann-Whitney
        U test.

        Parameters
        ----------
        data : Orange.data.Table
            Tabular data with gene expressions
        z_threshold : float
            The threshold for selecting the attribute. For each item the
            attributes with z-value above this value are selected.

        Returns
        -------
        :obj:`list`
            Sets of selected attributes for each cell
        """
        if len(data.X) <= 1:
            return [], []
        # rank data
        data_ge_ranked = rankdata(data.X, axis=0)

        # compute U, mu, sigma
        n = data_ge_ranked.shape[0]
        n2 = n - 1
        u = data_ge_ranked - 1
        mu = n2 / 2
        sigma = np.zeros(data_ge_ranked.shape[1])
        for i in range(data_ge_ranked.shape[1]):
            _, counts = np.unique(data_ge_ranked[:, i], return_counts=True)
            sigma[i] = np.sqrt(
                1 * n2 / 12 * ((n + 1) - np.sum((counts ** 3 - counts)) /
                               (n * (n - 1))))

        # compute z
        z = (u - mu) / (sigma + 1e-16)

        # gene selection
        attributes_np = np.array([
            a.attributes.get("Entrez ID") for a in data.domain.attributes])
        attributes_sets = [
            set(map(str, set(attributes_np[row > z_threshold]) - {None}))
            for row in z]
        # map to string was added since there seems to be no guarantee that
        # Entrez ID is a string.

        # pack z values to data table
        # take only attributes in new domain
        domain = Domain([x for x in data.domain.attributes])
        z_table = Table(domain, z)

        return attributes_sets, z_table
Beispiel #28
0
 def rank(self, method=0):
     """!
     @brief rank transfom the data
     @param method <b>int</b>
            if == 0: use standard rank transform,
            else: use CDF data transform.
     """
     self.rankMethod = method
     if method == 0:
         self.u = rankdata(self.x) / (len(self.x) + 1)
         self.v = rankdata(self.y) / (len(self.y) + 1)
     else:
         # use alternate CDF rank transform method
         kde_x = gaussian_kde(self.x)
         kde_y = gaussian_kde(self.y)
         u_hat = np.zeros(len(self.x))
         v_hat = np.zeros(len(self.y))
         for i, (xp, yp) in enumerate(zip(self.x, self.y)):
             u_hat[i] = kde_x.integrate_box_1d(-np.inf, xp)
             v_hat[i] = kde_y.integrate_box_1d(-np.inf, yp)
         self.u = u_hat
         self.v = v_hat
def generate_correlation_map(x: np.ndarray,
                             y: np.ndarray,
                             method: str = 'pearson') -> np.ndarray:
    """
    Correlate each row in matrix X against each row in matrix Y.

    Parameters
    ----------
    x
      Shape N X T.
    y
      Shape M X T.
    method
        Method use to compute the correlation. Must be one of 'pearson' or 'spearman'

    Returns
    -------
    np.array
      N X M array in which each element is a correlation coefficient.

    """
    if method.lower() not in ['spearman', 'pearson']:
        raise NotImplementedError(
            f'Method {method} not understood, must be one of "pearson", "spearman"'
        )

    if method.lower() == 'spearman':
        x = rankdata(x, axis=1)
        y = rankdata(y, axis=1)

    mu_x = x.mean(axis=1)
    mu_y = y.mean(axis=1)
    n = x.shape[1]
    if n != y.shape[1]:
        raise ValueError('x and y must have the same number of timepoints.')
    s_x = x.std(axis=1, ddof=n - 1)
    s_y = y.std(axis=1, ddof=n - 1)
    cov = np.dot(x, y.T) - n * np.dot(mu_x[:, np.newaxis], mu_y[np.newaxis, :])
    return cov / np.dot(s_x[:, np.newaxis], s_y[np.newaxis, :])
    def rank_all_stocks(self, metrics, update=True):
        # We load the fundamental indicators from a csv file (if update==False),
        # or from the export_csv_data method (if update==True)
        if update:
            self.export_csv_data(metrics)

        try:
            with open('Statistics_{}.csv'.format(self.sector), 'r') as f:
                Stats = []
                reader = csv.reader(f, delimiter=',')
                for row in reader:
                    Stats.append(row)
        except:
            print(
                'Cannot find file...probably does not exists in the directory')

        #
        ranks = [c[0] for c in Stats]
        for i1 in range(len(Stats[0])
                        ):  # For each fundamental indicator in the csv file...
            if Stats[0][
                    i1] in metrics:  # ...if that fundamental indicator is in our list of metrics...
                array = np.array([
                    float(c[i1]) for c in Stats[1:]
                ])  # ...we store its value for each stock in an array variable
                rank = mstats.rankdata(
                    np.ma.masked_invalid(array)
                )  # We rank each stock for this fundamental indicator
                rank[rank == 0] = np.nan

                if Stats[0][i1] in best:
                    if best[
                            Stats[0]
                        [i1]] == 'H':  # The rank depends on if we want the value of the fundamental indicator to be high or low
                        rank = np.nanmax(rank) + 1 - rank

                rank = list(rank)
                #                for i in range(len(rank)):
                #                    if not np.isnan(rank[i]):
                #                        rank[i] = int(rank[i])

                rank = [
                    Stats[0][i1]
                ] + rank  # We stack the ranks for all fundamental indicators in a matrix ranks
                ranks = np.vstack((ranks, rank))

        # We extract the rank of each stock for each fundamental indicator in a csv file named
        # 'Rank_?.csv', where "?" is the name of the sector
        with open('Rank_{}.csv'.format(self.sector), "w", newline='') as f:
            writer = csv.writer(f)
            writer.writerows(np.transpose(ranks))
Beispiel #31
0
 def test_ranking(self):
     x = ma.array([
         0,
         1,
         1,
         1,
         2,
         3,
         4,
         5,
         5,
         6,
     ])
     assert_almost_equal(mstats.rankdata(x),
                         [1, 3, 3, 3, 5, 6, 7, 8.5, 8.5, 10])
     x[[3, 4]] = masked
     assert_almost_equal(mstats.rankdata(x),
                         [1, 2.5, 2.5, 0, 0, 4, 5, 6.5, 6.5, 8])
     assert_almost_equal(mstats.rankdata(x, use_missing=True),
                         [1, 2.5, 2.5, 4.5, 4.5, 4, 5, 6.5, 6.5, 8])
     x = ma.array([
         0,
         1,
         5,
         1,
         2,
         4,
         3,
         5,
         1,
         6,
     ])
     assert_almost_equal(mstats.rankdata(x),
                         [1, 3, 8.5, 3, 5, 7, 6, 8.5, 3, 10])
     x = ma.array([[0, 1, 1, 1, 2], [
         3,
         4,
         5,
         5,
         6,
     ]])
     assert_almost_equal(mstats.rankdata(x),
                         [[1, 3, 3, 3, 5], [6, 7, 8.5, 8.5, 10]])
     assert_almost_equal(mstats.rankdata(x, axis=1),
                         [[1, 3, 3, 3, 5], [1, 2, 3.5, 3.5, 5]])
     assert_almost_equal(mstats.rankdata(x, axis=0),
                         [[1, 1, 1, 1, 1], [
                             2,
                             2,
                             2,
                             2,
                             2,
                         ]])
Beispiel #32
0
 def test_ranking(self):
     x = ma.array([0, 1, 1, 1, 2, 3, 4, 5, 5, 6])
     assert_almost_equal(mstats.rankdata(x), [1, 3, 3, 3, 5, 6, 7, 8.5, 8.5, 10])
     x[[3, 4]] = masked
     assert_almost_equal(mstats.rankdata(x), [1, 2.5, 2.5, 0, 0, 4, 5, 6.5, 6.5, 8])
     assert_almost_equal(mstats.rankdata(x, use_missing=True), [1, 2.5, 2.5, 4.5, 4.5, 4, 5, 6.5, 6.5, 8])
     x = ma.array([0, 1, 5, 1, 2, 4, 3, 5, 1, 6])
     assert_almost_equal(mstats.rankdata(x), [1, 3, 8.5, 3, 5, 7, 6, 8.5, 3, 10])
     x = ma.array([[0, 1, 1, 1, 2], [3, 4, 5, 5, 6]])
     assert_almost_equal(mstats.rankdata(x), [[1, 3, 3, 3, 5], [6, 7, 8.5, 8.5, 10]])
     assert_almost_equal(mstats.rankdata(x, axis=1), [[1, 3, 3, 3, 5], [1, 2, 3.5, 3.5, 5]])
     assert_almost_equal(mstats.rankdata(x, axis=0), [[1, 1, 1, 1, 1], [2, 2, 2, 2, 2]])
Beispiel #33
0
def calc_percentile(values):
    """Calculates the percentile values for a sample of numbers.

    NOTE:
        There are multiple ways to calculate the percentile. The problem lies in
        how to deal with duplicate entries.

        We will use the simplest definition of percentile, which is the rank of
        an entry divided by the total number of entries. When calculating the
        rank, we take the mean ranks of all entries with the same value.

        The calculated percentiles are in [0, 100).
    """
    return (rankdata(values) - 1) / len(values) * 100
Beispiel #34
0
    def __prepare_inputs(self, inputs):
        scaled_inputs = self.scaler.transform(inputs)
        standardized_inputs = self.standard_scaler.transform(inputs)
        outputs = np.concatenate((scaled_inputs, standardized_inputs), axis=1)
        if self.log and np.sum(inputs <= 0) == 0:
            outputs = np.concatenate((outputs, np.log(inputs)), axis=1)
        if self.sqrt and np.sum(inputs <= 0) == 0:
            outputs = np.concatenate((outputs, np.sqrt(inputs)), axis=1)
        if self.square:
            outputs = np.concatenate((outputs, np.square(inputs)), axis=1)
        if self.percentile:
            outputs = np.concatenate((outputs, rankdata(inputs, axis=0) / len(inputs)), axis=1)

        inputs = outputs.copy()
        return inputs
def get_ranks(np_array):
    print "original array has {0} total and {1} valid elements".format(
        np_array.size, np_array.size-np.count_nonzero(np.isnan(np_array)))

    #temp = np_array.argsort()
    #rank_array = np.empty(len(np_array), int)
    #rank_array[temp] = np.arange(len(np_array))

    rank_array = mstats.rankdata(np.ma.masked_invalid(np_array))
    rank_array[rank_array==0] = np.nan
    rank_array -= 1
    print "rank array has {0} max rank and {1} valid ranks".format(
        np.nanmax(rank_array),
        rank_array.size-np.count_nonzero(np.isnan(rank_array)))
    return rank_array
Beispiel #36
0
 def rank(self, method=0):
     """!
     @brief Compute ranks of the data
     @param method <b>int</b>
            if == 0: use standard rank transform,
            else: use CDF data transform.
     @return (u, v) tuple of <b>np_1darray</b> ranked samples
     """
     if method == 0:
         u = rankdata(self.x) / (len(self.x) + 1)
         v = rankdata(self.y) / (len(self.y) + 1)
     else:
         # use alternate CDF rank transform method
         kde_x = gaussian_kde(self.x)
         kde_y = gaussian_kde(self.y)
         u_hat = np.zeros(len(self.x))
         v_hat = np.zeros(len(self.y))
         for i, (xp, yp) in enumerate(zip(self.x, self.y)):
             u_hat[i] = kde_x.integrate_box_1d(-np.inf, xp)
             v_hat[i] = kde_y.integrate_box_1d(-np.inf, yp)
         u = u_hat
         v = v_hat
     self.UU, self.VV = u, v
     return u, v
def get_ranks(np_array):
    print "original array has {0} total and {1} valid elements".format(
        np_array.size, np_array.size - np.count_nonzero(np.isnan(np_array)))

    #temp = np_array.argsort()
    #rank_array = np.empty(len(np_array), int)
    #rank_array[temp] = np.arange(len(np_array))

    rank_array = mstats.rankdata(np.ma.masked_invalid(np_array))
    rank_array[rank_array == 0] = np.nan
    rank_array -= 1
    print "rank array has {0} max rank and {1} valid ranks".format(
        np.nanmax(rank_array),
        rank_array.size - np.count_nonzero(np.isnan(rank_array)))
    return rank_array
Beispiel #38
0
def post_hoc(results, alpha, stat):
    '''
    Performs a post-hoc test on the given results to determine the index configurations which are not 
    statistically worse than the best configuration

    results: list of list of numbers representing results for parameters for a number of problems
    alpha: 1 - confidence of outcome
    stat: statistic obtained from friedman test
    '''

    ranks = rankdata(array(results), axis=1)
    (k,n) = ranks.shape
    rank_sum = list(sum(ranks))
    best = min(rank_sum)
    rhs = ((2*k*(1-stat/(k*(n-1)))*sum(sum(ranks**2 - (n+1)*(n+1) / 4.)))/((k-1)*(n-1)))**0.5 * t.ppf(1-alpha/2, n-1)
    return [rank_sum.index(i) for i in rank_sum if abs(best - i) < rhs]
Beispiel #39
0
def _mannwhitneyu(x, y, use_continuity=True):
    """
    Computes the Mann-Whitney statistic
    Missing values in `x` and/or `y` are discarded.
    Parameters
    ----------
    x : ndarray,
        Input, vector or observations x features matrix
    y : ndarray,
        Input, vector or observations x features matrix. If matrix, must have
        same number of features as x
    use_continuity : {True, False}, optional
        Whether a continuity correction (1/2.) should be taken into account.
    Returns
    -------
    statistic : float
        The Mann-Whitney statistic
    approx z : float
        The normal-approximated z-score for U.
    pvalue : float
        Approximate p-value assuming a normal distribution.
    """
    if x.ndim == 1 and y.ndim == 1:
        x, y = x[:, np.newaxis], y[:, np.newaxis]
    ranks = rankdata(np.concatenate([x, y]), axis=0)
    nx, ny = x.shape[0], y.shape[0]
    nt = nx + ny
    U = ranks[:nx].sum(0) - nx * (nx + 1) / 2.

    mu = (nx * ny) / 2.
    u = np.amin([U, nx*ny - U], axis=0)  # get smaller U by convention

    sigsq = np.ones(ranks.shape[1]) * (nt ** 3 - nt) / 12.

    for i in np.arange(len(sigsq)):
        ties = count_tied_groups(ranks[:, i])
        sigsq[i] -= np.sum(v * (k ** 3 - k) for (k, v) in ties.items()) / 12.
    sigsq *= nx * ny / float(nt * (nt - 1))

    if use_continuity:
        z = (U - 1 / 2. - mu) / np.sqrt(sigsq)
    else:
        z = (U - mu) / np.sqrt(sigsq)

    prob = erfc(abs(z) / np.sqrt(2))
    return np.vstack([u, z, prob]).T
Beispiel #40
0
def empirical_copula_uniform(x):
    '''
	Evaluate the empirical copula-uniform dual representation of x as rank(x)/n.

	Parameters
	----------
	x : (n, d) np.array
		n i.i.d. draws from a d-dimensional distribution.

	'''
    mask = np.isnan(x).copy()
    valid_mask = np.logical_not(mask).astype(int)
    r = rankdata(x, axis=0)
    np.copyto(r, np.nan, where=mask)
    non_nan_ns = valid_mask.astype(float).sum(axis=0)
    u = r / non_nan_ns

    return u
Beispiel #41
0
def _correlation_star(data: np.ndarray, method: str) -> np.ndarray:
    """
    Calculates the correlation* ((S* i,j) or (R* i,j)) similarity component of the YS1 and YR1 dissimilarity metrics. \
    For every pair of samples (i,j), returns (corr(i,j) + 1) / 2, \
    where corr is either the Pearson correlation or Spearman correlation.

    :param data: an n-by-p numpy array of n samples by p features, to calculate pairwise distance on.
    :type data: np.ndarray
    :param method: the correlation metric to use when calculating correlation*
    :type method: 'pearson' or 'spearman'
    :return: an n-by-n numpy array of correlation* similarity scores.
    :rtype: np.ndarray
    """
    assert isinstance(method, str), f"'method' must be a string. Instead got {type(method)}."
    method = method.lower()
    assert method in {'spearman', 'pearson'}, f"'method' must be 'spearman' or 'pearson'. Instead got '{method}'."
    if method == 'spearman':
        return (np.corrcoef(rankdata(data, axis=1)) + 1) / 2
    return (np.corrcoef(data) + 1) / 2
Beispiel #42
0
def ecdf(data):
    '''
    Computes the Expected CDF of a data array.
    
    Parameters
    ----------
    data: array of numbers
        The data to compute the cdf
    
    Returns
    -------
    Two other arrays corresponding to the x and y axes.
    '''
    
    obs = np.asanyarray(data)
    rank = mstats.rankdata(obs)

    return_val_x = np.unique(obs)
    return_val_y = np.unique(rank) / len(rank)

    return (return_val_x, return_val_y)
def mann_whitney_u(x, y):
    x = asarray(x)
    y = asarray(y)
    n1 = len(x)
    n2 = len(y)
    ranked = rankdata(np.concatenate((x,y)))
    rankx = ranked[0:n1]       # get the x-ranks
    # ranky = ranked[n1:]        # the rest are y-ranks
    u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0)  # calc U for x
    u2 = n1*n2 - u1                            # remainder is U for y
    #bigu = max(u1,u2)
    smallu = min(u1,u2)
    # T = np.sqrt(tiecorrect(ranked))  # correction factor for tied scores
    T = tiecorrect(ranked)
    #print T
    if T == 0:
        #raise ValueError('All numbers are identical in amannwhitneyu')
        z = 0
    else:
        sd = np.sqrt(T*n1*n2*(n1+n2+1)/12.0)
        z = (smallu-n1*n2/2.0) / sd  # normal approximation for prob calc
    
    return u1, u2, z, distributions.norm.sf(abs(z))  # (1.0 - zprob(z))
Beispiel #44
0
 def __init__(self, y, regime, permutations=999):
     ranks = rankdata(y, axis=0)
     self.ranks = ranks
     n, k = y.shape
     ranks_d = ranks[:, range(1, k)] - ranks[:, range(k - 1)]
     self.ranks_d = ranks_d
     regimes = sp.unique(regime)
     self.regimes = regimes
     self.total = sum(abs(ranks_d))
     self.max_total = sum([abs(i - n + i - 1) for i in range(1, n + 1)])
     self._calc(regime)
     self.theta = self._calc(regime)
     self.permutations = permutations
     if permutations:
         np.perm = np.random.permutation
         sim = np.array([self._calc(
             np.perm(regime)) for i in xrange(permutations)])
         self.theta.shape = (1, len(self.theta))
         sim = np.concatenate((self.theta, sim))
         self.sim = sim
         den = permutations + 1.
         self.pvalue_left = (sim <= sim[0]).sum(axis=0) / den
         self.pvalue_right = (sim > sim[0]).sum(axis=0) / den
         self.z = (sim[0] - sim.mean(axis=0)) / sim.std(axis=0)
Beispiel #45
0
def normalize_filter_quantilemap(v,r):
    ranks = rankdata(v)/len(v)
    v2 = mquantiles(r,prob=ranks)
    v3 = savitzky_golay(v2,5,2,1,1)
    return v3
#!/home/paulk/software/bin/python
from __future__ import division
from sys import argv,exit,stderr,stdout
from scipy import *
from scipy.stats.mstats import rankdata

ranks = dict()
data = dict()
f = open(argv[1])
unwanted = ['I','S','\t']
for row in f:
	if row[0] in unwanted: continue
	l = row.strip().split('\t')
	ranks[l[1]] = rankdata(map(float,l[2:]))
f.close()


c = 0
for r in ranks:
	if c > 20: break
	print "\t".join([r] + map(str,list(ranks[r])))
	c += 0

	


	
Beispiel #47
0
def prioritize_gurobi(input_rasters, output_rank_raster, step=0.05,
                      save_intermediate=False, compress='DEFLATE',
                      ol_normalize=False, weights=None, verbose=False,
                      logger=None):
    """ Solve (multiple) maximum coverage problems for a set of input rasters.

    Create a hierarchical spatial prioritization using Gurobi solver to solve
    multiple optimization problems with different budget levels. Each budget
    level must be in range [0.0, 1.0] and corresponds to the area of interest.
    In other words, value of 0.1 would correspond to best 10% of the landscape.
    Budget levels will be automatically created using step value defined by the
    step arguments. Gurobi solver will then solve each problem using the
    representation levels in input_rasters. All the (binary) results are then
    summed together forming a selection frequency. Finally, the selection
    frequency is rescaled into range [0, 1] forming a rank priority raster.

    It is possible to provide a list (vector) of weights for each features.
    These values are used as simple multipliers for each feature when summing
    the values over all features. If provided, the list must match the number
    of input rasters exactly.

    :param input_rasters: List of String paths of input rasters.
    :param output_raster: String path to the rank raster file to be created.
    :param step: numeric value in (0, 1) defining the step length for budget
                 levels.
    :param save_intermediate: should intermediate optiomization results be
                              saved?
    :param compress: String compression level used for the output raster.
    :param ol_normalize: Boolean setting OL (Occurrence Level) normalization.
    :param weights: list of weights. Length must match the number of input
                    rasters.
    :param verbose Boolean indicating how much information is printed out.
    :param logger logger object to be used.
    """
    # 1. Setup  --------------------------------------------------------------

    all_start = timer()
    load_start = timer()

    if not logger:
        logging.basicConfig()
        llogger = logging.getLogger('optimize_gurobi')
        llogger.setLevel(logging.DEBUG if verbose else logging.INFO)
    else:
        llogger = logger

    if len(input_rasters) < 1:
        llogger.error("Input rasters list cannot be empty")
        sys.exit(1)

    # Check inputs
    assert len(input_rasters) > 0, "Input rasters list cannot be empty"
    assert len(output_rank_raster) != "", "Output raster path cannot be empty"
    try:
        step = float(step)
    except ValueError:
        llogger.error("'step' must be coercible to float")
        sys.exit(1)
    assert step > 0.0 and step < 1.0, "Step argument must be in range (0, 1)"
    # Construct budget levels based on the step provided. 0.0 (nothing) and
    # 1.0 (everything) are not needed.
    budget_levels = np.linspace(0.0+step, 1.0, 1/step)

    # 2. Pre-processing  -----------------------------------------------------

    llogger.info(" [** PRE-PROCESSING **]")

    # Create a sum array, i.e. sum all (occurrence level normalized) raster
    # values in input_rasters together.  NOTE: sum_raster() returns a masked
    # array.
    sum_array_masked = spatutils.sum_raster(input_rasters, olnormalize=True,
                                            weights=weights, logger=llogger)
    # To speed up things, do 2 things: 1) save mask (NoData) and get rid of
    # NoData cells for now, 2) flatten the array.
    (height, width) = sum_array_masked.shape
    mask = ma.getmask(sum_array_masked)
    # Get all the non-masked data as a 1-D array.
    sum_array = ma.compressed(sum_array_masked)
    # Create equal cost array
    cost = np.ones(sum_array.size)

    load_end = timer()
    load_elapsed = round(load_end - load_start, 2)
    llogger.info(" [TIME] Pre-processing took {} sec".format(load_elapsed))

    # 3. Optimize  -----------------------------------------------------------

    opt_start = timer()
    llogger.info(" [** OPTIMIZING **]")
    blevels_str = ", ".join([str(level) for level in budget_levels])
    llogger.info(" [NOTE] Target budget levels: {}".format(blevels_str))

    # Construct a ndarray (matrix) that will hold the selection frequency.
    # Populate it with 0s
    sel_freq = np.full((height, width), 0)

    # Define budget and optimize_maxcover
    for i, blevel in enumerate(budget_levels):
        no_blevel = i + 1
        prefix = utils.get_iteration_prefix(no_blevel, len(budget_levels))

        budget = blevel * cost.size
        llogger.info("{} Optimizing with ".format(prefix) +
                     "budget level {}...".format(blevel))
        x = optimize_maxcover(cost, budget, sum_array, verbose=verbose,
                              logger=llogger)
        # Create a full (filled with 0s) raster template
        x_selection = np.full((height, width), 0.0)
        # Place the values of result array (it's binary = {0, 1}) into template
        # elements that are False in the original mask
        x_selection[~mask] = x
        # Add the selected elements (planning units) into the selection
        # frequency matrix
        sel_freq += x_selection

        # Get the raster profile from the input raster files
        profile = spatutils.get_profile(input_rasters, logger=llogger)

        if save_intermediate:
            # Replace the real nodata values with a proper NoData value
            nodata_value = 255
            x_selection[mask] = nodata_value
            # Create a masked array
            output_x = ma.masked_values(x_selection, nodata_value)
            # Construct the output raster name
            btoken = str(blevel).replace('.', '_')
            # Construct a subdir name based on the basename
            output_bname = os.path.basename(output_rank_raster).split('.')[0]
            output_subir = os.path.join(os.path.dirname(output_rank_raster),
                                        output_bname)
            if not os.path.exists(output_subir):
                os.makedirs(output_subir)
            output_raster = os.path.join(output_subir,
                                         "budget_level_{}.tif".format(btoken))
            # Write out the data
            llogger.debug(" Writing intermediate output to " +
                          "{}".format(output_raster))
            profile.update(dtype=rasterio.uint8, compress=compress,
                           nodata=nodata_value)

            with rasterio.open(output_raster, 'w', **profile) as dst:
                dst.write_mask(mask)
                dst.write(output_x.astype(np.uint8), 1)

    opt_end = timer()
    opt_elapsed = round(opt_end - opt_start, 2)
    llogger.info(" [TIME] Optimization took {} sec".format(opt_elapsed))

    # 4. Rank values ---------------------------------------------------------

    post_start = timer()
    llogger.info(" [** POST-PROCESSING **]")

    llogger.info(" [1/3] Ranking selection frequencies")
    # Use 0s from summation as a mask
    rank_array = ma.masked_values(sel_freq, 0.0)
    rank_array = rankdata(rank_array)

    # 5. Recale data into range [0, 1] ---------------------------------------

    llogger.info(" [2/3] Rescaling ranks")
    rank_array = spatutils.normalize(rank_array)

    # 6. Prepare and write output --------------------------------------------

    llogger.info(" [3/3] Writing output to {}".format(output_rank_raster))
    # Replace the real nodata values with a proper NoData value
    nodata_value = -3.4e+38
    rank_array[mask] = nodata_value
    # Create a masked array
    rank_array = ma.masked_values(rank_array, nodata_value)
    profile.update(dtype=rasterio.float64, compress=compress,
                   nodata=nodata_value)

    with rasterio.open(output_rank_raster, 'w', **profile) as dst:
        dst.write_mask(mask)
        dst.write(rank_array.astype(np.float64), 1)

    post_end = timer()
    post_elapsed = round(post_end - post_start, 2)
    llogger.info(" [TIME] Post-processing took {} sec".format(post_elapsed))

    all_end = timer()
    all_elapsed = round(all_end - all_start, 2)
    llogger.info(" [TIME] All processing took {} sec".format(all_elapsed))
Beispiel #48
0
    def run_full_analysis(self, inputfile, user_params=None):
        """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """
        self.logger.info("Starting full motif analysis")
        self.logger.info("Using temporary directory {0}".format(mytmpdir()))
    
        if user_params is None:
            user_params = {}
        params = self.config.get_default_params()
        params.update(user_params)
        
        if params["torque"]:
            from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult
            self.logger.info("Using torque")
        else:
            from gimmemotifs.prediction import pp_predict_motifs, PredictionResult
            self.logger.info("Using multiprocessing")

        self.params = params
        #self.weird = params["weird_option"]

        background = [x.strip() for x in params["background"].split(",")]
        
        self.logger.info("Parameters:")
        for param, value in params.items():
            self.logger.info("  %s: %s" % (param, value))

        # Checking input
        self.input_type = "BED"
        # If we can load it as fasta then it is a fasta, yeh?
        try:
            Fasta(inputfile)
            self.logger.info("Inputfile is a FASTA file")
            self.input_type = "FASTA"
        except:
            # Leave it to BED
            pass

        if self.input_type == "FASTA":
            for bg in background:
                if not bg in FA_VALID_BGS:
                    self.logger.info("Input type is FASTA, can't use background type '%s'" % bg)
            background = [bg for bg in background if bg in FA_VALID_BGS]
            
        elif self.input_type == "BED":
            # Does the index_dir exist?  #bed-specific
            index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
            if not os.path.exists(index_dir):
                self.logger.error("No index found for genome %s! Has GimmeMotifs been configured correctly and is the genome indexed?" % params["genome"])
                sys.exit(1)

            # is it a valid bed-file etc.
            self._check_input(inputfile)    # bed-specific

            # Check for valid background
            for bg in background:
                if not bg in BED_VALID_BGS:
                    self.logger.info("Input type is BED, can't use background type '%s'" % bg)
            background = [bg for bg in background if bg in BED_VALID_BGS]
    
        if len(background) == 0:
            self.logger.error("No valid backgrounds specified!")
            sys.exit(1)

        self.max_time = None
        max_time = None
        # Maximum time?
        if params["max_time"]:
            try:
                max_time = float(params["max_time"])
            except:
                self.logger.info("Could not parse max_time value, setting to no limit")
                self.max_time = None

            if max_time > 0:
                self.logger.info("Time limit for motif prediction: %0.2f hours" % max_time)
                max_time = 3600 * max_time
                self.max_time = max_time
                self.logger.debug("Max_time in seconds %0.0f" % self.max_time)
            else:
                self.logger.info("Invalid time limit for motif prediction, setting to no limit")
                self.max_time = None
        else:
                self.logger.info("No time limit for motif prediction")
            
        if "random" in background:
            self.markov_model = params["markov_model"]

        # Create the necessary files for motif prediction and validation
        if self.input_type == "BED":
            self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"])
        
        
             # Create file for location plots
            index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
            lwidth = int(params["lwidth"])
            width = int(params["width"])
            extend = (lwidth - width) / 2
            genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True)
        
        elif self.input_type == "FASTA":
            self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"])
        
            # File for location plots
            self.location_fa = self.validation_fa
            fa = Fasta(self.location_fa)
            seqs = fa.seqs
            lwidth = len(seqs[0]) 
            all_same_width = not(False in [len(seq) == lwidth for seq in seqs])
            if not all_same_width:
                self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!")
        
        else:
            self.logger.error("Unknown input type, shouldn't happen")
            sys.exit(1)

        tools = dict([(x.strip(), x in [y.strip() for y in  params["tools"].split(",")]) for x in params["available_tools"].split(",")])
    
        self.create_background(background, params["genome"], params["width"])

        # Predict the motifs
        analysis = params["analysis"]
        """ Predict motifs, input is a FASTA-file"""
        self.logger.info("Starting motif prediction (%s) using %s" % 
            (analysis, ", ".join([x for x in tools.keys() if tools[x]])))

        bg_file = self.bg_file["fa"][sorted(background, lambda x,y: cmp(BG_RANK[x], BG_RANK[y]))[0]]
        self.logger.info("Using bg_file %s for significance" % bg_file)
        result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file)
    
        motifs = result.motifs
        self.logger.info("Predicted %s motifs, written to %s" % (len(motifs), self.predicted_pfm))
        
        if len(motifs) == 0:
            self.logger.info("No motifs found. Done.")
            sys.exit()
        
        # Write stats output to file
        f = open(self.stats_file, "w")
        stat_keys = result.stats.values()[0].keys()
        f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys)))
        print result.stats
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats:
                f.write("%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys])))
            else:
                self.logger.error("No stats for motif {0}, skipping this motif!".format(motif.id))
                motifs.remove(motif)
        f.close()
    
        self.motifs_with_stats = motifs

        f = open(self.ranks_file, "w")
        tools = dict((m.id.split("_")[0],1) for m in motifs).keys()
        f.write("Metric\tType\t%s\n" % ("\t".join(tools)))
        for stat in ["mncp", "roc_auc", "maxenr"]:
            best_motif = {}
            for motif in self.motifs_with_stats:
                val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat]
                name = motif.id.split("_")[0]
                if val > best_motif.setdefault(name, 0):
                    best_motif[name] = val
            names = best_motif.keys()
            vals = [best_motif[name] for name in names]
            rank = rankdata(vals)
            ind = [names.index(x) for x in tools]
            
            f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind])))
            f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind])))
        f.close()
            
            
            #self.logger.debug("RANK: %s" % stat)
            #self.logger.debug("\t".join([str(x) for x in names]))
            #self.logger.debug("\t".join([str(x) for x in vals]))
            #self.logger.debug("\t".join([str(x) for x in rank]))

        # Determine significant motifs
        nsig = 0 
        f = open(self.significant_pfm, "w")
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats['enr_fdr'] >= 2:
                f.write("%s\n" % motif.to_pfm())
                nsig += 1
        f.close()        
        self.logger.info("%s motifs are significant, written to %s" % (nsig, self.significant_pfm))
        
        if nsig == 0:
            self.logger.info("No significant motifs found. Done.")
            sys.exit()
        
        # ROC metrics of significant motifs
        for bg in background:
            self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg])
        
        # Cluster significant motifs
        clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"])
        
        # Determine best motif in cluster
        num_cluster, best_id = self._determine_best_motif_in_cluster(clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir)
        
        ### Enable parallel and modular evaluation of results
        # Scan (multiple) files with motifs
        # Define callback functions once scanning is finished:
        #    - ROC plot
        #     - Statistics
        #    - Location plots (histogram)
        #     -
        
        # Stars
        tmp = NamedTemporaryFile(dir=mytmpdir()).name
        p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file = self.validation_fa, bg_file = bg_file) 
        p.add_motifs(("Clustering",  (pwmfile_to_motifs(self.final_pwm), "","")))
        while len(p.stats.keys()) < len(p.motifs):
            sleep(5)

        for mid, num in num_cluster.items():
            p.stats[mid]["numcluster"] = num

        all_stats = {
            "mncp": [2, 5, 8],                
            "roc_auc": [0.6, 0.75, 0.9],    
            "maxenr": [10, 20, 30],         
            "enr_fdr": [4, 8, 12],         
            "fraction": [0.4, 0.6, 0.8],    
            "ks_sig": [4, 7, 10],
            "numcluster": [3, 6, 9],
        }

        
        # ROC plots
        for bg in background:
            self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg)
        
        # Location plots
        self.logger.info("Creating localization plots")
        motifs = pwmfile_to_motifs(self.final_pwm)
        for motif in motifs:
            m = "%s_%s" % (motif.id, motif.to_consensus())
            s = p.stats[m]
            outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
            motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"])
    
            s["stars"] = int(mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5)
            self.logger.debug("Motif %s: %s stars" % (m, s["stars"]))

        # Calculate enrichment of final, clustered motifs
        self.calculate_cluster_enrichment(self.final_pwm, background)

        # Create report    
        self.print_params()
        self._calc_report_values(self.final_pwm, background)
        self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id)
        self._create_text_report(self.final_pwm, background)
        self.logger.info("Open %s in your browser to see your results." % (self.motif_report))
        
        if not(params["keep_intermediate"]):
            
            self.logger.info("Deleting intermediate files. Please specifify the -k option if you want to keep these files.")
            shutil.rmtree(self.tmpdir)

        self.logger.info("Done")
Beispiel #49
0
def main(folder):

    read_scores = {}
    read_ranks = {}
    read_lengths = {}
    read_info_content = {}

    final_scores = {}
    final_ranks = {}

    num_tests = {}

    filenames = []

    num_ranks = 0

    # get general info
    tool_0 = os.listdir(folder)[0]
    pkl_file = open(os.path.join(folder, tool_0), 'rb')
    (reads, results) = pickle.load(pkl_file)
    for filename, values in reads.iteritems():
        read_scores[filename] = {}
        read_ranks[filename] = {}
        read_lengths[filename] = values["length"]
        read_info_content[filename] = values["info_content"]/float(values["length"])
        filenames.append(filename)

    tools = os.listdir(folder)

    # get specific info
    for tool in tools:
        pkl_file = open(os.path.join(folder, tool), 'rb')
        (reads, results) = pickle.load(pkl_file)
        final_scores[tool] = [0] * 4
        final_ranks[tool] = [0] * 4

        num_tests[tool] = len(results)
        for filename, values in results.iteritems():

            read_scores[filename][tool] = values["scores"]
            for i in range(len(values["scores"])):
                final_scores[tool][i] += values["scores"][i]

    for filename, tool_scores in read_scores.iteritems():

        # We only rank on the ones that didn't error for anyone
        if len(tool_scores) < len(tools):
            continue
        num_ranks += 1

        raw_scores = [(tool, scores) for tool, scores in tool_scores.iteritems()]
        ranks = [
            rankdata([1 - r[1][score] for r in raw_scores]
            ) for score in range(4)]
        for p, (tool, _) in enumerate(raw_scores):
            for score_type in range(4):    
                final_ranks[tool][score_type] += ranks[score_type][p]

    results = []
    
    print "Tool\ttests\tq\ttc\tcline\tmodeler\tq\ttc\tcline\tmodeler"
    for tool in tools:
        results = [tool, num_tests[tool]]
        results.extend([s/float(num_tests[tool]) for s in final_scores[tool]])
        results.extend([s/float(num_ranks) for s in final_ranks[tool]])
        print "%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f" % (
                tuple(results) 
        )

    length_bins = [round(x) for x in stats.mstats.mquantiles([read_lengths[f] for f in filenames], [0, 0.2, 0.4, 0.6, 0.8, 1])]
    info_bins = [round(x, 2) for x in stats.mstats.mquantiles([read_info_content[f] for f in filenames], [0, 0.2, 0.4, 0.6, 0.8, 1])]
    
    #information_traces = []
    #length_traces = []
    traces = []
    #pp.pprint(read_scores)
    for tool in tools:
        chosen_files = [f for f in filenames if tool in read_scores[f].keys()]
        qscore_data = [read_scores[f][tool][0] for f in chosen_files]
        tc_data = [read_scores[f][tool][1] for f in chosen_files]
        labels = ["Sum of pairs" for _ in range(len(qscore_data))]
        labels.extend(["Total column" for _ in range(len(tc_data))])
        all_data = qscore_data
        all_data.extend(tc_data)
       
        traces.append(Box(
                y=all_data,
                x=labels,
                name=tool,
                opacity=0.8 
            )
        )
    layout = Layout(
            boxmode='group',
            showlegend=True,
            title="Sum of pairs (SP) and total column (TC) scores for 300 randomly selected PANDIT alignments",
            titlefont = Font(
                size = 20),
            yaxis=YAxis(
                title='Score',
                showgrid=False,
                autorange = True,
                titlefont = Font(
                    size = 20)
            ),
            legend = Legend(
                font = Font(
                    size = 18
                ),
                yanchor = "bottom",
                xanchor = "left")
    )
    data = Data(traces)
    fig = Figure(data=data, layout=layout)
    py.sign_in("imogen", "mtf1tawct2")
    py.plot(fig)
plt.plot( [ beginTimeOfCloud , endTimeOfCloud ],  [ cloudTopHeight, cloudTopHeight ], 'k' )

plt.title('Radar reflectivity')
plt.xlabel('Time  [' + time_offset_radar_refl.units + ']')
plt.ylabel('Altitude  [m]')
plt.figure()
#plt.show()

#pdb.set_trace()


# uniformCloudBlock = close-up selection of contiguous cloud values.  
# Each column is a different altitude.
uniformCloudBlock = zeros((lenTimestepRangeCloud,lenLevelRangeCloud))
for col in range(0,lenLevelRangeCloud):
    uniformCloudBlock[:,col] = rankdata(reflCloudBlock[:,col]) /    \
                                MaskedArray.count(reflCloudBlock[:,col])

uniformCloudBlock = masked_where( uniformCloudBlock == 0, uniformCloudBlock )
# I'm not sure if it's appropriate to rank first, then fill.
# So I'm not sure if this is correct.
uniformCloudBlockFilled = filled(uniformCloudBlock,fill_value=0)

plt.clf()
for idx in range(1,5):
    plt.subplot(2,2,idx)
    plt.plot(uniformCloudBlockFilled[:,5],uniformCloudBlockFilled[:,idx],'.')
plt.title('Copula')
plt.figure()

#pdb.set_trace()
Beispiel #51
0
def main(folder):

    read_scores = {}
    read_ranks = {}
    read_lengths = {}
    read_info_content = {}

    final_scores = {}
    final_ranks = {}

    num_tests = {}

    filenames = []

    num_ranks = 0

    # get general info
    tool_0 = os.listdir(folder)[0]
    pkl_file = open(os.path.join(folder, tool_0), 'rb')
    (reads, results) = pickle.load(pkl_file)
    for filename, values in reads.iteritems():
        read_scores[filename] = {}
        read_ranks[filename] = {}
        filenames.append(filename)

    tools = os.listdir(folder)

    # get specific info
    for tool in tools:
        pkl_file = open(os.path.join(folder, tool), 'rb')
        (reads, results) = pickle.load(pkl_file)
        final_scores[tool] = [0] * 2
        final_ranks[tool] = [0] * 2

        num_tests[tool] = len(results)
        for filename, values in results.iteritems():

            read_scores[filename][tool] = values["scores"]
            for i in range(len(values["scores"])):
                final_scores[tool][i] += values["scores"][i]


    #for filename, tool_scores in read_scores.iteritems():
    #    print filename, tool_scores 

    for filename, tool_scores in read_scores.iteritems():

        # We only rank on the ones that didn't error for anyone
        if len(tool_scores) < len(tools):
            continue
        num_ranks += 1

        raw_scores = [(tool, scores) for tool, scores in tool_scores.iteritems()]
        ranks = [
            rankdata([1 - r[1][score] for r in raw_scores]
            ) for score in range(2)]
        for p, (tool, _) in enumerate(raw_scores):
            for score_type in range(2):    
                final_ranks[tool][score_type] += ranks[score_type][p]

    results = []
    
    print "Tool\ttests\tsp\ttc\tsp\ttc"
    for tool in tools:
        results = [tool, num_tests[tool]]
        results.extend([s/float(num_tests[tool]) for s in final_scores[tool]])
        results.extend([s/float(num_ranks) for s in final_ranks[tool]])
        print "%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f" % (
                tuple(results) 
        )

    traces = []
    tool_names = {"clustalo.sh":"Clustal Omega",
        "clustalw.sh":"Clustal W",
        "muscle.sh":"MUSCLE",
        "mafft.sh":"MAFFT L-INS-I",
        "prank.sh":"PRANK",
        "prank_f.sh":"PRANK+F",
        "noah_full.sh":"NOAH (full algorithm)",
        "noah_basic.sh":"NOAH (chained guide tree, no FB traversal)",
        "noah_no_fb.sh":"NOAH (guide arc, no FB traversal)",
        "noah_no_arc.sh":"NOAH (chained guide tree, FB traversal)"
    }

    for tool in tools:
        chosen_files = [f for f in filenames if tool in read_scores[f].keys()]
        qscore_data = [read_scores[f][tool][0] for f in chosen_files]
        #tc_data = [read_scores[f][tool][1] for f in chosen_files]
        labels = ["Sum of pairs" for _ in range(len(qscore_data))]
        #labels.extend(["Total column" for _ in range(len(tc_data))])
        all_data = qscore_data
        
        traces.append(Box(
                y=all_data,
                name=tool_names[tool],
                opacity=0.8 
            )
        )
        layout = Layout(
            showlegend=True,
            title="Sum of pairs (SP) scores",
            titlefont = Font(
                size = 20),
            yaxis=YAxis(
                title='Score',
                showgrid=False,
                autorange = True,
                titlefont = Font(
                    size = 20)
            ),
            xaxis=XAxis(
                showticklabels=False),
            legend = Legend(
                font = Font(
                    size = 18
                ),
                yanchor = "bottom",
                xanchor = "left")
    )
    data = Data(traces)
    fig = Figure(data=data, layout=layout)
    py.sign_in("imogen", "mtf1tawct2")
Beispiel #52
0
def sort_pars(results, pars):
    '''
    Sorts the parameters in order of performance according to results
    '''

    return [i[1] for i in sorted(zip(list(sum(rankdata(array(results), axis=1))),pars))]
def ranktransform(X):
    from scipy.stats.mstats import rankdata
    rankX = rankdata(X)
    Xpercentile = rankX/ X.shape[0]
    return Xpercentile