def get_lambda_pvalues(self, plam_mat, nlam_mat, bip_set=False): """Return the p-values for the :math:`\\Lambda`-motifs in ``nlam_mat``. Calculate the p-values for the numbers of observed :math:`\\Lambda`-motifs as given in the parameter ``nlam_mat`` for the bipartite node layer ``bip_set``. The probabilities for the single :math:`\\Lambda`-motifs are given in ``plam_mat``. If ``bip_set`` corresponds to the constrained bipartite node set, the :math:`\\Lambda`-motifs follow a Binomial probability distribution. Otherwise, all the node pairs follow the same Poisson Binomial probability distribution. The p-values are calculated as .. math:: p_{val}(k) = Pr(X >= k) = 1 - Pr(X < k) = 1 - cdf(k) + pmf(k) .. note:: The lower triangular part (including the diagonal) of the returned matrix is set to zero. :param plam_mat: matrix of :math:`\\Lambda`-motif probabilities :type plam_mat: numpy.array :param nlam_mat: matrix of observed number of Lambda motifs :type nlam_mat: numpy.array :param bip_set: selects row-nodes (``True``) or column-nodes (``False``) :type bip_set: bool :returns: matrix of the p-values for the :math:`\\Lambda`-motifs :rtype: numpy.array :raise NameError: raise an error if the parameter ``bip_set`` is neither ``True`` nor ``False`` :raise AssertionError: raise an error if shapes of the probability matrix and the matrix with the number of :math:`\\Lambda`-motifs are not equal """ if bip_set: m = self.num_columns elif not bip_set: m = self.num_rows else: errmsg = "'" + str(bip_set) + "' " + 'not supported.' raise NameError(errmsg) n = nlam_mat.shape[0] pval_mat = np.zeros(nlam_mat.shape) if bip_set != self.const_set: pb = PoiBin(plam_mat[np.diag_indices_from(plam_mat)]) for i in xrange(n): pval_mat[i, i + 1:] = pb.pval(nlam_mat[i, i + 1:]) elif bip_set == self.const_set: # if the sets correspond, the matrix dimensions should be the same assert plam_mat.shape[0] == nlam_mat.shape[0] for i in xrange(n): for j in xrange(i + 1, n): bn = binom(m, plam_mat[i, j]) pval_mat[i, j] = 1. - bn.cdf(nlam_mat[i, j]) \ + bn.pmf(nlam_mat[i, j]) return pval_mat
def pval_process_worker(self): """Calculate p-values and add them to the out-queue.""" # take elements from the queue as long as the element is not "STOP" for tupl in iter(self.input_queue.get, "STOP"): pb = PoiBin(tupl[1]) pv = pb.pval(int(tupl[2])) # add the result to the output queue self.output_queue.put((tupl[0], pv)) # once all the elements in the input queue have been dealt with, add a # "STOP" to the output queue self.output_queue.put("STOP")
def lambda_motifs(self, bip_set, parallel=True, filename=None, delim='\t', binary=True, num_chunks=4): """Calculate and save the p-values of the :math:`\\Lambda`-motifs. For each node couple in the bipartite layer specified by ``bip_set``, calculate the p-values of the corresponding :math:`\\Lambda`-motifs according to the link probabilities in the biadjacency matrix of the BiCM null model. The results can be saved either as a binary ``.npy`` or a human-readable ``.csv`` file, depending on ``binary``. .. note:: * The total number of p-values that are calculated is split into ``num_chunks`` chunks, which are processed sequentially in order to avoid memory allocation errors. Note that a larger value of ``num_chunks`` will lead to less memory occupation, but comes at the cost of slower processing speed. * The output consists of a one-dimensional array of p-values. If the bipartite layer ``bip_set`` contains ``n`` nodes, this means that the array will contain :math:`\\binom{n}{2}` entries. The indices ``(i, j)`` of the nodes corresponding to entry ``k`` in the array can be reconstructed using the method :func:`BiCM.flat2_triumat_idx`. The number of nodes ``n`` can be recovered from the length of the array with :func:`BiCM.flat2_triumat_dim` * If ``binary == False``, the ``filename`` should end with ``.csv``. If ``binary == True``, it will be saved in binary NumPy ``.npy`` format and the suffix ``.npy`` will be appended automatically. By default, the file is saved in binary format. :param bip_set: select row-nodes (``True``) or column-nodes (``False``) :type bip_set: bool :param parallel: select whether the calculation of the p-values should be run in parallel (``True``) or not (``False``) :type parallel: bool :param filename: name of the output file :type filename: str :param delim: delimiter between entries in the ``.csv``file, default is ``\\t`` :type delim: str :param binary: if ``True``, the file will be saved in the binary NumPy format ``.npy``, otherwise as ``.csv`` :type binary: bool :param num_chunks: number of chunks of p-value calculations that are performed sequentially :type num_chunks: int :raise ValueError: raise an error if the parameter ``bip_set`` is neither ``True`` nor ``False`` """ if (type(bip_set) == bool) and bip_set: biad_mat = self.adj_matrix bin_mat = self.bin_mat elif (type(bip_set) == bool) and not bip_set: biad_mat = np.transpose(self.adj_matrix) bin_mat = np.transpose(self.bin_mat) else: errmsg = "'" + str(bip_set) + "' " + 'not supported.' raise NameError(errmsg) n = self.get_triup_dim(bip_set) pval = np.ones(shape=(n, ), dtype='float') * (-0.1) # handle layers of dimension 2 separately if n == 1: nlam = np.dot(bin_mat[0, :], bin_mat[1, :].T) plam = biad_mat[0, :] * biad_mat[1, :] pb = PoiBin(plam) pval[0] = pb.pval(nlam) else: # if the dimension of the network is too large, split the # calculations # of the p-values in ``m`` intervals to avoid memory # allocation errors if n > 100: kk = self.split_range(n, m=num_chunks) else: kk = [0] # calculate p-values for index intervals for i in range(len(kk) - 1): k1 = kk[i] k2 = kk[i + 1] nlam = self.get_lambda_motif_block(bin_mat, k1, k2) plam = self.get_plambda_block(biad_mat, k1, k2) pv = self.get_pvalues_q(plam, nlam, k1, k2) pval[k1:k2] = pv # last interval k1 = kk[len(kk) - 1] k2 = n - 1 nlam = self.get_lambda_motif_block(bin_mat, k1, k2) plam = self.get_plambda_block(biad_mat, k1, k2) # for the last entry we have to INCLUDE k2, thus k2 + 1 pv = self.get_pvalues_q(plam, nlam, k1, k2 + 1) pval[k1:] = pv # check that all p-values have been calculated # assert np.all(pval >= 0) and np.all(pval <= 1) if filename is None: fname = 'p_values_' + str(bip_set) if not binary: fname += '.csv' else: fname = filename # account for machine precision: pval += np.finfo(np.float).eps self.save_array(pval, filename=fname, delim=delim, binary=binary)