Esempio n. 1
0
def simulate_relatedness(genotypes, relatedness=.5, n_iter=1000, copy=True):
    """
    Simulate relatedness by randomly copying genotypes between individuals.

    Parameters
    ----------

    genotypes : array_like
        An array of shape (n_variants, n_samples, ploidy) where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = first alternate allele,
        2 = second alternate allele, etc.).
    relatedness : float, optional
        Fraction of variants to copy genotypes for.
    n_iter : int, optional
        Number of times to randomly copy genotypes between individuals.
    copy : bool, optional
        If False, modify `genotypes` in place.

    Returns
    -------

    genotypes : ndarray, shape (n_variants, n_samples, ploidy)
        The input genotype array but with relatedness simulated.

    """

    # check genotypes array
    genotypes = np.asarray(genotypes)
    assert genotypes.ndim >= 2
    n_variants = genotypes.shape[0]
    n_samples = genotypes.shape[1]

    # copy input array
    if copy:
        genotypes = genotypes.copy()
    else:
        # modify in place
        pass

    # determine the number of variants to copy genotypes for
    n_copy = int(relatedness * n_variants)

    # iteratively introduce relatedness
    for i in range(n_iter):

        # randomly choose donor and recipient
        donor_index = random.randint(0, n_samples-1)
        donor = genotypes[:, donor_index]
        recip_index = random.randint(0, n_samples-1)
        recip = genotypes[:, recip_index]

        # randomly pick a set of variants to copy
        variant_indices = random.sample(range(n_variants), n_copy)

        # copy across genotypes
        recip[variant_indices] = donor[variant_indices]

    return genotypes
Esempio n. 2
0
def simulate_biallelic_genotypes(n_variants, n_samples, af_dist,
                                 p_missing=.1,
                                 ploidy=2):
    """Simulate genotypes at biallelic variants for a population in
    Hardy-Weinberg equilibrium

    Parameters
    ----------

    n_variants : int
        The number of variants.
    n_samples : int
        The number of samples.
    af_dist : frozen continuous random variable
        The distribution of allele frequencies.
    p_missing : float, optional
        The fraction of missing genotype calls.
    ploidy : int, optional
        The sample ploidy.

    Returns
    -------

    genotypes : ndarray, int8
        An array of shape (n_variants, n_samples, ploidy) where each
        element of the array is an integer corresponding to an allele index
        (-1 = missing, 0 = reference allele, 1 = alternate allele).

    """

    # initialise output array
    genotypes = np.empty((n_variants, n_samples, ploidy), dtype='i1')

    # generate allele frequencies under the given distribution
    af = af_dist.rvs(n_variants)

    # freeze binomial distribution to model missingness
    miss_dist = scipy.stats.binom(p=p_missing, n=n_samples)

    # iterate over variants
    for i, p in zip(range(n_variants), af):

        # randomly generate alleles under the given allele frequency
        # ensure p is valid probability
        p = min(p, 1)
        alleles = scipy.stats.bernoulli.rvs(p, size=n_samples*ploidy)

        # reshape alleles as genotypes under the given ploidy
        genotypes[i] = alleles.reshape(n_samples, ploidy)

        # simulate some missingness
        n_missing = miss_dist.rvs()
        missing_indices = random.sample(range(n_samples),
                                        n_missing)
        genotypes[i, missing_indices] = (-1,) * ploidy

    return genotypes
Esempio n. 3
0
def simulate_genotypes_with_ld(n_variants, n_samples, correlation=0.2):
    """A very simple function to simulate a set of genotypes, where
    variants are in some degree of linkage disequilibrium with their
    neighbours.

    Parameters
    ----------

    n_variants : int
        The number of variants to simulate data for.
    n_samples : int
        The number of individuals to simulate data for.
    correlation : float, optional
        The fraction of samples to copy genotypes between neighbouring
        variants.

    Returns
    -------

    gn : ndarray, int8
        A 2-dimensional array of shape (n_variants, n_samples) where each
        element is a genotype call coded as a single integer counting the
        number of non-reference alleles.

    """

    # initialise an array of random genotypes
    gn = np.random.randint(size=(n_variants, n_samples), low=0, high=3)
    gn = gn.astype('i1')

    # determine the number of samples to copy genotypes for
    n_copy = int(correlation * n_samples)

    # introduce linkage disequilibrium by copying genotypes from one sample to
    # the next
    for i in range(1, n_variants):

        # randomly pick the samples to copy from
        sample_indices = random.sample(range(n_samples), n_copy)

        # view genotypes from the previous variant for the selected samples
        c = gn[i-1, sample_indices]

        # randomly choose whether to invert the correlation
        inv = random.randint(0, 1)
        if inv:
            c = 2-c

        # copy across genotypes
        gn[i, sample_indices] = c

    return gn
Esempio n. 4
0
def block_apply(f, dataset, block_size=None, out=None):
    """Apply function `f` to `dataset` split along the first axis into
    contiguous slices of `block_size`. The result should be equivalent to
    calling ``f(dataset)`` directly, however may require less total memory,
    especially if `dataset` is an HDF5 dataset.

    Parameters
    ----------

    f : function
        The function to apply.
    dataset : array_like or HDF5 dataset
        The input dataset.
    block_size : int, optional
        The size (in number of items along `axis`) of the blocks passed to `f`.
    out : array_like or HDF5 dataset, optional
        If given, used to store the output.

    Returns
    -------

    out : ndarray
        The result of applying `f` to `dataset` blockwise.

    """

    # determine block size
    if block_size is None:
        if hasattr(dataset, 'chunks') and dataset.chunks is not None:
            # use dataset chunk size along slice axis
            block_size = dataset.chunks[0]
        else:
            # use arbitrary number
            block_size = 1000

    # determine total size along slice axis
    dim_size = dataset.shape[0]

    # iterate over blocks
    for block_start in range(0, dim_size, block_size):
        block_stop = min(block_start + block_size, dim_size)

        # load input block
        x = dataset[block_start:block_stop, ...]

        # compute output block
        y = f(x)

        if out is None:
            # initialise output array
            out_shape = list(y.shape)
            out_shape[0] = dim_size
            out = np.empty(out_shape, y.dtype)

        # store output block
        out[block_start:block_stop, ...] = y

    return out
Esempio n. 5
0
def block_take2d(dataset, row_indices, col_indices=None, block_size=None):
    """Select rows and optionally columns from a Numpy array or HDF5
    dataset with 2 or more dimensions.

    Parameters
    ----------

    dataset : array_like or HDF5 dataset
        The input dataset.
    row_indices : sequence of ints
        The indices of the selected rows. N.B., will be sorted in ascending
        order.
    col_indices : sequence of ints, optional
        The indices of the selected columns. If not provided, all columns
        will be returned.
    block_size : int, optional
        The size (in number of rows) of the block of data to process at a time.

    Returns
    -------

    out : ndarray
        An array containing the selected rows and columns.

    See Also
    --------

    anhima.util.block_compress2d, anhima.h5.take2d_pointsel

    Notes
    -----

    This function is mainly a work-around for the fact that fancy indexing via
    h5py is currently slow, and fancy indexing along more than one axis is not
    supported. The function works by reading the entire dataset in blocks of
    `block_size` rows, and processing each block in memory using numpy.

    """

    # N.B., make sure row_indices are sorted
    row_indices = np.asarray(row_indices)
    row_indices.sort()

    # how many rows are we selecting?
    n_rows_in = dataset.shape[0]
    n_rows_out = len(row_indices)

    # how many columns are we selecting?
    n_cols_in = dataset.shape[1]
    if col_indices:
        n_cols_out = len(col_indices)
    else:
        n_cols_out = n_cols_in

    # setup output array
    out_shape = (n_rows_out, n_cols_out) + dataset.shape[2:]
    out = np.empty(out_shape, dtype=dataset.dtype)

    # determine block size
    if block_size is None:
        if hasattr(dataset, 'chunks') and dataset.chunks is not None:
            # use dataset chunk height
            block_size = dataset.chunks[0]
        else:
            # use arbitrary number
            block_size = 1000

    # iterate block-wise
    offset = 0
    for block_start in range(0, n_rows_in, block_size):
        block_stop = min(block_start+block_size, n_rows_in)

        # how many indices to process in this block?
        i = np.searchsorted(row_indices, block_start)
        j = np.searchsorted(row_indices, block_stop)
        n = j-i
        ridx = row_indices[i:j]

        # only do anything if there are indices for this block
        if n:

            # load data for this block
            a = dataset[block_start:block_stop]

            # take rows
            b = np.take(a, ridx-block_start, axis=0)

            # take columns
            if col_indices:
                b = np.take(b, col_indices, axis=1)

            # store output
            out[offset:offset+n, ...] = b

            # keep track of offset
            offset += n

    return out
Esempio n. 6
0
def take2d_pointsel(dataset, row_indices=None, col_indices=None,
                    block_size=1000):
    """
    Load selected rows and optionally columns from an HDF5 dataset with 2 or
    more dimensions, using HDF5 point selections.

    Parameters
    ----------

    dataset : HDF5 dataset
        The dataset to load data from.
    row_indices : sequence of ints, optional
        The indices of the selected rows. If not provided, all rows will be
        returned.
    col_indices : sequence of ints, optional
        The indices of the selected columns. If not provided, all columns
        will be returned.
    block_size : int, optional
        The size (in number of points) of the block of data to load and
        process at a time.

    Returns
    -------

    out : ndarray
        An array containing the selected rows and columns.

    See Also
    --------

    anhima.util.take2d

    Notes
    -----

    This function is similar to :func:`anhima.util.take2d` but uses an HDF5
    point selection under the hood. Performance characteristics will be
    different and may be much better or much worse, depending on the size,
    shape and configuration of the dataset, and depending on the number of
    points to be selected.

    """

    n_rows_in = dataset.shape[0]
    if row_indices:
        row_indices = sorted(row_indices)
        n_rows_out = len(row_indices)
    else:
        # select all rows
        row_indices = range(n_rows_in)
        n_rows_out = n_rows_in

    n_cols_in = dataset.shape[1]
    if col_indices:
        # select all columns
        col_indices = sorted(col_indices)
        n_cols_out = len(col_indices)
    else:
        col_indices = range(n_cols_in)
        n_cols_out = n_cols_in

    n_items_out = n_rows_out * n_cols_out

    # initialise output array
    out = np.empty((n_items_out,), dtype=dataset.dtype)

    # convert indices into coordinates
    coords = itertools.product(row_indices, col_indices)

    # set up selection
    sel = h5py._hl.selections.PointSelection(dataset.shape)
    typ = h5py.h5t.py_create(dataset.dtype)

    # process blocks at a time
    for block_start in range(0, n_items_out, block_size):

        # materialise a block of coordinates
        selection = np.asarray(list(itertools.islice(coords, block_size)))

        # set selection
        sel.set(selection)

        # read data
        block_stop = block_start + len(selection)
        space = h5py.h5s.create_simple(sel.mshape)
        dataset.id.read(space,
                        sel._id,
                        out[block_start:block_stop],
                        typ)

    # reshape output array
    out = out.reshape(n_rows_out, n_cols_out)

    return out