Beispiel #1
0
def load_region(callset, chrom, start_position=0, stop_position=None,
                variants_fields=None,
                calldata_fields=None,
                variants_query=None,
                samples=None):
    """Load data into memory from `callset` for the given region.

    Parameters
    ----------

    callset : HDF5 file or group
        A file or group containing a variant call set.
    chrom : string
        The chromosome to extract data for.
    start_position : int, optional
        The start position for the region to extract data for.
    stop_position : int, optional
        The stop position for the region to extract data for.
    variants_fields : sequence of strings, optional
        Names of the variants datasets to extract.
    calldata_fields : sequence of strings, optional
        Names of the calldata datasets to extract.
    variants_query : string, optional
        A query to filter variants. Note that this query is applied
        after data for the region has been loaded, so any fields
        referenced in this query need to be included in `variants_fields`.
    samples : sequence of strings, optional
        Selected samples to extract.

    Returns
    -------

    variants : dict
        A dictionary mapping dataset identifiers to ndarrays.
    calldata : dict
        A dictionary mapping dataset identifiers to ndarrays.

    """

    # obtain chromosome group
    grp_chrom = callset[chrom]

    # setup output variables
    variants = dict()
    calldata = dict()

    # obtain variant positions
    pos = grp_chrom['variants']['POS']

    # select samples needs list of all samples, check one is stored in the
    # callset and fail early if not
    all_samples = None
    if samples is not None:
        # find all samples
        if 'samples' in callset.keys():
            all_samples = list(callset['samples'])
        elif 'samples' in grp_chrom.keys():
            all_samples = list(grp_chrom['samples'])
        else:
            raise Exception('list of all samples not found in callset')

    # locate region
    loc = anhima.loc.locate_interval(pos, start_position, stop_position)

    # extract variants data
    if variants_fields:
        if isinstance(variants_fields, string_types):
            variants_fields = [variants_fields]
        for f in variants_fields:
            variants[f] = grp_chrom['variants'][f][loc, ...]

    # extract calldata
    if calldata_fields:
        if isinstance(calldata_fields, string_types):
            calldata_fields = [calldata_fields]
        for f in calldata_fields:
            calldata[f] = grp_chrom['calldata'][f][loc, ...]

    # select variants
    if variants_query is not None:
        condition = numexpr.evaluate(variants_query, local_dict=variants)
        for f in variants:
            variants[f] = np.compress(condition, variants[f], axis=0)
        for f in calldata:
            calldata[f] = np.compress(condition, calldata[f], axis=0)

    # select samples
    if samples is not None:
        # TODO check dtype of all_samples
        samples = force_bytes(samples)
        sample_indices = [all_samples.index(s) for s in samples]
        for f in calldata:
            calldata[f] = np.take(calldata[f], sample_indices, axis=1)

    return variants, calldata
Beispiel #2
0
def save_tped(path, callset, chrom,
              start_position=0,
              stop_position=None,
              samples=None):

    """Save genotype data from an HDF5 callset to a Plink transposed format
    file (TPED).

    Parameters
    ----------

    path : string or file-like
        Path of file to write, or file-like object to write to.
    callset : HDF5 file or group
        A file or group containing a variant call set.
    chrom : string
        The chromosome to extract data for.
    start_position : int, optional
        The start position for the region to extract data for.
    stop_position : int, optional
        The stop position for the region to extract data for.
    samples : sequence of strings, optional
        Selection of samples to extract genotypes for, defaults to all samples.

    Notes
    -----

    Note that the current implementation loads all data from the requested
    region into memory before writing out to TPED, so may not be applicable
    to very large datasets.

    """

    variants, calldata = load_region(callset,
                                     chrom,
                                     start_position,
                                     stop_position,
                                     variants_fields=['POS', 'REF', 'ALT'],
                                     calldata_fields=['genotype'])

    # determine samples that we will use
    if samples is None:
        genotypes = calldata['genotype']
    else:
        samples = force_bytes(samples)
        h5_samples = callset[chrom]['samples'][:].tolist()
        genotypes = np.take(
            calldata['genotype'],
            [h5_samples.index(s) for s in samples],
            axis=1)

    ref = variants['REF']

    alt = variants['ALT']
    if alt.ndim > 1:
        alt = alt[:, 0]

    pos = variants['POS']

    anhima.io.save_tped(path,
                        genotypes=genotypes,
                        ref=ref,
                        alt=alt,
                        pos=pos,
                        chromosome=chrom,
                        identifier=None,
                        genetic_distance=None)