def load_region(callset, chrom, start_position=0, stop_position=None, variants_fields=None, calldata_fields=None, variants_query=None, samples=None): """Load data into memory from `callset` for the given region. Parameters ---------- callset : HDF5 file or group A file or group containing a variant call set. chrom : string The chromosome to extract data for. start_position : int, optional The start position for the region to extract data for. stop_position : int, optional The stop position for the region to extract data for. variants_fields : sequence of strings, optional Names of the variants datasets to extract. calldata_fields : sequence of strings, optional Names of the calldata datasets to extract. variants_query : string, optional A query to filter variants. Note that this query is applied after data for the region has been loaded, so any fields referenced in this query need to be included in `variants_fields`. samples : sequence of strings, optional Selected samples to extract. Returns ------- variants : dict A dictionary mapping dataset identifiers to ndarrays. calldata : dict A dictionary mapping dataset identifiers to ndarrays. """ # obtain chromosome group grp_chrom = callset[chrom] # setup output variables variants = dict() calldata = dict() # obtain variant positions pos = grp_chrom['variants']['POS'] # select samples needs list of all samples, check one is stored in the # callset and fail early if not all_samples = None if samples is not None: # find all samples if 'samples' in callset.keys(): all_samples = list(callset['samples']) elif 'samples' in grp_chrom.keys(): all_samples = list(grp_chrom['samples']) else: raise Exception('list of all samples not found in callset') # locate region loc = anhima.loc.locate_interval(pos, start_position, stop_position) # extract variants data if variants_fields: if isinstance(variants_fields, string_types): variants_fields = [variants_fields] for f in variants_fields: variants[f] = grp_chrom['variants'][f][loc, ...] # extract calldata if calldata_fields: if isinstance(calldata_fields, string_types): calldata_fields = [calldata_fields] for f in calldata_fields: calldata[f] = grp_chrom['calldata'][f][loc, ...] # select variants if variants_query is not None: condition = numexpr.evaluate(variants_query, local_dict=variants) for f in variants: variants[f] = np.compress(condition, variants[f], axis=0) for f in calldata: calldata[f] = np.compress(condition, calldata[f], axis=0) # select samples if samples is not None: # TODO check dtype of all_samples samples = force_bytes(samples) sample_indices = [all_samples.index(s) for s in samples] for f in calldata: calldata[f] = np.take(calldata[f], sample_indices, axis=1) return variants, calldata
def save_tped(path, callset, chrom, start_position=0, stop_position=None, samples=None): """Save genotype data from an HDF5 callset to a Plink transposed format file (TPED). Parameters ---------- path : string or file-like Path of file to write, or file-like object to write to. callset : HDF5 file or group A file or group containing a variant call set. chrom : string The chromosome to extract data for. start_position : int, optional The start position for the region to extract data for. stop_position : int, optional The stop position for the region to extract data for. samples : sequence of strings, optional Selection of samples to extract genotypes for, defaults to all samples. Notes ----- Note that the current implementation loads all data from the requested region into memory before writing out to TPED, so may not be applicable to very large datasets. """ variants, calldata = load_region(callset, chrom, start_position, stop_position, variants_fields=['POS', 'REF', 'ALT'], calldata_fields=['genotype']) # determine samples that we will use if samples is None: genotypes = calldata['genotype'] else: samples = force_bytes(samples) h5_samples = callset[chrom]['samples'][:].tolist() genotypes = np.take( calldata['genotype'], [h5_samples.index(s) for s in samples], axis=1) ref = variants['REF'] alt = variants['ALT'] if alt.ndim > 1: alt = alt[:, 0] pos = variants['POS'] anhima.io.save_tped(path, genotypes=genotypes, ref=ref, alt=alt, pos=pos, chromosome=chrom, identifier=None, genetic_distance=None)