def concat_chunks_into_dset(matrices, group, dset_name, rows_in_chunk=SNPS_PER_CHUNK): matrices = iter(matrices) fst_mat = _first_item(matrices) if matrices is None: raise ValueError('There were no matrices to concatenate') mats = chain([fst_mat], matrices) size = fst_mat.shape kwargs = DEF_DSET_PARAMS.copy() kwargs['dtype'] = fst_mat.dtype kwargs['maxshape'] = (None,) + size[1:] kwargs['chunks'] = (SNPS_PER_CHUNK,) + size[1:] dset = group.create_dataset(dset_name, size, **kwargs) current_snp_index = 0 for mat in mats: num_snps = mat.shape[0] start = current_snp_index stop = current_snp_index + num_snps current_snps_in_dset = dset.shape[0] if current_snps_in_dset < stop: dset.resize((stop,) + size[1:]) dset[start:stop] = mat current_snp_index += num_snps return dset
def _create_matrix(self, path, *args, **kwargs): hdf5 = self._h5file group_name, dset_name = posixpath.split(path) if not dset_name: msg = 'The path should include a dset name: ' + path raise ValueError(msg) try: hdf5[path] msg = 'The dataset already exists: ' + path raise ValueError(msg) except KeyError: pass try: group = hdf5[group_name] except KeyError: group = hdf5.create_group(group_name) for key, value in DEF_DSET_PARAMS.items(): if key not in kwargs: kwargs[key] = value if 'fillvalue' not in kwargs: if 'dtype' in kwargs: dtype = kwargs['dtype'] else: if len(args) > 2: dtype = args[2] else: dtype = None if dtype is not None: fillvalue = MISSING_VALUES[dtype] kwargs['fillvalue'] = fillvalue if 'maxshape' not in kwargs: kwargs['maxshape'] = (None,) * len(kwargs['shape']) args = list(args) args.insert(0, dset_name) dset = group.create_dataset(*args, **kwargs) return dset