def column_from_arrow_array(arrow_array): arrow_type = arrow_array.type buffers = arrow_array.buffers() if len(buffers) == 2: return numpy_array_from_arrow_array(arrow_array) elif len(buffers) == 3 and isinstance(arrow_array.type, type(pyarrow.string())): bitmap_buffer, offsets, string_bytes = arrow_array.buffers() if arrow_array.null_count == 0: null_bitmap = None # we drop any null_bitmap when there are no null counts else: null_bitmap = np.frombuffer(bitmap_buffer, 'uint8', len(bitmap_buffer)) offsets = np.frombuffer(offsets, np.int32, len(offsets) // 4) if string_bytes is None: string_bytes = np.array([], dtype='S1') else: string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes)) column = ColumnStringArrow(offsets, string_bytes, len(arrow_array), null_bitmap=null_bitmap) return column # Edited by Ank elif len(buffers) == 4 and isinstance(arrow_array.type, pyarrow.lib.ListType): return np.array(arrow_array) else: raise TypeError('type unsupported: %r' % arrow_type)
def _map_column(self, column: h5py.Group, as_arrow=False): data = column["data"] if "dtype" in data.attrs and data.attrs["dtype"] == "str": indices = self._map_hdf5_array(column['indices']) bytes = self._map_hdf5_array(data) if "null_bitmap" in column: null_bitmap = self._map_hdf5_array(column['null_bitmap']) else: null_bitmap = None if isinstance(indices, np.ndarray): # this is a real mmappable file return vaex.arrow.convert.arrow_string_array_from_buffers( bytes, indices, null_bitmap) else: # if not a reall mmappable array, we fall back to this, maybe we can generalize this return ColumnStringArrow(indices, bytes, null_bitmap=null_bitmap) else: if self._version > 1 and 'mask' in column: return self._map_hdf5_array(data, column['mask'], as_arrow=as_arrow) else: return self._map_hdf5_array(data, as_arrow=as_arrow)
def mmap(self, mmap, file): # from now on, we only work with the mmapped array # we cannot support USE_MMAP=False for strings yet self.array = h5mmap(mmap, file, self.array) self.index_array = h5mmap(mmap, file, self.index_array) if self.null_bitmap_array is not None: self.null_bitmap_array = h5mmap(mmap, file, self.null_bitmap_array) if isinstance(self.index_array, np.ndarray): # this is a real mmappable file self.to_array = vaex.arrow.convert.arrow_string_array_from_buffers( self.array, self.index_array, self.null_bitmap_array) else: self.to_array = ColumnStringArrow( self.index_array, self.array, null_bitmap=self.null_bitmap_array) # if not isinstance(to_array, ColumnStringArrow): self.to_array = ColumnStringArrow.from_arrow(self.to_array)
def _export_column(dataset_input, dataset_output, column_name, shuffle, sort, selection, N, order_array, order_array_inverse, progress_status, parallel=True): if 1: block_scope = dataset_input._block_scope( 0, vaex.execution.buffer_size_default) to_array = dataset_output.columns[column_name] dtype = dataset_input.data_type(column_name, array_type='numpy') is_string = vaex.array_types.is_string_type(dtype) if is_string: assert isinstance(to_array, pa.Array) # we don't support chunked arrays here # TODO legacy: we still use ColumnStringArrow to write, find a way to do this with arrow to_array = ColumnStringArrow.from_arrow(to_array) if shuffle or sort: # we need to create a in memory copy, otherwise we will do random writes which is VERY inefficient to_array_disk = to_array if np.ma.isMaskedArray(to_array): to_array = np.empty_like(to_array_disk) else: if vaex.array_types.is_string_type(dtype): # we create an empty column copy to_array = to_array._zeros_like() else: to_array = np.zeros_like(to_array_disk) to_offset = 0 # we need this for selections to_offset_unselected = 0 # we need this for filtering count = len( dataset_input ) # if not selection else dataset_input.length_unfiltered() # TODO: if no filter, selection or mask, we can choose the quick path for str string_byte_offset = 0 for i1, i2, values in dataset_input.evaluate(column_name, chunk_size=max_length, filtered=True, parallel=parallel, selection=selection, array_type='numpy-arrow'): logger.debug("from %d to %d (total length: %d, output length: %d)", i1, i2, len(dataset_input), N) no_values = len(values) if no_values: if is_string: # for strings, we don't take sorting/shuffling into account when building the structure to_column = to_array from_sequence = _to_string_sequence(values) to_sequence = to_column.string_sequence.slice( to_offset, to_offset + no_values, string_byte_offset) string_byte_offset += to_sequence.fill_from(from_sequence) to_offset += no_values else: fill_value = np.nan if dtype.kind == "f" else None # assert np.ma.isMaskedArray(to_array) == np.ma.isMaskedArray(values), "to (%s) and from (%s) array are not of both masked or unmasked (%s)" %\ # (np.ma.isMaskedArray(to_array), np.ma.isMaskedArray(values), column_name) if shuffle or sort: target_set_item = order_array[i1:i2] else: target_set_item = slice(to_offset, to_offset + no_values) if dtype.is_datetime: values = values.view(np.int64) if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray( values): to_array.data[target_set_item] = values.filled( fill_value) to_array.mask[target_set_item] = values.mask elif not np.ma.isMaskedArray( to_array) and np.ma.isMaskedArray(values): to_array[target_set_item] = values.filled(fill_value) else: to_array[target_set_item] = values to_offset += no_values with progress_lock: progress_status.value += i2 - i1 if progress_status.cancelled: break #if not progress(progress_value / float(progress_total)): # break if is_string: # write out the last index to_column = to_array if selection: to_column.indices[to_offset] = string_byte_offset else: to_column.indices[count] = string_byte_offset if shuffle or sort: # write to disk in one go if is_string: # strings are sorted afterwards view = to_array.string_sequence.lazy_index(order_array_inverse) to_array_disk.string_sequence.fill_from(view) else: if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray( to_array_disk): to_array_disk.data[:] = to_array.data to_array_disk.mask[:] = to_array.mask else: to_array_disk[:] = to_array
def _load_columns(self, h5data, first=[]): # print h5data # make sure x y x etc are first finished = set() if "description" in h5data.attrs: self.description = ensure_string(h5data.attrs["description"]) # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later h5columns = h5data if self._version == 1 else h5data['columns'] if "column_order" in h5columns.attrs: column_order = ensure_string( h5columns.attrs["column_order"]).split(",") else: column_order = [] # for name in list(h5columns): # if name not in column_order: # column_order.append(name) # for column_name in column_order: # if column_name in h5columns and column_name not in finished: for group_name in list(h5columns): logger.debug('loading column: %s', group_name) group = h5columns[group_name] if 'type' in group.attrs: if group.attrs['type'] in ['csr_matrix']: from scipy.sparse import csc_matrix, csr_matrix class csr_matrix_nocheck(csr_matrix): def check_format(self, *args, **kwargs): pass data = self._map_hdf5_array(group['data']) indptr = self._map_hdf5_array(group['indptr']) indices = self._map_hdf5_array(group['indices']) #column_names = ensure_string(group.attrs["column_names"]).split(",") # make sure we keep the original order groups = [(name, value) for name, value in group.items() if isinstance(value, h5py.Group)] column_names = [None] * len(groups) for name, column in groups: column_names[column.attrs['column_index']] = name matrix = csr_matrix_nocheck( (data, indices, indptr), shape=(len(indptr) - 1, len(column_names))) assert matrix.data is data # assert matrix.indptr is indptr assert matrix.indices is indices self.add_columns(column_names, matrix) else: column_name = group_name column = h5columns[column_name] if "alias" in column.attrs: column_name = column.attrs["alias"] if "ucd" in column.attrs: self.ucds[column_name] = ensure_string(column.attrs["ucd"]) if "description" in column.attrs: self.descriptions[column_name] = ensure_string( column.attrs["description"]) if "unit" in column.attrs: try: unitname = ensure_string(column.attrs["unit"]) if unitname and unitname != "None": self.units[column_name] = _try_unit(unitname) except: logger.exception("error parsing unit: %s", column.attrs["unit"]) if "units" in column.attrs: # Amuse case unitname = ensure_string(column.attrs["units"]) logger.debug("amuse unit: %s", unitname) if unitname == "(0.01 * system.get('S.I.').base('length'))": self.units[column_name] = astropy.units.Unit("cm") if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))": self.units[column_name] = astropy.units.Unit("cm/s") if unitname == "(0.001 * system.get('S.I.').base('mass'))": self.units[column_name] = astropy.units.Unit("gram") if unitname == "system.get('S.I.').base('length')": self.units[column_name] = astropy.units.Unit("m") if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))": self.units[column_name] = astropy.units.Unit("m/s") if unitname == "system.get('S.I.').base('mass')": self.units[column_name] = astropy.units.Unit("kg") data = column if self._version == 1 else column['data'] if hasattr(data, "dtype"): if "dtype" in data.attrs and data.attrs["dtype"] == "str": indices = self._map_hdf5_array(column['indices']) bytes = self._map_hdf5_array(data) if "null_bitmap" in column: null_bitmap = self._map_hdf5_array( column['null_bitmap']) else: null_bitmap = None if isinstance( indices, np.ndarray): # this is a real mmappable file self.add_column( column_name, vaex.arrow.convert. arrow_string_array_from_buffers( bytes, indices, null_bitmap)) else: # if not a reall mmappable array, we fall back to this, maybe we can generalize this self.add_column( column_name, ColumnStringArrow(indices, bytes, null_bitmap=null_bitmap)) else: shape = data.shape if True: # len(shape) == 1: dtype = data.dtype if "dtype" in data.attrs: dtype = data.attrs["dtype"] logger.debug("adding column %r with dtype %r", column_name, dtype) # self.addColumn(column_name, offset, len(data), dtype=dtype) if self._version > 1 and 'mask' in column: self.add_column( column_name, self._map_hdf5_array(data, column['mask'])) else: self.add_column(column_name, self._map_hdf5_array(data)) else: transposed = shape[1] < shape[0] self.addRank1(column_name, offset, shape[1], length1=shape[0], dtype=data.dtype, stride=1, stride1=1, transposed=transposed) all_columns = dict(**self._columns) # in case the column_order refers to non-existing columns column_order = [k for k in column_order if k in all_columns] column_names = [] self._columns = {} for name in column_order: self._columns[name] = all_columns.pop(name) # add the rest for name, col in all_columns.items(): self._columns[name] = col