Ejemplo n.º 1
0
 def mmap(self, mmap, file):
     # from now on, we only work with the mmapped array
     # we cannot support USE_MMAP=False for strings yet
     self.array = h5mmap(mmap, file, self.array)
     self.index_array = h5mmap(mmap, file, self.index_array)
     if self.null_bitmap_array is not None:
         self.null_bitmap_array = h5mmap(mmap, file, self.null_bitmap_array)
     if isinstance(self.index_array,
                   np.ndarray):  # this is a real mmappable file
         self.to_array = vaex.arrow.convert.arrow_string_array_from_buffers(
             self.array, self.index_array, self.null_bitmap_array)
     else:
         self.to_array = ColumnStringArrow(
             self.index_array,
             self.array,
             null_bitmap=self.null_bitmap_array)
     # if not isinstance(to_array, ColumnStringArrow):
     self.to_array = ColumnStringArrow.from_arrow(self.to_array)
Ejemplo n.º 2
0
def _export_column(dataset_input,
                   dataset_output,
                   column_name,
                   shuffle,
                   sort,
                   selection,
                   N,
                   order_array,
                   order_array_inverse,
                   progress_status,
                   parallel=True):

    if 1:
        block_scope = dataset_input._block_scope(
            0, vaex.execution.buffer_size_default)
        to_array = dataset_output.columns[column_name]
        dtype = dataset_input.data_type(column_name, array_type='numpy')
        is_string = vaex.array_types.is_string_type(dtype)
        if is_string:
            assert isinstance(to_array,
                              pa.Array)  # we don't support chunked arrays here
            # TODO legacy: we still use ColumnStringArrow to write, find a way to do this with arrow
            to_array = ColumnStringArrow.from_arrow(to_array)
        if shuffle or sort:  # we need to create a in memory copy, otherwise we will do random writes which is VERY inefficient
            to_array_disk = to_array
            if np.ma.isMaskedArray(to_array):
                to_array = np.empty_like(to_array_disk)
            else:
                if vaex.array_types.is_string_type(dtype):
                    # we create an empty column copy
                    to_array = to_array._zeros_like()
                else:
                    to_array = np.zeros_like(to_array_disk)
        to_offset = 0  # we need this for selections
        to_offset_unselected = 0  # we need this for filtering
        count = len(
            dataset_input
        )  # if not selection else dataset_input.length_unfiltered()
        # TODO: if no filter, selection or mask, we can choose the quick path for str
        string_byte_offset = 0

        for i1, i2, values in dataset_input.evaluate(column_name,
                                                     chunk_size=max_length,
                                                     filtered=True,
                                                     parallel=parallel,
                                                     selection=selection,
                                                     array_type='numpy-arrow'):
            logger.debug("from %d to %d (total length: %d, output length: %d)",
                         i1, i2, len(dataset_input), N)
            no_values = len(values)
            if no_values:
                if is_string:
                    # for strings, we don't take sorting/shuffling into account when building the structure
                    to_column = to_array
                    from_sequence = _to_string_sequence(values)
                    to_sequence = to_column.string_sequence.slice(
                        to_offset, to_offset + no_values, string_byte_offset)
                    string_byte_offset += to_sequence.fill_from(from_sequence)
                    to_offset += no_values
                else:
                    fill_value = np.nan if dtype.kind == "f" else None
                    # assert np.ma.isMaskedArray(to_array) == np.ma.isMaskedArray(values), "to (%s) and from (%s) array are not of both masked or unmasked (%s)" %\
                    # (np.ma.isMaskedArray(to_array), np.ma.isMaskedArray(values), column_name)
                    if shuffle or sort:
                        target_set_item = order_array[i1:i2]
                    else:
                        target_set_item = slice(to_offset,
                                                to_offset + no_values)
                    if dtype.is_datetime:
                        values = values.view(np.int64)
                    if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(
                            values):
                        to_array.data[target_set_item] = values.filled(
                            fill_value)
                        to_array.mask[target_set_item] = values.mask
                    elif not np.ma.isMaskedArray(
                            to_array) and np.ma.isMaskedArray(values):
                        to_array[target_set_item] = values.filled(fill_value)
                    else:
                        to_array[target_set_item] = values
                    to_offset += no_values

            with progress_lock:
                progress_status.value += i2 - i1
            if progress_status.cancelled:
                break
            #if not progress(progress_value / float(progress_total)):
            #    break
        if is_string:  # write out the last index
            to_column = to_array
            if selection:
                to_column.indices[to_offset] = string_byte_offset
            else:
                to_column.indices[count] = string_byte_offset
        if shuffle or sort:  # write to disk in one go
            if is_string:  # strings are sorted afterwards
                view = to_array.string_sequence.lazy_index(order_array_inverse)
                to_array_disk.string_sequence.fill_from(view)
            else:
                if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(
                        to_array_disk):
                    to_array_disk.data[:] = to_array.data
                    to_array_disk.mask[:] = to_array.mask
                else:
                    to_array_disk[:] = to_array