Ejemplo n.º 1
0
 def encode(self, encoding):
     return {
         'expression': self.expressions[0],
         'dtype': encoding.encode('dtype', self.dtype),
         'dtype_item': encoding.encode('dtype', self.dtype_item),
         'flatten': self.flatten,
         'unique_limit': self.unique_limit,
         'selection': self.selection
     }
Ejemplo n.º 2
0
 def encode(self, encoding):
     return {
         'expression': self.expressions[0],
         'dtype': encoding.encode('dtype', self.dtype),
         'dtype_item': encoding.encode('dtype', self.dtype_item),
         'flatten': self.flatten,
         'limit': self.limit,
         'limit_raise': self.limit_raise,
         'selection': self.selection,
         'return_inverse': self.return_inverse
     }
Ejemplo n.º 3
0
 def encode(self, encoding):
     return {
         'expressions': self.expressions,
         'shape': self.shape,
         'selections': self.selections,
         'op': encoding.encode('_op', self.op),
         'weights': self.weights,
         'dtype': encoding.encode('dtype', DataType(self.dtype)),
         'minima': self.minima,
         'maxima': self.maxima,
         'edges': self.edges,
         'selection_waslist': self.selection_waslist
     }
Ejemplo n.º 4
0
 def encode(self, encoding):
     # TODO: get rid of dtypes
     return {
             'binners': encoding.encode_list('binner', self.binners),
             'aggregation': encoding.encode("aggregation", self.aggregation_description),
             'dtypes': encoding.encode_dict("dtype", self.dtypes)
             }
Ejemplo n.º 5
0
def test_encoding():
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('blobtest', {'someblob': b'1234'})
    wiredata = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(wiredata, encoding)
    values = encoding.decode('blobtest', data)
    assert values['someblob'] == b'1234'
Ejemplo n.º 6
0
def rebuild_dataframe_vaex(df):
    # encoding and decode
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('dataframe', df)
    blob = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(blob, encoding)
    return encoding.decode('dataframe', data)
Ejemplo n.º 7
0
def rebuild_dataset_vaex(ds):
    # encoding and decode
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('dataset', ds)
    blob = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(blob, encoding)
    return encoding.decode('dataset', data)
Ejemplo n.º 8
0
def test_encoding_numpy():
    x = np.arange(10, dtype='>f4')
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('ndarray', x)
    wiredata = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(wiredata, encoding)
    value = encoding.decode('ndarray', data)
    assert np.all(value == x)
Ejemplo n.º 9
0
def test_encoding_arrow(array_factory_arrow):
    x = array_factory_arrow(np.arange(10, dtype='f4'))
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('arrow-array', x)
    wiredata = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(wiredata, encoding)
    value = encoding.decode('arrow-array', data)
    assert value.to_pylist() == x.to_pylist()
Ejemplo n.º 10
0
 def encode(self, encoding):
     # TODO: get rid of dtypes
     encoded = {
             'grid': encoding.encode('grid', self.grid),
             'aggregations': encoding.encode_list("aggregation", self.aggregation_descriptions),
             'dtypes': encoding.encode_dict("dtype", self.dtypes)
             }
     if self.has_values:
         encoded['values'] = encoding.encode_list2('ndarray', self.get_values())
     return encoded
Ejemplo n.º 11
0
def test_encoding_numpy_string_objects():
    x = np.array(['vaex', 'is', None, 'fast'])
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('ndarray', x)
    wiredata = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(wiredata, encoding)
    value = encoding.decode('ndarray', data)
    assert np.all(value == x)
Ejemplo n.º 12
0
def test_encoding_numpy_datetime():
    x = np.arange('2001', '2005', dtype='M')
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('ndarray', x)
    wiredata = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(wiredata, encoding)
    value = encoding.decode('ndarray', data)
    assert np.all(value == x)
Ejemplo n.º 13
0
def test_encoding_dtype():
    dtype = np.dtype('>f8')
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('dtype', dtype)
    wiredata = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(wiredata, encoding)
    print(data)
    value = encoding.decode('dtype', data)
    assert value == dtype
    assert value.is_numpy
Ejemplo n.º 14
0
 def encode(self, encoding):
     # TODO: get rid of dtypes
     return {
         'task':
         type(self).name,
         'grid':
         encoding.encode('grid', self.parent_grid),
         'aggregations':
         encoding.encode_list("aggregation", self.aggregation_descriptions),
         'dtypes':
         encoding.encode_dict("dtype", self.dtypes)
     }
Ejemplo n.º 15
0
def test_encoding_numpy_masked():
    x = np.arange(10, dtype='>f4')
    mask = x > 4
    x = np.ma.array(x, mask=mask)
    encoding = vaex.encoding.Encoding()
    data = encoding.encode('ndarray', x)
    wiredata = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    data = vaex.encoding.deserialize(wiredata, encoding)
    value = encoding.decode('ndarray', data)
    assert np.all(value == x)
    assert np.all(value.mask == x.mask)
Ejemplo n.º 16
0
def rebuild_with_skip(ds, skip):
    repr(ds)  # for coverage
    # encoding and decode
    encoding = vaex.encoding.Encoding()
    encoding.set_object_spec(skip.id, None)  # this will cause it to skip serialization
    data = encoding.encode('dataset', ds)
    assert encoding._object_specs[skip.id] is None
    del encoding._object_specs[skip.id]
    blob = vaex.encoding.serialize(data, encoding)

    encoding = vaex.encoding.Encoding()
    encoding.set_object(skip.id, skip)
    data = vaex.encoding.deserialize(blob, encoding)
    return encoding.decode('dataset', data)
Ejemplo n.º 17
0
    async def execute_async(self):
        logger.debug("starting with execute")

        with self.lock:  # setup thread local initial values
            if not hasattr(self.local, 'executing'):
                self.local.executing = False

        # wo don't allow any thread from our thread pool to enter (a computation should never produce a new task)
        # and we explicitly disallow reentry (this usually means a bug in vaex, or bad usage)
        chunk_executor_thread = threading.current_thread(
        ) in self.thread_pool._threads
        import traceback
        trace = ''.join(traceback.format_stack())
        if chunk_executor_thread or self.local.executing:
            logger.error("nested execute call")
            raise RuntimeError(
                "nested execute call: %r %r\nlast trace:\n%s\ncurrent trace:\n%s"
                % (chunk_executor_thread, self.local.executing,
                   self.local.last_trace, trace))
        else:
            self.local.last_trace = trace

        self.local.executing = True
        try:
            t0 = time.time()
            self.local.cancelled = False
            self.signal_begin.emit()
            cancelled = False
            # keep getting a list of tasks
            # we currently process tasks (grouped) per df
            # but also, tasks can add new tasks
            while not cancelled:
                tasks = self.local.tasks = self._pop_tasks()
                if not tasks:
                    break
                tasks = _merge(tasks, tasks[0].df)
                run = Run(tasks)
                self.passes += 1

                # (re) thrown exceptions as soon as possible to avoid complicated stack traces
                for task in tasks:
                    if task.isRejected:
                        task.get()
                    if hasattr(task, "check"):
                        try:
                            task.check()
                        except Exception as e:
                            task.reject(e)
                            raise

                for task in run.tasks:
                    task._results = []
                    if not any(task.signal_progress.emit(0)):
                        logger.debug("task cancelled immediately")
                        task.cancelled = True
                row_count = run.df._index_end - run.df._index_start
                chunk_size = self.chunk_size_for(row_count)
                run.block_scopes = [
                    run.df._block_scope(0, chunk_size)
                    for i in range(self.thread_pool.nthreads)
                ]
                encoding = vaex.encoding.Encoding()
                for task in tasks:
                    spec = encoding.encode('task', task)
                    spec['task-part-cpu-type'] = spec.pop('task-type')

                    def create_task_part():
                        return encoding.decode('task-part-cpu',
                                               spec,
                                               df=run.df)

                    # We want at least 1 task part (otherwise we cannot do any work)
                    # then we ask for the task part how often we should split
                    # This means that we can have 100 threads, but only 2 task parts
                    # In this case, evaluation of expressions is still multithreaded,
                    # but aggregation is reduced to effectively 2 threads.
                    task_part_0 = create_task_part()
                    ideal_task_splits = task_part_0.ideal_splits(
                        self.thread_pool.nthreads)
                    assert ideal_task_splits <= self.thread_pool.nthreads, f'Cannot have more splits {ideal_task_splits} then threads {self.thread_pool.nthreads}'
                    if ideal_task_splits == self.thread_pool.nthreads:
                        # in the simple case, we just use a list
                        task._parts = [task_part_0] + [
                            create_task_part()
                            for i in range(1, self.thread_pool.nthreads)
                        ]
                    else:
                        # otherwise a queue
                        task._parts = queue.Queue()
                        task._parts.put(task_part_0)
                        for i in range(1, ideal_task_splits):
                            task._parts.put(create_task_part())
                length = run.df.active_length()
                if vaex.cache.is_on():
                    key_df = run.df.fingerprint()
                # TODO: in the future we might want to enable the zigzagging again, but this requires all datasets to implement it
                # if self.zigzag:
                #     self.zig = not self.zig
                dataset = run.df.dataset[run.df._index_start:run.df._index_end]
                # find the columns from the dataset we need
                variables = set()
                for expression in run.expressions:
                    variables |= run.df._expr(expression).expand().variables(
                        ourself=True)
                columns = list(variables - set(run.df.variables) -
                               set(run.df.virtual_columns))
                logger.debug('Using columns %r from dataset, chunk_size=%r',
                             columns, chunk_size)
                for column in columns:
                    if column not in dataset:
                        raise RuntimeError(
                            f'Oops, requesting column {column} from dataset, but it does not exist'
                        )
                async for element in self.thread_pool.map_async(
                        self.process_part,
                        dataset.chunk_iterator(columns, chunk_size),
                        dataset.row_count,
                        progress=lambda p: all(self.signal_progress.emit(p))
                        and all([
                            all(task.signal_progress.emit(p)) for task in tasks
                        ]) and all([not task.cancelled for task in tasks]),
                        cancel=lambda: self._cancel(run),
                        unpack=True,
                        run=run):
                    pass  # just eat all element
                duration_wallclock = time.time() - t0
                logger.debug("executing took %r seconds", duration_wallclock)
                cancelled = self.local.cancelled or any(
                    task.cancelled for task in tasks) or run.cancelled
                logger.debug("cancelled: %r", cancelled)
                if cancelled:
                    logger.debug("execution aborted")
                    for task in tasks:
                        task.reject(UserAbort("cancelled"))
                        # remove references
                        task._result = None
                        task._results = None
                        cancelled = True
                        if isinstance(task, vaex.tasks.TaskAggregations):
                            for subtask in task.original_tasks:
                                subtask.reject(UserAbort("cancelled"))
                else:
                    for task in tasks:
                        logger.debug("fulfill task: %r", task)
                        if not task.cancelled:
                            parts = task._parts
                            if not isinstance(parts, list):
                                parts_queue = parts
                                parts = []
                                while not parts_queue.empty():
                                    parts.append(parts_queue.get())
                            parts[0].reduce(parts[1:])
                            logger.debug("wait for task: %r", task)
                            task._result = parts[0].get_result()
                            task.end()
                            task.fulfill(task._result)
                            logger.debug("got result for: %r", task)
                            if task._result is not None and task.cacheable:  # we don't want to store None
                                if vaex.cache.is_on():
                                    # we only want to store the original task results into the cache
                                    tasks_cachable = task.original_tasks if isinstance(
                                        task,
                                        vaex.tasks.TaskAggregations) else [
                                            task
                                        ]
                                    for task_cachable in tasks_cachable:
                                        key_task = task_cachable.fingerprint()
                                        # tasks' fingerprints don't include the dataframe
                                        key = f'{key_task}-{key_df}'
                                        previous_result = vaex.cache.get(
                                            key, type='task')
                                        if (
                                                previous_result is not None
                                        ):  # and (previous_result != task_cachable.get()):
                                            try:
                                                if previous_result != task_cachable.get(
                                                ):
                                                    # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors)
                                                    logger.warning(
                                                        "calculated new result: %r, while cache had value: %r",
                                                        previous_result,
                                                        task_cachable.get())
                                            except ValueError:  # when comparing numpy results
                                                if np.array_equal(
                                                        previous_result,
                                                        task_cachable.get(),
                                                        equal_nan=True):
                                                    # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors)
                                                    logger.warning(
                                                        "calculated new result: %r, while cache had value: %r",
                                                        previous_result,
                                                        task_cachable.get())
                                        vaex.cache.set(key,
                                                       task_cachable.get(),
                                                       type='task',
                                                       duration_wallclock=
                                                       duration_wallclock)
                                        logger.info(
                                            "added result: %r in cache under key: %r",
                                            task_cachable.get(), key)

                        else:
                            task.reject(UserAbort("Task was cancelled"))
                            # remove references
                            cancelled = True
                        task._result = None
                        task._results = None
                    self.signal_end.emit()
        except:  # noqa
            self.signal_cancel.emit()
            logger.exception("error in task, flush task queue")
            raise
        finally:
            self.local.executing = False
Ejemplo n.º 18
0
    def execute_generator(self, use_async=False):
        logger.debug("starting with execute")

        with self.lock:  # setup thread local initial values
            if not hasattr(self.local, 'executing'):
                self.local.executing = False

        try:
            t0 = time.time()
            self.local.cancelled = False
            self.signal_begin.emit()
            # keep getting a list of tasks
            # we currently process tasks (grouped) per df
            # but also, tasks can add new tasks
            while True:
                tasks = self.local.tasks = self._pop_tasks()

                # wo don't allow any thread from our thread pool to enter (a computation should never produce a new task)
                # and we explicitly disallow reentry (this usually means a bug in vaex, or bad usage)
                chunk_executor_thread = threading.current_thread(
                ) in self.thread_pool._threads
                import traceback
                trace = ''.join(traceback.format_stack())
                if chunk_executor_thread or self.local.executing and (
                        has_contextvars is False
                        or self.isnested.get() is True):
                    logger.error("nested execute call")
                    raise RuntimeError(
                        "nested execute call: %r %r\nlast trace:\n%s\ncurrent trace:\n%s"
                        % (chunk_executor_thread, self.local.executing,
                           self.local.last_trace, trace))
                else:
                    self.local.last_trace = trace

                self.local.executing = True
                if has_contextvars:
                    self.isnested.set(True)

                if not tasks:
                    break
                tasks = _merge(tasks)
                run = Run(tasks)
                self.passes += 1
                dataset = run.dataset

                run.variables = {}
                for df in run.tasks_per_df.keys():
                    run.variables[df] = {
                        key: df.evaluate_variable(key)
                        for key in df.variables.keys()
                    }

                # (re) thrown exceptions as soon as possible to avoid complicated stack traces
                for task in tasks:
                    if task.isRejected:
                        task.get()
                    if hasattr(task, "check"):
                        try:
                            task.check()
                        except Exception as e:
                            task.reject(e)
                            raise

                for task in run.tasks:
                    task.signal_start.emit(self)

                for task in run.tasks:
                    task._results = []
                    if not any(task.signal_progress.emit(0)):
                        logger.debug("task cancelled immediately")
                        task.cancelled = True
                row_count = dataset.row_count
                chunk_size = self.chunk_size_for(row_count)
                encoding = vaex.encoding.Encoding()
                run.nthreads = nthreads = self.thread_pool.nthreads
                task_checkers = vaex.tasks.create_checkers()
                memory_tracker = vaex.memory.create_tracker()
                vaex.memory.local.agg = memory_tracker
                # we track this for consistency
                memory_usage = 0
                for task in tasks:
                    for task_checker in task_checkers:
                        task_checker.add_task(task)
                    spec = encoding.encode('task', task)
                    spec['task-part-cpu-type'] = spec.pop('task-type')

                    def create_task_part():
                        nonlocal memory_usage
                        task_part = encoding.decode('task-part-cpu',
                                                    spec,
                                                    df=task.df,
                                                    nthreads=nthreads)
                        memory_usage += task_part.memory_usage()
                        for task_checker in task_checkers:
                            task_checker.add_task(task)
                        if task.requires_fingerprint:
                            task_part.fingerprint = task.fingerprint()
                        return task_part

                    # We want at least 1 task part (otherwise we cannot do any work)
                    # then we ask for the task part how often we should split
                    # This means that we can have 100 threads, but only 2 task parts
                    # In this case, evaluation of expressions is still multithreaded,
                    # but aggregation is reduced to effectively 2 threads.
                    task_part_0 = create_task_part()
                    ideal_task_splits = task_part_0.ideal_splits(
                        self.thread_pool.nthreads)
                    assert ideal_task_splits <= self.thread_pool.nthreads, f'Cannot have more splits {ideal_task_splits} then threads {self.thread_pool.nthreads}'
                    if ideal_task_splits == self.thread_pool.nthreads or task.see_all:
                        # in the simple case, we just use a list
                        task._parts = [task_part_0] + [
                            create_task_part()
                            for i in range(1, ideal_task_splits)
                        ]
                    else:
                        # otherwise a queue
                        task._parts = queue.Queue()
                        task._parts.put(task_part_0)
                        for i in range(1, ideal_task_splits):
                            task._parts.put(create_task_part())
                if memory_usage != memory_tracker.used:
                    raise RuntimeError(
                        f"Reported memory usage by tasks was {memory_usage}, while tracker listed {memory_tracker.used}"
                    )
                vaex.memory.local.agg = None

                # TODO: in the future we might want to enable the zigzagging again, but this requires all datasets to implement it
                # if self.zigzag:
                #     self.zig = not self.zig
                def progress(p):
                    # no global cancel and at least 1 tasks wants to continue, then we continue
                    ok_tasks = any([task.progress(p) for task in tasks])
                    all_stopped = all([task.stopped for task in tasks])
                    ok_executor = all(self.signal_progress.emit(p))
                    if all_stopped:
                        logger.debug(
                            "Pass cancelled because all tasks are stopped: %r",
                            tasks)
                    if not ok_tasks:
                        logger.debug(
                            "Pass cancelled because all tasks cancelled: %r",
                            tasks)
                    if not ok_executor:
                        logger.debug(
                            "Pass cancelled because of the global progress event: %r",
                            self.signal_progress.callbacks)
                    return ok_tasks and ok_executor and not all_stopped

                yield from self.thread_pool.map(
                    self.process_part,
                    dataset.chunk_iterator(run.dataset_deps, chunk_size),
                    dataset.row_count,
                    progress=progress,
                    cancel=lambda: self._cancel(run),
                    unpack=True,
                    run=run,
                    use_async=use_async)
                duration_wallclock = time.time() - t0
                logger.debug("executing took %r seconds", duration_wallclock)
                self.local.executing = False
                if has_contextvars:
                    self.isnested.set(False)
                if True:  # kept to keep the diff small
                    for task in tasks:
                        if not task.cancelled:
                            logger.debug("fulfill task: %r", task)
                            parts = task._parts
                            if not isinstance(parts, list):
                                parts_queue = parts
                                parts = []
                                while not parts_queue.empty():
                                    parts.append(parts_queue.get())
                            parts[0].reduce(parts[1:])
                            logger.debug("wait for task: %r", task)
                            task._result = parts[0].get_result()
                            task.end()
                            task.fulfill(task._result)
                            logger.debug("got result for: %r", task)
                            if task._result is not None and task.cacheable:  # we don't want to store None
                                if vaex.cache.is_on():
                                    # we only want to store the original task results into the cache
                                    tasks_cachable = task.original_tasks if isinstance(
                                        task,
                                        vaex.tasks.TaskAggregations) else [
                                            task
                                        ]
                                    for task_cachable in tasks_cachable:
                                        key = task_cachable.fingerprint()
                                        previous_result = vaex.cache.get(
                                            key, type='task')
                                        if (
                                                previous_result is not None
                                        ):  # and (previous_result != task_cachable.get()):
                                            try:
                                                if previous_result != task_cachable.get(
                                                ):
                                                    # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors)
                                                    logger.warning(
                                                        "calculated new result: %r, while cache had value: %r",
                                                        previous_result,
                                                        task_cachable.get())
                                            except ValueError:  # when comparing numpy results
                                                if not np.array_equal(
                                                        previous_result,
                                                        task_cachable.get(),
                                                        equal_nan=True):
                                                    # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors)
                                                    logger.warning(
                                                        "calculated new result: %r, while cache had value: %r",
                                                        previous_result,
                                                        task_cachable.get())
                                        vaex.cache.set(key,
                                                       task_cachable.get(),
                                                       type='task',
                                                       duration_wallclock=
                                                       duration_wallclock)
                                        logger.info(
                                            "added result: %r in cache under key: %r",
                                            task_cachable.get(), key)

                        else:
                            logger.debug("rejecting task: %r", task)
                            # we now reject, in the main thread
                            if task._toreject:
                                task.reject(task._toreject)
                            else:
                                task.reject(UserAbort("Task was cancelled"))
                            # remove references
                        task._result = None
                        task._results = None
                    self.signal_end.emit()
        except:  # noqa
            self.signal_cancel.emit()
            raise
        finally:
            self.local.executing = False
            if has_contextvars:
                self.isnested.set(False)