def encode(self, encoding): return { 'expression': self.expressions[0], 'dtype': encoding.encode('dtype', self.dtype), 'dtype_item': encoding.encode('dtype', self.dtype_item), 'flatten': self.flatten, 'unique_limit': self.unique_limit, 'selection': self.selection }
def encode(self, encoding): return { 'expression': self.expressions[0], 'dtype': encoding.encode('dtype', self.dtype), 'dtype_item': encoding.encode('dtype', self.dtype_item), 'flatten': self.flatten, 'limit': self.limit, 'limit_raise': self.limit_raise, 'selection': self.selection, 'return_inverse': self.return_inverse }
def encode(self, encoding): return { 'expressions': self.expressions, 'shape': self.shape, 'selections': self.selections, 'op': encoding.encode('_op', self.op), 'weights': self.weights, 'dtype': encoding.encode('dtype', DataType(self.dtype)), 'minima': self.minima, 'maxima': self.maxima, 'edges': self.edges, 'selection_waslist': self.selection_waslist }
def encode(self, encoding): # TODO: get rid of dtypes return { 'binners': encoding.encode_list('binner', self.binners), 'aggregation': encoding.encode("aggregation", self.aggregation_description), 'dtypes': encoding.encode_dict("dtype", self.dtypes) }
def test_encoding(): encoding = vaex.encoding.Encoding() data = encoding.encode('blobtest', {'someblob': b'1234'}) wiredata = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(wiredata, encoding) values = encoding.decode('blobtest', data) assert values['someblob'] == b'1234'
def rebuild_dataframe_vaex(df): # encoding and decode encoding = vaex.encoding.Encoding() data = encoding.encode('dataframe', df) blob = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(blob, encoding) return encoding.decode('dataframe', data)
def rebuild_dataset_vaex(ds): # encoding and decode encoding = vaex.encoding.Encoding() data = encoding.encode('dataset', ds) blob = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(blob, encoding) return encoding.decode('dataset', data)
def test_encoding_numpy(): x = np.arange(10, dtype='>f4') encoding = vaex.encoding.Encoding() data = encoding.encode('ndarray', x) wiredata = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(wiredata, encoding) value = encoding.decode('ndarray', data) assert np.all(value == x)
def test_encoding_arrow(array_factory_arrow): x = array_factory_arrow(np.arange(10, dtype='f4')) encoding = vaex.encoding.Encoding() data = encoding.encode('arrow-array', x) wiredata = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(wiredata, encoding) value = encoding.decode('arrow-array', data) assert value.to_pylist() == x.to_pylist()
def encode(self, encoding): # TODO: get rid of dtypes encoded = { 'grid': encoding.encode('grid', self.grid), 'aggregations': encoding.encode_list("aggregation", self.aggregation_descriptions), 'dtypes': encoding.encode_dict("dtype", self.dtypes) } if self.has_values: encoded['values'] = encoding.encode_list2('ndarray', self.get_values()) return encoded
def test_encoding_numpy_string_objects(): x = np.array(['vaex', 'is', None, 'fast']) encoding = vaex.encoding.Encoding() data = encoding.encode('ndarray', x) wiredata = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(wiredata, encoding) value = encoding.decode('ndarray', data) assert np.all(value == x)
def test_encoding_numpy_datetime(): x = np.arange('2001', '2005', dtype='M') encoding = vaex.encoding.Encoding() data = encoding.encode('ndarray', x) wiredata = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(wiredata, encoding) value = encoding.decode('ndarray', data) assert np.all(value == x)
def test_encoding_dtype(): dtype = np.dtype('>f8') encoding = vaex.encoding.Encoding() data = encoding.encode('dtype', dtype) wiredata = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(wiredata, encoding) print(data) value = encoding.decode('dtype', data) assert value == dtype assert value.is_numpy
def encode(self, encoding): # TODO: get rid of dtypes return { 'task': type(self).name, 'grid': encoding.encode('grid', self.parent_grid), 'aggregations': encoding.encode_list("aggregation", self.aggregation_descriptions), 'dtypes': encoding.encode_dict("dtype", self.dtypes) }
def test_encoding_numpy_masked(): x = np.arange(10, dtype='>f4') mask = x > 4 x = np.ma.array(x, mask=mask) encoding = vaex.encoding.Encoding() data = encoding.encode('ndarray', x) wiredata = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() data = vaex.encoding.deserialize(wiredata, encoding) value = encoding.decode('ndarray', data) assert np.all(value == x) assert np.all(value.mask == x.mask)
def rebuild_with_skip(ds, skip): repr(ds) # for coverage # encoding and decode encoding = vaex.encoding.Encoding() encoding.set_object_spec(skip.id, None) # this will cause it to skip serialization data = encoding.encode('dataset', ds) assert encoding._object_specs[skip.id] is None del encoding._object_specs[skip.id] blob = vaex.encoding.serialize(data, encoding) encoding = vaex.encoding.Encoding() encoding.set_object(skip.id, skip) data = vaex.encoding.deserialize(blob, encoding) return encoding.decode('dataset', data)
async def execute_async(self): logger.debug("starting with execute") with self.lock: # setup thread local initial values if not hasattr(self.local, 'executing'): self.local.executing = False # wo don't allow any thread from our thread pool to enter (a computation should never produce a new task) # and we explicitly disallow reentry (this usually means a bug in vaex, or bad usage) chunk_executor_thread = threading.current_thread( ) in self.thread_pool._threads import traceback trace = ''.join(traceback.format_stack()) if chunk_executor_thread or self.local.executing: logger.error("nested execute call") raise RuntimeError( "nested execute call: %r %r\nlast trace:\n%s\ncurrent trace:\n%s" % (chunk_executor_thread, self.local.executing, self.local.last_trace, trace)) else: self.local.last_trace = trace self.local.executing = True try: t0 = time.time() self.local.cancelled = False self.signal_begin.emit() cancelled = False # keep getting a list of tasks # we currently process tasks (grouped) per df # but also, tasks can add new tasks while not cancelled: tasks = self.local.tasks = self._pop_tasks() if not tasks: break tasks = _merge(tasks, tasks[0].df) run = Run(tasks) self.passes += 1 # (re) thrown exceptions as soon as possible to avoid complicated stack traces for task in tasks: if task.isRejected: task.get() if hasattr(task, "check"): try: task.check() except Exception as e: task.reject(e) raise for task in run.tasks: task._results = [] if not any(task.signal_progress.emit(0)): logger.debug("task cancelled immediately") task.cancelled = True row_count = run.df._index_end - run.df._index_start chunk_size = self.chunk_size_for(row_count) run.block_scopes = [ run.df._block_scope(0, chunk_size) for i in range(self.thread_pool.nthreads) ] encoding = vaex.encoding.Encoding() for task in tasks: spec = encoding.encode('task', task) spec['task-part-cpu-type'] = spec.pop('task-type') def create_task_part(): return encoding.decode('task-part-cpu', spec, df=run.df) # We want at least 1 task part (otherwise we cannot do any work) # then we ask for the task part how often we should split # This means that we can have 100 threads, but only 2 task parts # In this case, evaluation of expressions is still multithreaded, # but aggregation is reduced to effectively 2 threads. task_part_0 = create_task_part() ideal_task_splits = task_part_0.ideal_splits( self.thread_pool.nthreads) assert ideal_task_splits <= self.thread_pool.nthreads, f'Cannot have more splits {ideal_task_splits} then threads {self.thread_pool.nthreads}' if ideal_task_splits == self.thread_pool.nthreads: # in the simple case, we just use a list task._parts = [task_part_0] + [ create_task_part() for i in range(1, self.thread_pool.nthreads) ] else: # otherwise a queue task._parts = queue.Queue() task._parts.put(task_part_0) for i in range(1, ideal_task_splits): task._parts.put(create_task_part()) length = run.df.active_length() if vaex.cache.is_on(): key_df = run.df.fingerprint() # TODO: in the future we might want to enable the zigzagging again, but this requires all datasets to implement it # if self.zigzag: # self.zig = not self.zig dataset = run.df.dataset[run.df._index_start:run.df._index_end] # find the columns from the dataset we need variables = set() for expression in run.expressions: variables |= run.df._expr(expression).expand().variables( ourself=True) columns = list(variables - set(run.df.variables) - set(run.df.virtual_columns)) logger.debug('Using columns %r from dataset, chunk_size=%r', columns, chunk_size) for column in columns: if column not in dataset: raise RuntimeError( f'Oops, requesting column {column} from dataset, but it does not exist' ) async for element in self.thread_pool.map_async( self.process_part, dataset.chunk_iterator(columns, chunk_size), dataset.row_count, progress=lambda p: all(self.signal_progress.emit(p)) and all([ all(task.signal_progress.emit(p)) for task in tasks ]) and all([not task.cancelled for task in tasks]), cancel=lambda: self._cancel(run), unpack=True, run=run): pass # just eat all element duration_wallclock = time.time() - t0 logger.debug("executing took %r seconds", duration_wallclock) cancelled = self.local.cancelled or any( task.cancelled for task in tasks) or run.cancelled logger.debug("cancelled: %r", cancelled) if cancelled: logger.debug("execution aborted") for task in tasks: task.reject(UserAbort("cancelled")) # remove references task._result = None task._results = None cancelled = True if isinstance(task, vaex.tasks.TaskAggregations): for subtask in task.original_tasks: subtask.reject(UserAbort("cancelled")) else: for task in tasks: logger.debug("fulfill task: %r", task) if not task.cancelled: parts = task._parts if not isinstance(parts, list): parts_queue = parts parts = [] while not parts_queue.empty(): parts.append(parts_queue.get()) parts[0].reduce(parts[1:]) logger.debug("wait for task: %r", task) task._result = parts[0].get_result() task.end() task.fulfill(task._result) logger.debug("got result for: %r", task) if task._result is not None and task.cacheable: # we don't want to store None if vaex.cache.is_on(): # we only want to store the original task results into the cache tasks_cachable = task.original_tasks if isinstance( task, vaex.tasks.TaskAggregations) else [ task ] for task_cachable in tasks_cachable: key_task = task_cachable.fingerprint() # tasks' fingerprints don't include the dataframe key = f'{key_task}-{key_df}' previous_result = vaex.cache.get( key, type='task') if ( previous_result is not None ): # and (previous_result != task_cachable.get()): try: if previous_result != task_cachable.get( ): # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors) logger.warning( "calculated new result: %r, while cache had value: %r", previous_result, task_cachable.get()) except ValueError: # when comparing numpy results if np.array_equal( previous_result, task_cachable.get(), equal_nan=True): # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors) logger.warning( "calculated new result: %r, while cache had value: %r", previous_result, task_cachable.get()) vaex.cache.set(key, task_cachable.get(), type='task', duration_wallclock= duration_wallclock) logger.info( "added result: %r in cache under key: %r", task_cachable.get(), key) else: task.reject(UserAbort("Task was cancelled")) # remove references cancelled = True task._result = None task._results = None self.signal_end.emit() except: # noqa self.signal_cancel.emit() logger.exception("error in task, flush task queue") raise finally: self.local.executing = False
def execute_generator(self, use_async=False): logger.debug("starting with execute") with self.lock: # setup thread local initial values if not hasattr(self.local, 'executing'): self.local.executing = False try: t0 = time.time() self.local.cancelled = False self.signal_begin.emit() # keep getting a list of tasks # we currently process tasks (grouped) per df # but also, tasks can add new tasks while True: tasks = self.local.tasks = self._pop_tasks() # wo don't allow any thread from our thread pool to enter (a computation should never produce a new task) # and we explicitly disallow reentry (this usually means a bug in vaex, or bad usage) chunk_executor_thread = threading.current_thread( ) in self.thread_pool._threads import traceback trace = ''.join(traceback.format_stack()) if chunk_executor_thread or self.local.executing and ( has_contextvars is False or self.isnested.get() is True): logger.error("nested execute call") raise RuntimeError( "nested execute call: %r %r\nlast trace:\n%s\ncurrent trace:\n%s" % (chunk_executor_thread, self.local.executing, self.local.last_trace, trace)) else: self.local.last_trace = trace self.local.executing = True if has_contextvars: self.isnested.set(True) if not tasks: break tasks = _merge(tasks) run = Run(tasks) self.passes += 1 dataset = run.dataset run.variables = {} for df in run.tasks_per_df.keys(): run.variables[df] = { key: df.evaluate_variable(key) for key in df.variables.keys() } # (re) thrown exceptions as soon as possible to avoid complicated stack traces for task in tasks: if task.isRejected: task.get() if hasattr(task, "check"): try: task.check() except Exception as e: task.reject(e) raise for task in run.tasks: task.signal_start.emit(self) for task in run.tasks: task._results = [] if not any(task.signal_progress.emit(0)): logger.debug("task cancelled immediately") task.cancelled = True row_count = dataset.row_count chunk_size = self.chunk_size_for(row_count) encoding = vaex.encoding.Encoding() run.nthreads = nthreads = self.thread_pool.nthreads task_checkers = vaex.tasks.create_checkers() memory_tracker = vaex.memory.create_tracker() vaex.memory.local.agg = memory_tracker # we track this for consistency memory_usage = 0 for task in tasks: for task_checker in task_checkers: task_checker.add_task(task) spec = encoding.encode('task', task) spec['task-part-cpu-type'] = spec.pop('task-type') def create_task_part(): nonlocal memory_usage task_part = encoding.decode('task-part-cpu', spec, df=task.df, nthreads=nthreads) memory_usage += task_part.memory_usage() for task_checker in task_checkers: task_checker.add_task(task) if task.requires_fingerprint: task_part.fingerprint = task.fingerprint() return task_part # We want at least 1 task part (otherwise we cannot do any work) # then we ask for the task part how often we should split # This means that we can have 100 threads, but only 2 task parts # In this case, evaluation of expressions is still multithreaded, # but aggregation is reduced to effectively 2 threads. task_part_0 = create_task_part() ideal_task_splits = task_part_0.ideal_splits( self.thread_pool.nthreads) assert ideal_task_splits <= self.thread_pool.nthreads, f'Cannot have more splits {ideal_task_splits} then threads {self.thread_pool.nthreads}' if ideal_task_splits == self.thread_pool.nthreads or task.see_all: # in the simple case, we just use a list task._parts = [task_part_0] + [ create_task_part() for i in range(1, ideal_task_splits) ] else: # otherwise a queue task._parts = queue.Queue() task._parts.put(task_part_0) for i in range(1, ideal_task_splits): task._parts.put(create_task_part()) if memory_usage != memory_tracker.used: raise RuntimeError( f"Reported memory usage by tasks was {memory_usage}, while tracker listed {memory_tracker.used}" ) vaex.memory.local.agg = None # TODO: in the future we might want to enable the zigzagging again, but this requires all datasets to implement it # if self.zigzag: # self.zig = not self.zig def progress(p): # no global cancel and at least 1 tasks wants to continue, then we continue ok_tasks = any([task.progress(p) for task in tasks]) all_stopped = all([task.stopped for task in tasks]) ok_executor = all(self.signal_progress.emit(p)) if all_stopped: logger.debug( "Pass cancelled because all tasks are stopped: %r", tasks) if not ok_tasks: logger.debug( "Pass cancelled because all tasks cancelled: %r", tasks) if not ok_executor: logger.debug( "Pass cancelled because of the global progress event: %r", self.signal_progress.callbacks) return ok_tasks and ok_executor and not all_stopped yield from self.thread_pool.map( self.process_part, dataset.chunk_iterator(run.dataset_deps, chunk_size), dataset.row_count, progress=progress, cancel=lambda: self._cancel(run), unpack=True, run=run, use_async=use_async) duration_wallclock = time.time() - t0 logger.debug("executing took %r seconds", duration_wallclock) self.local.executing = False if has_contextvars: self.isnested.set(False) if True: # kept to keep the diff small for task in tasks: if not task.cancelled: logger.debug("fulfill task: %r", task) parts = task._parts if not isinstance(parts, list): parts_queue = parts parts = [] while not parts_queue.empty(): parts.append(parts_queue.get()) parts[0].reduce(parts[1:]) logger.debug("wait for task: %r", task) task._result = parts[0].get_result() task.end() task.fulfill(task._result) logger.debug("got result for: %r", task) if task._result is not None and task.cacheable: # we don't want to store None if vaex.cache.is_on(): # we only want to store the original task results into the cache tasks_cachable = task.original_tasks if isinstance( task, vaex.tasks.TaskAggregations) else [ task ] for task_cachable in tasks_cachable: key = task_cachable.fingerprint() previous_result = vaex.cache.get( key, type='task') if ( previous_result is not None ): # and (previous_result != task_cachable.get()): try: if previous_result != task_cachable.get( ): # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors) logger.warning( "calculated new result: %r, while cache had value: %r", previous_result, task_cachable.get()) except ValueError: # when comparing numpy results if not np.array_equal( previous_result, task_cachable.get(), equal_nan=True): # this can happen with multithreading, where two threads enter the same tasks in parallel (IF using different executors) logger.warning( "calculated new result: %r, while cache had value: %r", previous_result, task_cachable.get()) vaex.cache.set(key, task_cachable.get(), type='task', duration_wallclock= duration_wallclock) logger.info( "added result: %r in cache under key: %r", task_cachable.get(), key) else: logger.debug("rejecting task: %r", task) # we now reject, in the main thread if task._toreject: task.reject(task._toreject) else: task.reject(UserAbort("Task was cancelled")) # remove references task._result = None task._results = None self.signal_end.emit() except: # noqa self.signal_cancel.emit() raise finally: self.local.executing = False if has_contextvars: self.isnested.set(False)