def Value(typecode_or_type, *args, lock=None): ''' Return a synchronization wrapper for a Value ''' obj = RawValue(typecode_or_type, *args) if lock is False: return obj if lock in (True, None): lock = RLock() if not hasattr(lock, 'acquire'): raise AttributeError("'%r' has no method 'acquire'" % lock) return synchronized(obj, lock)
def map(self, fn, lazy=True, batched=False, num_workers=0): """ Performs specific function on the dataset to transform and update every sample. Args: fn (callable): Transformations to be performed. It receives single sample as argument if batched is False. Else it receives all examples. lazy (bool, optional): If True, transformations would be delayed and performed on demand. Otherwise, transforms all samples at once. Note that if `fn` is stochastic, `lazy` should be True or you will get the same result on all epochs. Defalt: False. batched(bool, optional): If True, transformations would take all examples as input and return a collection of transformed examples. Note that if set True, `lazy` option would be ignored. Defalt: False. num_workers(int, optional): Number of processes for multiprocessing. If set to 0, it doesn't use multiprocessing. Note that if set to positive value, `lazy` option would be ignored. Defalt: 0. """ assert num_workers >= 0, "num_workers should be a non-negative value" if num_workers > 0: with Pool(num_workers, initargs=(RLock(), )) as pool: def map_shard(num_workers, index, fn, batched): self.shard(num_shards=num_workers, index=index, contiguous=True) self._map(fn=fn, lazy=False, batched=batched) return self kwds_per_shard = [ dict(num_workers=num_workers, index=rank, fn=fn, batched=batched) for rank in range(num_workers) ] results = [ pool.apply_async(map_shard, kwds=kwds) for kwds in kwds_per_shard ] transformed_shards = [r.get() for r in results] self.new_data = [] for i in range(num_workers): self.new_data += transformed_shards[i].new_data return self else: return self._map(fn, lazy=lazy, batched=batched)
def Array(typecode_or_type, size_or_initializer, **kwds): ''' Return a synchronization wrapper for a RawArray ''' lock = kwds.pop('lock', None) if kwds: raise ValueError('unrecognized keyword argument(s): %s' % kwds.keys()) obj = RawArray(typecode_or_type, size_or_initializer) if lock is False: return obj if lock in (True, None): lock = RLock() if not hasattr(lock, 'acquire'): raise AttributeError("'%r' has no method 'acquire'" % lock) return synchronized(obj, lock)
def filter(self, fn, num_workers=0): """ Filters samples by the filter function and uses the filtered data to update this dataset. Args: fn (callable): A filter function that takes a sample as input and returns a boolean. Samples that return False would be discarded. num_workers(int, optional): Number of processes for multiprocessing. If set to 0, it doesn't use multiprocessing. Defaults to `0`. """ assert num_workers >= 0, "num_workers should be a non-negative value" if num_workers > 1: shards = [ self._shard( num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers) ] kwds_per_shard = [ dict( self=shards[rank], fn=fn) for rank in range(num_workers) ] pool = Pool(num_workers, initargs=(RLock(), )) results = [ pool.apply_async( self.__class__._filter, kwds=kwds) for kwds in kwds_per_shard ] transformed_shards = [r.get() for r in results] pool.close() pool.join() self.new_data = [] for i in range(num_workers): self.new_data += transformed_shards[i].new_data return self else: return self._filter(fn)
def __init__(self, obj, lock=None): self._obj = obj self._lock = lock or RLock() self.acquire = self._lock.acquire self.release = self._lock.release