def test_threading(): cache = Cache({"maxCacheSize": 4}) # fill cache startTime = datetime.datetime.now() for i in range(10000): values = (str(i), {"bytes": bytearray(500)}, 500) cache._cache[values[0]] = { 'timestamp': startTime + datetime.timedelta(0, 1), 'size': values[2], 'value': values[1], 'header': None } request = [] for i in range(1000): request.append(('key' + str(i), {"bytes": bytearray(50000)}, 50000)) res = [] def process(item): cache.update(item[0], item[1]) return item[0] with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: for out in executor.map(process, request): res.append(out) assert len(res) == 1000
class DataServer: def __init__(self, config): self._cache = Cache(config) self._host = config['host'] self._port = config['port'] self._encodingType = config['encoding'] self._encoding = Encoding(self._encodingType) self._adapter = ZMQServers(self._port, self._createReply, config) self.notAvailable = self._encoding.encode( self._createError('notAvailable', 'taskId notAvailable')) def listen(self): log.info('discovery serving on {host}:{port} with {encoding} encoding', host=self._host, port=self._port, encoding=self._encodingType) self._adapter.listen() @timing def _createReply(self, message): try: decoded = self._encoding.decode(value=message, plainEncode=True) tasks = decoded.get('tasks') resultsAsTuple = self.getDataByTaskId(tasks) except Exception as e: result = self._createError('unknown', str(e)) header, encoded = self._encoding.encode(result) return [header, encoded] parts = [] for header, content in resultsAsTuple: parts.append(header) parts.append(content) return parts def getDataByTaskId(self, tasks): results = [] for task in tasks: if (task not in self._cache): result = self.notAvailable else: result = self._cache.getWithHeader(task) results.append(result) return results def setSendingState(self, taskId, header, encoded, size): return self._cache.update(taskId, encoded, size=size, header=header) def _createError(self, code, message): return {'hkube_error': {'code': code, 'message': message}} def isLocal(self, host, port): return host == self._host and port == self._port def isServing(self): return self._adapter.isServing() def shutDown(self): self._adapter.close()
def test_too_large_message(): cache = Cache({"maxCacheSize": 5}) value1 = {"bytes": bytearray(1000000)} value2 = {"bytes": bytearray(5000000)} result = cache.update("task1", value1, 1000000) assert result == True assert len(cache._cache) == 1 result = cache.update("task2", value2, 6000000) assert result == False assert len(cache._cache) == 1
def __init__(self, config): self._cache = Cache(config) self._host = config['host'] self._port = config['port'] self._encodingType = config['encoding'] self._encoding = Encoding(self._encodingType) self._adapter = ZMQServers(self._port, self._createReply, config) self.notAvailable = self._encoding.encode( self._createError('notAvailable', 'taskId notAvailable'))
def __init__(self, options, dataServer=None): self._dataServer = dataServer self._storageCache = Cache(config.storage) self._encoding = Encoding(options.storage['encoding']) self._storageManager = StorageManager(options.storage) self._requestEncoding = options.storage['encoding'] self._requestTimeout = options.discovery['timeout'] self._networkTimeout = options.discovery['networkTimeout'] self._maxWorkers = min(32, (multiprocessing.cpu_count() or 1) + 4) log.info('using {workers} workers for DataAdapter', workers=self._maxWorkers)
def test_reaching_limit(): cache = Cache({"maxCacheSize": 4}) value1 = {"bytes": bytearray(1000000)} cache.update("task1", value1, 1000000) assert len(cache._cache) == 1 cache.update("task2", value1, 1000000) cache.update("task3", value1, 1000000) assert len(cache._cache) == 3 cache.update("task4", value1, 1500000) assert len(cache._cache) == 3 assert "task4" in cache assert "task1" not in cache
def test_get_all(): cache = Cache({"maxCacheSize": 5}) value1 = {"data": "1"} value2 = {"data": "2"} result = cache.update("task1", value1) assert result == True assert len(cache._cache) == 1 cache.update("task2", value2) assert len(cache._cache) == 2 tasksNotInCache, valuesInCache = cache.getAll( ['task1', 'task2', 'task3', 'task4']) assert len(tasksNotInCache) == 2 assert "task3" in set(tasksNotInCache) assert "task4" in set(tasksNotInCache) assert len(valuesInCache) == 2 data = list(map(lambda value: value.get('data'), valuesInCache)) assert "1" in set(data) assert "2" in set(data)
class DataAdapter: def __init__(self, options, dataServer=None): self._dataServer = dataServer self._storageCache = Cache(config.storage) self._encoding = Encoding(options.storage['encoding']) self._storageManager = StorageManager(options.storage) self._requestEncoding = options.storage['encoding'] self._requestTimeout = options.discovery['timeout'] self._networkTimeout = options.discovery['networkTimeout'] self._maxWorkers = min(32, (multiprocessing.cpu_count() or 1) + 4) log.info('using {workers} workers for DataAdapter', workers=self._maxWorkers) def encode(self, value): return self._encoding.encode(value) def decode(self, header=None, value=None): return self._encoding.decode(header=header, value=value) @trace() def getData(self, options): jobId = options.get('jobId') inputArgs = options.get('input') flatInput = options.get('flatInput') storage = options.get('storage') if (not flatInput): return inputArgs for k, v in flatInput.items(): if self._isStorage(v): key = v[2:] link = storage.get(key, None) if (link is None): raise Exception('unable to find storage key') if (typeCheck.isList(link)): data = self.batchRequest(link, jobId) else: data = self.tryGetDataFromPeerOrStorage(link) setPath(inputArgs, k, data) return inputArgs def _isStorage(self, value): return typeCheck.isString(value) and value.startswith('$$') def setAlgorithmStorage(self, jobId, input): storage = {} mappedInput = [] for item in input: taskId = uid(8) (header, data) = self.encode(item) storageInfo = self.setData({'jobId': jobId, 'taskId': taskId, 'header': header, 'data': data}) storage[taskId] = {'storageInfo': storageInfo} mappedInput.append('$${taskId}'.format(taskId=taskId)) return (storage, mappedInput) @trace() def setData(self, options): jobId = options.get('jobId') taskId = options.get('taskId') header = options.get('header') data = options.get('data') result = self._storageManager.hkube.put(jobId, taskId, header=header, value=data) return result @timing def batchRequest(self, options, jobId): batchResponse = [] for d in options: d.update({"jobId": jobId}) with concurrent.futures.ThreadPoolExecutor(max_workers=self._maxWorkers) as executor: for out in executor.map(self._batchRequest, options): batchResponse += out return batchResponse def _batchRequest(self, options): batchResponse = [] jobId = options.get('jobId') tasks = options.get('tasks') dataPath = options.get('path') storageInfo = options.get('storageInfo') if (storageInfo): storageResult = self._getFromCacheOrStorage(storageInfo, dataPath, storageInfo.get("path")) batchResponse.append(storageResult) return batchResponse tasksNotInCache, batchResponse = self._storageCache.getAll(tasks) if (tasksNotInCache): options['tasks'] = tasksNotInCache results = self._getFromPeer(options) for i, item in enumerate(results): size, content = item peerError = self._getPeerError(content) taskId = tasksNotInCache[i] if (peerError): storageData = self._getDataForTask(jobId, taskId, dataPath) batchResponse.append(storageData) else: self._storageCache.update(taskId, content, size) content = self._getPath(content, dataPath) batchResponse.append(content) return batchResponse def _getDataForTask(self, jobId, taskId, dataPath): path = self._storageManager.hkube.createPath(jobId, taskId) return self._getFromCacheOrStorage({'path': path}, dataPath, taskId) def tryGetDataFromPeerOrStorage(self, options): dataPath = options.get('path') storageInfo = options.get('storageInfo') discovery = options.get('discovery') data = None hasResponse = False if (discovery): cacheId = options.get('taskId') else: cacheId = storageInfo.get('path') data = self._getFromCache(cacheId, dataPath) if not (data): if (discovery): size, data = self._getFromPeer(options)[0] peerError = self._getPeerError(data) hasResponse = not peerError data = None if peerError else data if (hasResponse): self._setToCache(cacheId, data, size) data = self._getPath(data, dataPath) if (not hasResponse and storageInfo): data = self._getFromCacheOrStorage(storageInfo, dataPath, cacheId) return data @trace(name='getFromPeer') @timing def _getFromPeer(self, options): taskId = options.get('taskId') tasks = [taskId] if taskId else options.get('tasks') discovery = options.get('discovery') port = discovery.get('port') host = discovery.get('host') if (self._dataServer and self._dataServer.isLocal(host, port)): dataList = self._dataServer.getDataByTaskId(tasks) responses = [] for header, payload in dataList: responses.append((len(payload), self.decode(header=header, value=payload))) else: request = { 'address': { 'port': port, 'host': host }, 'tasks': tasks, 'encoding': self._requestEncoding, 'timeout': self._requestTimeout, 'networkTimeout': self._networkTimeout } dataRequest = DataRequest(request) responses = dataRequest.invoke() return responses def _getPeerError(self, options): error = None if (typeCheck.isDict(options)): error = options.get('hkube_error') return error def _getFromCacheOrStorage(self, options, dataPath, cacheID): data = self._getFromCache(cacheID, dataPath) if (data is None): size, data = self._getFromStorage(options) self._setToCache(cacheID, data, size) data = self._getPath(data, dataPath) return data @trace(name='getFromCache') @timing def _getFromCache(self, cacheId, dataPath): data = self._storageCache.get(cacheId) data = self._getPath(data, dataPath) return data def _setToCache(self, cacheId, data, size): self._storageCache.update(cacheId, data, size) @trace(name='getFromStorage') @timing def _getFromStorage(self, options): (header, payload) = self._storageManager.storage.get(options) decoded = self.decode(header=header, value=payload) size = len(payload) return (size, decoded) def createStorageInfo(self, options): jobId = options.get('jobId') taskId = options.get('taskId') encodedData = options.get('encodedData') path = self._storageManager.hkube.createPath(jobId, taskId) metadata = self.createMetadata(options) storageInfo = { 'storageInfo': { 'path': path, 'size': len(encodedData) if encodedData else 0 }, 'metadata': metadata } return storageInfo def createMetadata(self, options): nodeName = options.get('nodeName') data = options.get('data') savePaths = options.get('savePaths', []) metadata = dict() objData = dict() objData[nodeName] = data for path in savePaths: try: value = getPath(objData, path) if (value != 'DEFAULT'): meta = self._getMetadata(value) metadata[path] = meta except Exception: pass return metadata def _getMetadata(self, value): if (typeCheck.isDict(value)): meta = {'type': 'object'} elif (typeCheck.isList(value)): meta = {'type': 'array', 'size': len(value)} else: meta = {'type': str(type(value).__name__)} return meta def _getPath(self, data, dataPath): if (data and dataPath): newData = getPath(data, dataPath) if (newData == 'DEFAULT'): newData = None else: newData = data return newData