def test_find_artifact(self): backend = LocalArtifactBackend(config={"path": "./test_storage/"}) artifact = Artifact(self.stage_config) payload = "SHRIM" artifact.item = Item(payload="SHRIM") backend.save_artifact(artifact) loaded_artifact = backend._find_cached_artifact(artifact) # Ensure that we found the artifact self.assertNotEqual(None, loaded_artifact) # Ensure that the artifact doesn't have a payload self.assertNotEqual(None, loaded_artifact.item) self.assertEqual(None, loaded_artifact.item.payload) # Ensure that meta properties are correctly set on artifact self.assertEqual(loaded_artifact._specific_hash, artifact._specific_hash) self.assertEqual(loaded_artifact._dependency_hash, artifact._dependency_hash) self.assertEqual(loaded_artifact._definition_hash, artifact._definition_hash) self.assertEqual(loaded_artifact._pipeline_stage, artifact._pipeline_stage) self.assertEqual(loaded_artifact.item.type, artifact.item.type)
def test_pipeline_stage_status(self): backend = LocalArtifactBackend(config={"path": "./test_storage/"}) artifact = Artifact(self.stage_config) payload = "SHRIM" artifact.item = Item(payload=payload) status = backend.pipeline_stage_run_status(self.stage_config, artifact._dependency_hash) self.assertEqual(status, STAGE_DOES_NOT_EXIST) backend.save_artifact(artifact) status = backend.pipeline_stage_run_status(self.stage_config, artifact._dependency_hash) self.assertEqual(status, STAGE_IN_PROGRESS) backend.log_pipeline_stage_run_complete(self.stage_config, artifact._dependency_hash) status = backend.pipeline_stage_run_status(self.stage_config, artifact._dependency_hash) self.assertEqual(status, STAGE_COMPLETE) meta = backend._get_pipeline_stage_run_meta(self.stage_config, artifact._dependency_hash) self.assertEqual(len(meta['artifacts']), 1)
def test_metadata_from_dict(self): stage_a = PipelineStageConfig('some_name', { "A": 1, "B": 2, "type": "ExecutorPipelineStage" }) art_a = Artifact(stage_a) d = { "antecedents": {}, "creation_time": 124566722.3, "definition_hash": "dac9630aec642a428cd73f4be0a03569", "specific_hash": "bc1687bbb3b97214d46b7c30ab307cc1", "dependency_hash": "ecad5fc98abf66565e009155f5e57dda", "pipeline_stage": "some_stage", "item": { "meta": { "loss": 0.2 }, "tags": ["my_pipeline_run"], "type": "my_item_type" } } art_a.meta_from_dict(d) for prop in d: if prop == "item": for iprop in d['item']: value = getattr(art_a.item, iprop) self.assertEqual(d['item'][iprop], value) else: value = getattr(art_a, "_" + prop) self.assertEqual(d[prop], value)
def find_pipeline_stage_run_artifacts(self, stage_config, dependency_hash): """ Finds all artifacts for a given pipeline run, loading their metadata. """ # Update pipeline stage metaa stage_run_key = { 'stage_config_hash': stage_config.hash(), 'dependency_hash': dependency_hash, } response = self._stage_run_table.get_item(Key=stage_run_key) if 'Item' not in response: return None else: res = [] meta = json.loads(response['Item']['metadata']) for obj in meta['artifacts']: art = Artifact(stage_config) art.item.type = obj['type'] art._specific_hash = obj['specific_hash'] art._dependency_hash = dependency_hash art._definition_hash = stage_config.hash() res.append(self._find_cached_artifact(art)) return res
def test_load_artifact(self): backend = LocalArtifactBackend(config={"path": "./test_storage/"}) artifact = Artifact(self.stage_config) payload = "SHRIM" artifact.item = Item(payload="SHRIM") backend.save_artifact(artifact) loaded_artifact = backend.load_artifact(artifact) self.assertEqual(loaded_artifact.item.payload, artifact.item.payload)
def test_generate_metadata(self): stage_a = PipelineStageConfig('some_name', { "A": 1, "B": 2, "type": "ExecutorPipelineStage" }) art_a = Artifact(stage_a) d = art_a.meta_to_dict() for m in art_a._meta_properties: if m not in d: self.fail()
def test_stage_definition_hash_uniqueness(self): stage_a = PipelineStageConfig('some_name', { "foo": "bar", "type": "ExecutorPipelineStage" }) stage_b = PipelineStageConfig('some_name', { "foo": "quux", "type": "ExecutorPipelineStage" }) art_a = Artifact(stage_a) art_b = Artifact(stage_b) self.assertNotEqual(art_a._definition_hash, art_b._definition_hash)
def test_metadata_from_bad_dict(self): stage_a = PipelineStageConfig('some_name', { "A": 1, "B": 2, "type": "ExecutorPipelineStage" }) art_a = Artifact(stage_a) try: art_a.meta_from_dict({}) self.fail() except InvalidArtifactMetadataError: pass
def test_stage_definition_hash_idempotence(self): stage_a = PipelineStageConfig('some_name', { "A": 1, "B": 2, "type": "ExecutorPipelineStage" }) stage_b = PipelineStageConfig('some_name', { "B": 2, "A": 1, "type": "ExecutorPipelineStage" }) art_a = Artifact(stage_a) art_b = Artifact(stage_b) self.assertEqual(art_a._definition_hash, art_b._definition_hash)
def test_pipeline_stage_run_meta(self): backend = LocalArtifactBackend(config={"path": "./test_storage/"}) artifact = Artifact(self.stage_config) payload = "SHRIM" artifact.item = Item(payload=payload) backend.save_artifact(artifact) backend.log_pipeline_stage_run_complete(self.stage_config, artifact._dependency_hash) arts = backend.find_pipeline_stage_run_artifacts( self.stage_config, artifact._dependency_hash) self.assertEqual(len(arts), 1) self.assertEqual(arts[0].get_uid(), artifact.get_uid())
async def process_loop(executor, stage, input_artifacts): exit_loop = False while not exit_loop: await asyncio.sleep(2.0) for message in executor._task_queue.receive_messages( MessageAttributeNames=[ 'stage_config_hash', 'dependency_hash' ]): print("Retrieved message") print(message.body) print(message.message_attributes) if message.message_attributes is None: self.assertEqual(0, "Message attributes absent") m_config_hash = message.message_attributes.\ get('stage_config_hash').get('StringValue') m_dependency_hash = message.message_attributes.\ get('dependency_hash').get('StringValue') config_hash = stage._config.hash() dependency_hash = Artifact.dependency_hash(input_artifacts) self.assertEqual(config_hash, m_config_hash) self.assertEqual(dependency_hash, m_dependency_hash) message.delete() exit_loop = True for task in asyncio.Task.all_tasks(): task.cancel() raise CancelledError
def _yield_artifact(self, artifact_name): artifact_path = os.path.join(os.getcwd(), self._root, artifact_name) if self.read_content: with open(artifact_path, 'rb') as f: art = Artifact(self._stage_config) art.item.payload = f.read() return art return artifact_path
def find_pipeline_stage_run_artifacts(self, stage_config, dependency_hash): """ Finds all artifacts for a given pipeline run. """ meta = self._get_pipeline_stage_run_meta(stage_config, dependency_hash) if 'artifacts' not in meta: return [] else: res = [] for uid in meta['artifacts']: artDict = meta['artifacts'][uid] art = Artifact(stage_config) art.item.type = artDict['item_type'] art._specific_hash = artDict['specific_hash'] res.append(self._find_cached_artifact(art)) return res
def _yield_artifact(self): artifact_path = os.path.join(os.getcwd(), self._path) content = "" with open(artifact_path, 'r') as f: content = f.read() art = Artifact(self._stage_config) art.item.payload = content return art
def test_save_missing_payload(self): artifact = Artifact(self.stage_config) try: self._default_backend.save_artifact(artifact) self.assertEqual(0, "The above line should fail " + "due to the artifact having no payload") self.fail() except ArtifactMissingPayloadError: pass
def _sorted_artifacts(self, artifact): """ Returns a sorted list of artifacts, based upon pruning ordering """ item_meta = self._load_item_meta(artifact._pipeline_stage, artifact.item.type) result = [] for k in item_meta: result.append(item_meta[k]) sorted_metadata = sorted(result, key=lambda x: x["creation_time"]) sorted_artifacts = [] for x in sorted_metadata: a = Artifact(artifact._config, artifact.item.type) a.meta_from_dict(x) sorted_artifacts.append(a) return sorted_artifacts
def test_save_missing_payload(self): backend = LocalArtifactBackend(config={"path": "./test_storage/"}) artifact = Artifact(self.stage_config) try: backend.save_artifact(artifact) self.assertEqual( 0, "The above line should fail " + "due to the artifact having no payload") self.fail() except ArtifactMissingPayloadError: pass
def test_load_artifact_from_s3(self): backend = self._default_backend artifact = Artifact(self.stage_config) payload = "SHRIM" artifact.item = Item(payload=payload) backend.save_artifact(artifact) # Now we'll delete the local artifact cache, forcing retrieval from S3 path = backend._localArtifactBackend.path for root, dirs, files in os.walk(path, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) distutils.dir_util.mkpath(path) loaded_artifact = backend.load_artifact(artifact) self.assertEqual(loaded_artifact.item.payload.decode('utf-8'), payload) self.assertEqual(True, loaded_artifact._loaded_from_s3_cache) self.cleanup_test_tables(self._default_backend)
async def _run_job(self, job): # Get stage from pipeline pf = PipelineStageFactory() config = PipelineStageConfig(job['stage_name'], job['stage_config']) stage = pf.create_pipeline_stage(config) # Load artifact payloads from cache loaded_artifacts = [] for artifact in job['artifacts']: art_obj = Artifact(stage._config) art_obj.meta_from_dict(artifact) print(art_obj._pipeline_stage) loaded = self._backend.load_artifact(art_obj) if loaded is None: raise Exception("Could not find payload for artifact") loaded_artifacts.append(loaded) # Execute the task exec_task = self._executor.create_task(stage, loaded_artifacts) result = await exec_task.generate_artifacts() return result
async def _run_job(self, job): # Get stage from pipeline pf = PipelineStageFactory() config = PipelineStageConfig(job['stage_name'], job['stage_config']) stage = pf.create_pipeline_stage(config) # Load input artifact payloads from cache loaded_artifacts = [] for artifact in job['artifacts']: art_obj = Artifact(stage._config) art_obj.meta_from_dict(artifact) loaded = self._backend.load_artifact(art_obj) if loaded is None: self._log("Could not find payload for artifact") raise Exception("Could not find payload for artifact") loaded_artifacts.append(loaded) # Execute the task exec_task = self._executor.create_task(stage, loaded_artifacts) result = await exec_task.generate_artifacts() for art in result: art._creation_time = float(time.time()) art._dependency_hash = Artifact.dependency_hash(loaded_artifacts) self._backend.save_artifact(art) self._backend.log_pipeline_stage_run_complete( config, Artifact.dependency_hash(loaded_artifacts)) return result
def test_create_tasks(self): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) executor = RemoteSQSExecutor( aws_profile="testing", task_queue_name=self.test_queue_task_name, result_queue_name=self.test_queue_result_name, loop=loop) pf = PipelineStageFactory() stage = pf.create_pipeline_stage(self.stage_config) input_artifacts = [Artifact(self.stage_config)] executor.create_task(stage, input_artifacts) async def process_loop(executor, stage, input_artifacts): exit_loop = False while not exit_loop: await asyncio.sleep(2.0) for message in executor._task_queue.receive_messages( MessageAttributeNames=[ 'stage_config_hash', 'dependency_hash' ]): print("Retrieved message") print(message.body) print(message.message_attributes) if message.message_attributes is None: self.assertEqual(0, "Message attributes absent") m_config_hash = message.message_attributes.\ get('stage_config_hash').get('StringValue') m_dependency_hash = message.message_attributes.\ get('dependency_hash').get('StringValue') config_hash = stage._config.hash() dependency_hash = Artifact.dependency_hash(input_artifacts) self.assertEqual(config_hash, m_config_hash) self.assertEqual(dependency_hash, m_dependency_hash) message.delete() exit_loop = True for task in asyncio.Task.all_tasks(): task.cancel() raise CancelledError try: loop.run_until_complete( asyncio.wait([ executor._process_queue(), process_loop(executor, stage, input_artifacts) ])) except CancelledError: print('CancelledError raised: closing event loop.')
async def _process_queue(self): while True: task = await self._queue.get() self._log('Acquired Task: %s with %d inputs' % (task._stage._config.name, len(task._input_artifacts))) # Push task to queue config_hash = task._stage._config.hash() dependency_hash = Artifact.dependency_hash(task._input_artifacts) self._queue_push(task, config_hash, dependency_hash) # Wait until task is complete message = await self._await_result(config_hash, dependency_hash) result = message.body message.delete() self._complete_task(task, config_hash, dependency_hash)
def _get_cached_artifacts(self, stage_name, input_artifacts, backend): """ Attempts to retrieve cached artifacts for the stage run, identified uniquely by its definition and the hash of its input artifacts. """ stage = self._stages[stage_name] dependency_hash = Artifact.dependency_hash(input_artifacts) status = backend.pipeline_stage_run_status(stage, dependency_hash) if status == STAGE_COMPLETE or status == STAGE_IN_PROGRESS: cached_arts = backend.find_pipeline_stage_run_artifacts( stage._config, dependency_hash) self._log("Loaded %d cached artifacts for stage %s" %\ (len(cached_arts), stage_name)) loaded_arts = [] for art in cached_arts: loaded = backend.load_artifact(art) loaded._loaded_from_cache = True loaded_arts.append(loaded) return loaded_arts else: return None
async def _run_stage(self, stage_name, input_artifacts, executor, backend): """ Run a stage once we've acquired the input artifacts """ # Check if the stage has already been run with the given # input artifacts and pipeline definition. If so, # return the cached run. cached_arts = self._get_cached_artifacts(stage_name, input_artifacts, backend) if cached_arts is not None: self._log("Found %d cached artifacts for stage %s" % (len(cached_arts), stage_name)) return cached_arts # We need to generate fresh artifacts. # We'll feed the input artifacts to the executor, # returning generated artifacts stage = self._stages[stage_name] result = [] dependency_hash = Artifact.dependency_hash(input_artifacts) task = executor.create_task(self._stages[stage_name], input_artifacts) artifacts = await task.generate_artifacts() for art in artifacts: if hasattr(art, "_remotely_produced"): self._log("Remotely produced artifact for %s" % stage_name) result.append(art) else: self._log("Yielding fresh artifact for stage %s" % stage_name) self._log("\tPayload: %s " % str(art.item.payload)[0:50]) art = self._ensure_artifact_meta(art, dependency_hash) backend.save_artifact(art) result.append(art) self._log("Done generating stage %s" % stage_name) backend.log_pipeline_stage_run_complete(stage, dependency_hash) return result
def test_save_artifact(self): backend = LocalArtifactBackend(config={"path": "./test_storage/"}) artifact = Artifact(self.stage_config) artifact.item = Item(payload="SHRIM") backend.save_artifact(artifact) pass
def yield_artifacts(self, input_artifacts): for artifact in input_artifacts: new_artifact = Artifact(self._config, artifact.item) new_artifact._specific_hash = artifact._specific_hash yield new_artifact
def test_executor_server_integration(self): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) executor = RemoteSQSExecutor( aws_profile=self.test_profile, task_queue_name=self.test_queue_task_name, result_queue_name=self.test_queue_result_name, s3_bucket_name=self.test_bucket_name, dynamodb_artifact_table_name=self. test_dynamodb_artifact_table_name, dynamodb_stage_run_table_name=self.test_dynamodb_stage_run_name, loop=loop) server = RemoteSQSServer( aws_profile=self.test_profile, aws_region=self.test_region, s3_bucket_name=self.test_bucket_name, task_queue_name=self.test_queue_task_name, result_queue_name=self.test_queue_result_name, dynamodb_artifact_table_name=self. test_dynamodb_artifact_table_name, dynamodb_stage_run_table_name=self.test_dynamodb_stage_run_name, loop=loop) # Create task. Its input will be itself because that's just great. pf = PipelineStageFactory() stage = pf.create_pipeline_stage(self.stage_config) input_artifacts = [] for art in stage.yield_artifacts(): input_artifacts.append(art) executor.create_task(stage, input_artifacts) # Save input artifacts so they're available for the remote server executor._backend.save_artifact(input_artifacts[0]) # Run our local RemoteExecutor and the remote RemoteSQSServer # for 10 seconds. async def timeout(): await asyncio.sleep(10.0) for task in asyncio.Task.all_tasks(): task.cancel() raise CancelledError try: loop.run_until_complete( asyncio.wait([ executor._process_queue(), server._process_tasks(), server._executor_server._listen_to_queue(), timeout() ])) except CancelledError: print('CancelledError raised: closing event loop.') # Load our remotely generated artifact(s) and ensure they # have the correct payload. arts = executor._backend.find_pipeline_stage_run_artifacts( self.stage_config, Artifact.dependency_hash(input_artifacts)) loaded = [] for art in arts: loaded.append(executor._backend.load_artifact(art)) self.assertEqual(1, len(loaded)) self.assertEqual(loaded[0].item.payload['param_a'], "string parameter value")
def _yield_artifact(self): art = Artifact(self._stage_config) art.item.payload = self._parameters return art
def test_save_artifact(self): s3_backend = self._default_backend artifact = Artifact(self.stage_config) artifact.item.payload = "foobs" s3_backend.save_artifact(artifact) self.cleanup_test_tables(self._default_backend)