def __init__( self, artifact_type, path, accesskey_id=None, accesskey_secret=None, bucket=None, key=None, endpoint="", is_global=False, ): self.type = artifact_type self.id = f"output-{self.type}-{utils._get_uuid()}" # path is used for local path self.path = path self.is_global = is_global self.bucket = bucket self.key = key self.endpoint = endpoint if accesskey_id and accesskey_secret: secret = {"accessKey": accesskey_id, "secretKey": accesskey_secret} # TODO: check this secret exist or not self.secret = couler.create_secret(secret) else: self.secret = None
def __init__( self, artifact_type, path, accesskey_id, accesskey_secret, bucket, key=None, endpoint="", is_global=False, ): self.type = artifact_type self.id = f"output-{self.type}-{utils._get_uuid()}" # path is used for local path self.path = path self.is_global = is_global if accesskey_secret is None or accesskey_id is None or bucket is None: raise SyntaxError( f"need to input the correct config for {self.type}") self.bucket = bucket if key is None: # assume the local path is the same as the path of OSS self.key = path else: self.key = key self.endpoint = endpoint secrets = {"accessKey": accesskey_id, "secretKey": accesskey_secret} # TODO: check this secret exist or not self.secret = couler.create_secret(secrets)
def test_create_secret(self): # First job with secret1 user_info = {"uname": "abc", "passwd": "def"} secret1 = couler.create_secret(secret_data=user_info, name="dummy1") couler.run_container( image="python:3.6", secret=secret1, command="echo $uname" ) # Second job with secret2 that exists access_key = ["access_key", "access_value"] secret2 = couler.obtain_secret( secret_keys=access_key, namespace="test", name="dummy2" ) couler.run_container( image="python:3.6", secret=secret2, command="echo $access_value" ) # Check the secret yaml self.assertEqual(len(couler.states._secrets), 2) secret1_yaml = couler.states._secrets[secret1].to_yaml() secret2_yaml = couler.states._secrets[secret2].to_yaml() self.assertEqual(secret1_yaml["metadata"]["name"], "dummy1") self.assertEqual(len(secret1_yaml["data"]), 2) self.assertEqual( secret1_yaml["data"]["uname"], utils.encode_base64("abc") ) self.assertEqual( secret1_yaml["data"]["passwd"], utils.encode_base64("def") ) self.assertEqual(secret2_yaml["metadata"]["namespace"], "test") self.assertEqual(secret2_yaml["metadata"]["name"], "dummy2") self.assertEqual(len(secret2_yaml["data"]), 2)
def __init__( self, path, accesskey_id, accesskey_secret, bucket, key=None, endpoint="http://oss-cn-hangzhou-zmf.aliyuncs.com", is_global=False, ): self.id = "output-oss-%s" % utils._get_uuid() # path is used for local path self.path = path self.type = "OSS" self.is_global = is_global if accesskey_secret is None or accesskey_id is None or bucket is None: raise SyntaxError("need to input the correct config for oss") self.bucket = bucket if key is None: # assume the local path is the same as the path of OSS self.key = path else: self.key = key self.endpoint = endpoint import couler.argo as couler secrets = {"accessKey": accesskey_id, "secretKey": accesskey_secret} # TODO: check this secret exist or not self.secret = couler.create_secret(secrets)
def __init__( self, artifact_type, path, accesskey_id=None, accesskey_secret=None, bucket=None, key=None, endpoint="", is_global=False, insecure=False, ): self.type = artifact_type self.id = f"output-{self.type}-{utils._get_uuid()}" # path is used for local path self.path = path self.is_global = is_global self.bucket = bucket self.key = key self.endpoint = endpoint self.insecure = insecure if accesskey_id and accesskey_secret: secret = {"accessKey": accesskey_id, "secretKey": accesskey_secret} # artifact_secret flag causes the secret to be created only when a secret with the same name doesn't exist in the namespace self.secret = couler.create_secret(secret, artifact_secret=True) else: self.secret = None
def test_tensorflow_train(self): access_key_secret = {"access_key": "key1234"} secret = couler.create_secret(secret_data=access_key_secret) tf.train( num_ps=2, num_workers=3, num_evaluators=1, image="tensorflow:1.13", command="python tf.py", no_chief=False, worker_resources="cpu=0.5,memory=1024", ps_restart_policy="Never", worker_restart_policy="OnFailure", evaluator_resources="cpu=2,memory=4096", clean_pod_policy="Running", secret=secret, ) secret_yaml = list(couler.states._secrets.values())[0].to_yaml() self.assertEqual(secret_yaml["data"]["access_key"], utils.encode_base64("key1234")) wf = couler.workflow_yaml() self.assertEqual(len(wf["spec"]["templates"]), 2) # Check steps template template0 = wf["spec"]["templates"][0] self.assertEqual(len(template0["steps"]), 1) self.assertEqual(len(template0["steps"][0]), 1) # Check train template template1 = wf["spec"]["templates"][1] self.assertEqual(template1["name"], "test-tensorflow-train") resource = template1["resource"] self.assertEqual(resource["action"], "create") self.assertEqual(resource["setOwnerReference"], "true") self.assertEqual( resource["successCondition"], "status.replicaStatuses.Worker.succeeded == 3", ) self.assertEqual( resource["failureCondition"], "status.replicaStatuses.Worker.failed > 0", ) # Check the tfjob spec tfjob = yaml.load(StringIO(resource["manifest"]), Loader=yaml.FullLoader) self.assertEqual(tfjob["kind"], "TFJob") self.assertEqual(tfjob["spec"]["cleanPodPolicy"], "Running") chief = tfjob["spec"]["tfReplicaSpecs"]["Chief"] self.assertEqual(chief["replicas"], 1) chief_container = chief["template"]["spec"]["containers"][0] self.assertEqual(chief_container["env"][0]["name"], "access_key") self.assertEqual( chief_container["env"][0]["valueFrom"]["secretKeyRef"]["name"], secret_yaml["metadata"]["name"], ) ps = tfjob["spec"]["tfReplicaSpecs"]["PS"] self.assertEqual(ps["replicas"], 2) self.assertEqual(ps["restartPolicy"], "Never") self.assertEqual(len(ps["template"]["spec"]["containers"]), 1) ps_container = ps["template"]["spec"]["containers"][0] self.assertEqual(ps_container["image"], "tensorflow:1.13") self.assertEqual(ps_container["command"], "python tf.py") worker = tfjob["spec"]["tfReplicaSpecs"]["Worker"] self.assertEqual(worker["replicas"], 3) self.assertEqual(worker["restartPolicy"], "OnFailure") self.assertEqual(len(worker["template"]["spec"]["containers"]), 1) worker_container = ps["template"]["spec"]["containers"][0] self.assertEqual(worker_container["image"], "tensorflow:1.13") self.assertEqual(worker_container["command"], "python tf.py") worker_container = worker["template"]["spec"]["containers"][0] self.assertEqual(worker_container["env"][0]["name"], "access_key") self.assertEqual( worker_container["env"][0]["valueFrom"]["secretKeyRef"]["name"], secret_yaml["metadata"]["name"], ) self.assertEqual(worker_container["resources"]["limits"]["cpu"], 0.5) self.assertEqual(worker_container["resources"]["limits"]["memory"], 1024) evaluator = tfjob["spec"]["tfReplicaSpecs"]["Evaluator"] self.assertEqual(evaluator["replicas"], 1) self.assertEqual(len(evaluator["template"]["spec"]["containers"]), 1) evaluator_container = evaluator["template"]["spec"]["containers"][0] self.assertEqual(evaluator_container["image"], "tensorflow:1.13") self.assertEqual(evaluator_container["resources"]["limits"]["cpu"], 2) self.assertEqual(evaluator_container["resources"]["limits"]["memory"], 4096)
def job_2(): user_info = {"uname": "abc", "passwd": "def"} secret1 = couler.create_secret(secret_data=user_info, dry_run=True) couler.run_container(image="python:3.6", secret=secret1, command="echo $uname")
def test_pytorch_train(self): access_key_secret = {"access_key": "key1234"} secret = couler.create_secret(secret_data=access_key_secret) pytorch.train( num_workers=3, image="pytorch:1.13", command="python pytorch.py", worker_resources="cpu=0.5,memory=1024", worker_restart_policy="OnFailure", clean_pod_policy="Running", secret=secret, ) secret_yaml = list(couler.states._secrets.values())[0].to_yaml() self.assertEqual(secret_yaml["data"]["access_key"], utils.encode_base64("key1234")) wf = couler.workflow_yaml() self.assertEqual(len(wf["spec"]["templates"]), 2) # Check steps template template0 = wf["spec"]["templates"][0] self.assertEqual(len(template0["steps"]), 1) self.assertEqual(len(template0["steps"][0]), 1) # Check train template template1 = wf["spec"]["templates"][1] self.assertEqual(template1["name"], "test-pytorch-train") resource = template1["resource"] self.assertEqual(resource["action"], "create") self.assertEqual(resource["setOwnerReference"], "true") self.assertEqual( resource["successCondition"], "status.pytorchReplicaStatuses.Worker.succeeded > 0", ) self.assertEqual( resource["failureCondition"], "status.pytorchReplicaStatuses.Worker.failed > 0", ) # Check the PyTorchJob spec pytorch_job = yaml.load(StringIO(resource["manifest"]), Loader=yaml.FullLoader) self.assertEqual(pytorch_job["kind"], "PyTorchJob") self.assertEqual(pytorch_job["spec"]["cleanPodPolicy"], "Running") master = pytorch_job["spec"]["pytorchReplicaSpecs"]["Master"] self.assertEqual(master["replicas"], 1) chief_container = master["template"]["spec"]["containers"][0] self.assertEqual(chief_container["env"][0]["name"], "access_key") self.assertEqual( chief_container["env"][0]["valueFrom"]["secretKeyRef"]["name"], secret_yaml["metadata"]["name"], ) worker = pytorch_job["spec"]["pytorchReplicaSpecs"]["Worker"] self.assertEqual(worker["replicas"], 3) self.assertEqual(worker["restartPolicy"], "OnFailure") self.assertEqual(len(worker["template"]["spec"]["containers"]), 1) worker_container = worker["template"]["spec"]["containers"][0] self.assertEqual(worker_container["image"], "pytorch:1.13") self.assertEqual(worker_container["command"], "python pytorch.py") worker_container = worker["template"]["spec"]["containers"][0] self.assertEqual(worker_container["env"][0]["name"], "access_key") self.assertEqual( worker_container["env"][0]["valueFrom"]["secretKeyRef"]["name"], secret_yaml["metadata"]["name"], ) self.assertEqual(worker_container["resources"]["limits"]["cpu"], 0.5) self.assertEqual(worker_container["resources"]["limits"]["memory"], 1024)