def test_is_peer_job_inheritance_matched(self, mock_get_workflow): peer_job_0 = JobDefinition(name='raw-data-job') peer_job_1 = JobDefinition(name='train-job', is_federated=True) peer_config = WorkflowDefinition() peer_config.job_definitions.extend([peer_job_0, peer_job_1]) resp = GetWorkflowResponse(config=peer_config) mock_get_workflow.return_value = resp job_0 = JobDefinition(name='train-job', is_federated=True) config = WorkflowDefinition(job_definitions=[job_0]) project = Project() participant = project_pb2.Participant() project.set_config(project_pb2.Project(participants=[participant])) workflow0 = Workflow(project=project) workflow0.set_config(config) db.session.add(workflow0) db.session.commit() db.session.flush() workflow1 = Workflow(project=project, forked_from=workflow0.id) workflow1.set_config(config) workflow1.set_create_job_flags([CreateJobFlag.REUSE]) workflow1.set_peer_create_job_flags( [CreateJobFlag.NEW, CreateJobFlag.REUSE]) self.assertTrue(is_peer_job_inheritance_matched(workflow1)) workflow1.set_create_job_flags([CreateJobFlag.NEW]) self.assertFalse(is_peer_job_inheritance_matched(workflow1))
def test_patch_create_job_flags(self): wd = WorkflowDefinition() jd = wd.job_definitions.add() workflow = Workflow( name='test-workflow', project_id=123, config=wd.SerializeToString(), forkable=False, state=WorkflowState.READY, ) db.session.add(workflow) db.session.flush() job = Job(name='test_job', job_type=JobType(1), config=jd.SerializeToString(), workflow_id=workflow.id, project_id=123, state=JobState.STOPPED, is_disabled=False) db.session.add(job) db.session.flush() workflow.job_ids = str(job.id) db.session.commit() response = self.patch_helper(f'/api/v2/workflows/{workflow.id}', data={'create_job_flags': [3]}) self.assertEqual(response.status_code, HTTPStatus.OK) patched_job = Job.query.get(job.id) self.assertEqual(patched_job.is_disabled, True) response = self.patch_helper(f'/api/v2/workflows/{workflow.id}', data={'create_job_flags': [1]}) self.assertEqual(response.status_code, HTTPStatus.OK) patched_job = Job.query.get(job.id) self.assertEqual(patched_job.is_disabled, False)
def test_patch_batch_update_interval(self, mock_collect, mock_finish, mock_patch_item, mock_get_item_status): mock_get_item_status.side_effect = [None, ItemStatus.ON] workflow = Workflow( name='test-workflow-left', project_id=123, config=WorkflowDefinition(is_left=True).SerializeToString(), forkable=False, state=WorkflowState.STOPPED, ) batch_update_interval = 1 db.session.add(workflow) db.session.commit() db.session.refresh(workflow) # test create cronjob response = self.patch_helper( f'/api/v2/workflows/{workflow.id}', data={'batch_update_interval': batch_update_interval}) self.assertEqual(response.status_code, HTTPStatus.OK) mock_collect.assert_called_with( name=f'workflow_cron_job_{workflow.id}', items=[WorkflowCronJobItem(workflow.id)], metadata={}, interval=batch_update_interval * 60) # patch new interval time for cronjob batch_update_interval = 2 response = self.patch_helper( f'/api/v2/workflows/{workflow.id}', data={'batch_update_interval': batch_update_interval}) self.assertEqual(response.status_code, HTTPStatus.OK) mock_patch_item.assert_called_with( name=f'workflow_cron_job_{workflow.id}', key='interval_time', value=batch_update_interval * 60) # test stop cronjob response = self.patch_helper(f'/api/v2/workflows/{workflow.id}', data={'batch_update_interval': -1}) self.assertEqual(response.status_code, HTTPStatus.OK) mock_finish.assert_called_with(name=f'workflow_cron_job_{workflow.id}') workflow = Workflow( name='test-workflow-right', project_id=456, config=WorkflowDefinition(is_left=False).SerializeToString(), forkable=False, state=WorkflowState.STOPPED, ) db.session.add(workflow) db.session.commit() db.session.refresh(workflow) response = self.patch_helper(f'/api/v2/workflows/{workflow.id}', data={'batch_update_interval': 1}) self.assertEqual(response.status_code, HTTPStatus.BAD_REQUEST)
def test_post_successfully(self): template_name = 'test-nb-template' expected_template = WorkflowTemplate.query.filter_by( name=template_name).first() self.assertIsNone(expected_template) response = self.post_helper('/api/v2/workflow_templates', data={ 'name': template_name, 'comment': 'test-comment', 'config': { 'group_alias': 'g222', 'is_left': True } }) self.assertEqual(response.status_code, HTTPStatus.CREATED) data = json.loads(response.data).get('data') # Checks DB expected_template = WorkflowTemplate.query.filter_by( name=template_name).first() self.assertEqual(expected_template.name, template_name) self.assertEqual(expected_template.comment, 'test-comment') self.assertEqual( expected_template.config, WorkflowDefinition(group_alias='g222', is_left=True).SerializeToString()) self.assertEqual(data, expected_template.to_dict())
def test_patch_invalid_target_state(self, mock_wakeup): workflow = Workflow( name='test-workflow', project_id=123, config=WorkflowDefinition().SerializeToString(), forkable=False, state=WorkflowState.READY, target_state=WorkflowState.RUNNING ) db.session.add(workflow) db.session.commit() db.session.refresh(workflow) response = self.patch_helper( f'/api/v2/workflows/{workflow.id}', data={ 'target_state': 'READY' }) self.assertEqual(response.status_code, HTTPStatus.BAD_REQUEST) self.assertEqual(json.loads(response.data).get('details'), 'Another transaction is in progress [1]') # Checks DB patched_workflow = Workflow.query.get(workflow.id) self.assertEqual(patched_workflow.state, WorkflowState.READY) self.assertEqual(patched_workflow.target_state, WorkflowState.RUNNING) # Checks scheduler mock_wakeup.assert_not_called()
def test_patch_successfully(self, mock_wakeup): workflow = Workflow( name='test-workflow', project_id=123, config=WorkflowDefinition().SerializeToString(), forkable=False, state=WorkflowState.READY, ) db.session.add(workflow) db.session.commit() db.session.refresh(workflow) response = self.patch_helper( f'/api/v2/workflows/{workflow.id}', data={ 'target_state': 'RUNNING' }) self.assertEqual(response.status_code, HTTPStatus.OK) patched_data = json.loads(response.data).get('data') self.assertEqual(patched_data['id'], workflow.id) self.assertEqual(patched_data['state'], 'READY') self.assertEqual(patched_data['target_state'], 'RUNNING') # Checks DB patched_workflow = Workflow.query.get(workflow.id) self.assertEqual(patched_workflow.target_state, WorkflowState.RUNNING) # Checks scheduler mock_wakeup.assert_called_once_with(workflow.id)
def setUp(self): super().setUp() # Inserts data template1 = WorkflowTemplate(name='t1', comment='comment for t1', group_alias='g1', is_left=True) template1.set_config(WorkflowDefinition( group_alias='g1', is_left=True, )) template2 = WorkflowTemplate(name='t2', group_alias='g2', is_left=False) template2.set_config(WorkflowDefinition( group_alias='g2', is_left=False, )) db.session.add(template1) db.session.add(template2) db.session.commit()
def add_fake_workflow(session): wd = WorkflowDefinition() jd = wd.job_definitions.add() workflow = Workflow( name='test-workflow', project_id=123, config=wd.SerializeToString(), forkable=False, state=WorkflowState.READY, ) session.add(workflow) session.flush() job = Job(name='test_job', job_type=JobType(1), config=jd.SerializeToString(), workflow_id=workflow.id, project_id=123, state=JobState.STOPPED, is_disabled=False) session.add(job) session.flush() workflow.job_ids = str(job.id) session.commit() return workflow, job
def test_put_resetting(self): workflow = Workflow( name='test-workflow', project_id=123, config=WorkflowDefinition( group_alias='test-template').SerializeToString(), state=WorkflowState.NEW, ) db.session.add(workflow) db.session.commit() db.session.refresh(workflow) response = self.put_helper( f'/api/v2/workflows/{workflow.id}', data={ 'forkable': True, 'config': {'group_alias': 'test-template'}, }) self.assertEqual(response.status_code, HTTPStatus.CONFLICT)
def make_workflow_template(): workflow = WorkflowDefinition( group_alias='psi_join_tree_model', is_left=False, variables=[ Variable(name='image_version', value='v1.5-rc3', access_mode=Variable.PEER_READABLE), Variable(name='num_partitions', value='2', access_mode=Variable.PEER_WRITABLE), ], job_definitions=[ JobDefinition( name='raw-data-job', job_type=JobDefinition.RAW_DATA, is_federated=False, variables=[ Variable( name='input_dir', value='/app/deploy/integrated_test/tfrecord_raw_data', access_mode=Variable.PRIVATE), Variable(name='file_wildcard', value='*.rd', access_mode=Variable.PRIVATE), Variable(name='batch_size', value='1024', access_mode=Variable.PEER_WRITABLE), Variable(name='input_format', value='TF_RECORD', access_mode=Variable.PRIVATE), Variable(name='worker_cpu', value='2000m', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='4Gi', access_mode=Variable.PEER_WRITABLE), ], yaml_template='''{ "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.raw-data-job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "cleanPodPolicy": "All", "flReplicaSpecs": { "Master": { "template": { "spec": { "containers": [ { "resources": { "limits": { "cpu": "1000m", "memory": "2Gi" }, "requests": { "cpu": "1000m", "memory": "2Gi" } }, "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/data_portal/run_data_portal_master.sh" ], "args": [], "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw-data-job.name}" }, { "name": "DATA_PORTAL_NAME", "value": "${workflow.jobs.raw-data-job.name}" }, { "name": "OUTPUT_PARTITION_NUM", "value": "${workflow.variables.num_partitions}" }, { "name": "INPUT_BASE_DIR", "value": "${workflow.jobs.raw-data-job.variables.input_dir}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/raw_data/${workflow.jobs.raw-data-job.name}" }, { "name": "RAW_DATA_PUBLISH_DIR", "value": "portal_publish_dir/${workflow.jobs.raw-data-job.name}" }, { "name": "DATA_PORTAL_TYPE", "value": "PSI" }, { "name": "FILE_WILDCARD", "value": "${workflow.jobs.raw-data-job.variables.file_wildcard}" } ], "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow" } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ], "restartPolicy": "Never" } }, "pair": false, "replicas": 1 }, "Worker": { "replicas": ${workflow.variables.num_partitions}, "template": { "spec": { "containers": [ { "resources": { "limits": { "cpu": "${workflow.jobs.raw-data-job.variables.worker_cpu}", "memory": "${workflow.jobs.raw-data-job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.raw-data-job.variables.worker_cpu}", "memory": "${workflow.jobs.raw-data-job.variables.worker_mem}" } }, "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "command": [ "/app/deploy/scripts/data_portal/run_data_portal_worker.sh" ], "args": [], "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw-data-job.name}" }, { "name": "BATCH_SIZE", "value": "${workflow.jobs.raw-data-job.variables.batch_size}" }, { "name": "INPUT_DATA_FORMAT", "value": "${workflow.jobs.raw-data-job.variables.input_format}" }, { "name": "COMPRESSED_TYPE", "value": "" }, { "name": "OUTPUT_DATA_FORMAT", "value": "TF_RECORD" } ], "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow" } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ], "restartPolicy": "Never" } }, "pair": false } } } } '''), JobDefinition(name='data-join-job', job_type=JobDefinition.PSI_DATA_JOIN, is_federated=True, variables=[ Variable(name='worker_cpu', value='4000m', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='4Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='rsa_private_key_path', value='', access_mode=Variable.PRIVATE), ], dependencies=[JobDependency(source='raw-data-job')], yaml_template=''' { "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.data-join-job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "role": "Leader", "cleanPodPolicy": "All", "peerSpecs": { "Follower": { "peerURL": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80", "authority": "${project.participants[0].egress_domain}", "extraHeaders": { "x-host": "default.fedlearner.operator" } } }, "flReplicaSpecs": { "Master": { "template": { "spec": { "restartPolicy": "Never", "containers": [ { "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data-join-job.name}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "ROLE", "value": "leader" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data-join-job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.variables.num_partitions}" }, { "name": "START_TIME", "value": "0" }, { "name": "END_TIME", "value": "999999999999" }, { "name": "NEGATIVE_SAMPLING_RATE", "value": "1.0" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.raw-data-job.name}" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "args": [ "/app/deploy/scripts/rsa_psi/run_psi_data_join_master.sh" ], "resources": { "limits": { "cpu": "2000m", "memory": "3Gi" }, "requests": { "cpu": "2000m", "memory": "3Gi" } }, } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ] } }, "pair": true, "replicas": 1 }, "Worker": { "template": { "spec": { "restartPolicy": "Never", "containers": [ { "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "ROLE", "value": "follower" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data-join-job.name}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data-join-job.name}" }, { "name": "RSA_KEY_PATH", "value": "${workflow.jobs.data-join-job.rsa_private_key_path}" }, { "name": "RSA_PRIVATE_KEY_PATH", "value": "${workflow.jobs.data-join-job.rsa_private_key_path}" }, { "name": "PSI_RAW_DATA_ITER", "value": "TF_RECORD" }, { "name": "PSI_OUTPUT_BUILDER", "value": "TF_RECORD" }, { "name": "DATA_BLOCK_BUILDER", "value": "TF_RECORD" }, { "name": "DATA_BLOCK_DUMP_INTERVAL", "value": "600" }, { "name": "DATA_BLOCK_DUMP_THRESHOLD", "value": "524288" }, { "name": "EXAMPLE_ID_DUMP_INTERVAL", "value": "600" }, { "name": "EXAMPLE_ID_DUMP_THRESHOLD", "value": "524288" }, { "name": "EXAMPLE_JOINER", "value": "SORT_RUN_JOINER" }, { "name": "SIGN_RPC_TIMEOUT_MS", "value": "128000" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.raw-data-job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.variables.num_partitions}" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "args": [ "/app/deploy/scripts/rsa_psi/run_psi_data_join_worker.sh" ], "resources": { "limits": { "cpu": "${workflow.jobs.data-join-job.variables.worker_cpu}", "memory": "${workflow.jobs.data-join-job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.data-join-job.variables.worker_cpu}", "memory": "${workflow.jobs.data-join-job.variables.worker_mem}" } } } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ] } }, "pair": true, "replicas": ${workflow.variables.num_partitions} } } } } '''), JobDefinition(name='train-job', job_type=JobDefinition.TREE_MODEL_TRAINING, is_federated=True, variables=[ Variable(name='worker_cpu', value='4000m', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='8Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='send_scores_to_follower', value='True', access_mode=Variable.PEER_WRITABLE), Variable(name='send_metrics_to_follower', value='True', access_mode=Variable.PEER_WRITABLE), Variable(name='num_parallel', value='4', access_mode=Variable.PEER_WRITABLE), ], dependencies=[JobDependency(source='data-join-job')], yaml_template=''' { "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.train-job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "role": "Leader", "cleanPodPolicy": "All", "peerSpecs": { "Leader": { "peerURL": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80", "authority": "${project.participants[0].egress_domain}", "extraHeaders": { "x-host": "default.fedlearner.operator" } } }, "flReplicaSpecs": { "Worker": { "template": { "spec": { "restartPolicy": "Never", "containers": [ { "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.train-job.name}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "ROLE", "value": "leader" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/job_output/${workflow.jobs.train-job.name}" }, { "name": "MODE", "value": "train" }, { "name": "SEND_SCORES_TO_FOLLOWER", "value": "${workflow.jobs.train-job.variables.send_scores_to_follower}" }, { "name": "SEND_METRICS_TO_FOLLOWER", "value": "${workflow.jobs.train-job.variables.send_metrics_to_follower}" }, { "name": "NUM_PARALLEL", "value": "${workflow.jobs.train-job.variables.num_parallel}" }, { "name": "DATA_SOURCE", "value": "${workflow.jobs.data-join-job.name}" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "args": [ "/app/deploy/scripts/trainer/run_tree_worker.sh" ], "resources": { "limits": { "cpu": "${workflow.jobs.train-job.variables.worker_cpu}", "memory": "${workflow.jobs.train-job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.train-job.variables.worker_cpu}", "memory": "${workflow.jobs.train-job.variables.worker_mem}" } } } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ] } }, "pair": true, "replicas": 1 } } } } ''') ]) return workflow
def make_workflow_template(): workflow = WorkflowDefinition( group_alias='test_template', is_left=True, variables=[ Variable(name='image_version', value='v1.5-rc3', access_mode=Variable.PEER_READABLE), Variable(name='num_partitions', value='4', access_mode=Variable.PEER_WRITABLE), ], job_definitions=[ JobDefinition( name='raw_data_job', job_type=JobDefinition.RAW_DATA, is_federated=False, is_manual=False, variables=[ Variable( name='input_dir', value='/app/deploy/integrated_test/tfrecord_raw_data', access_mode=Variable.PRIVATE), Variable(name='file_wildcard', value='*.rd', access_mode=Variable.PRIVATE), Variable(name='batch_size', value='1024', access_mode=Variable.PEER_WRITABLE), Variable(name='input_format', value='TF_RECORD', access_mode=Variable.PRIVATE), Variable(name='output_format', value='TF_RECORD', access_mode=Variable.PRIVATE), Variable(name='master_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='master_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), ], yaml_template='''{ "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.raw_data_job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "cleanPodPolicy": "All", "flReplicaSpecs": { "Master": { "pair": false, "replicas": 1, "template": { "spec": { "containers": [ { "command": [ "/app/deploy/scripts/data_portal/run_data_portal_master.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw_data_job.name}" }, { "name": "DATA_PORTAL_NAME", "value": "${workflow.jobs.raw_data_job.name}" }, { "name": "OUTPUT_PARTITION_NUM", "value": "${workflow.variables.num_partitions}" }, { "name": "INPUT_BASE_DIR", "value": "${workflow.jobs.raw_data_job.variables.input_dir}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/raw_data/${workflow.jobs.raw_data_job.name}" }, { "name": "RAW_DATA_PUBLISH_DIR", "value": "portal_publish_dir/${workflow.jobs.raw_data_job.name}" }, { "name": "DATA_PORTAL_TYPE", "value": "Streaming" }, { "name": "FILE_WILDCARD", "value": "${workflow.jobs.raw_data_job.variables.file_wildcard}" } ], "image": "hub.docker.com/fedlearner/fedlearner:${workflow.variables.image_version}", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "resources": { "limits": { "cpu": "${workflow.jobs.raw_data_job.variables.master_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.master_mem}" }, "requests": { "cpu": "${workflow.jobs.raw_data_job.variables.master_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.master_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } }, "Worker": { "pair": false, "replicas": ${workflow.variables.num_partitions}, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "command": [ "/app/deploy/scripts/data_portal/run_data_portal_worker.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "CPU_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.cpu" } } }, { "name": "MEM_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.memory" } } }, { "name": "CPU_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.cpu" } } }, { "name": "MEM_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.memory" } } }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw_data_job.name}" }, { "name": "BATCH_SIZE", "value": "${workflow.jobs.raw_data_job.variables.batch_size}" }, { "name": "INPUT_DATA_FORMAT", "value": "${workflow.jobs.raw_data_job.variables.input_format}" }, { "name": "COMPRESSED_TYPE" }, { "name": "OUTPUT_DATA_FORMAT", "value": "${workflow.jobs.raw_data_job.variables.output_format}" } ], "image": "hub.docker.com/fedlearner/fedlearner:${workflow.variables.image_version}", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "resources": { "limits": { "cpu": "${workflow.jobs.raw_data_job.variables.worker_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.raw_data_job.variables.worker_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.worker_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } } }, "peerSpecs": { "Leader": { "peerURL": "" } }, "role": "Follower" } } '''), JobDefinition(name='data_join_job', job_type=JobDefinition.DATA_JOIN, is_federated=True, is_manual=False, variables=[ Variable(name='master_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='master_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='role', value='Follower', access_mode=Variable.PEER_WRITABLE), ], dependencies=[JobDependency(source='raw_data_job')], yaml_template=''' { "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.data_join_job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "cleanPodPolicy": "All", "flReplicaSpecs": { "Master": { "pair": true, "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "args": [ "/app/deploy/scripts/data_join/run_data_join_master.sh" ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "ROLE", "value": "${workflow.jobs.data_join_job.variables.role}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data_join_job.name}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data_join_job.name}" }, { "name": "CPU_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.cpu" } } }, { "name": "MEM_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.memory" } } }, { "name": "CPU_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.cpu" } } }, { "name": "MEM_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.memory" } } }, { "name": "BATCH_MODE", "value": "--batch_mode" }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" }, { "name": "START_TIME", "value": "0" }, { "name": "END_TIME", "value": "999999999999" }, { "name": "NEGATIVE_SAMPLING_RATE", "value": "1.0" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.data_join_job.name}" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.data_join_job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" } ], "image": "hub.docker.com/fedlearner/fedlearner:${workflow.variables.image_version}", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "resources": { "limits": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" }, "requests": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } }, "Worker": { "pair": true, "replicas": ${workflow.jobs.raw_data_job.variables.num_partitions}, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "args": [ "/app/deploy/scripts/data_join/run_data_join_worker.sh" ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "ROLE", "value": "${workflow.jobs.data_join_job.variables.role}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data_join_job.name}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data_join_job.name}" }, { "name": "CPU_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.cpu" } } }, { "name": "MEM_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.memory" } } }, { "name": "CPU_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.cpu" } } }, { "name": "MEM_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.memory" } } }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.data_join_job.name}" }, { "name": "DATA_BLOCK_DUMP_INTERVAL", "value": "600" }, { "name": "DATA_BLOCK_DUMP_THRESHOLD", "value": "65536" }, { "name": "EXAMPLE_ID_DUMP_INTERVAL", "value": "600" }, { "name": "EXAMPLE_ID_DUMP_THRESHOLD", "value": "65536" }, { "name": "EXAMPLE_ID_BATCH_SIZE", "value": "4096" }, { "name": "MAX_FLYING_EXAMPLE_ID", "value": "307152" }, { "name": "MIN_MATCHING_WINDOW", "value": "2048" }, { "name": "MAX_MATCHING_WINDOW", "value": "8192" }, { "name": "RAW_DATA_ITER", "value": "${workflow.jobs.raw_data_job.variables.output_format}" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.raw_data_job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:5b499dd", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "resources": { "limits": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" }, "requests": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } } }, "peerSpecs": { "Follower": { "authority": "external.name", "extraHeaders": { "x-host": "leader.flapp.operator" }, "peerURL": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" } }, "role": "Leader" } } ''') ]) return workflow