def get_container(train_op, train_env, train_num_gpus, drive='coco-headset-vol-1'): (train_op.container.set_memory_request('56Gi').set_memory_limit( '56Gi').set_cpu_request('7.5').set_cpu_limit('7.5').set_gpu_limit( str(train_num_gpus)).add_volume_mount( V1VolumeMount( name='tensorboard', mount_path='/shared/tensorboard')).add_volume_mount( V1VolumeMount(name='data', mount_path='/data/')).add_volume_mount( V1VolumeMount( name='shm', mount_path='/dev/shm'))) (add_env(add_ssh_volume(train_op), train_env).add_toleration( V1Toleration(key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')).add_node_selector_constraint( 'beta.kubernetes.io/instance-type', f'p3.{2 * train_num_gpus}xlarge'). add_volume( V1Volume(name='tensorboard', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( 'tensorboard-research-kf')) ).add_volume( V1Volume(name='data', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( drive))) # .add_volume(V1Volume(name='shm', host_path=V1HostPathVolumeSource(path='/dev/shm'))) .add_volume( V1Volume(name='shm', empty_dir=V1EmptyDirVolumeSource(medium='Memory'))))
def use_preemptible_nodepool(toleration: V1Toleration = V1Toleration( effect='NoSchedule', key='preemptible', operator='Equal', value='true'), hard_constraint: bool = False): """An operator that configures the GKE preemptible in a container op. Args: toleration (V1Toleration): toleration to pods, default is the preemptible label. hard_constraint (bool): the constraint of scheduling the pods on preemptible nodepools is hard. (Default: False) """ def _set_preemptible(task): task.add_toleration(toleration) node_selector_term = V1NodeSelectorTerm(match_expressions=[ V1NodeSelectorRequirement(key='cloud.google.com/gke-preemptible', operator='In', values=['true']) ]) if hard_constraint: node_affinity = V1NodeAffinity( required_during_scheduling_ignored_during_execution= V1NodeSelector(node_selector_terms=[node_selector_term])) else: node_affinity = V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ V1PreferredSchedulingTerm(preference=node_selector_term, weight=50) ]) affinity = V1Affinity(node_affinity=node_affinity) task.add_affinity(affinity=affinity) return task return _set_preemptible
def use_preemptible_nodepool(toleration: V1Toleration = V1Toleration( effect='NoSchedule', key='preemptible', operator='Equal', value='true')): """An operator that configures the GKE preemptible in a container op. """ def _set_preemptible(task): task.add_toleration(toleration) task.add_node_selector_constraint("cloud.google.com/gke-preemptible", "true") return task return _set_preemptible
def tolerations(): """A pipeline with tolerations""" op1 = dsl.ContainerOp( name='download', image='busybox', command=['sh', '-c'], arguments=['sleep 10; wget localhost:5678 -O /tmp/results.txt'], file_outputs={'downloaded': '/tmp/results.txt'})\ .add_toleration(V1Toleration(effect='NoSchedule', key='gpu', operator='Equal', value='run'))
def add_gpu_toleration(toleration: V1Toleration = V1Toleration( effect='NoSchedule', key='nvidia.com/gpu', operator='Equal', value='true')): """An operator that configures the GKE GPU nodes in a container op. Args: toleration: toleration to pods, default is the nvidia.com/gpu label. """ def _set_toleration(task): task.add_toleration(toleration) return _set_toleration
def train_eval_epic(owner, project, experiment, model, git_rev, pretrained_s3, mode, train_additional_args='', eval_additional_args=''): train_env = {} train_num_gpus = 1 train_op = components.load_component_from_file('components/train.yaml')( owner=owner, project=project, experiment=experiment, model=model, git_rev=git_rev, pretrained_s3=pretrained_s3, mode=mode, additional_args=train_additional_args) (train_op.container.set_memory_request('56Gi').set_memory_limit( '56Gi').set_cpu_request('7.5').set_cpu_limit('7.5').set_gpu_limit( str(train_num_gpus)).add_volume_mount( V1VolumeMount( name='tensorboard', mount_path='/shared/tensorboard')).add_volume_mount( V1VolumeMount(name='data', mount_path='/data/')).add_volume_mount( V1VolumeMount( name='shm', mount_path='/dev/shm'))) (add_env(add_ssh_volume(train_op), train_env).add_toleration( V1Toleration(key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')).add_node_selector_constraint( 'beta.kubernetes.io/instance-type', f'p3.{2*train_num_gpus}xlarge'). add_volume( V1Volume(name='tensorboard', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( 'tensorboard-research-kf')) ).add_volume( V1Volume(name='data', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( 'dataset-epic-kitchen'))) # .add_volume(V1Volume(name='shm', host_path=V1HostPathVolumeSource(path='/dev/shm'))) .add_volume( V1Volume(name='shm', empty_dir=V1EmptyDirVolumeSource(medium='Memory'))))
def test_tolerations(self): """Test a pipeline with a tolerations.""" op1 = dsl.ContainerOp( name='download', image='busybox', command=['sh', '-c'], arguments=['sleep 10; wget localhost:5678 -O /tmp/results.txt'], file_outputs={'downloaded': '/tmp/results.txt'}) \ .add_toleration(V1Toleration( effect='NoSchedule', key='gpu', operator='Equal', value='run')) self._test_op_to_template_yaml(op1, file_base_name='tolerations')
def tolerations(preemptible: bool = True) -> Optional[List[V1Toleration]]: """creates tolerations for pod spec Args: preemptible: tolerate preemptible vm instances Returns: list of tolerations """ if not preemptible: return [] return [ V1Toleration(key=k.NODE_SELECTOR_PREEMPTIBLE, operator='Equal', value='true', effect='NoSchedule') ]
def start_fat_pod(self, node_taint_key, node_taint_value): """ Start fat pod """ core_api = kubernetes.client.CoreV1Api(_build_client()) pod = V1Pod(api_version='v1', kind='Pod', metadata=V1ObjectMeta( name=FAT_POD_NAME, annotations={"sidecar.istio.io/inject": "false"}, ), spec=V1PodSpec(restart_policy='Never', priority=0, tolerations=[ V1Toleration( key=node_taint_key, operator="Equal", value=node_taint_value, effect="NoSchedule", ) ], containers=[ V1Container( name=FAT_POD_NAME, image=FAT_POD_IMAGE, resources=V1ResourceRequirements( limits={ 'cpu': FAT_POD_CPU, 'memory': FAT_POD_MEMORY }, requests={ 'cpu': FAT_POD_CPU, 'memory': FAT_POD_MEMORY }), command=["echo"], args=["I am a fat :("]) ])) core_api.create_namespaced_pod(self._namespace, pod)
# Load custom components ####################################### ################### # Train Op comp_train_fname = op.join('components', 'od_train', 'component.yaml') train_component = components.load_component(filename=comp_train_fname) ################### # Export Op comp_export_fname = op.join('components', 'od_export', 'component.yaml') export_component = components.load_component(filename=comp_export_fname) ######################################## # Define a toleration to a ML node taint ml_tol = V1Toleration(effect='NoSchedule', key='mlUseOnly', operator='Equal', value='true') ml_tol2 = V1Toleration(effect='NoSchedule', key='nvidia.com/gpu', operator='Equal', value='present') @dsl.pipeline(name='OD API training/export', description='A pipeline to train/export an instance segmentation model.') def divot_detect_pipeline( pipeline_config_path, model_dir, eval_dir, inference_output_directory, checkpoint_every_n=5000, num_train_steps=200000, sample_1_of_n_eval_examples=10, inference_input_type='encoded_image_string_tensor', eval_checkpoint_metric='loss', metric_objective_type='min'):
####################################### ################### # Train Op comp_train_fname = op.join('components', 'od_train', 'component.yaml') train_component = components.load_component(filename=comp_train_fname) ################### # Export Op comp_export_fname = op.join('components', 'od_export', 'component.yaml') export_component = components.load_component(filename=comp_export_fname) ######################################## # Define a toleration to a ML node taint ml_tol = V1Toleration(effect='NoSchedule', key='mlUseOnly', operator='Equal', value='true') @dsl.pipeline( name='OD API training/export', description='A pipeline to train/export an instance segmentation model.') def divot_detect_pipeline(pipeline_config_path, model_dir, eval_dir, inference_output_directory, num_train_steps=200000, sample_1_of_n_eval_examples=10, inference_input_type='encoded_image_string_tensor', eval_checkpoint_metric='loss', metric_objective_type='min'):