def run_test(args): """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(args.spec) loader = jinja2.FileSystemLoader(os.path.dirname(args.spec)) if not args.image_tag: raise ValueError("--image_tag must be provided.") logging.info("Loading spec from %s with image_tag=%s", args.spec, args.image_tag) spec_contents = jinja2.Environment(loader=loader).get_template( os.path.basename(args.spec)).render(image_tag=args.image_tag) spec = yaml.load(spec_contents) # Make the job name unique. spec["metadata"]["name"] += "-" + uuid.uuid4().hex[0:4] try: start = time.time() api_response = tf_job_client.create_tf_job(api_client, spec) namespace = api_response["metadata"]["namespace"] name = api_response["metadata"]["name"] logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results["status"]["state"] != "succeeded": t.failure = "Job {0} in namespace {1} in state {2}".format( name, namespace, results["status"]["state"]) # TODO(jlewi): # Here are some validation checks to run: # 1. Check tensorboard is created if its part of the job spec. # 2. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def main(argv=None): parser = argparse.ArgumentParser(description='Kubeflow TFJob launcher') parser.add_argument( '--container-image', type=str, help= '''Container image to run using KubeFlow TFJob. The command line should be added after --.''' ) parser.add_argument('--workers', type=int, default=0) parser.add_argument('--pss', type=int, default=0) parser.add_argument( '--cluster', type=str, help='GKE cluster set up for kubeflow. If set, zone must be provided. ' + 'If not set, assuming this runs in a GKE container and current ' + 'cluster is used.') parser.add_argument('--zone', type=str, help='zone of the kubeflow cluster.') parser.add_argument('--kfversion', type=str, default='v1alpha2', help='The version of the deployed kubeflow. ' + 'If not set, the default version is v1alpha2') parser.add_argument('--tfjob-ns', type=str, default='default', help='The namespace where the tfjob is submitted' + 'If not set, the default namespace is default') parser.add_argument( '--tfjob-timeout-minutes', type=int, default=10, help='Time in minutes to wait for the TFJob to complete') parser.add_argument('--output-dir', type=str) parser.add_argument('--ui-metadata-type', type=str, default='tensorboard') import sys all_args = sys.argv[1:] separator_idx = all_args.index('--') launcher_args = all_args[:separator_idx] remaining_args = all_args[separator_idx + 1:] args = parser.parse_args(launcher_args) logging.getLogger().setLevel(logging.INFO) args_dict = vars(args) if args.cluster and args.zone: cluster = args_dict.pop('cluster') zone = args_dict.pop('zone') else: # Get culster name and zone from metadata metadata_server = "http://metadata/computeMetadata/v1/instance/" metadata_flavor = {'Metadata-Flavor': 'Google'} cluster = requests.get(metadata_server + "attributes/cluster-name", headers=metadata_flavor).text zone = requests.get(metadata_server + "zone", headers=metadata_flavor).text.split('/')[-1] logging.info('Getting credentials for GKE cluster %s.' % cluster) subprocess.call([ 'gcloud', 'container', 'clusters', 'get-credentials', cluster, '--zone', zone ]) workers = args_dict.pop('workers') pss = args_dict.pop('pss') kf_version = args_dict.pop('kfversion') tfjob_ns = args_dict.pop('tfjob_ns') tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes') trainer_image = args.container_image or os.environ['TRAINER_IMAGE_NAME'] command = remaining_args logging.info('Generating training template.') template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'train.template.yaml') content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss, trainer_image, command) logging.info('Start training.') # Set up handler for k8s clients config.load_incluster_config() api_client = k8s_client.ApiClient() create_response = tf_job_client.create_tf_job(api_client, content_yaml, version=kf_version) job_name = create_response['metadata']['name'] if args.output_dir: # Create metadata.json file for visualization. metadata = { 'outputs': [{ 'type': args.ui_metadata_type, 'source': args.output_dir, }] } with open('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f) wait_response = tf_job_client.wait_for_job( api_client, tfjob_ns, job_name, kf_version, timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) succ = True #TODO: update this failure checking after tf-operator has the condition checking function. if 'Worker' in wait_response['status']['tfReplicaStatuses']: if 'Failed' in wait_response['status']['tfReplicaStatuses']['Worker']: logging.error('Training failed since workers failed.') succ = False if 'PS' in wait_response['status']['tfReplicaStatuses']: if 'Failed' in wait_response['status']['tfReplicaStatuses']['PS']: logging.error('Training failed since PSs failed.') succ = False if 'MASTER' in wait_response['status']['tfReplicaStatuses']: if 'Failed' in wait_response['status']['tfReplicaStatuses']['MASTER']: logging.error('Training failed since MASTER failed.') succ = False #TODO: remove this after kubeflow fixes the wait_for_job issue # because the wait_for_job returns when the worker finishes but the master might not be complete yet. if 'MASTER' in wait_response['status'][ 'tfReplicaStatuses'] and 'active' in wait_response['status'][ 'tfReplicaStatuses']['MASTER']: master_active = True while master_active: # Wait for master to finish time.sleep(2) wait_response = tf_job_client.wait_for_job( api_client, tfjob_ns, job_name, kf_version, timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) if 'active' not in wait_response['status']['tfReplicaStatuses'][ 'MASTER']: master_active = False if succ: logging.info('Training success.') tf_job_client.delete_tf_job(api_client, tfjob_ns, job_name, version=kf_version) with open('/output.txt', 'w') as f: f.write(args.output_dir)
def main(argv=None): parser = argparse.ArgumentParser(description='ML Trainer') parser.add_argument('--working-dir', help='Training job working directory.', required=True) parser.add_argument('--train-files-dir', help='Path to training data', required=True) parser.add_argument('--train-files-prefix', help='The prefix of the training input files.', required=True) parser.add_argument( '--tf-transform-dir', help='Tf-transform directory with model from preprocessing step', required=True) parser.add_argument('--output-dir', help="""\ Directory under which which the serving model (under /serving_model_dir)\ and the tf-mode-analysis model (under /eval_model_dir) will be written\ """, required=True) parser.add_argument('--eval-files-dir', help='Path to evaluation data', required=True) parser.add_argument('--eval-files-prefix', help='The prefix of the eval input files.', required=True) # Training arguments parser.add_argument( '--job-dir', help='GCS location to write checkpoints and export models', required=True) # Argument to turn on all logging parser.add_argument( '--verbosity', choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'], default='INFO', ) # Experiment arguments parser.add_argument('--train-steps', help='Count of steps to run the training job for', required=True, type=int) parser.add_argument( '--eval-steps', help='Number of steps to run evalution for at each checkpoint', default=100, type=int) parser.add_argument('--workers', type=int, default=0) parser.add_argument('--pss', type=int, default=0) parser.add_argument( '--cluster', type=str, help='GKE cluster set up for kubeflow. If set, zone must be provided. ' + 'If not set, assuming this runs in a GKE container and current ' + 'cluster is used.') parser.add_argument('--zone', type=str, help='zone of the kubeflow cluster.') parser.add_argument('--kfversion', type=str, default='v1alpha2', help='The version of the deployed kubeflow. ' + 'If not set, the default version is v1alpha2') parser.add_argument('--tfjob-ns', type=str, default='kubeflow', help='The namespace where the tfjob is submitted' + 'If not set, the namespace is kubeflow') parser.add_argument( '--tfjob-timeout-minutes', type=int, default=10, help='Time in minutes to wait for the TFJob to complete') args = parser.parse_args() # KUBEFLOW_NAMESPACE = 'default' logging.getLogger().setLevel(logging.INFO) args_dict = vars(args) if args.cluster and args.zone: cluster = args_dict.pop('cluster') zone = args_dict.pop('zone') else: # Get cluster name and zone from metadata metadata_server = "http://metadata/computeMetadata/v1/instance/" metadata_flavor = {'Metadata-Flavor': 'Google'} cluster = requests.get(metadata_server + "attributes/cluster-name", headers=metadata_flavor).text zone = requests.get(metadata_server + "zone", headers=metadata_flavor).text.split('/')[-1] logging.info('Getting credentials for GKE cluster %s.' % cluster) subprocess.call([ 'gcloud', 'container', 'clusters', 'get-credentials', cluster, '--zone', zone ]) # Create metadata.json file for visualization. tb_dir = args_dict.pop( 'working_dir') # don't pass this arg to the training module metadata = { 'outputs': [{ 'type': 'tensorboard', 'source': tb_dir, }] } with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f) workers = args_dict.pop('workers') pss = args_dict.pop('pss') kf_version = args_dict.pop('kfversion') tfjob_ns = args_dict.pop('tfjob_ns') tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes') args_list = [ '--%s=%s' % (k.replace('_', '-'), v) for k, v in six.iteritems(args_dict) if v is not None ] logging.info('Generating training template.') template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'train.template.yaml') content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss, args_list) logging.info('Start training.') # Set up handler for k8s clients config.load_incluster_config() api_client = k8s_client.ApiClient() create_response = tf_job_client.create_tf_job(api_client, content_yaml, version=kf_version) job_name = create_response['metadata']['name'] wait_response = tf_job_client.wait_for_job( api_client, tfjob_ns, job_name, kf_version, timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) succ = True #TODO: update this failure checking after tf-operator has the condition checking function. if 'Worker' in wait_response['status']['tfReplicaStatuses']: if 'Failed' in wait_response['status']['tfReplicaStatuses']['Worker']: logging.error('Training failed since workers failed.') succ = False if 'PS' in wait_response['status']['tfReplicaStatuses']: if 'Failed' in wait_response['status']['tfReplicaStatuses']['PS']: logging.error('Training failed since PSs failed.') succ = False if 'MASTER' in wait_response['status']['tfReplicaStatuses']: if 'Failed' in wait_response['status']['tfReplicaStatuses']['MASTER']: logging.error('Training failed since MASTER failed.') succ = False #TODO: remove this after kubeflow fixes the wait_for_job issue # because the wait_for_job returns when the worker finishes but the master might not be complete yet. if 'MASTER' in wait_response['status'][ 'tfReplicaStatuses'] and 'active' in wait_response['status'][ 'tfReplicaStatuses']['MASTER']: master_active = True while master_active: # Wait for master to finish time.sleep(2) wait_response = tf_job_client.wait_for_job( api_client, tfjob_ns, job_name, kf_version, timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) if 'active' not in wait_response['status']['tfReplicaStatuses'][ 'MASTER']: master_active = False if succ: logging.info('Training success.') tf_job_client.delete_tf_job(api_client, tfjob_ns, job_name, version=kf_version) with open('/output.txt', 'w') as f: f.write(args.job_dir)
def main(argv=None): parser = argparse.ArgumentParser(description='ML Trainer') parser.add_argument( '--working-dir', help='Training job working directory.', required=True) parser.add_argument( '--train-files-dir', help='Path to training data', required=True) parser.add_argument( '--train-files-prefix', help='The prefix of the training input files.', required=True) parser.add_argument( '--tf-transform-dir', help='Tf-transform directory with model from preprocessing step', required=True) parser.add_argument( '--output-dir', help="""\ Directory under which which the serving model (under /serving_model_dir)\ and the tf-mode-analysis model (under /eval_model_dir) will be written\ """, required=True) parser.add_argument( '--eval-files-dir', help='Path to evaluation data', required=True ) parser.add_argument( '--eval-files-prefix', help='The prefix of the eval input files.', required=True) # Training arguments parser.add_argument( '--job-dir', help='GCS location to write checkpoints and export models', required=True) # Argument to turn on all logging parser.add_argument( '--verbosity', choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'], default='INFO', ) # Experiment arguments parser.add_argument( '--train-steps', help='Count of steps to run the training job for', required=True, type=int) parser.add_argument( '--eval-steps', help='Number of steps to run evalution for at each checkpoint', default=100, type=int) parser.add_argument('--workers', type=int, default=0) parser.add_argument('--pss', type=int, default=0) parser.add_argument('--cluster', type=str, help='GKE cluster set up for kubeflow. If set, zone must be provided. ' + 'If not set, assuming this runs in a GKE container and current ' + 'cluster is used.') parser.add_argument('--zone', type=str, help='zone of the kubeflow cluster.') parser.add_argument('--kfversion', type=str, default='v1beta1', help='The version of the deployed kubeflow. ' + 'If not set, the default version is v1beta1') parser.add_argument('--tfjob-ns', type=str, default='kubeflow', help='The namespace where the tfjob is submitted' + 'If not set, the namespace is kubeflow') parser.add_argument('--tfjob-timeout-minutes', type=int, default=20, help='Time in minutes to wait for the TFJob to complete') args = parser.parse_args() logging.getLogger().setLevel(logging.INFO) args_dict = vars(args) if args.cluster and args.zone: cluster = args_dict.pop('cluster') zone = args_dict.pop('zone') else: # Get cluster name and zone from metadata metadata_server = "http://metadata/computeMetadata/v1/instance/" metadata_flavor = {'Metadata-Flavor' : 'Google'} cluster = requests.get(metadata_server + "attributes/cluster-name", headers = metadata_flavor).text zone = requests.get(metadata_server + "zone", headers = metadata_flavor).text.split('/')[-1] # logging.info('Getting credentials for GKE cluster %s.' % cluster) # subprocess.call(['gcloud', 'container', 'clusters', 'get-credentials', cluster, # '--zone', zone]) # Create metadata.json file for visualization. tb_dir = args_dict.pop('working_dir') # don't pass this arg to the training module metadata = { 'outputs' : [{ 'type': 'tensorboard', 'source': tb_dir, }] } with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f) workers = args_dict.pop('workers') pss = args_dict.pop('pss') kf_version = args_dict.pop('kfversion') tfjob_ns = args_dict.pop('tfjob_ns') tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes') args_list = ['--%s=%s' % (k.replace('_', '-'),v) for k,v in six.iteritems(args_dict) if v is not None] logging.info('Generating training template.') template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'train.template.yaml') content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss, args_list) logging.info('Start training.') # Set up handler for k8s clients config.load_incluster_config() api_client = k8s_client.ApiClient() create_response = tf_job_client.create_tf_job(api_client, content_yaml, version=kf_version) job_name = create_response['metadata']['name'] wait_response = tf_job_client.wait_for_job( api_client, tfjob_ns, job_name, kf_version, timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) succ = True # TODO: update this failure checking after tf-operator has the condition checking function. if 'Worker' in wait_response['status']['replicaStatuses']: if 'Failed' in wait_response['status']['replicaStatuses']['Worker']: logging.error('Training failed since workers failed.') succ = False if 'PS' in wait_response['status']['replicaStatuses']: if 'Failed' in wait_response['status']['replicaStatuses']['PS']: logging.error('Training failed since PSs failed.') succ = False if 'Master' in wait_response['status']['replicaStatuses']: if 'Failed' in wait_response['status']['replicaStatuses']['Master']: logging.error('Training failed since Master failed.') succ = False # #TODO: remove this after kubeflow fixes the wait_for_job issue # # because the wait_for_job returns when the worker finishes but the master might not be complete yet. # if 'Master' in wait_response['status']['replicaStatuses'] and 'active' in wait_response['status']['replicaStatuses']['Master']: # master_active = True # while master_active: # # Wait for master to finish # time.sleep(2) # wait_response = tf_job_client.wait_for_job(api_client, tfjob_ns, job_name, kf_version, # timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) # if 'active' not in wait_response['status']['tfReplicaStatuses']['Master']: # master_active = False if succ: logging.info('Training success.') tf_job_client.delete_tf_job(api_client, tfjob_ns, job_name, version=kf_version) with open('/output.txt', 'w') as f: f.write(args.job_dir)