def gh_summ(  #pylint: disable=unused-argument
  train_steps: 'Integer' = 2019300,
  project: str = 'YOUR_PROJECT_HERE',
  github_token: str = 'YOUR_GITHUB_TOKEN_HERE',
  working_dir: 'GCSPath' = 'gs://YOUR_GCS_DIR_HERE',
  checkpoint_dir: 'GCSPath' = 'gs://aju-dev-demos-codelabs/kubecon/model_output_tbase.bak2019000/',
  deploy_webapp: str = 'true',
  data_dir: 'GCSPath' = 'gs://aju-dev-demos-codelabs/kubecon/t2t_data_gh_all/'
  ):


  copydata = copydata_op(
    data_dir=data_dir,
    checkpoint_dir=checkpoint_dir,
    model_dir='%s/%s/model_output' % (working_dir, dsl.RUN_ID_PLACEHOLDER),
    action=COPY_ACTION,
    )


  train = train_op(
    data_dir=data_dir,
    model_dir=copydata.outputs['copy_output_path'],
    action=TRAIN_ACTION, train_steps=train_steps,
    deploy_webapp=deploy_webapp
    )

  serve = dsl.ContainerOp(
      name='serve',
      image='gcr.io/google-samples/ml-pipeline-kubeflow-tfserve:v5',
      arguments=["--model_name", 'ghsumm-%s' % (dsl.RUN_ID_PLACEHOLDER,),
          "--model_path", train.outputs['train_output_path']
          ]
      )

  # train.set_gpu_limit(1)
  train.set_gpu_limit(1).apply(gcp.use_preemptible_nodepool()).set_retry(10)


  with dsl.Condition(train.outputs['launch_server'] == 'true'):
    webapp = dsl.ContainerOp(
        name='webapp',
        image='gcr.io/google-samples/ml-pipeline-webapp-launcher:v7ap',
        arguments=["--model_name", 'ghsumm-%s' % (dsl.RUN_ID_PLACEHOLDER,),
            "--github_token", github_token]

        )
    webapp.after(serve)
def gh_summ(  #pylint: disable=unused-argument
        train_steps=2019300,
        project='YOUR_PROJECT_HERE',
        github_token='YOUR_GITHUB_TOKEN_HERE',
        working_dir='YOUR_GCS_DIR_HERE',
        checkpoint_dir='gs://aju-dev-demos-codelabs/kubecon/model_output_tbase.bak2019000',
        deploy_webapp='true',
        data_dir='gs://aju-dev-demos-codelabs/kubecon/t2t_data_gh_all/'):

    copydata = copydata_op(
        working_dir=working_dir,
        data_dir=data_dir,
        checkpoint_dir=checkpoint_dir,
        model_dir='%s/%s/model_output' % (working_dir, '{{workflow.name}}'),
        action=COPY_ACTION).apply(gcp.use_gcp_secret('user-gcp-sa'))

    log_dataset = metadata_log_op(log_type=DATASET,
                                  workspace_name=WORKSPACE_NAME,
                                  run_name='{{workflow.name}}',
                                  data_uri=data_dir)

    train = train_op(
        working_dir=working_dir,
        data_dir=data_dir,
        checkpoint_dir=checkpoint_dir,
        model_dir='%s/%s/model_output' % (working_dir, '{{workflow.name}}'),
        action=TRAIN_ACTION,
        train_steps=train_steps,
        deploy_webapp=deploy_webapp).apply(gcp.use_gcp_secret('user-gcp-sa'))

    log_model = metadata_log_op(log_type=MODEL,
                                workspace_name=WORKSPACE_NAME,
                                run_name='{{workflow.name}}',
                                model_uri='%s/%s/model_output' %
                                (working_dir, '{{workflow.name}}'))

    serve = dsl.ContainerOp(
        name='serve',
        image='gcr.io/google-samples/ml-pipeline-kubeflow-tfserve',
        arguments=[
            "--model_name",
            'ghsumm-%s' % ('{{workflow.name}}', ), "--model_path",
            '%s/%s/model_output/export' % (working_dir, '{{workflow.name}}')
        ])
    log_dataset.after(copydata)
    train.after(copydata)
    log_model.after(train)
    serve.after(train)
    train.set_gpu_limit(4).apply(gcp.use_preemptible_nodepool()).set_retry(5)
    train.set_memory_limit('48G')

    with dsl.Condition(train.output == 'true'):
        webapp = dsl.ContainerOp(
            name='webapp',
            image='gcr.io/google-samples/ml-pipeline-webapp-launcher:v2ap',
            arguments=[
                "--model_name",
                'ghsumm-%s' % ('{{workflow.name}}', ), "--github_token",
                github_token
            ])
        webapp.after(serve)
def flipcoin():
    flip = FlipCoinOp().apply(gcp.use_preemptible_nodepool()).set_gpu_limit(
        1, 'nvidia').set_retry(5)
Exemple #4
0
def baseline_repro_pipeline(
    data_bucket: str = 'voxsrc-2020-voxceleb-v4',
    test_list: str = 'vox1_full.txt',
    # @note test_utterances_list is in the same format as train_list, but for
    #       the test data. Whereas test_list contains utterance pairs for
    #       evaluation
    test_utterances_list: str = 'vox1_full_utterances.txt',
    train_list: str = 'vox2_full.txt',
    test_path: str = 'vox1_full.tar.gz',
    train_path: str = 'vox2_full.tar.gz',
    checkpoint_bucket: str = 'voxsrc-2020-checkpoints',
    batch_size: int = 750,
    max_epoch: int = 21,
    n_speakers: int = 2,
    test_interval: int = 3,
    feature_extraction_threads: int = 16,
    data_loader_threads: int = 7,
    # @note This run ID contains "full" pre-extracted features for vox1 and vox2
    reuse_run_with_id: str = "milo_webster-19rvuxfu",
    gaussian_noise_std: float = .9,
):
    # set prod_hw=True to enable production hardware (preemptible V100).
    # Encountered odd issues when node resource constraints aren't known at
    # "compile time" of kf pipeline file
    prod_hw = True
    run_id = '{{workflow.uid}}'

    feature_extraction_task = feature_extraction_op(
        data_bucket=data_bucket,
        test_utterances_list=test_list,
        train_list=train_list,
        test_path=test_path,
        train_path=train_path,
        run_id=run_id,
        num_threads=feature_extraction_threads,
        reuse_run_with_id=reuse_run_with_id)

    # default feature extractor to high-perf pool if not in pass-through mode
    # if in pass-through mode, there's no reason to use a beefy node
    if not reuse_run_with_id:
        feature_extraction_task.set_cpu_request("9").set_cpu_limit("16")

    train_task = train_op(
        data_bucket=data_bucket,
        test_list=test_list,
        train_list=train_list,
        test_path=feature_extraction_task.outputs['test_feats_tar_path'],
        train_path=feature_extraction_task.outputs['train_feats_tar_path'],
        batch_size=batch_size,
        max_epoch=max_epoch,
        checkpoint_bucket=checkpoint_bucket,
        run_id=run_id,
        n_speakers=n_speakers,
        test_interval=test_interval,
        gaussian_noise_std=gaussian_noise_std,
        n_data_loader_thread=data_loader_threads,
    )

    train_task.add_pvolumes({'/dev/shm': ipc_shared_mem_volume})
    train_task.after(feature_extraction_task)

    # add Weights & Biases credentials
    if "WANDB_API_KEY" in os.environ:
        train_task.add_env_variable(
            k8s_client.V1EnvVar(name='WANDB_API_KEY',
                                value=os.environ["WANDB_API_KEY"]))
    else:
        raise 'Error: No WandB API key set in environment'

    # @note These resource requests autoscale an autoscalable node pool from
    #       0->1 that matches the corresponding config. Autoscaled nodes will be
    #       deactivated on GCP after 10 minutes of inactivity
    if prod_hw:
        # require training to run on a preemptible node pool
        train_task\
            .apply(gcp.use_preemptible_nodepool(hard_constraint=True))\
            .set_retry(5)
        # require training to run on a node with a gpu of type 'train_gpu_type'
        train_task\
            .set_gpu_limit(1)\
            .add_node_selector_constraint('cloud.google.com/gke-accelerator',
                    'nvidia-tesla-v100')