Example #1
0
def wait_training_finish(timeout: int, wait: bool, mt_id: str,
                         mt_client: ModelTrainingClient):
    """
    Wait for training to finish according to command line arguments

    :param wait:
    :param timeout:
    :param mt_id: Model Training name
    :param mt_client: Model Training Client
    """
    if not wait:
        return

    start = time.time()
    if timeout <= 0:
        raise Exception(
            'Invalid --timeout argument: should be positive integer')

    # We create a separate client for logs because it has the different timeout settings
    log_mt_client = ModelTrainingClient.construct_from_other(mt_client)
    log_mt_client.timeout = mt_client.timeout, LOG_READ_TIMEOUT_SECONDS

    click.echo("Logs streaming...")

    while True:
        elapsed = time.time() - start
        if elapsed > timeout:
            raise Exception(TIMEOUT_ERROR_MESSAGE)

        try:
            mt = mt_client.get(mt_id)
            if mt.status.state == TRAINING_SUCCESS_STATE:
                click.echo(
                    f'Model {mt_id} was trained. Training took {round(time.time() - start)} seconds'
                )
                return
            elif mt.status.state == TRAINING_FAILED_STATE:
                raise Exception(f'Model training {mt_id} was failed.')
            elif mt.status.state == "":
                click.echo(
                    f"Can't determine the state of {mt.id}. Sleeping...")
            else:
                for msg in log_mt_client.log(mt.id, follow=True):
                    print_logs(msg)

        except (WrongHttpStatusCode, HTTPException, RequestException,
                APIConnectionException) as e:
            LOGGER.info(
                'Callback have not confirmed completion of the operation. Exception: %s',
                str(e))

        LOGGER.debug('Sleep before next request')
        time.sleep(DEFAULT_WAIT_TIMEOUT)
Example #2
0
def logs(client: ModelTrainingClient, train_id: str, file: str, follow: bool):
    """
    \b
    Stream training logs.
    For this command, you must provide a training ID or path to file with one training.
    The file must contain only one training.
    \b
    Usage example:
        * odahuflowctl train delete --id examples-git
        * odahuflowctl train delete -f train.yaml
    \f
    :param follow: Follow logs stream
    :param client: Model training HTTP client
    :param train_id: Model training ID
    :param file: Path to the file with only one training
    """
    check_id_or_file_params_present(train_id, file)

    if file:
        train = parse_resources_file_with_one_item(file).resource
        if not isinstance(train, ModelTraining):
            raise ValueError(
                f'Model training expected, but {type(train)} provided')

        train_id = train.id

    for msg in client.log(train_id, follow):
        print_logs(msg)
Example #3
0
def edit(client: ModelTrainingClient, train_id: str, file: str, wait: bool,
         timeout: int):
    """
    \b
    Rerun a training.
    You should specify a path to file with a training. The file must contain only one training.
    For now, CLI supports YAML and JSON file formats.
    If you want to update multiple trainings, you should use "odahuflowctl bulk apply" instead.
    If you provide the training id parameter, it will override before sending to API server.
    \b
    Usage example:
        * odahuflowctl train update -f train.yaml --id examples-git
    \f
    :param client: Model training HTTP client
    :param train_id: Model training ID
    :param file: Path to the file with only one training
    :param timeout: timeout in seconds. for wait (if no-wait is off)
    :param wait: no wait until scale will be finished
    """
    train = parse_resources_file_with_one_item(file).resource
    if not isinstance(train, ModelTraining):
        raise ValueError(
            f'Model training expected, but {type(train)} provided')

    if train_id:
        train.id = train_id

    train = client.edit(train)
    click.echo(f"Rerun training: {train}")

    wait_training_finish(timeout, wait, train.id, client)
def build_client(resource: OdahuflowCloudResourceUpdatePair,
                 api_client: RemoteAPIClient) -> typing.Optional[object]:
    """
    Build client for particular resource (e.g. it builds ModelTrainingClient for ModelTraining resource)

    :param resource: target resource
    :param api_client: base API client to extract connection options from
    :return: remote client or None
    """
    if isinstance(resource.resource, ModelTraining):
        return ModelTrainingClient.construct_from_other(api_client)
    elif isinstance(resource.resource, ModelDeployment):
        return ModelDeploymentClient.construct_from_other(api_client)
    elif isinstance(resource.resource, Connection):
        return ConnectionClient.construct_from_other(api_client)
    elif isinstance(resource.resource, ToolchainIntegration):
        return ToolchainIntegrationClient.construct_from_other(api_client)
    elif isinstance(resource.resource, ModelRoute):
        return ModelRouteClient.construct_from_other(api_client)
    elif isinstance(resource.resource, ModelPackaging):
        return ModelPackagingClient.construct_from_other(api_client)
    elif isinstance(resource.resource, PackagingIntegration):
        return PackagingIntegrationClient.construct_from_other(api_client)
    else:
        raise InvalidResourceType('{!r} is invalid resource '.format(
            resource.resource))
Example #5
0
def get(client: ModelTrainingClient, train_id: str, output_format: str):
    """
    \b
    Get trainings.
    The command without id argument retrieve all trainings.
    \b
    Get all trainings in json format:
        odahuflowctl train get --output-format json
    \b
    Get training with "git-repo" id:
        odahuflowctl train get --id git-repo
    \b
    Using jsonpath:
        odahuflowctl train get -o 'jsonpath=[*].spec.reference'
    \f
    :param client: Model training HTTP client
    :param train_id: Model training ID
    :param output_format: Output format
    :return:
    """
    trains = [client.get(train_id)] if train_id else client.get_all()

    format_output(trains, output_format)
Example #6
0
def run(client: ModelTrainingClient, train_id: str, manifest_file: List[str],
        manifest_dir: List[str], output_dir: str):
    """
    \b
    Start a training process locally.
    \b
    Usage example:
        * odahuflowctl local train run --id examples-git
    \f
    """
    entities: List[OdahuflowCloudResourceUpdatePair] = []
    for file_path in manifest_file:
        entities.extend(parse_resources_file(file_path).changes)

    for dir_path in manifest_dir:
        entities.extend(parse_resources_dir(dir_path))

    mt: Optional[ModelTraining] = None

    # find a training
    toolchains: Dict[str, ToolchainIntegration] = {}
    for entity in map(lambda x: x.resource, entities):
        if isinstance(entity, ToolchainIntegration):
            toolchains[entity.id] = entity
        elif isinstance(entity, ModelTraining) and entity.id == train_id:
            mt = entity

    if not mt:
        click.echo(
            f'{train_id} training not found. Trying to retrieve it from API server'
        )
        mt = client.get(train_id)

    toolchain = toolchains.get(mt.spec.toolchain)
    if not toolchain:
        click.echo(
            f'{toolchain} toolchain not found. Trying to retrieve it from API server'
        )
        toolchain = ToolchainIntegrationClient.construct_from_other(
            client).get(mt.spec.toolchain)

    trainer = K8sTrainer(
        model_training=mt,
        toolchain_integration=toolchain,
    )

    start_train(trainer, output_dir)
Example #7
0
def delete(client: ModelTrainingClient, train_id: str, file: str,
           ignore_not_found: bool):
    """
    \b
    Delete a training.
    For this command, you must provide a training ID or path to file with one training.
    The file must contain only one training.
    For now, CLI supports YAML and JSON file formats.
    If you want to delete multiple trainings, you should use "odahuflowctl bulk delete" instead.
    The command will fail if you provide both arguments.
    \b
    Usage example:
        * odahuflowctl train delete --id examples-git
        * odahuflowctl train delete -f train.yaml
    \f
    :param client: Model training HTTP client
    :param train_id: Model training ID
    :param file: Path to the file with only one training
    :param ignore_not_found: ignore if Model Training is not found
    """
    check_id_or_file_params_present(train_id, file)

    if file:
        train = parse_resources_file_with_one_item(file).resource
        if not isinstance(train, ModelTraining):
            raise ValueError(
                f'Model training expected, but {type(train)} provided')

        train_id = train.id

    try:
        message = client.delete(train_id)
        click.echo(message)
    except WrongHttpStatusCode as e:
        if e.status_code != 404 or not ignore_not_found:
            raise e

        click.echo(
            IGNORE_NOT_FOUND_ERROR_MESSAGE.format(kind=ModelTraining.__name__,
                                                  id=train_id))
Example #8
0
def create(client: ModelTrainingClient, train_id: str, file: str, wait: bool,
           timeout: int, ignore_if_exists: bool):
    """
    \b
    Create a training.
    You should specify a path to file with a training. The file must contain only one training.
    For now, CLI supports YAML and JSON file formats.
    If you want to create multiple trainings, you should use "odahuflowctl bulk apply" instead.
    If you provide the training id parameter, it will override before sending to API server.
    \b
    Usage example:
        * odahuflowctl train create -f train.yaml --id examples-git
    \f
    :param timeout: timeout in seconds. for wait (if no-wait is off)
    :param wait: no wait until scale will be finished
    :param client: Model training HTTP client
    :param train_id: Model training ID
    :param file: Path to the file with only one training
    :param ignore_if_exists: Return success status code if entity is already exists
    """
    train = parse_resources_file_with_one_item(file).resource
    if not isinstance(train, ModelTraining):
        raise ValueError(f'ModelTraining expected, but {type(train)} provided')

    if train_id:
        train.id = train_id

    try:
        train = client.create(train)
    except EntityAlreadyExists as e:
        if ignore_if_exists:
            LOGGER.debug(
                f'--ignore-if-exists was passed: {e} will be suppressed')
            click.echo('Training already exists')
            return
        raise

    click.echo(f"Start training: {train}")
    wait_training_finish(timeout, wait, train.id, client)
Example #9
0
def training(ctx: click.core.Context, url: str, token: str):
    """
    Local training process.\n
    Alias for the command is train.
    """
    ctx.obj = ModelTrainingClient(url, token)
Example #10
0
def training(ctx: click.core.Context, url: str, token: str):
    """
    Allow you to perform actions on trainings.\n
    Alias for the command is train.
    """
    ctx.obj = ModelTrainingClient(url, token)
Example #11
0
 def training_get_log(train_id):
     log_generator = ModelTrainingClient().log(train_id, follow=False)
     # logs_list will be list of log lines
     logs_list = list(log_generator)
     text = "\n".join(logs_list)
     return text
Example #12
0
 def training_delete(train_id: str):
     return ModelTrainingClient().delete(train_id)
Example #13
0
 def training_post(payload_file):
     api_object = parse_resources_file_with_one_item(payload_file).resource
     return ModelTrainingClient().create(api_object)
Example #14
0
 def training_get_id(train_id: str):
     return ModelTrainingClient().get(train_id)
Example #15
0
 def training_get():
     return ModelTrainingClient().get_all()
Example #16
0
CONNECTION = EntityTestData(
    ConnectionClient(),
    Connection(
        id=ENTITY_ID,
        spec=ConnectionSpec(
            key_secret="mock-key-secret",
            uri="mock-url",
        ),
    ),
    connection.connection,
    'Connection',
)

TRAINING = EntityTestData(
    ModelTrainingClient(),
    ModelTraining(
        id=ENTITY_ID,
        spec=ModelTrainingSpec(work_dir="/1/2/3",
                               model=ModelIdentity(name="name",
                                                   version="version")),
        status=ModelTrainingStatus(state=SUCCEEDED_STATE)), training.training,
    'ModelTraining')

TOOLCHAIN = EntityTestData(
    ToolchainIntegrationClient(),
    ToolchainIntegration(
        id=ENTITY_ID,
        spec=ToolchainIntegrationSpec(
            default_image="mock-image",
            entrypoint="default-entrypoint",