Ejemplo n.º 1
0
def verify_purge(fset, targets):
    fset.reload(update_spec=False)
    orig_status_targets = list(fset.status.targets.keys())
    target_names = [t.name for t in targets]

    for target in targets:
        driver = get_target_driver(target_spec=target, resource=fset)
        filesystem = driver._get_store().get_filesystem(False)
        assert filesystem.exists(driver._target_path)

    fset.purge_targets(target_names=target_names)

    for target in targets:
        driver = get_target_driver(target_spec=target, resource=fset)
        filesystem = driver._get_store().get_filesystem(False)
        assert not filesystem.exists(driver._target_path)

    fset.reload(update_spec=False)
    assert set(fset.status.targets.keys()
               ) == set(orig_status_targets) - set(target_names)
Ejemplo n.º 2
0
def init_featureset_graph(source,
                          featureset,
                          namespace,
                          targets=None,
                          return_df=True,
                          verbose=False):
    """create storey ingestion graph/DAG from feature set object"""

    cache = ResourceCache()
    graph = featureset.spec.graph.copy()

    # init targets (and table)
    targets = targets or []
    server = create_graph_server(graph=graph, parameters={}, verbose=verbose)
    server.init_states(context=None, namespace=namespace, resource_cache=cache)

    if graph.engine != "sync":
        _add_data_steps(
            graph,
            cache,
            featureset,
            targets=targets,
            source=source,
            return_df=return_df,
            context=server.context,
        )

    server.init_object(namespace)

    if graph.engine != "sync":
        return graph.wait_for_completion()

    if hasattr(source, "to_dataframe"):
        source = source.to_dataframe()
    elif not hasattr(source, "to_csv"):
        raise mlrun.errors.MLRunInvalidArgumentError("illegal source")

    event = MockEvent(body=source)
    data = server.run(event, get_body=True)
    for target in targets:
        target = get_target_driver(target, featureset)
        size = target.write_dataframe(data)
        target_status = target.update_resource_status("ready", size=size)
        if verbose:
            logger.info(f"wrote target: {target_status}")

    return data
Ejemplo n.º 3
0
def context_to_ingestion_params(context):
    """extract the ingestion task params from job/serving context"""

    featureset_uri = context.get_param("featureset")
    featureset = context.get_store_resource(featureset_uri)
    infer_options = context.get_param("infer_options", InferOptions.Null)

    source = context.get_param("source")
    if source:
        source = get_source_from_dict(source)
    elif featureset.spec.source.to_dict():
        source = get_source_from_dict(featureset.spec.source.to_dict())

    targets = context.get_param("targets", None)
    if not targets:
        targets = featureset.spec.targets
    targets = [get_target_driver(target, featureset) for target in targets]
    return featureset, source, targets, infer_options
Ejemplo n.º 4
0
def init_featureset_graph(
    source,
    featureset,
    namespace,
    targets=None,
    return_df=True,
    verbose=False,
    rows_limit=None,
):
    """create storey ingestion graph/DAG from feature set object"""

    cache = ResourceCache()
    graph = featureset.spec.graph.copy()

    # init targets (and table)
    targets = targets or []
    server = create_graph_server(graph=graph, parameters={}, verbose=verbose)
    server.init_states(context=None, namespace=namespace, resource_cache=cache)

    if graph.engine != "sync":
        # todo: support rows_limit it storey sources
        _add_data_steps(
            graph,
            cache,
            featureset,
            targets=targets,
            source=source,
            return_df=return_df,
            context=server.context,
        )
        server.init_object(namespace)
        return graph.wait_for_completion()

    server.init_object(namespace)

    # if the source is a dataframe iterator we load/write it in chunks
    chunk_id = 0
    if hasattr(source, "to_dataframe"):
        if source.is_iterator():
            chunk_id = 1
            chunks = source.to_dataframe()
        else:
            chunks = [source.to_dataframe()]
    elif not hasattr(source, "to_csv"):
        raise mlrun.errors.MLRunInvalidArgumentError("illegal source")
    else:
        chunks = [source]

    entity_columns = list(featureset.spec.entities.keys())
    key_fields = entity_columns if entity_columns else None

    sizes = [0] * len(targets)
    data_result = None
    total_rows = 0
    targets = [get_target_driver(target, featureset) for target in targets]
    for chunk in chunks:
        event = MockEvent(body=chunk)
        data = server.run(event, get_body=True)
        if data is not None:
            for i, target in enumerate(targets):
                size = target.write_dataframe(
                    data,
                    key_column=key_fields,
                    timestamp_key=featureset.spec.timestamp_key,
                    chunk_id=chunk_id,
                )
                if size:
                    sizes[i] += size
        chunk_id += 1
        if data_result is None:
            # in case of multiple chunks only return the first chunk (last may be too small)
            data_result = data
        total_rows += data.shape[0]
        if rows_limit and total_rows >= rows_limit:
            break

    # todo: fire termination event if iterator

    for i, target in enumerate(targets):
        target_status = target.update_resource_status("ready", size=sizes[i])
        if verbose:
            logger.info(f"wrote target: {target_status}")

    return data_result