Esempio n. 1
0
def list_runs(page_offset: int, page_size: int, include_soft_deletes: bool) -> None:
    """List recent ingestion runs to datahub"""

    session, gms_host = get_session_and_host()

    url = f"{gms_host}/runs?action=list"

    payload_obj = {
        "pageOffset": page_offset,
        "pageSize": page_size,
        "includeSoft": include_soft_deletes,
    }

    payload = json.dumps(payload_obj)

    response = session.post(url, data=payload)

    rows = parse_restli_response(response)
    local_timezone = datetime.now().astimezone().tzinfo

    structured_rows = [
        [
            row.get("runId"),
            row.get("rows"),
            datetime.fromtimestamp(row.get("timestamp") / 1000).strftime(
                "%Y-%m-%d %H:%M:%S"
            )
            + f" ({local_timezone})",
        ]
        for row in rows
    ]

    click.echo(tabulate(structured_rows, RUNS_TABLE_COLUMNS, tablefmt="grid"))
Esempio n. 2
0
def list_runs(page_offset: int, page_size: int) -> None:
    """List recent ingestion runs to datahub"""
    session, gms_host = get_session_and_host()

    url = f"{gms_host}/runs?action=list"

    payload_obj = {
        "pageOffset": page_offset,
        "pageSize": page_size,
    }

    payload = json.dumps(payload_obj)

    response = session.post(url, data=payload)

    rows = parse_restli_response(response)

    structured_rows = [[
        row.get("runId"),
        row.get("rows"),
        datetime.utcfromtimestamp(row.get("timestamp") /
                                  1000).strftime("%Y-%m-%d %H:%M:%S"),
    ] for row in rows]

    click.echo(tabulate(structured_rows, RUNS_TABLE_COLUMNS, tablefmt="grid"))
Esempio n. 3
0
def _delete_one_urn(
    urn: str,
    soft: bool = False,
    dry_run: bool = False,
    entity_type: str = "dataset",
    cached_session_host: Optional[Tuple[sessions.Session, str]] = None,
    cached_emitter: Optional[rest_emitter.DatahubRestEmitter] = None,
    run_id: str = "delete-run-id",
    deletion_timestamp: int = _get_current_time(),
) -> DeletionResult:

    deletion_result = DeletionResult()
    deletion_result.num_entities = 1
    deletion_result.num_records = UNKNOWN_NUM_RECORDS  # Default is unknown

    if soft:
        # Add removed aspect
        if not cached_emitter:
            _, gms_host = cli_utils.get_session_and_host()
            token = cli_utils.get_token()
            emitter = rest_emitter.DatahubRestEmitter(gms_server=gms_host, token=token)
        else:
            emitter = cached_emitter
        if not dry_run:
            emitter.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType=entity_type,
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=urn,
                    aspectName="status",
                    aspect=StatusClass(removed=True),
                    systemMetadata=SystemMetadataClass(
                        runId=run_id, lastObserved=deletion_timestamp
                    ),
                )
            )
        else:
            logger.info(f"[Dry-run] Would soft-delete {urn}")
    else:
        if not dry_run:
            payload_obj = {"urn": urn}
            urn, rows_affected = cli_utils.post_delete_endpoint(
                payload_obj,
                "/entities?action=delete",
                cached_session_host=cached_session_host,
            )
            deletion_result.num_records = rows_affected
        else:
            logger.info(f"[Dry-run] Would hard-delete {urn}")
            deletion_result.num_records = UNKNOWN_NUM_RECORDS  # since we don't know how many rows will be affected

    deletion_result.end()
    return deletion_result
Esempio n. 4
0
def delete_with_filters(
    dry_run: bool,
    soft: bool,
    force: bool,
    include_removed: bool,
    search_query: str = "*",
    entity_type: str = "dataset",
    env: Optional[str] = None,
    platform: Optional[str] = None,
) -> DeletionResult:

    session, gms_host = cli_utils.get_session_and_host()
    token = cli_utils.get_token()

    logger.info(f"datahub configured with {gms_host}")
    emitter = rest_emitter.DatahubRestEmitter(gms_server=gms_host, token=token)
    batch_deletion_result = DeletionResult()
    urns = [
        u
        for u in cli_utils.get_urns_by_filter(
            env=env,
            platform=platform,
            search_query=search_query,
            entity_type=entity_type,
            include_removed=include_removed,
        )
    ]
    logger.info(
        f"Filter matched {len(urns)} entities. Sample: {choices(urns, k=min(5, len(urns)))}"
    )
    if not force:
        click.confirm(
            f"This will delete {len(urns)} entities. Are you sure?", abort=True
        )

    for urn in progressbar.progressbar(urns, redirect_stdout=True):
        one_result = _delete_one_urn(
            urn,
            soft=soft,
            entity_type=entity_type,
            dry_run=dry_run,
            cached_session_host=(session, gms_host),
            cached_emitter=emitter,
        )
        batch_deletion_result.merge(one_result)
    batch_deletion_result.end()

    return batch_deletion_result
Esempio n. 5
0
def delete(
    urn: str,
    force: bool,
    soft: bool,
    env: str,
    platform: str,
    entity_type: str,
    query: str,
    registry_id: str,
    dry_run: bool,
) -> None:
    """Delete metadata from datahub using a single urn or a combination of filters"""

    cli_utils.test_connectivity_complain_exit("delete")
    # one of urn / platform / env / query must be provided
    if not urn and not platform and not env and not query and not registry_id:
        raise click.UsageError(
            "You must provide either an urn or a platform or an env or a query for me to delete anything"
        )

    # default query is set to "*" if not provided
    query = "*" if query is None else query

    if not force and not soft and not dry_run:
        click.confirm(
            "This will permanently delete data from DataHub. Do you want to continue?",
            abort=True,
        )

    if urn:
        # Single urn based delete
        session, host = cli_utils.get_session_and_host()
        entity_type = guess_entity_type(urn=urn)
        logger.info(f"DataHub configured with {host}")
        deletion_result: DeletionResult = delete_one_urn_cmd(
            urn,
            soft=soft,
            dry_run=dry_run,
            entity_type=entity_type,
            cached_session_host=(session, host),
        )

        if not dry_run:
            if deletion_result.num_records == 0:
                click.echo(f"Nothing deleted for {urn}")
            else:
                click.echo(
                    f"Successfully deleted {urn}. {deletion_result.num_records} rows deleted"
                )
    elif registry_id:
        # Registry-id based delete
        if soft and not dry_run:
            raise click.UsageError(
                "Soft-deleting with a registry-id is not yet supported. Try --dry-run to see what you will be deleting, before issuing a hard-delete using the --hard flag"
            )
        deletion_result = delete_for_registry(registry_id=registry_id,
                                              soft=soft,
                                              dry_run=dry_run)
    else:
        # Filter based delete
        deletion_result = delete_with_filters(
            env=env,
            platform=platform,
            dry_run=dry_run,
            soft=soft,
            entity_type=entity_type,
            search_query=query,
            force=force,
        )

    if not dry_run:
        message = "soft delete" if soft else "hard delete"
        click.echo(
            f"Took {(deletion_result.end_time_millis-deletion_result.start_time_millis)/1000.0} seconds to {message} {deletion_result.num_records} rows for {deletion_result.num_entities} entities"
        )
    else:
        click.echo(
            f"{deletion_result.num_entities} entities with {deletion_result.num_records if deletion_result.num_records != UNKNOWN_NUM_RECORDS else 'unknown'} rows will be affected. Took {(deletion_result.end_time_millis-deletion_result.start_time_millis)/1000.0} seconds to evaluate."
        )
    if deletion_result.sample_records:
        click.echo(
            tabulate(deletion_result.sample_records,
                     RUN_TABLE_COLUMNS,
                     tablefmt="grid"))
Esempio n. 6
0
def dataplatform2instance_func(
    instance: str,
    platform: str,
    dry_run: bool,
    env: str,
    force: bool,
    hard: bool,
    keep: bool,
) -> None:
    click.echo(
        f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}"
    )
    run_id: str = f"migrate-{uuid.uuid4()}"
    migration_report = MigrationReport(run_id, dry_run, keep)
    system_metadata = SystemMetadataClass(runId=run_id)

    all_aspects = [
        "schemaMetadata",
        "datasetProperties",
        "viewProperties",
        "subTypes",
        "editableDatasetProperties",
        "ownership",
        "datasetDeprecation",
        "institutionalMemory",
        "editableSchemaMetadata",
        "globalTags",
        "glossaryTerms",
        "upstreamLineage",
        "datasetUpstreamLineage",
        "status",
    ]

    if not dry_run:
        rest_emitter = DatahubRestEmitter(
            gms_server=cli_utils.get_session_and_host()[1]
        )

    urns_to_migrate = []
    # we first calculate all the urns we will be migrating
    for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        # Does this urn already have a platform instance associated with it?
        response = cli_utils.get_aspects_for_entity(
            entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True
        )
        if "dataPlatformInstance" in response:
            assert isinstance(
                response["dataPlatformInstance"], DataPlatformInstanceClass
            )
            data_platform_instance: DataPlatformInstanceClass = response[
                "dataPlatformInstance"
            ]
            if data_platform_instance.instance:
                log.debug("This is already an instance-specific urn, will skip")
                continue
            else:
                log.debug(
                    f"{src_entity_urn} is not an instance specific urn. {response}"
                )
                urns_to_migrate.append(src_entity_urn)

    if not force and not dry_run:
        # get a confirmation from the operator before proceeding if this is not a dry run
        sampled_urns_to_migrate = random.choices(
            urns_to_migrate, k=min(10, len(urns_to_migrate))
        )
        sampled_new_urns: List[str] = [
            make_dataset_urn_with_platform_instance(
                platform=key.platform,
                name=key.name,
                platform_instance=instance,
                env=str(key.origin),
            )
            for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate]
            if key
        ]
        click.echo(
            f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
        )
        click.echo(f"New urns will look like {sampled_new_urns}")
        click.confirm("Ok to proceed?", abort=True)

    for src_entity_urn in progressbar.progressbar(
        urns_to_migrate, redirect_stdout=True
    ):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        new_urn = make_dataset_urn_with_platform_instance(
            platform=key.platform,
            name=key.name,
            platform_instance=instance,
            env=str(key.origin),
        )
        log.debug(f"Will migrate {src_entity_urn} to {new_urn}")
        relationships = migration_utils.get_incoming_relationships_dataset(
            src_entity_urn
        )

        for mcp in migration_utils.clone_aspect(
            src_entity_urn,
            aspect_names=all_aspects,
            dst_urn=new_urn,
            dry_run=dry_run,
            run_id=run_id,
        ):
            if not dry_run:
                rest_emitter.emit_mcp(mcp)
            migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName)  # type: ignore

        if not dry_run:
            rest_emitter.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=new_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=make_data_platform_urn(platform),
                        instance=make_dataplatform_instance_urn(platform, instance),
                    ),
                    systemMetadata=system_metadata,
                )
            )
        migration_report.on_entity_create(new_urn, "dataPlatformInstance")

        for relationship in relationships:
            target_urn = relationship["entity"]
            entity_type = _get_type_from_urn(target_urn)
            relationshipType = relationship["type"]
            aspect_name = (
                migration_utils.get_aspect_name_from_relationship_type_and_entity(
                    relationshipType, entity_type
                )
            )
            aspect_map = cli_utils.get_aspects_for_entity(
                target_urn, aspects=[aspect_name], typed=True
            )
            if aspect_name in aspect_map:
                aspect = aspect_map[aspect_name]
                assert isinstance(aspect, DictWrapper)
                aspect = migration_utils.modify_urn_list_for_aspect(
                    aspect_name, aspect, relationshipType, src_entity_urn, new_urn
                )
                # use mcpw
                mcp = MetadataChangeProposalWrapper(
                    entityType=entity_type,
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=target_urn,
                    aspectName=aspect_name,
                    aspect=aspect,
                )
                if not dry_run:
                    rest_emitter.emit_mcp(mcp)
                migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName)  # type: ignore
            else:
                log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")

        if not dry_run and not keep:
            log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}")
            delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id)
        migration_report.on_entity_migrated(src_entity_urn, "status")  # type: ignore

    print(f"{migration_report}")