Beispiel #1
0
def _fix_artifact_tags_duplications(db: mlrun.api.db.sqldb.db.SQLDB,
                                    db_session: sqlalchemy.orm.Session):
    logger.info("Fixing artifact tags duplications")
    # get all artifacts
    artifacts = db._find_artifacts(db_session, None, "*")
    # get all artifact tags
    tags = db._query(db_session, mlrun.api.db.sqldb.models.Artifact.Tag).all()
    # artifact record id -> artifact
    artifact_record_id_map = {artifact.id: artifact for artifact in artifacts}
    tags_to_delete = []
    projects = {artifact.project for artifact in artifacts}
    for project in projects:
        artifact_keys = {
            artifact.key
            for artifact in artifacts if artifact.project == project
        }
        for artifact_key in artifact_keys:
            artifact_key_tags = []
            for tag in tags:
                # sanity
                if tag.obj_id not in artifact_record_id_map:
                    logger.warning("Found orphan tag, deleting",
                                   tag=tag.to_dict())
                if artifact_record_id_map[tag.obj_id].key == artifact_key:
                    artifact_key_tags.append(tag)
            tag_name_tags_map = collections.defaultdict(list)
            for tag in artifact_key_tags:
                tag_name_tags_map[tag.name].append(tag)
            for tag_name, _tags in tag_name_tags_map.items():
                if len(_tags) == 1:
                    continue
                tags_artifacts = [
                    artifact_record_id_map[tag.obj_id] for tag in _tags
                ]
                last_updated_artifact = _find_last_updated_artifact(
                    tags_artifacts)
                for tag in _tags:
                    if tag.obj_id != last_updated_artifact.id:
                        tags_to_delete.append(tag)
    if tags_to_delete:
        logger.info(
            "Found duplicated artifact tags. Removing duplications",
            tags_to_delete=[
                tag_to_delete.to_dict() for tag_to_delete in tags_to_delete
            ],
            tags=[tag.to_dict() for tag in tags],
            artifacts=[artifact.to_dict() for artifact in artifacts],
        )
        for tag in tags_to_delete:
            db_session.delete(tag)
        db_session.commit()
Beispiel #2
0
def _fix_datasets_large_previews(
    db: mlrun.api.db.sqldb.db.SQLDB,
    db_session: sqlalchemy.orm.Session,
    leader_session: typing.Optional[str] = None,
):
    # get all artifacts
    artifacts = db._find_artifacts(db_session, None, "*")
    for artifact in artifacts:
        try:
            artifact_dict = artifact.struct
            if (artifact_dict and artifact_dict.get("kind")
                    == mlrun.artifacts.DatasetArtifact.kind):
                header = artifact_dict.get("header", [])
                if header and len(
                        header) > mlrun.artifacts.dataset.max_preview_columns:
                    logger.debug(
                        "Found dataset artifact with more than allowed columns in preview fields. Fixing",
                        artifact=artifact_dict,
                    )
                    columns_to_remove = header[mlrun.artifacts.dataset.
                                               max_preview_columns:]

                    # align preview
                    if artifact_dict.get("preview"):
                        new_preview = []
                        for preview_row in artifact_dict["preview"]:
                            # sanity
                            if (len(preview_row) <
                                    mlrun.artifacts.dataset.max_preview_columns
                                ):
                                logger.warning(
                                    "Found artifact with more than allowed columns in header definition, "
                                    "but preview data is valid. Leaving preview as is",
                                    artifact=artifact_dict,
                                )
                            new_preview.append(
                                preview_row[:mlrun.artifacts.dataset.
                                            max_preview_columns])

                        artifact_dict["preview"] = new_preview

                    # align stats
                    for column_to_remove in columns_to_remove:
                        if column_to_remove in artifact_dict.get("stats", {}):
                            del artifact_dict["stats"][column_to_remove]

                    # align schema
                    if artifact_dict.get("schema", {}).get("fields"):
                        new_schema_fields = []
                        for field in artifact_dict["schema"]["fields"]:
                            if field.get("name") not in columns_to_remove:
                                new_schema_fields.append(field)
                        artifact_dict["schema"]["fields"] = new_schema_fields

                    # lastly, align headers
                    artifact_dict["header"] = header[:mlrun.artifacts.dataset.
                                                     max_preview_columns]
                    logger.debug(
                        "Fixed dataset artifact preview fields. Storing",
                        artifact=artifact_dict,
                    )
                    db._store_artifact(
                        db_session,
                        artifact.key,
                        artifact_dict,
                        artifact.uid,
                        project=artifact.project,
                        tag_artifact=False,
                        ensure_project=False,
                        leader_session=leader_session,
                    )
        except Exception as exc:
            logger.warning(
                "Failed fixing dataset artifact large preview. Continuing",
                exc=exc,
            )