Example #1
0
def get_object_artifacts_for_node(node: schemas.StorageNodeDB,
                                  root: schemas.WorkspaceRootDB,
                                  workspace: schemas.WorkspaceDB,
                                  obj: minio.Object) -> dict:
    """Artifacts directly from MinIO"""
    path = s3utils.getWorkspaceKey(workspace, root)
    prefix = posixpath.join(root.base_path, "")  # add trailing slash
    return clientCache.get_minio_sdk_client(node).list_objects_v2(
        root.bucket, prefix=prefix)
Example #2
0
def minio_recursive_generate_objects(
    node: schemas.StorageNodeOperator,
    root: schemas.WorkspaceRootDB,
    workspace: schemas.WorkspaceDB,
) -> Iterable[minio.Object]:
    """Generate a flat list of minio objects from a workspace"""
    b3client = clientCache.get_minio_sdk_client(node)
    bucket = root.bucket
    prefix = posixpath.join(root.base_path, s3utils.getWorkspaceKey(workspace))
    return b3client.list_objects_v2(
        bucket,
        prefix=prefix,
        recursive=True,
    )
Example #3
0
def minio_transform_object(
        workspace: schemas.WorkspaceDB, root: schemas.WorkspaceRootDB,
        obj: minio.Object) -> indexing_schemas.IndexDocumentBase:
    """Turn an object into a index document"""
    common = s3utils.getWorkspaceKey(workspace)
    assert (posixpath.commonprefix([common, obj.object_name]) is
            common), f"{common} not in {obj.object_name}"
    inner = obj.object_name.lstrip(common)
    return indexing_schemas.IndexDocumentBase(
        time=obj.last_modified,
        size=obj.size,
        etag=obj.etag,
        path=inner,
        extension=posixpath.splitext(inner)[-1],
        content_type=obj.content_type,
    )
Example #4
0
def probe(
    doc: indexing_schemas.IndexDocumentBase,
    node: schemas.StorageNodeOperator,
    root: schemas.WorkspaceRootDB,
    workspace: schemas.WorkspaceDB,
):
    endpoint = node.api_url
    parsed = urllib.parse.urlparse(endpoint)
    host = parsed.netloc
    headerstring = ""
    uri = posixpath.join(
        "/",
        root.bucket,
        s3utils.getWorkspaceKey(workspace, root),
        doc.path.lstrip("/"),
    )
    headers = s3utils.get_s3v4_headers(
        access_key=node.access_key_id,
        secret_key=node.secret_access_key,
        region=node.region_name,
        host=host,
        endpoint=endpoint,
        uri=uri,
    )
    headerstring = "\r\n".join(
        [f"{key}:{val}" for key, val in headers.items()])
    url = urllib.parse.urljoin(endpoint, uri)
    try:
        data = ffmpeg.probe(url, headers=headerstring)
        if len(data["streams"]):
            streams = data["streams"][0]
            doc.codec_tag_string = streams["codec_tag_string"]
            doc.r_frame_rate = streams["r_frame_rate"]
            doc.width = streams["width"]
            doc.height = streams["height"]
            doc.duration_ts = streams["duration_ts"]
            try:
                doc.bit_rate = int(streams["bit_rate"])
            except:
                doc.bit_rate = streams["bit_rate"]
        doc.duration_sec = data["format"]["duration"]
        doc.format_name = data["format"]["format_name"]
    except ffmpeg._run.Error as e:
        raise indexing_schemas.ProducerError(e)
Example #5
0
 def mc(ctx, args):
     r = ctx["session"].post(
         "token/search",
         json={
             "search_terms": args,
         },
     )
     if r.ok:
         response = r.json()
         assembled = " ".join(args)
         mc_env = ""
         for arg, match in response["workspaces"].items():
             workspace = schemas.WorkspaceDB(**match["workspace"])
             scope = workspace.root.root_type.lower()
             key = s3utils.getWorkspaceKey(workspace)
             path = "/".join([
                 "myalias",
                 workspace.root.bucket,
                 key,
                 match["path"].lstrip("/"),
             ])
             assembled = assembled.replace(arg, path)
         if len(response["tokens"]) == 1:
             token = response["tokens"][0]["token"]
             access_key = token["access_key_id"]
             secret = token["secret_access_key"]
             session_token = token["session_token"]
             api_url = response["tokens"][0]["node"]["api_url"]
             url = urllib.parse.urlparse(api_url)
             mc_env = (
                 f"{url.scheme}://{access_key}:{secret}:{session_token}@{url.netloc}"
             )
         command = (
             "mc",
             *assembled.split(" "),
         )
         os.execvpe(command[0], command,
                    dict(os.environ, MC_HOST_myalias=mc_env))
     else:
         exit_with(handle_request_error(r))
Example #6
0
def bulk_index_add(
    db: Session,
    ec: elasticsearch.Elasticsearch,
    user: schemas.UserDB,
    workspace_id: uuid.UUID,
    docs: indexing_schemas.IndexBulkAdd,
):
    workspace: models.Workspace = db.query(
        models.Workspace).get_or_404(workspace_id)
    last_crawl: indexing_models.WorkspaceCrawlRound = (db.query(
        indexing_models.WorkspaceCrawlRound).filter(
            indexing_models.WorkspaceCrawlRound.workspace_id == workspace.id
        ).order_by(desc(
            indexing_models.WorkspaceCrawlRound.start_time)).first_or_404())
    if last_crawl.success == True:
        raise ValueError(
            f"no outstanding crawl round for this workspace found")
    root: models.WorkspaceRoot = workspace.root
    verify_root_permissions(user, root)
    bulk_operations = ""
    index: indexing_models.RootIndex = (db.query(
        indexing_models.RootIndex).filter(
            indexing_models.RootIndex.root_id == root.id).first())
    if index is None:
        raise ValueError(
            f"index does not exist for workspace {workspace.name}::{workspace.id}"
        )
    object_count = len(docs.documents)
    object_size_sum = 0
    for doc in docs.documents:
        workspacekey = s3utils.getWorkspaceKey(workspace)
        doc.s
        upsertdoc = indexing_schemas.IndexDocument(
            **doc.dict(),
            workspace_id=workspace.id,
            workspace_name=workspace.name,
            owner_id=workspace.owner_id,
            owner_name=workspace.owner.username,
            bucket=root.bucket,
            server=root.storage_node.api_url,
            root_path=workspacekey,
            root_id=root.id,
            user_shares=[share.sharee.id for share in workspace.shares],
            # TODO: group shares
        )
        object_size_sum += doc.size
        bulk_operations += (json.dumps(
            {
                "update": {
                    "_index":
                    index.index_type,
                    "_id":
                    make_record_primary_key(
                        root.storage_node.api_url,
                        root.bucket,
                        workspacekey,
                        doc.path,
                    ),
                }
            }, ) + "\n")
        bulk_operations += (
            indexing_schemas.ElasticUpsertIndexDocument(doc=upsertdoc).json() +
            "\n")
    last_crawl.total_objects += object_count
    last_crawl.total_size += object_size_sum
    last_crawl.last_indexed_key = docs.documents[-1].path
    if docs.succeeded:
        last_crawl.succeeded = True
        last_crawl.end_time = datetime.datetime.utcnow()
    db.add(last_crawl)
    db.commit()
    ec.bulk(bulk_operations)