def get_object_artifacts_for_node(node: schemas.StorageNodeDB, root: schemas.WorkspaceRootDB, workspace: schemas.WorkspaceDB, obj: minio.Object) -> dict: """Artifacts directly from MinIO""" path = s3utils.getWorkspaceKey(workspace, root) prefix = posixpath.join(root.base_path, "") # add trailing slash return clientCache.get_minio_sdk_client(node).list_objects_v2( root.bucket, prefix=prefix)
def minio_recursive_generate_objects( node: schemas.StorageNodeOperator, root: schemas.WorkspaceRootDB, workspace: schemas.WorkspaceDB, ) -> Iterable[minio.Object]: """Generate a flat list of minio objects from a workspace""" b3client = clientCache.get_minio_sdk_client(node) bucket = root.bucket prefix = posixpath.join(root.base_path, s3utils.getWorkspaceKey(workspace)) return b3client.list_objects_v2( bucket, prefix=prefix, recursive=True, )
def minio_transform_object( workspace: schemas.WorkspaceDB, root: schemas.WorkspaceRootDB, obj: minio.Object) -> indexing_schemas.IndexDocumentBase: """Turn an object into a index document""" common = s3utils.getWorkspaceKey(workspace) assert (posixpath.commonprefix([common, obj.object_name]) is common), f"{common} not in {obj.object_name}" inner = obj.object_name.lstrip(common) return indexing_schemas.IndexDocumentBase( time=obj.last_modified, size=obj.size, etag=obj.etag, path=inner, extension=posixpath.splitext(inner)[-1], content_type=obj.content_type, )
def probe( doc: indexing_schemas.IndexDocumentBase, node: schemas.StorageNodeOperator, root: schemas.WorkspaceRootDB, workspace: schemas.WorkspaceDB, ): endpoint = node.api_url parsed = urllib.parse.urlparse(endpoint) host = parsed.netloc headerstring = "" uri = posixpath.join( "/", root.bucket, s3utils.getWorkspaceKey(workspace, root), doc.path.lstrip("/"), ) headers = s3utils.get_s3v4_headers( access_key=node.access_key_id, secret_key=node.secret_access_key, region=node.region_name, host=host, endpoint=endpoint, uri=uri, ) headerstring = "\r\n".join( [f"{key}:{val}" for key, val in headers.items()]) url = urllib.parse.urljoin(endpoint, uri) try: data = ffmpeg.probe(url, headers=headerstring) if len(data["streams"]): streams = data["streams"][0] doc.codec_tag_string = streams["codec_tag_string"] doc.r_frame_rate = streams["r_frame_rate"] doc.width = streams["width"] doc.height = streams["height"] doc.duration_ts = streams["duration_ts"] try: doc.bit_rate = int(streams["bit_rate"]) except: doc.bit_rate = streams["bit_rate"] doc.duration_sec = data["format"]["duration"] doc.format_name = data["format"]["format_name"] except ffmpeg._run.Error as e: raise indexing_schemas.ProducerError(e)
def mc(ctx, args): r = ctx["session"].post( "token/search", json={ "search_terms": args, }, ) if r.ok: response = r.json() assembled = " ".join(args) mc_env = "" for arg, match in response["workspaces"].items(): workspace = schemas.WorkspaceDB(**match["workspace"]) scope = workspace.root.root_type.lower() key = s3utils.getWorkspaceKey(workspace) path = "/".join([ "myalias", workspace.root.bucket, key, match["path"].lstrip("/"), ]) assembled = assembled.replace(arg, path) if len(response["tokens"]) == 1: token = response["tokens"][0]["token"] access_key = token["access_key_id"] secret = token["secret_access_key"] session_token = token["session_token"] api_url = response["tokens"][0]["node"]["api_url"] url = urllib.parse.urlparse(api_url) mc_env = ( f"{url.scheme}://{access_key}:{secret}:{session_token}@{url.netloc}" ) command = ( "mc", *assembled.split(" "), ) os.execvpe(command[0], command, dict(os.environ, MC_HOST_myalias=mc_env)) else: exit_with(handle_request_error(r))
def bulk_index_add( db: Session, ec: elasticsearch.Elasticsearch, user: schemas.UserDB, workspace_id: uuid.UUID, docs: indexing_schemas.IndexBulkAdd, ): workspace: models.Workspace = db.query( models.Workspace).get_or_404(workspace_id) last_crawl: indexing_models.WorkspaceCrawlRound = (db.query( indexing_models.WorkspaceCrawlRound).filter( indexing_models.WorkspaceCrawlRound.workspace_id == workspace.id ).order_by(desc( indexing_models.WorkspaceCrawlRound.start_time)).first_or_404()) if last_crawl.success == True: raise ValueError( f"no outstanding crawl round for this workspace found") root: models.WorkspaceRoot = workspace.root verify_root_permissions(user, root) bulk_operations = "" index: indexing_models.RootIndex = (db.query( indexing_models.RootIndex).filter( indexing_models.RootIndex.root_id == root.id).first()) if index is None: raise ValueError( f"index does not exist for workspace {workspace.name}::{workspace.id}" ) object_count = len(docs.documents) object_size_sum = 0 for doc in docs.documents: workspacekey = s3utils.getWorkspaceKey(workspace) doc.s upsertdoc = indexing_schemas.IndexDocument( **doc.dict(), workspace_id=workspace.id, workspace_name=workspace.name, owner_id=workspace.owner_id, owner_name=workspace.owner.username, bucket=root.bucket, server=root.storage_node.api_url, root_path=workspacekey, root_id=root.id, user_shares=[share.sharee.id for share in workspace.shares], # TODO: group shares ) object_size_sum += doc.size bulk_operations += (json.dumps( { "update": { "_index": index.index_type, "_id": make_record_primary_key( root.storage_node.api_url, root.bucket, workspacekey, doc.path, ), } }, ) + "\n") bulk_operations += ( indexing_schemas.ElasticUpsertIndexDocument(doc=upsertdoc).json() + "\n") last_crawl.total_objects += object_count last_crawl.total_size += object_size_sum last_crawl.last_indexed_key = docs.documents[-1].path if docs.succeeded: last_crawl.succeeded = True last_crawl.end_time = datetime.datetime.utcnow() db.add(last_crawl) db.commit() ec.bulk(bulk_operations)