def upload( paths, girder_collection, girder_top_folder, local_top_path, girder_instance, existing, validation_, fake_data, develop_debug, ): # Ensure that we have all Folders created as well assert local_top_path, "--local-top-path must be specified for now" assert girder_collection, "--collection must be specified" if not girder_top_folder: # TODO: UI # Most often it would be the same directory name as of the local top dir girder_top_folder = op.basename(local_top_path) if girder_top_folder in (op.pardir, op.curdir): girder_top_folder = op.basename(op.realpath(local_top_path)) import multiprocessing from .. import girder from ..pynwb_utils import get_metadata from ..pynwb_utils import validate as pynwb_validate from ..pynwb_utils import ignore_benign_pynwb_warnings from ..support.generatorify import generator_from_callback from ..support.pyout import naturalsize from pathlib import Path, PurePosixPath ignore_benign_pynwb_warnings() # so validate doesn't whine client = girder.authenticate(girder_instance) collection_rec = girder.ensure_collection(client, girder_collection) lgr.debug("Working with collection %s", collection_rec) local_top_path = Path(local_top_path) girder_top_folder = PurePosixPath(girder_top_folder) # We will keep a shared set of "being processed" paths so # we could limit the number of them until # https://github.com/pyout/pyout/issues/87 # properly addressed process_paths = set() uploaded_paths = {} # path: uploaded size def skip_file(msg): return {"status": "skipped", "message": msg} lock = multiprocessing.Lock() # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt def process_path(path, relpath): try: try: stat = os.stat(path) yield {"size": stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return yield {"status": "checking girder"} girder_folder = girder_top_folder / relpath.parent while True: try: lock.acquire(timeout=60) # TODO: we need to make this all thread safe all the way # until uploading the file since multiple threads would # create multiple folder_rec = girder.ensure_folder( client, collection_rec, girder_collection, girder_folder ) # Get (if already exists) or create an item item_rec = client.createItem( folder_rec["_id"], name=relpath.name, reuseExisting=True ) finally: lock.release() file_recs = list(client.listFile(item_rec["_id"])) if len(file_recs) > 1: raise NotImplementedError( f"Item {item_rec} contains multiple files: {file_recs}" ) elif file_recs: # there is a file already if existing == "skip": yield skip_file("exists already") return elif existing == "reupload": yield { "message": "exists - reuploading", "status": "deleting old item", } # TODO: delete an item here raise NotImplementedError("yarik did not find deleteItem API") continue else: raise ValueError(existing) break # no need to loop # we need to delete it first??? I do not see a method TODO if validation_ != "skip": yield {"status": "validating"} validation_errors = pynwb_validate(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation_ == "require": yield skip_file("failed validation") return else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step yield {"status": "extracting metadata"} try: metadata = get_metadata(path) except Exception as exc: yield skip_file("failed to extract metadata: %s" % str(exc)) return yield {"status": "uploading"} # Upload file to an item # XXX TODO progress reporting back to pyout is actually tricky # if possible to implement via callback since # callback would need to yield somehow from the context here. # yoh doesn't see how that could be done yet. In the worst # case we would copy uploadFileToItem and _uploadContents # and make them into generators to relay progress instead of # via callback # https://stackoverflow.com/questions/9968592/turn-functions-with-a-callback-into-python-generators # has some solutions but all IMHO are abit too complex for r in generator_from_callback( lambda c: client.uploadFileToItem( item_rec["_id"], path, progressCallback=c ) ): uploaded_paths[str(path)] = r["current"] yield { "upload": 100.0 * ((r["current"] / r["total"]) if r["total"] else 1.0) } # Provide metadata for the item from the file, could be done via # a callback to be triggered upon successfull upload, or we could # just do it "manually" metadata_ = {} for k, v in metadata.items(): if v in ("", None): continue # degenerate, why bother # XXX TODO: remove this -- it is only temporary, search should handle if isinstance(v, str): metadata_[k] = v.lower() elif isinstance(v, datetime.datetime): metadata_[k] = str(v) # we will add some fields which would help us with deciding to # reupload or not metadata_["uploaded_size"] = os.stat(str(path)).st_size metadata_["uploaded_mtime"] = os.stat(str(path)).st_mtime yield {"status": "uploading metadata"} client.addMetadataToItem(item_rec["_id"], metadata_) yield {"status": "done"} except Exception as exc: if develop_debug: raise yield {"status": "ERROR", "message": str(exc)} finally: process_paths.remove(str(path)) # We will again use pyout to provide a neat table summarizing our progress # with upload etc import pyout from ..support import pyout as pyouts # for the upload speeds we need to provide a custom aggregate t0 = time.time() def upload_agg(*ignored): dt = time.time() - t0 total = sum(uploaded_paths.values()) if not total: return "" speed = total / dt if dt else 0 return "%s/s" % naturalsize(speed) pyout_style = pyouts.get_style(hide_if_missing=False) pyout_style["upload"]["aggregate"] = upload_agg rec_fields = ("path", "size", "errors", "upload", "status", "message") out = pyout.Tabular(style=pyout_style, columns=rec_fields) with out: for path in paths: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) process_paths.add(path) rec = {"path": path} path = Path(path) try: relpath = path.relative_to(local_top_path) rec["path"] = str(relpath) if develop_debug: # DEBUG: do serially for v in process_path(path, relpath): print(v) else: rec[rec_fields[1:]] = process_path(path, relpath) except ValueError as exc: # typically if local_top_path is not the top path for the path rec["status"] = skip_file(exc) out(rec)
def upload( paths, existing="refresh", validation="require", dandiset_path=None, # Development options should come as kwargs girder_collection=collection_drafts, girder_top_folder=None, dandi_instance="dandi", fake_data=False, # TODO: not implemented, prune? allow_any_path=False, devel_debug=False, ): """Upload dandiset (files) to DANDI archive. Target dandiset to upload to must already be registered in the archive and locally "dandiset.yaml" should exist in `--dandiset-path`. If you have not yet created a dandiset in the archive, use 'dandi register' command first. Local dandiset should pass validation. For that it should be first organized using 'dandiset organize' command. By default all files in the dandiset (not following directories starting with a period) will be considered for the upload. You can point to specific files you would like to validate and have uploaded. """ from pathlib import Path, PurePosixPath from ..dandiset import Dandiset from ..support.digests import Digester dandiset = Dandiset.find(dandiset_path) if not dandiset: raise RuntimeError( f"Found no {dandiset_metadata_file} anywhere. Use 'dandi register', 'download', or 'organize' first" ) # Should no longer be needed # dandiset_path = Path(dandiset_path).resolve() # Girder side details: if not girder_collection: girder_collection = collection_drafts if not girder_top_folder: # We upload to staging/dandiset_id ds_identifier = dandiset.identifier if not ds_identifier: raise ValueError( "No 'identifier' set for the dandiset yet. Use 'dandi register'" ) if not re.match(dandiset_identifier_regex, ds_identifier): raise ValueError( f"Dandiset identifier {ds_identifier} does not follow expected " f"convention {dandiset_identifier_regex!r}. Use " f"'dandi register' to get a legit identifier" ) # this is a path not a girder id girder_top_folder = ds_identifier girder_top_folder = PurePosixPath(girder_top_folder) if str(girder_top_folder) in (".", "..", "", "/"): raise ValueError( f"Got folder {girder_top_folder}, but files cannot be uploaded " f"into a collection directly." ) # TODO: that the folder already exists if False: raise ValueError( f"There is no {girder_top_folder} in {girder_collection}. " f"Did you use 'dandi register'?" ) import multiprocessing from .. import girder from ..pynwb_utils import ignore_benign_pynwb_warnings, get_object_id from ..metadata import get_metadata from ..validate import validate_file from ..utils import ( find_dandi_files, find_files, path_is_subpath, get_utcnow_datetime, ) from ..support.generatorify import generator_from_callback from ..support.pyout import naturalsize ignore_benign_pynwb_warnings() # so validate doesn't whine client = girder.get_client(girder.known_instances[dandi_instance].girder) try: collection_rec = girder.ensure_collection(client, girder_collection) except girder.gcl.HttpError as exc: if devel_debug: raise # provide a bit less intimidating error reporting lgr.error( "Failed to assure presence of the %s collection: %s", girder_collection, (girder.get_HttpError_response(exc) or {}).get("message", str(exc)), ) sys.exit(1) lgr.debug("Working with collection %s", collection_rec) # # Treat paths # if not paths: paths = [dandiset.path] # Expand and validate all paths -- they should reside within dandiset orig_paths = paths paths = list(find_files(".*", paths) if allow_any_path else find_dandi_files(paths)) npaths = len(paths) lgr.info(f"Found {npaths} files to consider") for path in paths: path_basename = op.basename(path) if not ( allow_any_path or path_basename == dandiset_metadata_file or path_basename.endswith(".nwb") ): raise NotImplementedError( f"ATM only .nwb and dandiset.yaml should be in the paths to upload. Got {path}" ) fullpath = path if op.isabs(path) else op.abspath(path) if not path_is_subpath(fullpath, dandiset.path): raise ValueError(f"{path} is not under {dandiset.path}") # We will keep a shared set of "being processed" paths so # we could limit the number of them until # https://github.com/pyout/pyout/issues/87 # properly addressed process_paths = set() from collections import defaultdict uploaded_paths = defaultdict(lambda: {"size": 0, "errors": []}) def skip_file(msg): return {"status": "skipped", "message": str(msg)} lock = multiprocessing.Lock() # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt def process_path(path, relpath): try: try: stat = os.stat(path) yield {"size": stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return yield {"status": "checking girder"} girder_folder = girder_top_folder / relpath.parent # we will add some fields which would help us with deciding to # reupload or not path_stat = os.stat(str(path)) file_metadata_ = { "uploaded_size": path_stat.st_size, "uploaded_mtime": ensure_strtime(path_stat.st_mtime), # "uploaded_date": None, # to be filled out upon upload completion } # A girder delete API target to .delete before uploading a file # (e.g. if decided to reupload) delete_before_upload = None def ensure_item(): """This function might need to be called twice, e.g. if we are to reupload the entire item. ATM new versions of the files would create new items since the policy is one File per Item """ try: lock.acquire(timeout=60) # TODO: we need to make this all thread safe all the way # until uploading the file since multiple threads would # create multiple # ATM it even fails with No such folder: 5e33658d6eb14e0bf49e97d5", # so will first upload one file and then the rest... not sure why # locking doesn't work folder_rec = girder.ensure_folder( client, collection_rec, girder_collection, girder_folder ) # Get (if already exists) or create an item item_rec = client.createItem( folder_rec["_id"], name=relpath.name, reuseExisting=True ) finally: lock.release() return item_rec def ensure_folder(): try: lock.acquire(timeout=60) folder_rec = girder.ensure_folder( client, collection_rec, girder_collection, girder_folder ) finally: lock.release() return folder_rec # # 1. Validate first, so we do not bother girder at all if not kosher # if validation != "skip": yield {"status": "validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if op.basename(path) == dandiset_metadata_file: # We need to upload its content as metadata for the entire # folder. folder_rec = ensure_folder() remote_metadata = folder_rec["meta"] if remote_metadata.get("dandiset", {}) == dandiset.metadata: yield skip_file("exists (same)") else: remote_metadata["dandiset"] = dandiset.metadata yield {"status": "uploading dandiset metadata"} client.addMetadataToFolder(folder_rec["_id"], remote_metadata) yield {"status": "done"} # Interrupt -- no file to upload return # # 2. Ensure having an item # item_rec = ensure_item() # # 3. Analyze possibly present on the remote files in the item # file_recs = list(client.listFile(item_rec["_id"])) # get metadata and if we have all indications that it is # probably the same -- we just skip stat_fields = [ # Care only about mtime, ignore ctime which could change "uploaded_mtime", "uploaded_size", ] assert sorted(file_metadata_) == stat_fields item_metadata = item_rec.get("meta", {}) item_file_metadata_ = { k: item_rec.get("meta", {}).get(k, None) for k in stat_fields } lgr.debug( "Files meta: local file: %s remote file: %s", file_metadata_, item_file_metadata_, ) if item_file_metadata_["uploaded_mtime"]: local_mtime = ensure_datetime(file_metadata_["uploaded_mtime"]) remote_mtime = ensure_datetime( item_file_metadata_.get("uploaded_mtime") ) remote_file_status = ( "same" if (file_metadata_ == item_file_metadata_) else ( "newer" if remote_mtime > local_mtime else ("older" if remote_mtime < local_mtime else "diff") ) ) else: remote_file_status = "no mtime" exists_msg = f"exists ({remote_file_status})" if len(file_recs) > 1: raise NotImplementedError( f"Item {item_rec} contains multiple files: {file_recs}" ) elif file_recs: # there is a file already if existing == "error": # as promised -- not gentle at all! raise FileExistsError(exists_msg) if existing == "skip": yield skip_file(exists_msg) return # Logic below only for overwrite and reupload if existing == "overwrite": if remote_file_status == "same": yield skip_file(exists_msg) return elif existing == "refresh": if not remote_file_status == "older": yield skip_file(exists_msg) return elif existing == "force": pass else: raise ValueError("existing") delete_before_upload = f'/item/{item_rec["_id"]}' yield {"message": exists_msg + " - reuploading"} # # 4. Extract metadata - delayed since takes time, but is done # before actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now if op.basename(path) != dandiset_metadata_file: yield {"status": "extracting metadata"} try: metadata = get_metadata(path) except Exception as exc: if allow_any_path: yield {"status": "failed to extract metadata"} metadata = {} else: yield skip_file("failed to extract metadata: %s" % str(exc)) if not file_recs: # remove empty item yield {"status": "deleting empty item"} client.delete(f'/item/{item_rec["_id"]}') yield {"status": "deleted empty item"} return # # ?. Compute checksums and possible other digests (e.g. for s3, ipfs - TODO) # yield {"status": "digesting"} try: # TODO: in theory we could also cache the result, but since it is # critical to get correct checksums, safer to just do it all the time. # Should typically be faster than upload itself ;-) digester = Digester(metadata_digests) file_metadata_.update(digester(path)) except Exception as exc: yield skip_file("failed to compute digests: %s" % str(exc)) return # # 5. Upload file # # TODO: we could potentially keep new item "hidden" until we are # done with upload, and only then remove old one and replace with # a new one (rename from "hidden" name). if delete_before_upload: yield {"status": "deleting old"} client.delete(delete_before_upload) yield {"status": "old deleted"} # create a a new item item_rec = ensure_item() yield {"status": "uploading"} # Upload file to an item # XXX TODO progress reporting back to pyout is actually tricky # if possible to implement via callback since # callback would need to yield somehow from the context here. # yoh doesn't see how that could be done yet. In the worst # case we would copy uploadFileToItem and _uploadContents # and make them into generators to relay progress instead of # via callback # https://stackoverflow.com/questions/9968592/turn-functions-with-a-callback-into-python-generators # has some solutions but all IMHO are abit too complex for r in generator_from_callback( lambda c: client.uploadFileToItem( item_rec["_id"], path, progressCallback=c ) ): uploaded_paths[str(path)]["size"] = r["current"] yield { "upload": 100.0 * ((r["current"] / r["total"]) if r["total"] else 1.0) } # Get uploaded file id file_id, current = client.isFileCurrent( item_rec["_id"], op.basename(path), op.abspath(path) ) if not current: raise RuntimeError( "Must not happen since file %s was just uploaded" % path ) # # 6. Upload metadata # metadata_ = {} for k, v in metadata.items(): if v in ("", None): continue # degenerate, why bother # XXX TODO: remove this -- it is only temporary, search should handle if isinstance(v, str): metadata_[k] = v.lower() elif isinstance(v, datetime): metadata_[k] = ensure_strtime(v) # we will add some fields which would help us with deciding to # reupload or not # .isoformat() would give is8601 representation but I see in girder # already # session_start_time 1971-01-01 12:00:00+00:00 # decided to go for .isoformat for internal consistency -- let's see file_metadata_["uploaded_datetime"] = ensure_strtime(time.time()) metadata_.update(file_metadata_) metadata_["uploaded_size"] = path_stat.st_size metadata_["uploaded_mtime"] = ensure_strtime(path_stat.st_mtime) metadata_["uploaded_by"] = "dandi %s" % __version__ # Also store object_id for the file to help identify changes/moves try: metadata_["uploaded_nwb_object_id"] = get_object_id(str(path)) except Exception as exc: (lgr.debug if allow_any_path else lgr.warning)( "Failed to read object_id: %s", exc ) # # # # 7. Also set remote file ctime to match local mtime # # since for type "file", Resource has no "updated" field. # # and this could us help to identify changes being done # # to the remote file -- if metadata["uploaded_mtime"] # # differs # yield {"status": "setting remote file timestamp"} # try: # client.setResourceTimestamp( # file_id, type="file", created=metadata_["uploaded_mtime"] # ) # except girder.gcl.HttpError as exc: # if devel_debug: # raise # response = girder.get_HttpError_response(exc) # message = response.get("message", str(exc)) # yield {"status": "WARNING", "message": message} # 7. Upload metadata yield {"status": "uploading metadata"} client.addMetadataToItem(item_rec["_id"], metadata_) yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) if isinstance(exc, girder.gcl.HttpError): response = girder.get_HttpError_response(exc) if "message" in response: message = response["message"] uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path)) # We will again use pyout to provide a neat table summarizing our progress # with upload etc import pyout from ..support import pyout as pyouts # for the upload speeds we need to provide a custom aggregate t0 = time.time() def upload_agg(*ignored): dt = time.time() - t0 total = sum(v["size"] for v in uploaded_paths.values()) if not total: return "" speed = total / dt if dt else 0 return "%s/s" % naturalsize(speed) pyout_style = pyouts.get_style(hide_if_missing=False) pyout_style["upload"]["aggregate"] = upload_agg rec_fields = ("path", "size", "errors", "upload", "status", "message") out = pyout.Tabular(style=pyout_style, columns=rec_fields) with out: for path in paths: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) process_paths.add(path) rec = {"path": path} path = Path(path) try: fullpath = path if path.is_absolute() else path.absolute() relpath = fullpath.relative_to(dandiset.path) rec["path"] = str(relpath) if devel_debug: # DEBUG: do serially for v in process_path(path, relpath): print(v) else: rec[rec_fields[1:]] = process_path(path, relpath) except ValueError as exc: if "does not start with" in str(exc): # if top_path is not the top path for the path # Provide more concise specific message without path details rec.update(skip_file("must be a child of top path")) else: rec.update(skip_file(exc)) out(rec)
def download( urls, output_dir, *, format="pyout", existing="error", jobs=1, get_metadata=True, get_assets=True, ): # TODO: unduplicate with upload. For now stole from that one # We will again use pyout to provide a neat table summarizing our progress # with upload etc import pyout from .support import pyout as pyouts # dandi.cli.formatters are used in cmd_ls to provide switchable pyout_style = pyouts.get_style(hide_if_missing=False) rec_fields = ("path", "size", "done", "done%", "checksum", "status", "message") out = pyout.Tabular(style=pyout_style, columns=rec_fields, max_workers=jobs) out_helper = PYOUTHelper() pyout_style["done"] = pyout_style["size"].copy() pyout_style["size"]["aggregate"] = out_helper.agg_size pyout_style["done"]["aggregate"] = out_helper.agg_done # I thought I was making a beautiful flower but ended up with cacti # which never blooms... All because assets are looped through inside download_generator # TODO: redo kw = dict(assets_it=out_helper.it) if jobs > 1 and format == "pyout": # It could handle delegated to generator downloads kw["yield_generator_for_fields"] = rec_fields[1:] # all but path gen_ = download_generator( urls, output_dir, existing=existing, get_metadata=get_metadata, get_assets=get_assets, **kw, ) # TODOs: # - redo frontends similarly to how command_ls did it # - have a single loop with analysis of `rec` to either any file # has failed to download. If any was: exception should probably be # raised. API discussion for Python side of API: # if format == "debug": for rec in gen_: print(rec) sys.stdout.flush() elif format == "pyout": with out: for rec in gen_: out(rec) else: raise ValueError(format)
def _new_upload( api_url, dandiset, paths, existing, validation, dandiset_path, allow_any_path, upload_dandiset_metadata, devel_debug, ): from .dandiapi import DandiAPIClient from .dandiset import APIDandiset from .support.digests import Digester client = DandiAPIClient(api_url) client.dandi_authenticate() dandiset = APIDandiset(dandiset.path) # "cast" to a new API based dandiset ds_identifier = dandiset.identifier # this is a path not a girder id if not re.match(dandiset_identifier_regex, str(ds_identifier)): raise ValueError( f"Dandiset identifier {ds_identifier} does not follow expected " f"convention {dandiset_identifier_regex!r}. Use " f"'dandi register' to get a legit identifier") from .metadata import nwb2asset from .pynwb_utils import ignore_benign_pynwb_warnings from .support.pyout import naturalsize from .utils import find_dandi_files, find_files, path_is_subpath from .validate import validate_file ignore_benign_pynwb_warnings() # so validate doesn't whine # # Treat paths # if not paths: paths = [dandiset.path] # Expand and validate all paths -- they should reside within dandiset paths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) paths = list(map(Path, paths)) npaths = len(paths) lgr.info(f"Found {npaths} files to consider") for path in paths: if not (allow_any_path or path.name == dandiset_metadata_file or path.name.endswith(".nwb")): raise NotImplementedError( f"ATM only .nwb and dandiset.yaml should be in the paths to upload. Got {path}" ) if not path_is_subpath(str(path.absolute()), dandiset.path): raise ValueError(f"{path} is not under {dandiset.path}") # We will keep a shared set of "being processed" paths so # we could limit the number of them until # https://github.com/pyout/pyout/issues/87 # properly addressed process_paths = set() from collections import defaultdict uploaded_paths = defaultdict(lambda: {"size": 0, "errors": []}) def skip_file(msg): return {"status": "skipped", "message": str(msg)} # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on server. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return # # Compute checksums and possible other digests (e.g. for s3, ipfs - TODO) # yield {"status": "digesting"} try: # TODO: in theory we could also cache the result, but since it is # critical to get correct checksums, safer to just do it all the time. # Should typically be faster than upload itself ;-) digester = Digester(["sha256"]) sha256_digest = digester(path)["sha256"] except Exception as exc: yield skip_file("failed to compute digests: %s" % str(exc)) return extant = client.get_asset_bypath(ds_identifier, "draft", relpath) if extant is not None and extant["sha256"] == sha256_digest: if existing == "error": # as promised -- not gentle at all! raise FileExistsError("file exists") if existing == "skip": yield skip_file("file exists") return # Logic below only for overwrite and reupload if existing == "overwrite": if extant["sha256"] == sha256_digest: yield skip_file("file exists") return elif existing == "refresh": pass elif existing == "force": pass else: raise ValueError("existing") # # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. if upload_dandiset_metadata: yield {"status": "updating metadata"} client.set_dandiset_metadata(dandiset.identifier, metadata=dandiset.metadata) yield {"status": "updated metadata"} else: yield skip_file("should be edited online") return # # Extract metadata - delayed since takes time, but is done before # actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now yield {"status": "extracting metadata"} try: asset_metadata = nwb2asset(path, digest=sha256_digest, digest_type="SHA256") except Exception as exc: if allow_any_path: yield {"status": "failed to extract metadata"} metadata = { "contentSize": os.path.getsize(path), "digest": sha256_digest, "digest_type": "SHA256", # "encodingFormat": # TODO } else: yield skip_file("failed to extract metadata: %s" % str(exc)) return else: # We need to convert to a `dict` this way instead of with # `.dict()` so that enums will be converted to strings. metadata = json.loads( asset_metadata.json(exclude_unset=True, exclude_none=True)) # # Upload file # yield {"status": "uploading"} for r in client.iter_upload(ds_identifier, "draft", str(relpath), metadata, str(path)): if r["status"] == "uploading": uploaded_paths[str(path)]["size"] = r["current"] yield r yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path)) # We will again use pyout to provide a neat table summarizing our progress # with upload etc import pyout from .support import pyout as pyouts # for the upload speeds we need to provide a custom aggregate t0 = time.time() def upload_agg(*ignored): dt = time.time() - t0 total = sum(v["size"] for v in uploaded_paths.values()) if not total: return "" speed = total / dt if dt else 0 return "%s/s" % naturalsize(speed) pyout_style = pyouts.get_style(hide_if_missing=False) pyout_style["upload"]["aggregate"] = upload_agg rec_fields = ["path", "size", "errors", "upload", "status", "message"] out = pyout.Tabular(style=pyout_style, columns=rec_fields) with out, client.session(): for path in paths: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) rec = {"path": str(path)} process_paths.add(str(path)) try: relpath = path.absolute().relative_to(dandiset.path) rec["path"] = str(relpath) if devel_debug: # DEBUG: do serially for v in process_path(path, relpath): print(str(v), flush=True) else: rec[tuple(rec_fields[1:])] = process_path(path, relpath) except ValueError as exc: if "does not start with" in str(exc): # if top_path is not the top path for the path # Provide more concise specific message without path details rec.update(skip_file("must be a child of top path")) else: rec.update(skip_file(exc)) out(rec)