def push( self, files_to_upload: List[str], blocking: bool = True, multi_threaded: bool = True, fps: int = 1, as_frames: bool = False, files_to_exclude: Optional[List[str]] = None, resume: bool = False, path: Optional[str] = None, ): """Uploads a local dataset (images ONLY) in the datasets directory. Parameters ---------- files_to_upload : list[Path] List of files to upload. It can be a folder. blocking : bool If False, the dataset is not uploaded and a generator function is returned instead multi_threaded : bool Uses multiprocessing to upload the dataset in parallel. If blocking is False this has no effect. files_to_exclude : list[str] List of files to exclude from the file scan (which is done only if files is None) fps : int Number of file per seconds to upload as_frames: bool Annotate as video. resume : bool Flag for signalling the resuming of a push path: str Optional path to put the files into Returns ------- generator : function Generator for doing the actual uploads. This is None if blocking is True count : int The files count """ # paths needs to start with / if path and path[0] != "/": path = f"/{path}" # This is where the responses from the upload function will be saved/load for resume self.local_path.parent.mkdir(exist_ok=True) responses_path = self.local_path.parent / ".upload_responses.json" # Init optional parameters if files_to_exclude is None: files_to_exclude = [] if files_to_upload is None: raise NotFound("Dataset location not found. Check your path.") if resume: if not responses_path.exists(): raise NotFound("Dataset location not found. Check your path.") with responses_path.open() as f: logged_responses = json.load(f) files_to_exclude.extend([ response["file_path"] for response in logged_responses if response["s3_response_status_code"].startswith("2") ]) files_to_upload = find_files(files=files_to_upload, recursive=True, files_to_exclude=files_to_exclude) if not files_to_upload: raise ValueError( "No files to upload, check your path, exclusion filters and resume flag" ) progress, count = add_files_to_dataset( client=self.client, dataset_id=str(self.dataset_id), filenames=files_to_upload, fps=fps, as_frames=as_frames, team=self.team, path=path, ) # If blocking is selected, upload the dataset remotely if blocking: responses = exhaust_generator(progress=progress, count=count, multi_threaded=multi_threaded) # Log responses to file if responses: responses = [{k: str(v) for k, v in response.items()} for response in responses] if resume: responses.extend(logged_responses) with responses_path.open("w") as f: json.dump(responses, f) return None, count else: return progress, count
def upload_annotations( client: "Client", team: str, image_mapping: Path, annotations_path: Path, class_mapping: Path, dataset_id: Optional[int] = None, multi_threaded: bool = True, ): """Experimental feature to upload annotations from the front end Parameters ---------- client: Client Client authenticated to the team where the put request will be made team: str Team against which the client will make the requests image_mapping: Path Path to the json file which contains the mapping between `original file names` and `dataset image id` which are required in the put request to compose the endpoint annotations_path: Path Path to the folder which contains all the json files representing the annotations to add class_mapping: Path Path to the json file which contains the mapping between `class name` and `class id` which is required in the put request to compose the payload. If not provided, new classes will be created dataset_id: int Dataset ID where to upload the annotations. This is required if class_mapping is None or if a class present in the annotations is missing on Darwin multi_threaded : bool Uses multiprocessing to upload the dataset in parallel. Notes ----- This function is experimental and the json files `image_mapping` and `class_mapping` can actually only be retrieved from the backend at the moment. """ # This is where the responses from the upload function will be saved/load for resume responses_path = image_mapping.parent / "upload_responses.json" output_file_path = image_mapping.parent / "log_requests.csv" # Read and prepare the image id mappings in a dict format {'original filename': 'image id'} with image_mapping.open() as json_file: image_mapping = { cm["original_filename"]: cm["id"] for cm in json.load(json_file) } # Read and prepare the class mappings in a dict format {'class name': 'class id'} if class_mapping is not None: with class_mapping.open() as json_file: class_mapping = { cm["name"]: cm["id"] for cm in json.load(json_file) } else: class_mapping = {} # Resume images_id = set() # Check that all the classes exists for f in annotations_path.glob("*.json"): with f.open() as json_file: # Read the annotation json file data = json.load(json_file) image_dataset_id = image_mapping[data["image"] ["original_filename"]] # Skip if already present if image_dataset_id in images_id: continue for annotation in data["annotations"]: # If the class is missing, create a new class on Darwin and update the mapping if not annotation["name"] in class_mapping: if dataset_id is not None: new_class = create_new_class( client=client, team=team, annotation_type_ids=[ "3" ], # TODO maybe in the future allow to use polygons and BB as well cropped_image={ "image_id": image_dataset_id, "scale": 0.01, "x": "0", "y": "0" }, dataset_id=dataset_id, description="", expected_occurrences=[0, 1], metadata=None, name=annotation["name"], ) class_mapping[new_class["name"]] = new_class["id"] else: raise ValueError( "Dataset ID is None and a class is missing on Darwin" " (or in the provided mapping).") # For each annotation found in the folder send out a request files_to_upload = [] for f in annotations_path.glob("*.json"): with f.open() as json_file: # Read the annotation json file data = json.load(json_file) image_dataset_id = image_mapping[data["image"] ["original_filename"]] # Skip if already present if image_dataset_id in images_id: continue files_to_upload.append({ "data": data, "image_dataset_id": image_dataset_id }) generator = (functools.partial( _upload_annotation, class_mapping=class_mapping, client=client, team=team, data=element["data"], image_dataset_id=element["image_dataset_id"], output_file_path=output_file_path, ) for element in files_to_upload) responses = exhaust_generator(progress=generator, count=len(files_to_upload), multi_threaded=multi_threaded) # Log responses to file if responses: components_labels = ["payload", "response"] responses = [{ component_label: {k: str(v) for k, v in component.items()} for response in responses for component, component_label in zip(response, components_labels) }] with responses_path.open("w") as f: json.dump(responses, f)
def pull( self, *, release: Optional[Release] = None, blocking: bool = True, multi_threaded: bool = True, only_annotations: bool = False, force_replace: bool = False, remove_extra: bool = False, subset_filter_annotations_function: Optional[Callable] = None, subset_folder_name: Optional[str] = None, use_folders: bool = False, video_frames: Optional[bool] = False, ): """Downloads a remote project (images and annotations) in the datasets directory. Parameters ---------- release: Release The release to pull blocking : bool If False, the dataset is not downloaded and a generator function is returned instead multi_threaded : bool Uses multiprocessing to download the dataset in parallel. If blocking is False this has no effect. only_annotations: bool Download only the annotations and no corresponding images force_replace: bool Forces the re-download of an existing image remove_extra: bool Removes existing images for which there is not corresponding annotation subset_filter_annotations_function: Callable This function receives the directory where the annotations are downloaded and can perform any operation on them i.e. filtering them with custom rules or else. If it needs to receive other parameters is advised to use functools.partial() for it. subset_folder_name: str Name of the folder with the subset of the dataset. If not provided a timestamp is used. use_folders: bool Recreates folders from the dataset video_frames: bool Pulls video frames images instead of video files Returns ------- generator : function Generator for doing the actual downloads. This is None if blocking is True count : int The files count """ if release is None: release = self.get_release() if release.format != "json": raise UnsupportedExportFormat(release.format) release_dir = self.local_releases_path / release.name release_dir.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory() as tmp_dir: tmp_dir = Path(tmp_dir) # Download the release from Darwin zip_file_path = release.download_zip(tmp_dir / "dataset.zip") with zipfile.ZipFile(zip_file_path) as z: # Extract annotations z.extractall(tmp_dir) # If a filtering function is provided, apply it if subset_filter_annotations_function is not None: subset_filter_annotations_function(tmp_dir) if subset_folder_name is None: subset_folder_name = datetime.now().strftime( "%m/%d/%Y_%H:%M:%S") annotations_dir = release_dir / (subset_folder_name or "") / "annotations" # Remove existing annotations if necessary if annotations_dir.exists(): try: shutil.rmtree(annotations_dir) except PermissionError: print( f"Could not remove dataset in {annotations_dir}. Permission denied." ) annotations_dir.mkdir(parents=True, exist_ok=False) # Move the annotations into the right folder and rename them to have the image # original filename as contained in the json for annotation_path in tmp_dir.glob("*.json"): with annotation_path.open() as file: annotation = json.load(file) filename = Path(annotation["image"]["filename"]).stem destination_name = annotations_dir / f"{filename}{annotation_path.suffix}" shutil.move(str(annotation_path), str(destination_name)) # Extract the list of classes and create the text files make_class_lists(release_dir) if release.latest: latest_dir = self.local_releases_path / "latest" if latest_dir.is_symlink(): latest_dir.unlink() latest_dir.symlink_to(f"./{release_dir.name}") if only_annotations: # No images will be downloaded return None, 0 team_config = self.client.config.get_team(self.team) api_key = team_config.get("api_key") # Create the generator with the download instructions progress, count = download_all_images_from_annotations( api_key=api_key, api_url=self.client.url, annotations_path=annotations_dir, images_path=self.local_images_path, force_replace=force_replace, remove_extra=remove_extra, use_folders=use_folders, video_frames=video_frames, ) if count == 0: return None, count # If blocking is selected, download the dataset on the file system if blocking: exhaust_generator(progress=progress(), count=count, multi_threaded=multi_threaded) return None, count else: return progress, count