def get_dataset_files(dataset_slug) -> Iterator[DatasetItem]: """ Return a list of filenames in a dataset along with their status """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) resp = dataset.fetch_remote_files() return resp
def add_labels_to_dataset(dataset_slug: str, labels: List[str], label_type: str): """ Add labels to a dataset """ assert label_type in ['polygon', 'tag'] client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) for label in labels: dataset.create_annotation_class(label, label_type)
def populate_dataset_annotations(dataset_slug, format_name: str, file_paths: List[str]): assert format_name in ['darwin', 'coco', 'pascal_voc'] client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) format_dict = {k: v for (k, v) in darwin.importer.formats.supported_formats} parser = format_dict[format_name] importer.import_annotations(dataset, parser, file_paths)
def create_dataset(dataset_slug): """ Create new empty dataset """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.create_dataset(name=identifier.dataset_slug) dataset_ifo = dict( name=dataset.name, id=dataset.dataset_id, slug=dataset.slug, remote_path=dataset.remote_path ) return dataset_ifo
def authenticate(api_key: str, default_team: Optional[bool] = None, datasets_dir: Optional[Path] = None) -> Config: """ Authenticate the API key against the server and creates a configuration file for it. Parameters ---------- api_key : str API key to use for the client login. default_team: Optional[bool] Flag to make the team the default one. Defaults to None. datasets_dir: Optional[Path] Dataset directory on the file system. Defaults to None. Returns ------- Config A configuration object to handle YAML files. """ # Resolve the home folder if the dataset_dir starts with ~ or ~user validate_api_key(api_key) try: client = Client.from_api_key(api_key=api_key) config_path = Path.home() / ".darwin" / "config.yaml" config_path.parent.mkdir(exist_ok=True) if default_team is None: default_team = input( f"Make {client.default_team} the default team? [y/N] ") in [ "Y", "y" ] if datasets_dir is None: datasets_dir = Path( prompt("Datasets directory", "~/.darwin/datasets")) datasets_dir = Path(datasets_dir).expanduser() Path(datasets_dir).mkdir(parents=True, exist_ok=True) client.set_datasets_dir(datasets_dir) default_team_name: Optional[ str] = client.default_team if default_team else None return persist_client_configuration(client, default_team=default_team_name) except InvalidLogin: _error("Invalid API key")
def _populate_dataset(dataset_slug, items): client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) item_batches = [x.tolist() for x in np.array_split(items, min(len(items), 100))] for idx, batch in enumerate(item_batches): print(f'Batch {idx + 1}/{len(item_batches)}') payload = { 'files': batch } print(payload) print(dataset.dataset_id) response = requests.put(f'https://darwin.v7labs.com/api/datasets/{dataset.dataset_id}/external_data', headers=HEADERS, json=payload) response.raise_for_status()
def _load_client( team_slug: Optional[str] = None, offline: bool = False, maybe_guest: bool = False, dataset_identifier: Optional[str] = None, ) -> Client: """Fetches a client, potentially offline Parameters ---------- offline : bool Flag for using an offline client maybe_guest : bool Flag to make a guest client, if config is missing Returns ------- Client The client requested """ if not team_slug and dataset_identifier: team_slug = DatasetIdentifier.parse(dataset_identifier).team_slug try: api_key = os.getenv("DARWIN_API_KEY") if api_key: client = Client.from_api_key(api_key) else: config_dir = Path.home() / ".darwin" / "config.yaml" client = Client.from_config(config_dir, team_slug=team_slug) return client except MissingConfig: if maybe_guest: return Client.from_guest() else: _error("Authenticate first") except InvalidLogin: _error("Please re-authenticate") except Unauthenticated: _error("Please re-authenticate")
def get_annotations(dataset_slug, anno_dest_dir='annos', *, clear_directory=False, verbose=False): """ Get all annotations for a dataset dataset_name: name of the dataset to retrieve annotations for anno_dest_dir: directory to store the annotation files clear_directory: delete all existing files in target directory if they exist (if False raise an error if files exist) verbose: log API responses """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) filters = {'statuses': 'review,complete'} ids = [file.id for file in dataset.fetch_remote_files(filters)] # darwin-py doesn't support dataset_item_ids # uses also /datasets/{self.dataset_id}/exports # dataset.export(annotation_class_ids=annotation_class_ids, name=name, include_url_token=include_url_token) export_name = 'export_tmp' print(ids) payload = dict( format='json', name=export_name, include_authorship=True, include_export_token=True, dataset_item_ids=ids ) print('Creating export...') response_create = requests.post(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports', headers=HEADERS, json=payload) response_create.raise_for_status() if verbose: pprint.pprint(['create_export', response_create.json()]) def get_export(timeout=60): waiting_for_export = True timeout_stop = time.time() + timeout while waiting_for_export: response_retrieve = requests.get(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports', headers=HEADERS) if verbose: pprint.pprint(['get_export', response_retrieve.json()]) response_retrieve.raise_for_status() exports = list(filter(lambda x: x['name'] == export_name, response_retrieve.json())) if len(exports) == 1 and exports[0]['latest']: return exports[0] else: if time.time() > timeout_stop: raise RuntimeError('Timeout whilst waiting for export to complete') time.sleep(0.5) if verbose: print('trying again...') try: print('Waiting for export to complete...') export = get_export() # download export data # (this is also available through dataset.annotations as a single dict? maybe deprecated?) print('Downloading annotations...') with requests.get(export['download_url'], stream=True) as r: r.raise_for_status() if verbose: pprint.pprint(['download_annos', r.status_code]) with mktmpdir() as tmp_dir: tmp_file = os.path.join(tmp_dir, 'export.zip') with open(tmp_file, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) if os.path.exists(anno_dest_dir): anno_files = os.listdir(anno_dest_dir) if len(anno_files) > 0: if clear_directory: for file in anno_files: os.remove(os.path.join(anno_dest_dir, file)) else: raise RuntimeError('Directory already exists and contains files!') else: os.makedirs(anno_dest_dir) with ZipFile(tmp_file, 'r') as f: f.extractall(anno_dest_dir) anno_paths = [os.path.join(anno_dest_dir, x) for x in os.listdir(anno_dest_dir)] except Exception as e: response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS) response_delete.raise_for_status() if verbose: pprint.pprint(['delete_export', response_delete.status_code]) raise e print('Export completed, cleaning up...') response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS) response_delete.raise_for_status() del export['download_url'] export['annotation_paths'] = anno_paths return export