Esempio n. 1
0
def get_dataset_files(dataset_slug) -> Iterator[DatasetItem]:
    """ Return a list of filenames in a dataset along with their status """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)
    resp = dataset.fetch_remote_files()

    return resp
Esempio n. 2
0
def add_labels_to_dataset(dataset_slug: str, labels: List[str], label_type: str):
    """ Add labels to a dataset """
    assert label_type in ['polygon', 'tag']
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    for label in labels:
        dataset.create_annotation_class(label, label_type)
Esempio n. 3
0
def populate_dataset_annotations(dataset_slug, format_name: str, file_paths: List[str]):
    assert format_name in ['darwin', 'coco', 'pascal_voc']
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    format_dict = {k: v for (k, v) in darwin.importer.formats.supported_formats}
    parser = format_dict[format_name]

    importer.import_annotations(dataset, parser, file_paths)
Esempio n. 4
0
def create_dataset(dataset_slug):
    """ Create new empty dataset """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.create_dataset(name=identifier.dataset_slug)

    dataset_ifo = dict(
        name=dataset.name,
        id=dataset.dataset_id,
        slug=dataset.slug,
        remote_path=dataset.remote_path
    )
    return dataset_ifo
Esempio n. 5
0
def authenticate(api_key: str,
                 default_team: Optional[bool] = None,
                 datasets_dir: Optional[Path] = None) -> Config:
    """
    Authenticate the API key against the server and creates a configuration file for it.

    Parameters
    ----------
    api_key : str
        API key to use for the client login.
    default_team: Optional[bool]
        Flag to make the team the default one. Defaults to None.
    datasets_dir: Optional[Path]
        Dataset directory on the file system. Defaults to None.

    Returns
    -------
    Config
    A configuration object to handle YAML files.
    """
    # Resolve the home folder if the dataset_dir starts with ~ or ~user

    validate_api_key(api_key)

    try:
        client = Client.from_api_key(api_key=api_key)
        config_path = Path.home() / ".darwin" / "config.yaml"
        config_path.parent.mkdir(exist_ok=True)

        if default_team is None:
            default_team = input(
                f"Make {client.default_team} the default team? [y/N] ") in [
                    "Y", "y"
                ]
        if datasets_dir is None:
            datasets_dir = Path(
                prompt("Datasets directory", "~/.darwin/datasets"))

        datasets_dir = Path(datasets_dir).expanduser()
        Path(datasets_dir).mkdir(parents=True, exist_ok=True)

        client.set_datasets_dir(datasets_dir)

        default_team_name: Optional[
            str] = client.default_team if default_team else None
        return persist_client_configuration(client,
                                            default_team=default_team_name)

    except InvalidLogin:
        _error("Invalid API key")
Esempio n. 6
0
def _populate_dataset(dataset_slug, items):
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    item_batches = [x.tolist() for x in np.array_split(items, min(len(items), 100))]
    for idx, batch in enumerate(item_batches):
        print(f'Batch {idx + 1}/{len(item_batches)}')
        payload = {
            'files': batch
        }
        print(payload)
        print(dataset.dataset_id)
        response = requests.put(f'https://darwin.v7labs.com/api/datasets/{dataset.dataset_id}/external_data', headers=HEADERS,
                                json=payload)

        response.raise_for_status()
Esempio n. 7
0
def _load_client(
    team_slug: Optional[str] = None,
    offline: bool = False,
    maybe_guest: bool = False,
    dataset_identifier: Optional[str] = None,
) -> Client:
    """Fetches a client, potentially offline

    Parameters
    ----------
    offline : bool
        Flag for using an offline client

    maybe_guest : bool
        Flag to make a guest client, if config is missing
    Returns
    -------
    Client
    The client requested
    """
    if not team_slug and dataset_identifier:
        team_slug = DatasetIdentifier.parse(dataset_identifier).team_slug
    try:
        api_key = os.getenv("DARWIN_API_KEY")
        if api_key:
            client = Client.from_api_key(api_key)
        else:
            config_dir = Path.home() / ".darwin" / "config.yaml"
            client = Client.from_config(config_dir, team_slug=team_slug)
        return client
    except MissingConfig:
        if maybe_guest:
            return Client.from_guest()
        else:
            _error("Authenticate first")
    except InvalidLogin:
        _error("Please re-authenticate")
    except Unauthenticated:
        _error("Please re-authenticate")
Esempio n. 8
0
def get_annotations(dataset_slug, anno_dest_dir='annos', *, clear_directory=False, verbose=False):
    """ Get all annotations for a dataset

    dataset_name: name of the dataset to retrieve annotations for
    anno_dest_dir: directory to store the annotation files
    clear_directory: delete all existing files in target directory if they exist (if False raise an error if files exist)
    verbose: log API responses
    """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    filters = {'statuses': 'review,complete'}
    ids = [file.id for file in dataset.fetch_remote_files(filters)]

    # darwin-py doesn't support dataset_item_ids
    # uses also /datasets/{self.dataset_id}/exports
    # dataset.export(annotation_class_ids=annotation_class_ids, name=name, include_url_token=include_url_token)

    export_name = 'export_tmp'
    print(ids)

    payload = dict(
        format='json',
        name=export_name,
        include_authorship=True,
        include_export_token=True,
        dataset_item_ids=ids
    )

    print('Creating export...')
    response_create = requests.post(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports',
                                    headers=HEADERS,
                                    json=payload)
    response_create.raise_for_status()
    if verbose:
        pprint.pprint(['create_export', response_create.json()])

    def get_export(timeout=60):
        waiting_for_export = True
        timeout_stop = time.time() + timeout
        while waiting_for_export:
            response_retrieve = requests.get(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports', headers=HEADERS)
            if verbose:
                pprint.pprint(['get_export', response_retrieve.json()])
            response_retrieve.raise_for_status()
            exports = list(filter(lambda x: x['name'] == export_name, response_retrieve.json()))
            if len(exports) == 1 and exports[0]['latest']:
                return exports[0]
            else:
                if time.time() > timeout_stop:
                    raise RuntimeError('Timeout whilst waiting for export to complete')
            time.sleep(0.5)
            if verbose:
                print('trying again...')

    try:
        print('Waiting for export to complete...')
        export = get_export()

        # download export data
        # (this is also available through dataset.annotations as a single dict? maybe deprecated?)
        print('Downloading annotations...')
        with requests.get(export['download_url'], stream=True) as r:
            r.raise_for_status()
            if verbose:
                pprint.pprint(['download_annos', r.status_code])
            with mktmpdir() as tmp_dir:
                tmp_file = os.path.join(tmp_dir, 'export.zip')
                with open(tmp_file, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                if os.path.exists(anno_dest_dir):
                    anno_files = os.listdir(anno_dest_dir)
                    if len(anno_files) > 0:
                        if clear_directory:
                            for file in anno_files:
                                os.remove(os.path.join(anno_dest_dir, file))
                        else:
                            raise RuntimeError('Directory already exists and contains files!')
                else:
                    os.makedirs(anno_dest_dir)
                with ZipFile(tmp_file, 'r') as f:
                    f.extractall(anno_dest_dir)
                anno_paths = [os.path.join(anno_dest_dir, x) for x in os.listdir(anno_dest_dir)]
    except Exception as e:
        response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS)
        response_delete.raise_for_status()
        if verbose:
            pprint.pprint(['delete_export', response_delete.status_code])
        raise e

    print('Export completed, cleaning up...')
    response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS)
    response_delete.raise_for_status()

    del export['download_url']
    export['annotation_paths'] = anno_paths
    return export