Esempio n. 1
0
def main(organization_id, dataset_name, max_workers):
    coco_dataset_train = COCOBobxDatasetMod(
        split='train', year='2017', use_crowded=False,
    )
    coco_dataset_val = COCOBobxDatasetMod(
        split='val', year='2017', use_crowded=False,
    )
    categories = coco_dataset_val.categories
    props = build_props(categories)

    datalake_client = DatalakeClient(organization_id=organization_id, credential=credential)
    dataset_client = DatasetClient(organization_id=organization_id, credential=credential)

    description = f'MS-COCO detection created with dataset: {dataset_name}'
    channel = datalake_client.channels.create('', description,
                                              StorageType.DATALAKE.value)
    print(f'channel is created: {channel.channel_id}')

    print('upload train dataset...')
    dataset_id_train = create_dataset(dataset_client, dataset_name + '-train', props=props,
                                      dataset_type='detection', override=True)
    dataset_train = dataset_client.get_dataset(dataset_id_train)
    upload_coco_dataset(coco_dataset_train, channel, dataset_train.dataset_items, max_workers)

    print('upload val dataset...')
    dataset_id_val = create_dataset(dataset_client, dataset_name + '-val', props=props,
                                    dataset_type='detection', override=True)
    dataset_val = dataset_client.get_dataset(dataset_id_val)
    upload_coco_dataset(coco_dataset_val, channel, dataset_val.dataset_items, max_workers)
def get_dataset_item_ids(dataset_ids: List[str]) -> List[DatasetItemId]:
    """
    FIXME: DEPRECATED https://github.com/abeja-inc/platform-planning/issues/2171
    Get dataset item ids.
    :param dataset_ids:
    :return:
    """
    client = Client()
    dataset_item_ids = list()
    for dataset_id in dataset_ids:
        dataset = client.get_dataset(dataset_id)
        for item in dataset.dataset_items.list(prefetch=USE_ON_MEMORY):
            dataset_item_id = DatasetItemId(dataset_id, item.dataset_item_id)
            dataset_item_ids.append(dataset_item_id)
            if USE_ON_MEMORY:
                try:
                    source_data = item.source_data[0]
                    file_content = source_data.get_content(cache=USE_CACHE)
                    file_like_object = io.BytesIO(file_content)
                    img = load_img(file_like_object, color_mode='rgb', target_size=(IMG_ROWS, IMG_COLS))
                    dataset_item_id.data = img
                    label_id = item.attributes['classification'][0]['label_id']  # FIXME: Allow category selection
                    dataset_item_id.label_id = label_id
                except Exception as e:
                    print('Error: Loading dataset_item_id', dataset_item_id.item_id)
                    raise e
        break  # FIXME: Allow multiple datasets.
    return dataset_item_ids
Esempio n. 3
0
def handler(context):
    # set alias specified in console
    dataset_alias = context.datasets
    dataset_id = dataset_alias['train']

    # get dataset via ABEJA Platform api
    dataset_client = DatasetClient()
    dataset = dataset_client.get_dataset(dataset_id)
    dataset_list = list(load_dataset_from_api(dataset))
    num_classes = len(dataset.props['categories'][0]['labels'])
    print('number of classes is {}.'.format(num_classes))

    # create dataloader
    trainloader, validloader = load_split_train_test(dataset_list)

    # specify model architecture (ResNet-50)
    model = models.resnet50(pretrained=True)

    # freeze parameters so we don't backprop through them
    for param in model.parameters():
        param.requires_grad = False

    # replace the last fully connected layer with a Linnear layer with no. of classes out features
    model.fc = nn.Linear(2048, num_classes)
    model = model.to(device)

    # define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate)

    # train the model
    train_model(trainloader, validloader, model, optimizer, criterion)
Esempio n. 4
0
def get_dataset_labels(dataset_ids):
    datasets_client = DatasetsClient()
    labels = []
    for dataset_id in dataset_ids:
        dataset = datasets_client.get_dataset(dataset_id)
        labels = dataset.props['categories'][0]['labels']
        break
    return labels
    def __init__(self, dataset_item_ids: List[DatasetItemId], id2index: Dict[str, int], is_train: bool = False):
        self.client = Client()
        self.is_train = is_train
        self.dataset_item_ids = dataset_item_ids
        self.dataset = self.client.get_dataset(self.dataset_item_ids[0].dataset_id)
        self.id2index = id2index
        self.num_classes = len(id2index)

        # FIXME: https://github.com/abeja-inc/platform-planning/issues/2170
        self.dataset_item_count = len(dataset_item_ids)
        self.num_batches_per_epoch = math.ceil(self.dataset_item_count / BATCH_SIZE)
Esempio n. 6
0
    def __init__(self, dataset_id, transform=None):
        super(SegmentationDatasetFromAPI, self).__init__()

        self.transform = transform

        client = Client()
        self.dataset = client.get_dataset(dataset_id)
        self.dataset_list = list(load_dataset_from_api(self.dataset))

        self.max_id = max([
            c['label_id']
            for c in self.dataset.props['props']['attributes'][0]['categories']
        ])

        self.client = APIClient()
Esempio n. 7
0
def load_dataset_from_api(dataset_id,
                          max_num=None,
                          organization_id=None,
                          credential=None):
    client = Client(organization_id, credential)
    dataset = client.get_dataset(dataset_id)

    if max_num is not None:
        dataset_list = dataset.dataset_items.list(prefetch=False)
        ret = []
        for d in dataset_list:
            ret.append(d)
            if len(ret) > max_num:
                break
        return ret
    else:
        return dataset.dataset_items.list(prefetch=True)
Esempio n. 8
0
def load_dataset_from_api(dataset_id,
                          max_num=None,
                          organization_id=None,
                          credential=None):
    client = Client(organization_id, credential)
    dataset = client.get_dataset(dataset_id)

    if max_num is not None:
        ret = []
        for d in tqdm(dataset.dataset_items.list(prefetch=True),
                      total=max_num):
            ret.append(d)
            if len(ret) > max_num:
                break
        return ret
    else:
        ret = []
        total_items = dataset.total_count
        for item in tqdm(dataset.dataset_items.list(prefetch=True),
                         total=total_items):
            ret.append(item)
        return ret
Esempio n. 9
0
    def __init__(self,
                 root,
                 dataset_id,
                 transform=None,
                 target_transform=None,
                 transforms=None,
                 prefetch=False,
                 use_cache=True,
                 indices=None):

        super(AbejaDataset, self).__init__(root, transforms, transform,
                                           target_transform)

        datasets_client = DatasetsClient()
        self.datalake_client = DatalakeClient()
        dataset = datasets_client.get_dataset(dataset_id)
        self.labels = dataset.props['categories'][0]['labels']
        self.palette = create_palette(self.labels)
        self.use_cache = use_cache

        self.datalake_files = list()
        idx = 0
        for item in dataset.dataset_items.list(prefetch=prefetch):
            if indices is not None and not idx in indices:
                idx += 1
                continue

            if 'segmentation-image' in item.attributes:
                data_uri = item.attributes['segmentation-image']['combined'][
                    'data_uri']
            else:
                # FIXME: DEPRECATED. Type 'segmentation' is invalid on the latest spec.
                data_uri = item.attributes['segmentation']['combined'][
                    'data_uri']
            m = re.search(r'datalake://(.+?)/(.+?)$', data_uri)
            src_data = item.source_data[0]
            self.datalake_files.append(
                DataLakeObj(m.group(1), m.group(2), src_data))
            idx += 1
def set_categories(dataset_ids: list) -> Tuple[Dict[str, int], Dict[int, str]]:
    """
    Set categories from Datasets.
    :param dataset_ids: Dataset IDs. list format.
    :return id2index: Map of label_id to training index.
    :return index2label: Map of training index to label.
    """
    client = Client()
    id2index = dict()
    index2label = dict()
    index = 0
    for dataset_id in dataset_ids:
        dataset = client.get_dataset(dataset_id)
        category_0 = dataset.props['categories'][0]  # FIXME: Allow category selection
        for label in category_0['labels']:
            label_id = label['label_id']
            label_name = label['label']
            if label_id not in id2index:
                id2index[label_id] = index
                index2label[index] = label_name
                index += 1
        break  # FIXME: Allow multiple datasets.
    return id2index, index2label
Esempio n. 11
0
def set_categories(dataset_ids: list) -> Tuple[Dict[str, int], Dict[int, str]]:
    """
    Set categories from Datasets.
    :param dataset_ids: Dataset IDs. list format.
    :return id2index: Map of label_id to training index.
    :return index2label: Map of training index to label.
    """
    client = Client()
    id2index = dict()
    index2label = dict()
    index = 0

    last_dataset = None
    for dataset_id in dataset_ids:
        dataset = client.get_dataset(dataset_id)
        if len(dataset.props.get('categories', [])) > 1:
            raise NotImplementedError(
                'more than one category not supported yet.')

        # check if all categories are same
        if last_dataset is not None:
            if last_dataset.props['categories'] != dataset.props['categories']:
                raise NotImplementedError(
                    'different categories among datasets not supported yet.')
        last_dataset = dataset

    category_0 = last_dataset.props['categories'][
        0]  # FIXME: Allow category selection
    for label in category_0['labels']:
        label_id = label['label_id']
        label_name = label['label']
        if label_id not in id2index:
            id2index[label_id] = index
            index2label[index] = label_name
            index += 1
    return id2index, index2label
Esempio n. 12
0
def upload_dataset(dataset_client: DatasetClient, dataset_id: str,
                   dataset_list: list, max_workers: int = 4):
    """ uploads file info list to dataset using abeja's dataset client """
    dataset = dataset_client.get_dataset(dataset_id)

    def _f(dataset_item):
        source_data = [
            {
                'data_uri': dataset_item.source_data[0].uri,
                'data_type': dataset_item.source_data[0].type
            }
        ]  # TODO: only one url to be uploaded
        attributes = dataset_item.attributes
        dataset.dataset_items.create(source_data=source_data, attributes=attributes)

    if max_workers > 1:
        with ThreadPoolExecutor(max_workers) as executor:
            results = list(tqdm(executor.map(_f, dataset_list), total=len(dataset_list)))
        return results
    return [_f(x) for x in tqdm(dataset_list)]
Esempio n. 13
0
def load_dataset_from_api(dataset_id):
    client = Client()
    dataset = client.get_dataset(dataset_id)
    dataset_list = dataset.dataset_items.list(prefetch=True)
    return dataset_list
class DataGenerator(Sequence):
    """
    Custom Data Generator for ABEJA Datasets
    FIXME: Allow multiple datasets.
    """

    def __init__(self, dataset_item_ids: List[DatasetItemId], id2index: Dict[str, int], is_train: bool = False):
        self.client = Client()
        self.is_train = is_train
        self.dataset_item_ids = dataset_item_ids
        self.dataset = self.client.get_dataset(self.dataset_item_ids[0].dataset_id)
        self.id2index = id2index
        self.num_classes = len(id2index)

        # FIXME: https://github.com/abeja-inc/platform-planning/issues/2170
        self.dataset_item_count = len(dataset_item_ids)
        self.num_batches_per_epoch = math.ceil(self.dataset_item_count / BATCH_SIZE)

    def __data_load(self, imgs, labels, start_pos: int, from_i: int, to_i: int):
        for i in range(from_i, to_i, 1):
            id_idx = (start_pos + i) % self.dataset_item_count
            dataset_item_id = self.dataset_item_ids[id_idx]
            if USE_ON_MEMORY:
                img = dataset_item_id.data
                label_id = dataset_item_id.label_id
            else:
                try:
                    dataset_id = dataset_item_id.dataset_id
                    item_id = dataset_item_id.item_id
                    if self.dataset.dataset_id != dataset_id:
                        self.dataset = self.client.get_dataset(dataset_id)
                    dataset_item = self.dataset.dataset_items.get(item_id)
                    label_id = dataset_item.attributes['classification'][0]['label_id']  # FIXME: Allow category selection
                    source_data = dataset_item.source_data[0]
                    file_content = source_data.get_content(cache=USE_CACHE)
                    file_like_object = io.BytesIO(file_content)
                    img = load_img(file_like_object, target_size=(IMG_ROWS, IMG_COLS))
                except Exception as e:
                    print('Error: Loading dataset_item_id', dataset_item_id.item_id)
                    raise e
            img = preprocessor.transform(img, is_train=self.is_train, seed=RANDOM_SEED)
            imgs[i, :] = img
            labels[i] = self.id2index[label_id]

    def __getitem__(self, idx):
        start_pos = BATCH_SIZE * idx
        imgs = np.empty((BATCH_SIZE, IMG_ROWS, IMG_COLS, NB_CHANNELS), dtype=np.float32)
        labels = [0]*BATCH_SIZE

        threadlist = list()
        for i in range(NUM_DATA_LOAD_THREAD):
            thread = threading.Thread(
                target=self.__data_load,
                args=(imgs, labels, start_pos, THREAD_INDICES[i], THREAD_INDICES[i+1]))
            threadlist.append(thread)
        for thread in threadlist:
            thread.start()
        for thread in threadlist:
            thread.join()

        labels = keras.utils.to_categorical(labels, num_classes=self.num_classes)
        return imgs, labels

    def __len__(self):
        return self.num_batches_per_epoch
Esempio n. 15
0
def get_dataset_size(dataset_id):
    datasets_client = DatasetsClient()
    dataset = datasets_client.get_dataset(dataset_id)
    return dataset.total_count