def main(organization_id, dataset_name, max_workers): coco_dataset_train = COCOBobxDatasetMod( split='train', year='2017', use_crowded=False, ) coco_dataset_val = COCOBobxDatasetMod( split='val', year='2017', use_crowded=False, ) categories = coco_dataset_val.categories props = build_props(categories) datalake_client = DatalakeClient(organization_id=organization_id, credential=credential) dataset_client = DatasetClient(organization_id=organization_id, credential=credential) description = f'MS-COCO detection created with dataset: {dataset_name}' channel = datalake_client.channels.create('', description, StorageType.DATALAKE.value) print(f'channel is created: {channel.channel_id}') print('upload train dataset...') dataset_id_train = create_dataset(dataset_client, dataset_name + '-train', props=props, dataset_type='detection', override=True) dataset_train = dataset_client.get_dataset(dataset_id_train) upload_coco_dataset(coco_dataset_train, channel, dataset_train.dataset_items, max_workers) print('upload val dataset...') dataset_id_val = create_dataset(dataset_client, dataset_name + '-val', props=props, dataset_type='detection', override=True) dataset_val = dataset_client.get_dataset(dataset_id_val) upload_coco_dataset(coco_dataset_val, channel, dataset_val.dataset_items, max_workers)
def get_dataset_item_ids(dataset_ids: List[str]) -> List[DatasetItemId]: """ FIXME: DEPRECATED https://github.com/abeja-inc/platform-planning/issues/2171 Get dataset item ids. :param dataset_ids: :return: """ client = Client() dataset_item_ids = list() for dataset_id in dataset_ids: dataset = client.get_dataset(dataset_id) for item in dataset.dataset_items.list(prefetch=USE_ON_MEMORY): dataset_item_id = DatasetItemId(dataset_id, item.dataset_item_id) dataset_item_ids.append(dataset_item_id) if USE_ON_MEMORY: try: source_data = item.source_data[0] file_content = source_data.get_content(cache=USE_CACHE) file_like_object = io.BytesIO(file_content) img = load_img(file_like_object, color_mode='rgb', target_size=(IMG_ROWS, IMG_COLS)) dataset_item_id.data = img label_id = item.attributes['classification'][0]['label_id'] # FIXME: Allow category selection dataset_item_id.label_id = label_id except Exception as e: print('Error: Loading dataset_item_id', dataset_item_id.item_id) raise e break # FIXME: Allow multiple datasets. return dataset_item_ids
def handler(context): # set alias specified in console dataset_alias = context.datasets dataset_id = dataset_alias['train'] # get dataset via ABEJA Platform api dataset_client = DatasetClient() dataset = dataset_client.get_dataset(dataset_id) dataset_list = list(load_dataset_from_api(dataset)) num_classes = len(dataset.props['categories'][0]['labels']) print('number of classes is {}.'.format(num_classes)) # create dataloader trainloader, validloader = load_split_train_test(dataset_list) # specify model architecture (ResNet-50) model = models.resnet50(pretrained=True) # freeze parameters so we don't backprop through them for param in model.parameters(): param.requires_grad = False # replace the last fully connected layer with a Linnear layer with no. of classes out features model.fc = nn.Linear(2048, num_classes) model = model.to(device) # define loss function and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate) # train the model train_model(trainloader, validloader, model, optimizer, criterion)
def get_dataset_labels(dataset_ids): datasets_client = DatasetsClient() labels = [] for dataset_id in dataset_ids: dataset = datasets_client.get_dataset(dataset_id) labels = dataset.props['categories'][0]['labels'] break return labels
def __init__(self, dataset_item_ids: List[DatasetItemId], id2index: Dict[str, int], is_train: bool = False): self.client = Client() self.is_train = is_train self.dataset_item_ids = dataset_item_ids self.dataset = self.client.get_dataset(self.dataset_item_ids[0].dataset_id) self.id2index = id2index self.num_classes = len(id2index) # FIXME: https://github.com/abeja-inc/platform-planning/issues/2170 self.dataset_item_count = len(dataset_item_ids) self.num_batches_per_epoch = math.ceil(self.dataset_item_count / BATCH_SIZE)
def __init__(self, dataset_id, transform=None): super(SegmentationDatasetFromAPI, self).__init__() self.transform = transform client = Client() self.dataset = client.get_dataset(dataset_id) self.dataset_list = list(load_dataset_from_api(self.dataset)) self.max_id = max([ c['label_id'] for c in self.dataset.props['props']['attributes'][0]['categories'] ]) self.client = APIClient()
def load_dataset_from_api(dataset_id, max_num=None, organization_id=None, credential=None): client = Client(organization_id, credential) dataset = client.get_dataset(dataset_id) if max_num is not None: dataset_list = dataset.dataset_items.list(prefetch=False) ret = [] for d in dataset_list: ret.append(d) if len(ret) > max_num: break return ret else: return dataset.dataset_items.list(prefetch=True)
def load_dataset_from_api(dataset_id, max_num=None, organization_id=None, credential=None): client = Client(organization_id, credential) dataset = client.get_dataset(dataset_id) if max_num is not None: ret = [] for d in tqdm(dataset.dataset_items.list(prefetch=True), total=max_num): ret.append(d) if len(ret) > max_num: break return ret else: ret = [] total_items = dataset.total_count for item in tqdm(dataset.dataset_items.list(prefetch=True), total=total_items): ret.append(item) return ret
def __init__(self, root, dataset_id, transform=None, target_transform=None, transforms=None, prefetch=False, use_cache=True, indices=None): super(AbejaDataset, self).__init__(root, transforms, transform, target_transform) datasets_client = DatasetsClient() self.datalake_client = DatalakeClient() dataset = datasets_client.get_dataset(dataset_id) self.labels = dataset.props['categories'][0]['labels'] self.palette = create_palette(self.labels) self.use_cache = use_cache self.datalake_files = list() idx = 0 for item in dataset.dataset_items.list(prefetch=prefetch): if indices is not None and not idx in indices: idx += 1 continue if 'segmentation-image' in item.attributes: data_uri = item.attributes['segmentation-image']['combined'][ 'data_uri'] else: # FIXME: DEPRECATED. Type 'segmentation' is invalid on the latest spec. data_uri = item.attributes['segmentation']['combined'][ 'data_uri'] m = re.search(r'datalake://(.+?)/(.+?)$', data_uri) src_data = item.source_data[0] self.datalake_files.append( DataLakeObj(m.group(1), m.group(2), src_data)) idx += 1
def set_categories(dataset_ids: list) -> Tuple[Dict[str, int], Dict[int, str]]: """ Set categories from Datasets. :param dataset_ids: Dataset IDs. list format. :return id2index: Map of label_id to training index. :return index2label: Map of training index to label. """ client = Client() id2index = dict() index2label = dict() index = 0 for dataset_id in dataset_ids: dataset = client.get_dataset(dataset_id) category_0 = dataset.props['categories'][0] # FIXME: Allow category selection for label in category_0['labels']: label_id = label['label_id'] label_name = label['label'] if label_id not in id2index: id2index[label_id] = index index2label[index] = label_name index += 1 break # FIXME: Allow multiple datasets. return id2index, index2label
def set_categories(dataset_ids: list) -> Tuple[Dict[str, int], Dict[int, str]]: """ Set categories from Datasets. :param dataset_ids: Dataset IDs. list format. :return id2index: Map of label_id to training index. :return index2label: Map of training index to label. """ client = Client() id2index = dict() index2label = dict() index = 0 last_dataset = None for dataset_id in dataset_ids: dataset = client.get_dataset(dataset_id) if len(dataset.props.get('categories', [])) > 1: raise NotImplementedError( 'more than one category not supported yet.') # check if all categories are same if last_dataset is not None: if last_dataset.props['categories'] != dataset.props['categories']: raise NotImplementedError( 'different categories among datasets not supported yet.') last_dataset = dataset category_0 = last_dataset.props['categories'][ 0] # FIXME: Allow category selection for label in category_0['labels']: label_id = label['label_id'] label_name = label['label'] if label_id not in id2index: id2index[label_id] = index index2label[index] = label_name index += 1 return id2index, index2label
def upload_dataset(dataset_client: DatasetClient, dataset_id: str, dataset_list: list, max_workers: int = 4): """ uploads file info list to dataset using abeja's dataset client """ dataset = dataset_client.get_dataset(dataset_id) def _f(dataset_item): source_data = [ { 'data_uri': dataset_item.source_data[0].uri, 'data_type': dataset_item.source_data[0].type } ] # TODO: only one url to be uploaded attributes = dataset_item.attributes dataset.dataset_items.create(source_data=source_data, attributes=attributes) if max_workers > 1: with ThreadPoolExecutor(max_workers) as executor: results = list(tqdm(executor.map(_f, dataset_list), total=len(dataset_list))) return results return [_f(x) for x in tqdm(dataset_list)]
def load_dataset_from_api(dataset_id): client = Client() dataset = client.get_dataset(dataset_id) dataset_list = dataset.dataset_items.list(prefetch=True) return dataset_list
class DataGenerator(Sequence): """ Custom Data Generator for ABEJA Datasets FIXME: Allow multiple datasets. """ def __init__(self, dataset_item_ids: List[DatasetItemId], id2index: Dict[str, int], is_train: bool = False): self.client = Client() self.is_train = is_train self.dataset_item_ids = dataset_item_ids self.dataset = self.client.get_dataset(self.dataset_item_ids[0].dataset_id) self.id2index = id2index self.num_classes = len(id2index) # FIXME: https://github.com/abeja-inc/platform-planning/issues/2170 self.dataset_item_count = len(dataset_item_ids) self.num_batches_per_epoch = math.ceil(self.dataset_item_count / BATCH_SIZE) def __data_load(self, imgs, labels, start_pos: int, from_i: int, to_i: int): for i in range(from_i, to_i, 1): id_idx = (start_pos + i) % self.dataset_item_count dataset_item_id = self.dataset_item_ids[id_idx] if USE_ON_MEMORY: img = dataset_item_id.data label_id = dataset_item_id.label_id else: try: dataset_id = dataset_item_id.dataset_id item_id = dataset_item_id.item_id if self.dataset.dataset_id != dataset_id: self.dataset = self.client.get_dataset(dataset_id) dataset_item = self.dataset.dataset_items.get(item_id) label_id = dataset_item.attributes['classification'][0]['label_id'] # FIXME: Allow category selection source_data = dataset_item.source_data[0] file_content = source_data.get_content(cache=USE_CACHE) file_like_object = io.BytesIO(file_content) img = load_img(file_like_object, target_size=(IMG_ROWS, IMG_COLS)) except Exception as e: print('Error: Loading dataset_item_id', dataset_item_id.item_id) raise e img = preprocessor.transform(img, is_train=self.is_train, seed=RANDOM_SEED) imgs[i, :] = img labels[i] = self.id2index[label_id] def __getitem__(self, idx): start_pos = BATCH_SIZE * idx imgs = np.empty((BATCH_SIZE, IMG_ROWS, IMG_COLS, NB_CHANNELS), dtype=np.float32) labels = [0]*BATCH_SIZE threadlist = list() for i in range(NUM_DATA_LOAD_THREAD): thread = threading.Thread( target=self.__data_load, args=(imgs, labels, start_pos, THREAD_INDICES[i], THREAD_INDICES[i+1])) threadlist.append(thread) for thread in threadlist: thread.start() for thread in threadlist: thread.join() labels = keras.utils.to_categorical(labels, num_classes=self.num_classes) return imgs, labels def __len__(self): return self.num_batches_per_epoch
def get_dataset_size(dataset_id): datasets_client = DatasetsClient() dataset = datasets_client.get_dataset(dataset_id) return dataset.total_count