Beispiel #1
0
def handler(context):
    # set alias specified in console
    dataset_alias = context.datasets
    dataset_id = dataset_alias['train']

    # get dataset via ABEJA Platform api
    dataset_client = DatasetClient()
    dataset = dataset_client.get_dataset(dataset_id)
    dataset_list = list(load_dataset_from_api(dataset))
    num_classes = len(dataset.props['categories'][0]['labels'])
    print('number of classes is {}.'.format(num_classes))

    # create dataloader
    trainloader, validloader = load_split_train_test(dataset_list)

    # specify model architecture (ResNet-50)
    model = models.resnet50(pretrained=True)

    # freeze parameters so we don't backprop through them
    for param in model.parameters():
        param.requires_grad = False

    # replace the last fully connected layer with a Linnear layer with no. of classes out features
    model.fc = nn.Linear(2048, num_classes)
    model = model.to(device)

    # define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate)

    # train the model
    train_model(trainloader, validloader, model, optimizer, criterion)
Beispiel #2
0
def main(organization_id, dataset_name, max_workers):
    coco_dataset_train = COCOBobxDatasetMod(
        split='train', year='2017', use_crowded=False,
    )
    coco_dataset_val = COCOBobxDatasetMod(
        split='val', year='2017', use_crowded=False,
    )
    categories = coco_dataset_val.categories
    props = build_props(categories)

    datalake_client = DatalakeClient(organization_id=organization_id, credential=credential)
    dataset_client = DatasetClient(organization_id=organization_id, credential=credential)

    description = f'MS-COCO detection created with dataset: {dataset_name}'
    channel = datalake_client.channels.create('', description,
                                              StorageType.DATALAKE.value)
    print(f'channel is created: {channel.channel_id}')

    print('upload train dataset...')
    dataset_id_train = create_dataset(dataset_client, dataset_name + '-train', props=props,
                                      dataset_type='detection', override=True)
    dataset_train = dataset_client.get_dataset(dataset_id_train)
    upload_coco_dataset(coco_dataset_train, channel, dataset_train.dataset_items, max_workers)

    print('upload val dataset...')
    dataset_id_val = create_dataset(dataset_client, dataset_name + '-val', props=props,
                                    dataset_type='detection', override=True)
    dataset_val = dataset_client.get_dataset(dataset_id_val)
    upload_coco_dataset(coco_dataset_val, channel, dataset_val.dataset_items, max_workers)
def register_dataset_items_from_datalake(organization_id, image_channel_id,
                                         label_channel_id, dataset_name,
                                         img_list_path):
    """
    register datasets from datalake channel
    :param dataset_id: target dataset id
    :param channel_id: target channel
    :param label_metadata_key: metadata key which label value is stored
    :param max_size_for_label: max size of dataset items for each label value
    :return:
    """
    with open('dataset.json', 'r') as f:
        dataset_props = json.load(f)

    paths = [[fn.split('/')[-1] for fn in line.split()]
             for line in open(img_list_path)]
    img2label = {img: lbl for img, lbl in paths}

    print('Getting data from datalake....')
    client = DatalakeClient(organization_id=organization_id,
                            credential=credential)
    label_channel = client.get_channel(label_channel_id)

    label_list = label_channel.list_files(prefetch=True)
    label2fileid = {
        label.metadata['filename']: label.file_id
        for label in label_list
    }

    image_channel = client.get_channel(image_channel_id)
    file_iter = image_channel.list_files(limit=1000, prefetch=True)

    dataset_items = []
    for file_info in file_iter:
        imgfile = file_info.metadata['filename']
        labelfile = img2label[imgfile]
        label_id = label2fileid[labelfile]

        annotation = {
            'channel_id': label_channel_id,
            'file_id': label_id,
            'filename': labelfile
        }

        item = create_request_element(image_channel_id, file_info, annotation)
        dataset_items.append(item)

    print('Registering dataset items....')
    dataset_params = {
        'organization_id': organization_id,
        'name': dataset_name,
        'type': 'segmentation',
        'props': dataset_props
    }
    dataset_client = DatasetClient(organization_id=organization_id,
                                   credential=credential)
    dataset = dataset_client.datasets.create(dataset_name, 'segmentation',
                                             dataset_params)
    register_dataset_items(dataset, dataset_items)
Beispiel #4
0
def register_dataset(organization_id, datalake_name, dataset_name,
                     dataset_json_path, max_workers, attribute_type):
    """
    register datasets from datalake channel
    """
    print('prepare PET dataset')
    pet_dataset_trainval, pet_dataset_test = load_data()
    print(
        f'num of trainval: {len(pet_dataset_trainval)} test: {len(pet_dataset_trainval)}'
    )

    print('create channel')
    description = 'The Oxford-IIIT Pet Dataset'
    datalake_client = DatalakeClient(organization_id=organization_id,
                                     credential=credential)
    channel = datalake_client.channels.create(datalake_name, description,
                                              StorageType.DATALAKE.value)
    print(f'channel created: {channel.channel_id}')

    print('register datasets')
    with open(dataset_json_path, 'r') as f:
        dataset_format = json.load(f)

    dataset_client = DatasetClient(organization_id=organization_id,
                                   credential=credential)

    dataset_trainval = dataset_client.datasets.create(
        dataset_name + '-trainval', dataset_format['type'],
        dataset_format['props'])
    print(f'trainval dataset created: {dataset_trainval.dataset_id}')

    dataset_test = dataset_client.datasets.create(dataset_name + '-test',
                                                  dataset_format['type'],
                                                  dataset_format['props'])
    print(f'test dataset created: {dataset_test.dataset_id}')

    print('start uploading trainval..')
    upload_data(channel,
                dataset_trainval.dataset_items,
                pet_dataset_trainval,
                is_train=True,
                max_workers=max_workers,
                attribute_type=attribute_type)

    print('start uploading test..')
    upload_data(channel,
                dataset_test.dataset_items,
                pet_dataset_test,
                is_train=False,
                max_workers=max_workers,
                attribute_type=attribute_type)

    print('finished!')
Beispiel #5
0
def register_dataset_items_from_datalake(organization_id, channel_id,
                                         dataset_name, split, year,
                                         max_workers):
    """
    register datasets from datalake channel

    Args:
        organization_id:
        channel_id:
        dataset_name:
        split:
        year:
        max_workers:

    Returns:

    """

    with open('dataset.json', 'r') as f:
        dataset_props = json.load(f)

    voc_dataset = voc_bbox_dataset.VOCBboxDataset(split=split, year=year)
    nb_data = len(voc_dataset)

    data = {}
    for i in range(nb_data):
        id, annotation = voc_dataset.get_annotations(i)
        data[id] = annotation

    print('Getting data from datalake....')
    client = DatalakeClient(organization_id=organization_id,
                            credential=credential)
    channel = client.get_channel(channel_id)

    def file2id(file_info):
        return file_info.metadata['filename'].split('.')[0]

    file_iter = channel.list_files(limit=1000, prefetch=False)
    dataset_items = []
    for file_info in tqdm(file_iter):
        if file2id(file_info) in data:
            item = create_request_element(channel_id, file_info,
                                          data[file2id(file_info)])
            dataset_items.append(item)

    print('Registering dataset items....')
    dataset_client = DatasetClient(organization_id=organization_id,
                                   credential=credential)
    dataset = dataset_client.datasets.create(dataset_name,
                                             dataset_props['type'],
                                             dataset_props['props'])
    register_dataset_items(dataset, dataset_items, max_workers=max_workers)
    print('uploaded!')
def register_dataset_items_from_datalake(organization_id,
                                         channel_id,
                                         dataset_name,
                                         label_metadata_key):
    with open('dataset.json', 'r') as f:
        dataset_props = json.load(f)

    print('Getting data from datalake....')
    client = DatalakeClient(organization_id=organization_id,
                            credential=credential)
    channel = client.get_channel(channel_id)

    def to_annotation(file_info):
        label = file_info.metadata[label_metadata_key]
        label_id = label2id[label]
        return [{label_metadata_key: label,
                 'label_id': label_id,
                 'category_id': 0}]

    file_iter = channel.list_files(limit=1000, prefetch=False)
    label2id = {
        x['label']: x['label_id']
        for x in dataset_props['props']['categories'][0]['labels']
    }

    dataset_items = []
    for file_info in file_iter:
        item = create_request_element(channel_id, file_info,
                                      data_id=int(file_info.metadata['filename'].split('.')[0]),
                                      annotation=to_annotation(file_info))
        dataset_items.append(item)
        if len(dataset_items) % 1000 == 0:
            print(len(dataset_items))

    print('Registering dataset items....')
    dataset_client = DatasetClient(organization_id=organization_id,
                                   credential=credential)
    dataset = dataset_client.datasets.create(dataset_name,
                                             dataset_props['type'],
                                             dataset_props['props'])
    register_dataset_items(dataset, dataset_items)