from arc23.data import retrieval as rt
import numpy as np

metadata_path = '/media/guest/Main Storage/HDD Data/CMAopenaccess/data.csv'
out_dir = './preprocessed_data.csv'

metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata(
    metadata_path,
    cols=(
        0, 18, -4
    ),  # -4 necessary so invalid images are ignored TODO: add parameter for which cols to validate?
    class_cols=(18, ))

len_metadata = 31149  # TODO: either the dataset is corrupted/in a different format after this point or the endpoint was down last I tried
metadata = metadata[:len_metadata]

metadata = np.array(metadata)
uniques, counts = np.unique(metadata[:, 1], return_counts=True)
count_dict = {u: c for u, c in zip(uniques, counts)}
max_count = max(counts)

with open(out_dir, 'w+', newline='', encoding="utf8") as out_file:
    for metadatum in metadata:
        # omit ambiguous categories
        m = class_to_index[0][metadatum[1]]
        if m == 5 or m == 7 or m == 8 or m == 12 or m == 13 or m == 14 or m == 18 or m == 19 or m == 20 or m == 21 or m == 22 or m == 23 or m == 24 or m == 25 or m == 27 or m == 28 or m == 32 or m == 34 or m == 36 or m == 37 or m == 38 or m == 39 or m == 40 or m == 41 or m == 42 or m == 45 or m == 46 or m == 47 or m == 48 or m == 49 or m == 50 or m == 51 or m == 52 or m == 54 or m == 58 or m == 59 or m == 60 or m == 62 or m == 63 or m == 65:
            pass
        else:
            # equalize the frequency of each class
            for _ in range(max_count // count_dict[metadatum[1]]):
                out_file.write(metadatum[0] + ',' + metadatum[1] + '\n')
Ejemplo n.º 2
0
from arc23.data import retrieval as rt
import numpy as np

metadata_path = './preprocessed_data.csv'

COL_TYPE = 1
COL_IMG_WEB = 0

metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata(
    metadata_path, cols=(COL_IMG_WEB, COL_TYPE), class_cols=(COL_TYPE, ))

metadata = np.array(metadata)

uniques, counts = np.unique(metadata[:, 1], return_counts=True)

print({u: c for u, c in zip(uniques, counts)})
Ejemplo n.º 3
0
def get_metadata(quiet=False):
    metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata(
        metadata_path,
        cols=(COL_ID, COL_PATH),
        delimiter=' ',
    )

    # shuffle at beginning to get random sampling for train, test and validation datasets
    random.shuffle(metadata)

    if not quiet:
        print(class_to_index)
        print(index_to_class)

    return metadata
def run():
    metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata(
        metadata_in_path,
        cols=(COL_ID, ),
    )

    with open(metadata_out_path, 'w+', newline='',
              encoding="utf8") as metadata_file:
        for m, metadatum in enumerate(metadata):
            filepath = metadatum[0] + '.png'
            img = pipe(get_from_metadata(), it.random_fit_to(
                (32, 32)))(metadatum)
            filepath = metadatum[0][:-3] + 'png'  # removing jpg extension
            img.save(data_out_dir + filepath, img_format='PNG')
            print('preprocessed ', metadatum[0])
            metadata_file.write(filepath + ' ' + str(m) + '\n')
def run():
    metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata(
        metadata_path,
        cols=(COL_ID, COL_TYPE, COL_IMG_WEB),
        class_cols=(COL_TYPE, ))
    len_metadata = 31149  # TODO: either the dataset is corrupted/in a different format after this point or the endpoint was down last I tried
    metadata = metadata[:len_metadata]

    # TODO: abstract into some sort of pipeline?
    # TODO: use DALI
    for metadatum in metadata:
        img = pipe(get_from_metadata(), it.random_fit_to(
            (256, 256)))(metadatum)
        filepath = data_out_dir + metadatum[0] + '.png'
        img.save(filepath, img_format='PNG')
        print('preprocessed ', metadatum[0])
Ejemplo n.º 6
0
def run():

    print('preparing metadata...')

    metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata(
        metadata_path,
        cols=(COL_IMG_WEB, COL_TYPE),
        class_cols=(COL_TYPE,)
    )
    len_metadata = 31149  # TODO: either the dataset is corrupted/in a different format after this point or the endpoint was down last I tried
    metadata = metadata[:len_metadata]

    # shuffle at beginning to get random sampling for train, test and validation datasets
    random.shuffle(metadata)

    print(class_to_index)
    print(index_to_class)

    # TODO: make this easier to read/abstracted out to do for train, validation, test all at once?
    # TODO: don't restrict it to just those though, or to requiring metadata
    data_split_points = (None, 2048, 1024, 0)

    train_metadata, validation_metadata, test_metadata = (
        metadata[n:m] for m, n in zip(data_split_points[:-1], data_split_points[1:])
    )

    print(metadata_headers)

    print('initializing loaders...')

    loader, validation_loader, test_loader = (make_loader(ldir, m, class_to_index)
                                              for ldir, m in zip(
        (train_label_out_path, validation_label_out_path, test_label_out_path),
        (train_metadata, validation_metadata, test_metadata)
    ))

    loader.build()
    validation_loader.build()
    test_loader.build()

    dataiter = iter(loader)
    # demo_batch = next(dataiter)
    # print(demo_batch['labels'][0].cpu().item())
    # demo_img = np.swapaxes(demo_batch['inputs'][0].cpu(), 0, -1)
    # plt.imshow(demo_img / 256.)
    # plt.show()

    is_cuda = cuda.is_available()

    device = torch.device("cuda:0" if is_cuda else "cpu")
    print("using ", device)

    layers = define_layers(num_classes)
    net = ninit.from_iterable(sh.infer_shapes(layers, loader))
    net = net.to(device)

    confusion_matrix_metric = mt.confusion_matrix(index_to_class[0].keys())
    metrics = [
        mt.accuracy_by_category(index_to_class[0].keys()),
        MetricFuncs(
            on_item=confusion_matrix_metric.on_item,
            on_end=lambda: out.matrix_to_csv(lambda steps_per_epoch: confusion_matrix_metric.on_end, './out/confusion_matrix')(0)()
        )
    ]

    net = adapt_checkpointing(
        checkpoint_sequential,
        lambda n: dry_run(n, loader, make_trainer, functools.partial(train_step, squeeze_gtruth=True), device=device)(),
        net
    )

    if is_cuda:
        profile_cuda_memory_by_layer(
            net,
            dry_run(net, loader, make_trainer, functools.partial(train_step, squeeze_gtruth=True), device=device),
            device=device
        )
        optimize_cuda_for_fixed_input_size()

    # the trainer is not used above or it would be modified
    trainer = make_trainer(net)

    train_state = TrainState()

    # if we have a save file, continue from there
    if os.path.isfile(train_state_path):
        net, trainer, train_state = serialization.load_train_state(train_state_path)(net, trainer, train_state)()

    accuracy = test(net, test_loader, metrics, device, squeeze_gtruth=True)
    print("pre-training accuracy: ", accuracy)

    callbacks = {
        "on_step": [
            out.scalar_to_tensorboard(cb.loss(), out.tensorboard_writer()),
            lambda steps_per_epoch: on_interval(
                out.print_with_step(
                    cb.interval_avg_loss(interval=1)
                )(steps_per_epoch),
                16
            )
        ],
        "on_epoch_start": [
            out.print_tables(
                cb.layer_stats(
                    net,
                    dry_run(net, loader, trainer, functools.partial(train_step, squeeze_gtruth=True), device=device),
                    [
                        mh.weight_stats_hook((torch.mean,)),
                        mh.output_stats_hook((torch.var,)),
                        mh.grad_stats_hook((torch.var_mean,)),
                    ]
                ), titles=["WEIGHT STATS", "OUTPUT_STATS", "GRADIENT STATS"], headers=["Layer", "Value"]
            ),
        ],
        "on_epoch_end": [
            cb.validate(functools.partial(validate, squeeze_gtruth=True), net, validation_loader, metrics, device),
            lambda steps_per_epoch: serialization.save_train_state(train_state_path)(net, trainer, train_state),
        ]
    }

    train(net, loader, trainer, callbacks, device, train_state, 50, squeeze_gtruth=True)

    accuracy = test(net, test_loader, metrics, device, squeeze_gtruth=True)
    print("post-training accuracy: ", accuracy)
def run():
    metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata(
        metadata_in_path,
        cols=(COL_ID, ),
    )

    with open(metadata_out_path, 'w+', newline='',
              encoding="utf8") as metadata_file:
        # TODO: abstract into some sort of pipeline?
        # TODO: use DALI
        for m, metadatum in enumerate(metadata):
            # TODO: upscale once base implementation works
            img = pipe(get_from_metadata(), it.random_fit_to(
                (32, 32)))(metadatum)
            filepath = metadatum[0][:-3] + 'png'  # removing jpg extension
            img.save(data_out_dir + filepath, img_format='PNG')
            print('preprocessed ', metadatum[0])
            metadata_file.write(filepath + ' ' + str(m) + '\n')