from arc23.data import retrieval as rt import numpy as np metadata_path = '/media/guest/Main Storage/HDD Data/CMAopenaccess/data.csv' out_dir = './preprocessed_data.csv' metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata( metadata_path, cols=( 0, 18, -4 ), # -4 necessary so invalid images are ignored TODO: add parameter for which cols to validate? class_cols=(18, )) len_metadata = 31149 # TODO: either the dataset is corrupted/in a different format after this point or the endpoint was down last I tried metadata = metadata[:len_metadata] metadata = np.array(metadata) uniques, counts = np.unique(metadata[:, 1], return_counts=True) count_dict = {u: c for u, c in zip(uniques, counts)} max_count = max(counts) with open(out_dir, 'w+', newline='', encoding="utf8") as out_file: for metadatum in metadata: # omit ambiguous categories m = class_to_index[0][metadatum[1]] if m == 5 or m == 7 or m == 8 or m == 12 or m == 13 or m == 14 or m == 18 or m == 19 or m == 20 or m == 21 or m == 22 or m == 23 or m == 24 or m == 25 or m == 27 or m == 28 or m == 32 or m == 34 or m == 36 or m == 37 or m == 38 or m == 39 or m == 40 or m == 41 or m == 42 or m == 45 or m == 46 or m == 47 or m == 48 or m == 49 or m == 50 or m == 51 or m == 52 or m == 54 or m == 58 or m == 59 or m == 60 or m == 62 or m == 63 or m == 65: pass else: # equalize the frequency of each class for _ in range(max_count // count_dict[metadatum[1]]): out_file.write(metadatum[0] + ',' + metadatum[1] + '\n')
from arc23.data import retrieval as rt import numpy as np metadata_path = './preprocessed_data.csv' COL_TYPE = 1 COL_IMG_WEB = 0 metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata( metadata_path, cols=(COL_IMG_WEB, COL_TYPE), class_cols=(COL_TYPE, )) metadata = np.array(metadata) uniques, counts = np.unique(metadata[:, 1], return_counts=True) print({u: c for u, c in zip(uniques, counts)})
def get_metadata(quiet=False): metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata( metadata_path, cols=(COL_ID, COL_PATH), delimiter=' ', ) # shuffle at beginning to get random sampling for train, test and validation datasets random.shuffle(metadata) if not quiet: print(class_to_index) print(index_to_class) return metadata
def run(): metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata( metadata_in_path, cols=(COL_ID, ), ) with open(metadata_out_path, 'w+', newline='', encoding="utf8") as metadata_file: for m, metadatum in enumerate(metadata): filepath = metadatum[0] + '.png' img = pipe(get_from_metadata(), it.random_fit_to( (32, 32)))(metadatum) filepath = metadatum[0][:-3] + 'png' # removing jpg extension img.save(data_out_dir + filepath, img_format='PNG') print('preprocessed ', metadatum[0]) metadata_file.write(filepath + ' ' + str(m) + '\n')
def run(): metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata( metadata_path, cols=(COL_ID, COL_TYPE, COL_IMG_WEB), class_cols=(COL_TYPE, )) len_metadata = 31149 # TODO: either the dataset is corrupted/in a different format after this point or the endpoint was down last I tried metadata = metadata[:len_metadata] # TODO: abstract into some sort of pipeline? # TODO: use DALI for metadatum in metadata: img = pipe(get_from_metadata(), it.random_fit_to( (256, 256)))(metadatum) filepath = data_out_dir + metadatum[0] + '.png' img.save(filepath, img_format='PNG') print('preprocessed ', metadatum[0])
def run(): print('preparing metadata...') metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata( metadata_path, cols=(COL_IMG_WEB, COL_TYPE), class_cols=(COL_TYPE,) ) len_metadata = 31149 # TODO: either the dataset is corrupted/in a different format after this point or the endpoint was down last I tried metadata = metadata[:len_metadata] # shuffle at beginning to get random sampling for train, test and validation datasets random.shuffle(metadata) print(class_to_index) print(index_to_class) # TODO: make this easier to read/abstracted out to do for train, validation, test all at once? # TODO: don't restrict it to just those though, or to requiring metadata data_split_points = (None, 2048, 1024, 0) train_metadata, validation_metadata, test_metadata = ( metadata[n:m] for m, n in zip(data_split_points[:-1], data_split_points[1:]) ) print(metadata_headers) print('initializing loaders...') loader, validation_loader, test_loader = (make_loader(ldir, m, class_to_index) for ldir, m in zip( (train_label_out_path, validation_label_out_path, test_label_out_path), (train_metadata, validation_metadata, test_metadata) )) loader.build() validation_loader.build() test_loader.build() dataiter = iter(loader) # demo_batch = next(dataiter) # print(demo_batch['labels'][0].cpu().item()) # demo_img = np.swapaxes(demo_batch['inputs'][0].cpu(), 0, -1) # plt.imshow(demo_img / 256.) # plt.show() is_cuda = cuda.is_available() device = torch.device("cuda:0" if is_cuda else "cpu") print("using ", device) layers = define_layers(num_classes) net = ninit.from_iterable(sh.infer_shapes(layers, loader)) net = net.to(device) confusion_matrix_metric = mt.confusion_matrix(index_to_class[0].keys()) metrics = [ mt.accuracy_by_category(index_to_class[0].keys()), MetricFuncs( on_item=confusion_matrix_metric.on_item, on_end=lambda: out.matrix_to_csv(lambda steps_per_epoch: confusion_matrix_metric.on_end, './out/confusion_matrix')(0)() ) ] net = adapt_checkpointing( checkpoint_sequential, lambda n: dry_run(n, loader, make_trainer, functools.partial(train_step, squeeze_gtruth=True), device=device)(), net ) if is_cuda: profile_cuda_memory_by_layer( net, dry_run(net, loader, make_trainer, functools.partial(train_step, squeeze_gtruth=True), device=device), device=device ) optimize_cuda_for_fixed_input_size() # the trainer is not used above or it would be modified trainer = make_trainer(net) train_state = TrainState() # if we have a save file, continue from there if os.path.isfile(train_state_path): net, trainer, train_state = serialization.load_train_state(train_state_path)(net, trainer, train_state)() accuracy = test(net, test_loader, metrics, device, squeeze_gtruth=True) print("pre-training accuracy: ", accuracy) callbacks = { "on_step": [ out.scalar_to_tensorboard(cb.loss(), out.tensorboard_writer()), lambda steps_per_epoch: on_interval( out.print_with_step( cb.interval_avg_loss(interval=1) )(steps_per_epoch), 16 ) ], "on_epoch_start": [ out.print_tables( cb.layer_stats( net, dry_run(net, loader, trainer, functools.partial(train_step, squeeze_gtruth=True), device=device), [ mh.weight_stats_hook((torch.mean,)), mh.output_stats_hook((torch.var,)), mh.grad_stats_hook((torch.var_mean,)), ] ), titles=["WEIGHT STATS", "OUTPUT_STATS", "GRADIENT STATS"], headers=["Layer", "Value"] ), ], "on_epoch_end": [ cb.validate(functools.partial(validate, squeeze_gtruth=True), net, validation_loader, metrics, device), lambda steps_per_epoch: serialization.save_train_state(train_state_path)(net, trainer, train_state), ] } train(net, loader, trainer, callbacks, device, train_state, 50, squeeze_gtruth=True) accuracy = test(net, test_loader, metrics, device, squeeze_gtruth=True) print("post-training accuracy: ", accuracy)
def run(): metadata, len_metadata, metadata_headers, class_to_index, index_to_class, num_classes = rt.load_metadata( metadata_in_path, cols=(COL_ID, ), ) with open(metadata_out_path, 'w+', newline='', encoding="utf8") as metadata_file: # TODO: abstract into some sort of pipeline? # TODO: use DALI for m, metadatum in enumerate(metadata): # TODO: upscale once base implementation works img = pipe(get_from_metadata(), it.random_fit_to( (32, 32)))(metadatum) filepath = metadatum[0][:-3] + 'png' # removing jpg extension img.save(data_out_dir + filepath, img_format='PNG') print('preprocessed ', metadatum[0]) metadata_file.write(filepath + ' ' + str(m) + '\n')