def prepare_data_for_training(args): """ make dataset ready for training and validation""" # Form the train/test splits and write them to disk dataset = data.Dataset(args) # get image classes and image counts in each class label_map = dataset.get_class_info() class_count = len(list(label_map.values())) # split the data and store it in log dir df_train, df_test = dataset.split_dataset() # perform dataset augmentations image_data = augment.Augmentation(args) # get the data gens for training and test images train_data_gen, _ = image_data.map_fn_train(df_train) test_data_gen, _ = image_data.map_fn_test(df_test) return train_data_gen, test_data_gen, df_train, df_test, class_count
param_training = ed.EasyDict() param_training.epoch = 25 param_training.lr_schedule = 'poly' param_training.init_lr = 0.005 param_model = ed.EasyDict() param_model.rand_n = 30 if __name__ == '__main__': # rank matching 的匹配效果图 import dataset_utils import matplotlib.pyplot as plt root_path = r'D:\Documents\Data_Files\Datasets\Pascal\VOC2012' dataset = dataset_utils.Dataset(root_path) img, label = dataset[4] backbone_root_path = 'd:/Documents/Data_Files/Parameters' ssd = SSD(backbone_root_path) mx_img, mx_label = ssd.get_transform_fn_val()(img, label) mx_label = mx_label.expand_dims(axis=0) # (1, M, 5) tensor_preds = ssd(mx_img.expand_dims(axis=0)) cls_targets, box_targets, pos_neg_samples = generate_target(mx.nd.array(ssd.get_anchors()).expand_dims(axis=0), tensor_preds[:, :, 21:], mx_label[:, :, -4:], mx_label[:, :, 0]) pos_idx = (np.where(pos_neg_samples[0].asnumpy() > 0))[0] # (P, ) fig = plt.figure()
param.inputWahlDescriptor = FLAGS.desc_type != 0 param.dim_model = FLAGS.conv_dim dir = dataset_constants.m40_desc_dataset_dict.get(FLAGS.dataset, 'none') if dir == 'none': print('Unsupported dataset type %s, exit!' % FLAGS.dataset) exit(1) param.input_dir = dir if param.inputWahlDescriptor: descriptor_to_use = 'wahl_long_desc_run_1_' else: descriptor_to_use = 'merged_proposed_kl_bugfix4_desc_run_0_' dict_labels = dataset_utils.m40_label_to_int_dict m40_dataset = dataset_utils.Dataset(param.input_dir, dict_labels, descriptor_to_use) N_classes = len(dict_labels) classes_train = m40_dataset.train_label.shape[1] classes_test = m40_dataset.test_label.shape[1] assert (N_classes == classes_train) assert (N_classes == classes_test) print('Used classes %d, train %d test %d' % (N_classes, classes_train, classes_test)) # we check the dimension of this nparray to ensure our data is correct N_train = m40_dataset.train_feat.shape N_l_train = m40_dataset.train_label.shape print("Training dataset on m40, N_train %dx%d, N_label_train %dx%d" % (N_train[0], N_train[1], N_l_train[0], N_l_train[1]))
train_df, val_df = load_data(data_config.data_source) vocab_idx, vocab_phone_CZidx, vocab_phone_ENidx, vocab_phone_HUidx, vocab_phone_RUidx = load_vocabularies(data_config.data_source) # Get useful dimensions from vocabularies PHONE_CZ_ALPHABET_LEN = len(vocab_phone_CZidx)+1 PHONE_EN_ALPHABET_LEN = len(vocab_phone_ENidx)+1 PHONE_HU_ALPHABET_LEN = len(vocab_phone_HUidx)+1 PHONE_RU_ALPHABET_LEN = len(vocab_phone_RUidx)+1 print('Generating ids') custom_unit_dict = {"sent_unit": "chars"} training_data = dataset_utils.Dataset(train_df, vocab_idx, vocab_phone_CZidx, vocab_phone_ENidx, vocab_phone_HUidx, vocab_phone_RUidx) training_data.generate(custom_unit_dict, has_class=True, add_start_end_tag=False) validation_data = dataset_utils.Dataset(val_df, vocab_idx, vocab_phone_CZidx, vocab_phone_ENidx, vocab_phone_HUidx, vocab_phone_RUidx) validation_data.generate(custom_unit_dict, has_class=True, add_start_end_tag=False) # Get useful dimensions from dataset MAX_CHAR_IN_SENT_LEN = find_max_len(training_data.ids_sentence, validation_data.ids_sentence) MAX_PHONE_CZ_LEN = find_max_len(training_data.ids_phones_CZ, validation_data.ids_phones_CZ) MAX_PHONE_EN_LEN = find_max_len(training_data.ids_phones_EN, validation_data.ids_phones_EN) MAX_PHONE_HU_LEN = find_max_len(training_data.ids_phones_HU, validation_data.ids_phones_HU) MAX_PHONE_RU_LEN = find_max_len(training_data.ids_phones_RU, validation_data.ids_phones_RU) print('Preprocessing data')