def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0): ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, "ntype_train.txt")))) clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt")))) noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt")))) clean_keys = read_list(os.path.join(data_root, "clean_train_key_list.txt")) noisy_keys = read_list(os.path.join(data_root, "noisy_train_key_list.txt")) # upsampling clean keys to ratio * #noisy_keys clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio) # mix clean and noisy data keys = list(clean_keys) + list(noisy_keys) np.random.shuffle(keys) clean, noisy, ntype = [], [], [] for k in keys: if k in clean_kv: clean.append(clean_kv[k]) noisy.append("-1") else: clean.append("-1") noisy.append(noisy_kv[k]) if k in ntype_kv: ntype.append(ntype_kv[k]) else: ntype.append("-1") keys = [k + " -1" for k in keys] write_list(keys, os.path.join(output_dir, "mixed_train_images.txt")) write_list(clean, os.path.join(output_dir, "mixed_train_label_clean.txt")) write_list(noisy, os.path.join(output_dir, "mixed_train_label_noisy.txt")) write_list(ntype, os.path.join(output_dir, "mixed_train_label_ntype.txt"))
def make_aux_ntype(data_root, output_dir): clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt")))) noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt")))) train_keys = set(read_list(os.path.join(data_root, "clean_train_key_list.txt"))) val_keys = set(read_list(os.path.join(data_root, "clean_val_key_list.txt"))) test_keys = set(read_list(os.path.join(data_root, "clean_test_key_list.txt"))) noisy_keys = set(noisy_kv.keys()) # compute and save matrix C keys = (train_keys | val_keys) & noisy_keys clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) C = compute_matrix_c(clean_labels, noisy_labels) save_to_blobproto(C, os.path.join(output_dir, "matrix_c.binaryproto")) # make noise type (ntype) def _make(keys, token): clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) lines = [] alpha = 1.0 / (C.shape[0] - 1) for key, y, y_tilde in zip(keys, clean_labels, noisy_labels): if y == y_tilde: lines.append(key + " 0") elif alpha >= C[y_tilde][y]: lines.append(key + " 1") else: lines.append(key + " 2") np.random.shuffle(lines) output_file = os.path.join(output_dir, "ntype_{}.txt".format(token)) write_list(lines, output_file) _make(train_keys & noisy_keys, "train") _make(val_keys & noisy_keys, "val") _make(test_keys & noisy_keys, "test")
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0): ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, 'ntype_train.txt')))) clean_kv = dict( zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt')))) noisy_kv = dict( zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt')))) clean_keys = read_list(os.path.join(data_root, 'clean_train_key_list.txt')) noisy_keys = read_list(os.path.join(data_root, 'noisy_train_key_list.txt')) # upsampling clean keys to ratio * #noisy_keys clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio) # mix clean and noisy data keys = list(clean_keys) + list(noisy_keys) np.random.shuffle(keys) clean, noisy, ntype = [], [], [] for k in keys: if k in clean_kv: clean.append(clean_kv[k]) noisy.append('-1') else: clean.append('-1') noisy.append(noisy_kv[k]) if k in ntype_kv: ntype.append(ntype_kv[k]) else: ntype.append('-1') keys = [k + ' -1' for k in keys] write_list(keys, os.path.join(output_dir, 'mixed_train_images.txt')) write_list(clean, os.path.join(output_dir, 'mixed_train_label_clean.txt')) write_list(noisy, os.path.join(output_dir, 'mixed_train_label_noisy.txt')) write_list(ntype, os.path.join(output_dir, 'mixed_train_label_ntype.txt'))
def main(args): id_offset = 0 merged_train_kv = {} merged_val_kv = {} for dataset_dir, db_dir in zip(args.dataset_dirs, args.db_dirs): train_files, train_labels = read_kv(osp.join(db_dir, 'train.txt')) val_files, val_labels = read_kv(osp.join(db_dir, 'val.txt')) unique_ids = set(map(int, train_labels + val_labels)) id_mapping = {idx: i + id_offset for i, idx in enumerate(unique_ids)} for k, v in zip(train_files, train_labels): merged_train_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)] for k, v in zip(val_files, val_labels): merged_val_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)] id_offset += len(id_mapping) mkdir_if_missing(osp.join(args.output_dir)) train_list = [k + ' ' + str(v) for k, v in merged_train_kv.iteritems()] np.random.shuffle(train_list) write_list(train_list, osp.join(args.output_dir, 'train.txt')) write_kv(merged_val_kv.keys(), map(str, merged_val_kv.values()), osp.join(args.output_dir, 'val.txt')) print "Max ID:", id_offset
def make_aux_clean(data_root, output_dir): label_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt")))) def _make(token): keys = read_list(os.path.join(data_root, "clean_{}_key_list.txt".format(token))) lines = [k + " " + label_kv[k] for k in keys] np.random.shuffle(lines) output_file = os.path.join(output_dir, "clean_{}.txt".format(token)) write_list(lines, output_file) _make("train") _make("val") _make("test")
def make_aux_ntype(data_root, output_dir): clean_kv = dict( zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt')))) noisy_kv = dict( zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt')))) train_keys = set( read_list(os.path.join(data_root, 'clean_train_key_list.txt'))) val_keys = set(read_list(os.path.join(data_root, 'clean_val_key_list.txt'))) test_keys = set( read_list(os.path.join(data_root, 'clean_test_key_list.txt'))) noisy_keys = set(noisy_kv.keys()) # compute and save matrix C keys = (train_keys | val_keys) & noisy_keys clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) C = compute_matrix_c(clean_labels, noisy_labels) save_to_blobproto(C, os.path.join(output_dir, 'matrix_c.binaryproto')) # make noise type (ntype) def _make(keys, token): clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) lines = [] alpha = 1.0 / (C.shape[0] - 1) for key, y, y_tilde in zip(keys, clean_labels, noisy_labels): if y == y_tilde: lines.append(key + ' 0') elif alpha >= C[y_tilde][y]: lines.append(key + ' 1') else: lines.append(key + ' 2') np.random.shuffle(lines) output_file = os.path.join(output_dir, 'ntype_{}.txt'.format(token)) write_list(lines, output_file) _make(train_keys & noisy_keys, 'train') _make(val_keys & noisy_keys, 'val') _make(test_keys & noisy_keys, 'test')
def make_aux_clean(data_root, output_dir): label_kv = dict( zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt')))) def _make(token): keys = read_list( os.path.join(data_root, 'clean_{}_key_list.txt'.format(token))) lines = [k + ' ' + label_kv[k] for k in keys] np.random.shuffle(lines) output_file = os.path.join(output_dir, 'clean_{}.txt'.format(token)) write_list(lines, output_file) _make('train') _make('val') _make('test')