Example #1
0
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0):
    ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, "ntype_train.txt"))))
    clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt"))))
    noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt"))))
    clean_keys = read_list(os.path.join(data_root, "clean_train_key_list.txt"))
    noisy_keys = read_list(os.path.join(data_root, "noisy_train_key_list.txt"))
    # upsampling clean keys to ratio * #noisy_keys
    clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio)
    # mix clean and noisy data
    keys = list(clean_keys) + list(noisy_keys)
    np.random.shuffle(keys)
    clean, noisy, ntype = [], [], []
    for k in keys:
        if k in clean_kv:
            clean.append(clean_kv[k])
            noisy.append("-1")
        else:
            clean.append("-1")
            noisy.append(noisy_kv[k])
        if k in ntype_kv:
            ntype.append(ntype_kv[k])
        else:
            ntype.append("-1")
    keys = [k + " -1" for k in keys]
    write_list(keys, os.path.join(output_dir, "mixed_train_images.txt"))
    write_list(clean, os.path.join(output_dir, "mixed_train_label_clean.txt"))
    write_list(noisy, os.path.join(output_dir, "mixed_train_label_noisy.txt"))
    write_list(ntype, os.path.join(output_dir, "mixed_train_label_ntype.txt"))
Example #2
0
def make_aux_ntype(data_root, output_dir):
    clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt"))))
    noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt"))))
    train_keys = set(read_list(os.path.join(data_root, "clean_train_key_list.txt")))
    val_keys = set(read_list(os.path.join(data_root, "clean_val_key_list.txt")))
    test_keys = set(read_list(os.path.join(data_root, "clean_test_key_list.txt")))
    noisy_keys = set(noisy_kv.keys())
    # compute and save matrix C
    keys = (train_keys | val_keys) & noisy_keys
    clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
    noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
    C = compute_matrix_c(clean_labels, noisy_labels)
    save_to_blobproto(C, os.path.join(output_dir, "matrix_c.binaryproto"))
    # make noise type (ntype)
    def _make(keys, token):
        clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
        noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
        lines = []
        alpha = 1.0 / (C.shape[0] - 1)
        for key, y, y_tilde in zip(keys, clean_labels, noisy_labels):
            if y == y_tilde:
                lines.append(key + " 0")
            elif alpha >= C[y_tilde][y]:
                lines.append(key + " 1")
            else:
                lines.append(key + " 2")
        np.random.shuffle(lines)
        output_file = os.path.join(output_dir, "ntype_{}.txt".format(token))
        write_list(lines, output_file)

    _make(train_keys & noisy_keys, "train")
    _make(val_keys & noisy_keys, "val")
    _make(test_keys & noisy_keys, "test")
Example #3
0
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0):
    ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, 'ntype_train.txt'))))
    clean_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt'))))
    noisy_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt'))))
    clean_keys = read_list(os.path.join(data_root, 'clean_train_key_list.txt'))
    noisy_keys = read_list(os.path.join(data_root, 'noisy_train_key_list.txt'))
    # upsampling clean keys to ratio * #noisy_keys
    clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio)
    # mix clean and noisy data
    keys = list(clean_keys) + list(noisy_keys)
    np.random.shuffle(keys)
    clean, noisy, ntype = [], [], []
    for k in keys:
        if k in clean_kv:
            clean.append(clean_kv[k])
            noisy.append('-1')
        else:
            clean.append('-1')
            noisy.append(noisy_kv[k])
        if k in ntype_kv:
            ntype.append(ntype_kv[k])
        else:
            ntype.append('-1')
    keys = [k + ' -1' for k in keys]
    write_list(keys, os.path.join(output_dir, 'mixed_train_images.txt'))
    write_list(clean, os.path.join(output_dir, 'mixed_train_label_clean.txt'))
    write_list(noisy, os.path.join(output_dir, 'mixed_train_label_noisy.txt'))
    write_list(ntype, os.path.join(output_dir, 'mixed_train_label_ntype.txt'))
Example #4
0
def main(args):
    id_offset = 0
    merged_train_kv = {}
    merged_val_kv = {}
    for dataset_dir, db_dir in zip(args.dataset_dirs, args.db_dirs):
        train_files, train_labels = read_kv(osp.join(db_dir, 'train.txt'))
        val_files, val_labels = read_kv(osp.join(db_dir, 'val.txt'))
        unique_ids = set(map(int, train_labels + val_labels))
        id_mapping = {idx: i + id_offset for i, idx in enumerate(unique_ids)}
        for k, v in zip(train_files, train_labels):
            merged_train_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)]
        for k, v in zip(val_files, val_labels):
            merged_val_kv[osp.join(dataset_dir, k)] = id_mapping[int(v)]
        id_offset += len(id_mapping)
    mkdir_if_missing(osp.join(args.output_dir))
    train_list = [k + ' ' + str(v) for k, v in merged_train_kv.iteritems()]
    np.random.shuffle(train_list)
    write_list(train_list, osp.join(args.output_dir, 'train.txt'))
    write_kv(merged_val_kv.keys(), map(str, merged_val_kv.values()),
             osp.join(args.output_dir, 'val.txt'))
    print "Max ID:", id_offset
Example #5
0
def make_aux_clean(data_root, output_dir):
    label_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt"))))

    def _make(token):
        keys = read_list(os.path.join(data_root, "clean_{}_key_list.txt".format(token)))
        lines = [k + " " + label_kv[k] for k in keys]
        np.random.shuffle(lines)
        output_file = os.path.join(output_dir, "clean_{}.txt".format(token))
        write_list(lines, output_file)

    _make("train")
    _make("val")
    _make("test")
Example #6
0
def make_aux_ntype(data_root, output_dir):
    clean_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt'))))
    noisy_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt'))))
    train_keys = set(
        read_list(os.path.join(data_root, 'clean_train_key_list.txt')))
    val_keys = set(read_list(os.path.join(data_root,
                                          'clean_val_key_list.txt')))
    test_keys = set(
        read_list(os.path.join(data_root, 'clean_test_key_list.txt')))
    noisy_keys = set(noisy_kv.keys())
    # compute and save matrix C
    keys = (train_keys | val_keys) & noisy_keys
    clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
    noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
    C = compute_matrix_c(clean_labels, noisy_labels)
    save_to_blobproto(C, os.path.join(output_dir, 'matrix_c.binaryproto'))

    # make noise type (ntype)
    def _make(keys, token):
        clean_labels = np.asarray([int(clean_kv[k]) for k in keys])
        noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys])
        lines = []
        alpha = 1.0 / (C.shape[0] - 1)
        for key, y, y_tilde in zip(keys, clean_labels, noisy_labels):
            if y == y_tilde:
                lines.append(key + ' 0')
            elif alpha >= C[y_tilde][y]:
                lines.append(key + ' 1')
            else:
                lines.append(key + ' 2')
        np.random.shuffle(lines)
        output_file = os.path.join(output_dir, 'ntype_{}.txt'.format(token))
        write_list(lines, output_file)

    _make(train_keys & noisy_keys, 'train')
    _make(val_keys & noisy_keys, 'val')
    _make(test_keys & noisy_keys, 'test')
Example #7
0
def make_aux_clean(data_root, output_dir):
    label_kv = dict(
        zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt'))))

    def _make(token):
        keys = read_list(
            os.path.join(data_root, 'clean_{}_key_list.txt'.format(token)))
        lines = [k + ' ' + label_kv[k] for k in keys]
        np.random.shuffle(lines)
        output_file = os.path.join(output_dir, 'clean_{}.txt'.format(token))
        write_list(lines, output_file)

    _make('train')
    _make('val')
    _make('test')