Ejemplo n.º 1
0
def balance_class_count_hdf5(fpath,
                             keys,
                             key_label='label',
                             other_clname=CLNAME_OTHER):
    """ Resample keys in an HDF5 to generate a near balanced dataset.
    Returns a dictionary with resampled features and ground truth
    and indicies from the original label that were sampled.
    Not suitable for very large datasets.
    
    fpath -- path to HDF5 file
    keys -- keys to resample (e.g. features)
    Keyword arguments:
    key_label -- key for ground truth data in HDF5
    other_clname -- name for negative class (None if non-existent)
    """
    h_src = h5py.File(fpath, 'r')
    labls = h_src[key_label][:]
    bal = Balancer(np.squeeze(labls))
    class_count = bal.get_class_count(other_clname=other_clname)
    idxs = bal.get_idxs_to_balance_class_count(class_count.values())
    np.random.shuffle(
        idxs
    )  # shuffle the array along the first index of a multi-dimensional array, in-place

    dict_balanced = {key_label: labls[idxs]}
    for k in keys:
        dict_balanced[k] = h_src[k][:][idxs]
    return dict_balanced, idxs
Ejemplo n.º 2
0
    def test_get_idxs_to_balance_class_count_other_highest(self):

        self.l[10:60, 1] = 0
        self.l[10:30, 1] = 1
        bal = Balancer(np.copy(self.l))
        counts = bal.get_class_count(other_clname=CLNAME_OTHER)
        assert_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 20)
        assert_equals(counts[CLNAME_OTHER], 70)
        assert_equals(counts[CLNAME_OTHER], np.max(counts.values()),
                      "this test requires class count for %s to be highest!")
        tolerance_order = 1
        idxs = bal.get_idxs_to_balance_class_count(counts.values())
        assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 0,
                                                            idxs < 10)
                                             ),
                            10 + (70 - 10), tolerance_order)
        assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 10,
                                                            idxs < 30)
                                             ),
                            20 + (70 - 20), tolerance_order)
        assert_equals(np.count_nonzero(idxs >= 30),
                      70, tolerance_order)
Ejemplo n.º 3
0
def balance_class_count_hdf5(fpath, keys,
                             key_label='label',
                             other_clname=CLNAME_OTHER):
    """ Resample keys in an HDF5 to generate a near balanced dataset.
    Returns a dictionary with resampled features and ground truth
    and indicies from the original label that were sampled.
    Not suitable for very large datasets.
    
    fpath -- path to HDF5 file
    keys -- keys to resample (e.g. features)
    Keyword arguments:
    key_label -- key for ground truth data in HDF5
    other_clname -- name for negative class (None if non-existent)
    """
    h_src = h5py.File(fpath, 'r')
    labls = h_src[key_label][:]
    bal = Balancer(np.squeeze(labls))
    class_count = bal.get_class_count(other_clname=other_clname)
    idxs = bal.get_idxs_to_balance_class_count(class_count.values())
    np.random.shuffle(idxs)  # shuffle the array along the first index of a multi-dimensional array, in-place

    dict_balanced = {key_label : labls[idxs]}
    for k in keys:
        dict_balanced[k] = h_src[k][:][idxs]
    return dict_balanced, idxs
Ejemplo n.º 4
0
def get_class_count_hdf5(fpath, key_label='label', other_clname=CLNAME_OTHER):
    """ Count per-class instances in HDF5 and return a dictionary of class ids
    and per-class count

    fpath -- path to HDF5 file
    Keyword arguments:
    key_label -- key for ground truth data in HDF5
    other_clname -- name for negative class (None if non-existent)
    """
    h = h5py.File(fpath, 'r')
    b = Balancer(np.squeeze(h[key_label]))
    return b.get_class_count(other_clname=other_clname)
Ejemplo n.º 5
0
def save_balanced_sampled_class_count_hdf5(fpath,
                                           keys,
                                           fpath_dst,
                                           key_label='label',
                                           other_clname=CLNAME_OTHER,
                                           chunks=None,
                                           target_count=None):
    """ Resample keys in an HDF5 to generate a near balanced dataset
    and save into a new HDF5.
    Returns indicies from the original label that were sampled.
    Not suitable for very large datasets.

    Classes with count < target_count will sub-sampled without replacement.
    Classes with count > target_count will get over-sampled.
    Classes with count equal to target_count will be copied.

    fpath -- path to source HDF5 file
    keys -- keys to resample (e.g. features)
    fpath_dst -- path to destination HDF5 file
    Keyword arguments:
    key_label -- key for ground truth data in HDF5
    other_clname -- name for negative class (None if non-existent)
    chunks -- forward chunks parameter to use during hdf5 writing
    target_count -- per-class count to target when sampling
    """
    if os.path.abspath(fpath) == os.path.abspath(fpath_dst):
        raise IOError("Cannot read and write to the same file (%s) (%s)" %
                      (fpath, fpath_dst))

    with h5py.File(fpath, 'r') as h_src:
        labls = h_src[key_label][:]
        bal = Balancer(np.squeeze(labls))
        class_count = bal.get_class_count(other_clname=other_clname)
        idxs = bal.get_idxs_to_balance_class_count(class_count.values(),
                                                   target_count)
        np.random.shuffle(
            idxs
        )  # shuffle the array along the first index of a multi-dimensional array, in-place
        with h5py.File(fpath_dst, 'w') as h_dst:
            h_dst[key_label] = labls[idxs]
            for k in keys:
                dataset_src = h_src[k]
                shape_new = list(dataset_src.shape)
                shape_new[0] = len(idxs)
                dataset_dst = h_dst.create_dataset(k,
                                                   tuple(shape_new),
                                                   dataset_src.dtype,
                                                   chunks=chunks)
                for idx_dst, idx_src in enumerate(idxs):
                    dataset_dst[idx_dst] = dataset_src[idx_src]
    return idxs
Ejemplo n.º 6
0
def get_class_count_hdf5(fpath,
                         key_label='label',
                         other_clname=CLNAME_OTHER):
    """ Count per-class instances in HDF5 and return a dictionary of class ids
    and per-class count

    fpath -- path to HDF5 file
    Keyword arguments:
    key_label -- key for ground truth data in HDF5
    other_clname -- name for negative class (None if non-existent)
    """
    h = h5py.File(fpath, 'r')
    b = Balancer(np.squeeze(h[key_label]))
    return b.get_class_count(other_clname=other_clname)
Ejemplo n.º 7
0
def save_balanced_sampled_class_count_hdf5(fpath,
                                           keys,
                                           fpath_dst,
                                           key_label='label',
                                           other_clname=CLNAME_OTHER,
                                           chunks=None,
                                           target_count=None
                                           ):
    """ Resample keys in an HDF5 to generate a near balanced dataset
    and save into a new HDF5.
    Returns indicies from the original label that were sampled.
    Not suitable for very large datasets.

    Classes with count < target_count will sub-sampled without replacement.
    Classes with count > target_count will get over-sampled.
    Classes with count equal to target_count will be copied.

    fpath -- path to source HDF5 file
    keys -- keys to resample (e.g. features)
    fpath_dst -- path to destination HDF5 file
    Keyword arguments:
    key_label -- key for ground truth data in HDF5
    other_clname -- name for negative class (None if non-existent)
    chunks -- forward chunks parameter to use during hdf5 writing
    target_count -- per-class count to target when sampling
    """
    if os.path.abspath(fpath) == os.path.abspath(fpath_dst):
        raise IOError("Cannot read and write to the same file (%s) (%s)" %
                      (fpath, fpath_dst))

    with h5py.File(fpath, 'r') as h_src:
        labls = h_src[key_label][:]
        bal = Balancer(np.squeeze(labls))
        class_count = bal.get_class_count(other_clname=other_clname)
        idxs = bal.get_idxs_to_balance_class_count(class_count.values(),
                                                   target_count)
        np.random.shuffle(idxs)  # shuffle the array along the first index of a multi-dimensional array, in-place
        with h5py.File(fpath_dst, 'w') as h_dst:
            h_dst[key_label] = labls[idxs]
            for k in keys:
                dataset_src = h_src[k]
                shape_new = list(dataset_src.shape)
                shape_new[0] = len(idxs)
                dataset_dst = h_dst.create_dataset(k, tuple(shape_new),
                                                   dataset_src.dtype,
                                                   chunks=chunks)
                for idx_dst, idx_src in enumerate(idxs):
                    dataset_dst[idx_dst] = dataset_src[idx_src]
    return idxs
Ejemplo n.º 8
0
    def test_get_idxs_to_balance_class_count_other_not_highest(self):

        bal = Balancer(np.copy(self.l))
        counts = bal.get_class_count(other_clname=CLNAME_OTHER)
        assert_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 50)
        assert_equals(counts[CLNAME_OTHER], 40)
        tolerance_order = 1
        idxs = bal.get_idxs_to_balance_class_count(counts.values())
        assert_almost_equal(
            np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10)),
            10 + (50 - 10), tolerance_order)
        assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)),
                      50, 1)
        assert_almost_equal(np.count_nonzero(idxs >= 60), 40 + (50 - 40),
                            tolerance_order)
Ejemplo n.º 9
0
    def test_get_idxs_to_balance_class_count_other_not_highest(self):

        bal = Balancer(np.copy(self.l))
        counts = bal.get_class_count(other_clname=CLNAME_OTHER)
        assert_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 50)
        assert_equals(counts[CLNAME_OTHER], 40)

        for target_count in [500]:#[10, 20, 500]:
            idxs = bal.sample_idxs_to_target_count(counts.values(),
                                                   target_count)

            assert_equals(idxs.size, (self.num_classes + 1) * target_count)
            assert_equals(np.count_nonzero(idxs < 10), target_count)
            assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)),
                          target_count)
            assert_equals(np.count_nonzero(idxs >= 60), target_count)
Ejemplo n.º 10
0
    def test_get_idxs_to_balance_class_count_other_not_highest(self):

        bal = Balancer(np.copy(self.l))
        counts = bal.get_class_count(other_clname=CLNAME_OTHER)
        assert_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 50)
        assert_equals(counts[CLNAME_OTHER], 40)

        for target_count in [500]:  #[10, 20, 500]:
            idxs = bal.sample_idxs_to_target_count(counts.values(),
                                                   target_count)

            assert_equals(idxs.size, (self.num_classes + 1) * target_count)
            assert_equals(np.count_nonzero(idxs < 10), target_count)
            assert_equals(
                np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)),
                target_count)
            assert_equals(np.count_nonzero(idxs >= 60), target_count)
Ejemplo n.º 11
0
    def test_get_idxs_to_balance_class_count_other_not_highest(self):

        bal = Balancer(np.copy(self.l))
        counts = bal.get_class_count(other_clname=CLNAME_OTHER)
        assert_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 50)
        assert_equals(counts[CLNAME_OTHER], 40)
        tolerance_order = 1
        idxs = bal.get_idxs_to_balance_class_count(counts.values())
        assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 0,
                                                            idxs < 10)
                                             ),
                            10 + (50 - 10), tolerance_order)
        assert_equals(np.count_nonzero(np.logical_and(idxs >= 10,
                                                      idxs < 60)
                                       ),
                      50, 1)
        assert_almost_equal(np.count_nonzero(idxs >= 60),
                            40 + (50 - 40), tolerance_order)
Ejemplo n.º 12
0
    def test_get_idxs_to_balance_class_count_no_other(self):

        new_col = np.zeros((len(self.l), 1))
        labls = np.hstack((self.l, new_col))
        labls[60:, -1] = 1
        bal = Balancer(labls)
        counts = bal.get_class_count(other_clname=None)
        assert_not_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 50)
        assert_equals(counts[2], 40)
        tolerance_order = 1
        idxs = bal.get_idxs_to_balance_class_count(counts.values())
        assert_almost_equal(
            np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10)),
            10 + (50 - 10), tolerance_order)
        assert_equals(np.count_nonzero(np.logical_and(idxs >= 10, idxs < 60)),
                      50, 1)
        assert_almost_equal(np.count_nonzero(idxs >= 60), 40 + (50 - 40),
                            tolerance_order)
Ejemplo n.º 13
0
    def test_get_idxs_to_balance_class_count_other_highest(self):

        self.l[10:60, 1] = 0
        self.l[10:30, 1] = 1
        bal = Balancer(np.copy(self.l))
        counts = bal.get_class_count(other_clname=CLNAME_OTHER)
        assert_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 20)
        assert_equals(counts[CLNAME_OTHER], 70)
        assert_equals(counts[CLNAME_OTHER], np.max(counts.values()),
                      "this test requires class count for %s to be highest!")
        tolerance_order = 1
        idxs = bal.get_idxs_to_balance_class_count(counts.values())
        assert_almost_equal(
            np.count_nonzero(np.logical_and(idxs >= 0, idxs < 10)),
            10 + (70 - 10), tolerance_order)
        assert_almost_equal(
            np.count_nonzero(np.logical_and(idxs >= 10, idxs < 30)),
            20 + (70 - 20), tolerance_order)
        assert_equals(np.count_nonzero(idxs >= 30), 70, tolerance_order)
Ejemplo n.º 14
0
    def test_get_idxs_to_balance_class_count_no_other(self):

        new_col = np.zeros((len(self.l), 1))
        labls = np.hstack((self.l, new_col))
        labls[60:, -1] = 1
        bal = Balancer(labls)
        counts = bal.get_class_count(other_clname=None)
        assert_not_in(CLNAME_OTHER, counts.keys())

        assert_equals(counts[0], 10)
        assert_equals(counts[1], 50)
        assert_equals(counts[2], 40)
        tolerance_order = 1
        idxs = bal.get_idxs_to_balance_class_count(counts.values())
        assert_almost_equal(np.count_nonzero(np.logical_and(idxs >= 0,
                                                            idxs < 10)
                                             ),
                            10 + (50 - 10), tolerance_order)
        assert_equals(np.count_nonzero(np.logical_and(idxs >= 10,
                                                      idxs < 60)
                                       ),
                      50, 1)
        assert_almost_equal(np.count_nonzero(idxs >= 60),
                            40 + (50 - 40), tolerance_order)