Example #1
0
    def smote_bootstrap_sample(self, X, y, b, k):

        classes = np.unique(y)
        count = np.bincount(y)  # number of instances of each class

        majority_class = count.argmax()  # majority class
        majority_count = count.max()  # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0, ))

        class_data = X[(y == majority_class), :]
        idx = np.random.choice(majority_count, (majority_count, ))
        data = np.concatenate((data, class_data[idx, :]))
        target = np.concatenate((target, majority_class * np.ones(
            (majority_count, ))))

        minority_class = count.argmin()
        minority_count = count.min()

        #        print majority_count
        N_syn = int((majority_count) * (b / 100))
        #        print N_syn
        N_res = majority_count - N_syn
        #        print N_res
        N_syn, N_res = N_res, N_syn

        class_data = X[(y == minority_class), :]
        idx = np.random.choice(class_data.shape[0], (N_res, ))
        sampled_min_data = class_data[idx, :]
        #        print sampled_min_data.shape
        if N_syn > 0:
            N_smote = np.ceil(N_syn / minority_count) * 100
            N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100)
            synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k)

            idx = np.random.choice(synthetic.shape[0], (N_syn, ))
            new_class_data = np.concatenate(
                (sampled_min_data, synthetic[idx, :]))
            data = np.concatenate((data, new_class_data))
            target = np.concatenate((target, minority_class * np.ones(
                (new_class_data.shape[0], ))))
        else:
            data = np.concatenate((data, sampled_min_data))
            target = np.concatenate((target, minority_class * np.ones(
                (sampled_min_data.shape[0], ))))

        return data, target
Example #2
0
    def smote_bootstrap_sample(self, X, y, b, k):

        count = np.bincount(y)  # number of instances of each class

        majority_class = count.argmax()  # majority class
        majority_count = count.max()  # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0,))

        class_data = X[(y == majority_class), :]
        idx = np.random.choice(majority_count, (majority_count,))
        data = np.concatenate((data, class_data[idx, :]))
        target = np.concatenate(
            (target, majority_class * np.ones((majority_count,))))

        minority_class = count.argmin()
        minority_count = count.min()

        # print majority_count
        N_syn = int((majority_count) * (b / 100))
        # print N_syn
        N_res = majority_count - N_syn
        # print N_res
        N_syn, N_res = N_res, N_syn

        class_data = X[(y == minority_class), :]
        idx = np.random.choice(class_data.shape[0], (N_res,))
        sampled_min_data = class_data[idx, :]
        # print sampled_min_data.shape
        if N_syn > 0:
            N_smote = np.ceil(N_syn / minority_count) * 100
            N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100)
            synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k)

            idx = np.random.choice(synthetic.shape[0], (N_syn,))
            new_class_data = np.concatenate(
                (sampled_min_data, synthetic[idx, :]))
            data = np.concatenate((data, new_class_data))
            target = np.concatenate(
                (target, minority_class * np.ones((new_class_data.shape[0],))))
        else:
            data = np.concatenate((data, sampled_min_data))
            target = np.concatenate(
                (target, minority_class * np.ones((sampled_min_data.shape[0],))))  # noqa

        return data, target
Example #3
0
    def bootstrap_classifiers(self, X, y, K, pos_prob):

        clfs = []
        
        for i in range(K):
            mask = (self.positive_label == y)
            negative_label = y[~mask][0]

            majority_size = np.sum(~mask)
            minority_size = len(mask) - majority_size
            
            # apply smote
            N_smote = int(np.ceil(majority_size / minority_size) * 100 )

            #print 'classifier: {}'.format(i)
            #print '     maj size = {}'.format(majority_size)
            #print '     min size = {}'.format(minority_size)
            #print '     SMOTE:'
            #print '         N_smote: {}'.format(N_smote)
            #print '         T : {}'.format(X[mask].shape)
            
            X_syn = smote(X[mask],N=N_smote, k=self.smote_k)
            #print '         out : {}'.format(X_syn.shape)
            y_syn = self.positive_label * np.ones((X_syn.shape[0],))

            # use enough synthetic data to perfectly balance the binary problem
            n_missing = majority_size - minority_size
            #print n_missing
            idx = np.random.choice(X_syn.shape[0], n_missing)
            
            # add synthetic data to original data
            X_new = np.concatenate((X, X_syn[idx,]))
            y_new = np.concatenate((y, y_syn[idx,]))


    
            # use new mask
            mask = (self.positive_label == y_new)

            # balance the classes

            cX, cy = [], []
            for j in range(X_new.shape[0]):
                if np.random.random() < pos_prob:
                    idx = np.random.random_integers(0, len(X_new[mask]) - 1)
                    cX = cX + [X_new[mask][idx]]
                    cy = cy + [self.positive_label]
                else:
                    idx = np.random.random_integers(0, len(X_new[~mask]) - 1)
                    cX = cX + [X_new[~mask][idx]]
                    cy = cy + [negative_label]
            if not self.positive_label in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X_new[mask])- 1)
                cX[idx_1] = X_new[mask][idx_2]
                cy[idx_1] = self.positive_label
            elif not negative_label in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X_new[~mask])- 1)
                cX[idx_1] = X_new[~mask][idx_2]
                cy[idx_1] = negative_label

            clf = sklearn.base.clone(self.base_classifier)
            clfs = clfs + [clf.fit(cX, cy)]

        return clfs
Example #4
0
    def smote_bootstrap_sample(self, X, y, b, k):
        
        classes = np.unique(y)
        count = np.bincount(y) # number of instances of each class

        majority_class = count.argmax() # majority clas
        majority_count = count.max() # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0,))

        for i in classes:

            class_data = X[(y==i),:]

            if i == majority_class: # majority class
                # regular bootstrap (i.e. 100% sampling rate)
                idx = np.random.choice(majority_count, (majority_count,))
                data = np.concatenate((data, class_data[idx,:]))
                target = np.concatenate((target, i * np.ones((majority_count,))))
                #print('original class data = {}'.format(class_data.shape))
                #print('sampled class data = {}'.format(class_data[idx,:].shape))
                #print()



            else: # minority classes
                # bootstrap the class data with defined sampling rate
                sample_rate = (majority_count / class_data.shape[0]) * (b/100)
                idx = np.random.choice(class_data.shape[0], (int(sample_rate * class_data.shape[0]),))
                sampled_class_data = class_data[idx,:]
                
                #print('original class data = {}'.format(class_data.shape))
                #print('majority_count = {}'.format(majority_count))
                #print('class data = {}'.format(class_data.shape))
                #print('b = {}'.format(b))
                #print('sample rate = {}'.format(sample_rate))
                #print('sampled class data = {}'.format(sampled_class_data.shape))


                # run smote on bootstrapped data to obtain synthetic samples
                # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero
                N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b/100 + 10e-8)) * 100 )
                #print(N_smote)

                #print('----------')
                #print('smote parameters:')
                #print('T : {}'.format(sampled_class_data.shape))
                #print('N : {}'.format(N_smote))
                synthetic = smote(sampled_class_data, N=N_smote, k=self.k)
                #print('synthetic data = {})'.format(synthetic.shape))
                #print(synthetic)
               
                # add synthetic samples to sampled class data
                n_missing = majority_count - sampled_class_data.shape[0]
                idx = np.random.choice(synthetic.shape[0], (n_missing,))
                new_class_data = np.concatenate((sampled_class_data, synthetic[idx,:]))
                #print('new class data = {})'.format(new_class_data.shape))
                #print()
                data = np.concatenate((data, new_class_data))
                target = np.concatenate((target, i * np.ones((new_class_data.shape[0],))))

        return data, target
Example #5
0
    def smote_bootstrap_sample(self, X, y, b, k):

        classes = np.unique(y)
        count = np.bincount(y)  # number of instances of each class

        majority_class = count.argmax()  # majority clas
        majority_count = count.max()  # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0, ))

        for i in classes:

            class_data = X[(y == i), :]

            if i == majority_class:  # majority class
                # regular bootstrap (i.e. 100% sampling rate)
                idx = np.random.choice(majority_count, (majority_count, ))
                data = np.concatenate((data, class_data[idx, :]))
                target = np.concatenate((target, i * np.ones(
                    (majority_count, ))))
                #print('original class data = {}'.format(class_data.shape))
                #print('sampled class data = {}'.format(class_data[idx,:].shape))
                #print()

            else:  # minority classes
                # bootstrap the class data with defined sampling rate
                sample_rate = (majority_count / class_data.shape[0]) * (b /
                                                                        100)
                idx = np.random.choice(
                    class_data.shape[0],
                    (int(sample_rate * class_data.shape[0]), ))
                sampled_class_data = class_data[idx, :]

                #print('original class data = {}'.format(class_data.shape))
                #print('majority_count = {}'.format(majority_count))
                #print('class data = {}'.format(class_data.shape))
                #print('b = {}'.format(b))
                #print('sample rate = {}'.format(sample_rate))
                #print('sampled class data = {}'.format(sampled_class_data.shape))

                # run smote on bootstrapped data to obtain synthetic samples
                # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero
                N_smote = int(
                    np.ceil((majority_count / sampled_class_data.shape[0]) *
                            (1 - b / 100 + 10e-8)) * 100)
                #print(N_smote)

                #print('----------')
                #print('smote parameters:')
                #print('T : {}'.format(sampled_class_data.shape))
                #print('N : {}'.format(N_smote))
                synthetic = smote(sampled_class_data, N=N_smote, k=self.k)
                #print('synthetic data = {})'.format(synthetic.shape))
                #print(synthetic)

                # add synthetic samples to sampled class data
                n_missing = majority_count - sampled_class_data.shape[0]
                idx = np.random.choice(synthetic.shape[0], (n_missing, ))
                new_class_data = np.concatenate(
                    (sampled_class_data, synthetic[idx, :]))
                #print('new class data = {})'.format(new_class_data.shape))
                #print()
                data = np.concatenate((data, new_class_data))
                target = np.concatenate((target, i * np.ones(
                    (new_class_data.shape[0], ))))

        return data, target
Example #6
0
    def bootstrap_classifiers(self, X, y, K, pos_prob):

        clfs = []

        for i in range(K):
            mask = (self.positive_label == y)
            negative_label = y[~mask][0]

            majority_size = np.sum(~mask)
            minority_size = len(mask) - majority_size

            # apply smote
            N_smote = int(np.ceil(majority_size / minority_size) * 100)

            #print 'classifier: {}'.format(i)
            #print '     maj size = {}'.format(majority_size)
            #print '     min size = {}'.format(minority_size)
            #print '     SMOTE:'
            #print '         N_smote: {}'.format(N_smote)
            #print '         T : {}'.format(X[mask].shape)

            X_syn = smote(X[mask], N=N_smote, k=self.smote_k)
            #print '         out : {}'.format(X_syn.shape)
            y_syn = self.positive_label * np.ones((X_syn.shape[0], ))

            # use enough synthetic data to perfectly balance the binary problem
            n_missing = majority_size - minority_size
            #print n_missing
            idx = np.random.choice(X_syn.shape[0], n_missing)

            # add synthetic data to original data
            X_new = np.concatenate((X, X_syn[idx, ]))
            y_new = np.concatenate((y, y_syn[idx, ]))

            # use new mask
            mask = (self.positive_label == y_new)

            # balance the classes

            cX, cy = [], []
            for j in range(X_new.shape[0]):
                if np.random.random() < pos_prob:
                    idx = np.random.random_integers(0, len(X_new[mask]) - 1)
                    cX = cX + [X_new[mask][idx]]
                    cy = cy + [self.positive_label]
                else:
                    idx = np.random.random_integers(0, len(X_new[~mask]) - 1)
                    cX = cX + [X_new[~mask][idx]]
                    cy = cy + [negative_label]
            if not self.positive_label in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X_new[mask]) - 1)
                cX[idx_1] = X_new[mask][idx_2]
                cy[idx_1] = self.positive_label
            elif not negative_label in cy:
                idx_1 = np.random.random_integers(0, len(cX) - 1)
                idx_2 = np.random.random_integers(0, len(X_new[~mask]) - 1)
                cX[idx_1] = X_new[~mask][idx_2]
                cy[idx_1] = negative_label

            clf = sklearn.base.clone(self.base_classifier)
            clfs = clfs + [clf.fit(cX, cy)]

        return clfs