def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority class majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0, )) class_data = X[(y == majority_class), :] idx = np.random.choice(majority_count, (majority_count, )) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate((target, majority_class * np.ones( (majority_count, )))) minority_class = count.argmin() minority_count = count.min() # print majority_count N_syn = int((majority_count) * (b / 100)) # print N_syn N_res = majority_count - N_syn # print N_res N_syn, N_res = N_res, N_syn class_data = X[(y == minority_class), :] idx = np.random.choice(class_data.shape[0], (N_res, )) sampled_min_data = class_data[idx, :] # print sampled_min_data.shape if N_syn > 0: N_smote = np.ceil(N_syn / minority_count) * 100 N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100) synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k) idx = np.random.choice(synthetic.shape[0], (N_syn, )) new_class_data = np.concatenate( (sampled_min_data, synthetic[idx, :])) data = np.concatenate((data, new_class_data)) target = np.concatenate((target, minority_class * np.ones( (new_class_data.shape[0], )))) else: data = np.concatenate((data, sampled_min_data)) target = np.concatenate((target, minority_class * np.ones( (sampled_min_data.shape[0], )))) return data, target
def smote_bootstrap_sample(self, X, y, b, k): count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority class majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) class_data = X[(y == majority_class), :] idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate( (target, majority_class * np.ones((majority_count,)))) minority_class = count.argmin() minority_count = count.min() # print majority_count N_syn = int((majority_count) * (b / 100)) # print N_syn N_res = majority_count - N_syn # print N_res N_syn, N_res = N_res, N_syn class_data = X[(y == minority_class), :] idx = np.random.choice(class_data.shape[0], (N_res,)) sampled_min_data = class_data[idx, :] # print sampled_min_data.shape if N_syn > 0: N_smote = np.ceil(N_syn / minority_count) * 100 N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100) synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k) idx = np.random.choice(synthetic.shape[0], (N_syn,)) new_class_data = np.concatenate( (sampled_min_data, synthetic[idx, :])) data = np.concatenate((data, new_class_data)) target = np.concatenate( (target, minority_class * np.ones((new_class_data.shape[0],)))) else: data = np.concatenate((data, sampled_min_data)) target = np.concatenate( (target, minority_class * np.ones((sampled_min_data.shape[0],)))) # noqa return data, target
def bootstrap_classifiers(self, X, y, K, pos_prob): clfs = [] for i in range(K): mask = (self.positive_label == y) negative_label = y[~mask][0] majority_size = np.sum(~mask) minority_size = len(mask) - majority_size # apply smote N_smote = int(np.ceil(majority_size / minority_size) * 100 ) #print 'classifier: {}'.format(i) #print ' maj size = {}'.format(majority_size) #print ' min size = {}'.format(minority_size) #print ' SMOTE:' #print ' N_smote: {}'.format(N_smote) #print ' T : {}'.format(X[mask].shape) X_syn = smote(X[mask],N=N_smote, k=self.smote_k) #print ' out : {}'.format(X_syn.shape) y_syn = self.positive_label * np.ones((X_syn.shape[0],)) # use enough synthetic data to perfectly balance the binary problem n_missing = majority_size - minority_size #print n_missing idx = np.random.choice(X_syn.shape[0], n_missing) # add synthetic data to original data X_new = np.concatenate((X, X_syn[idx,])) y_new = np.concatenate((y, y_syn[idx,])) # use new mask mask = (self.positive_label == y_new) # balance the classes cX, cy = [], [] for j in range(X_new.shape[0]): if np.random.random() < pos_prob: idx = np.random.random_integers(0, len(X_new[mask]) - 1) cX = cX + [X_new[mask][idx]] cy = cy + [self.positive_label] else: idx = np.random.random_integers(0, len(X_new[~mask]) - 1) cX = cX + [X_new[~mask][idx]] cy = cy + [negative_label] if not self.positive_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X_new[mask])- 1) cX[idx_1] = X_new[mask][idx_2] cy[idx_1] = self.positive_label elif not negative_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X_new[~mask])- 1) cX[idx_1] = X_new[~mask][idx_2] cy[idx_1] = negative_label clf = sklearn.base.clone(self.base_classifier) clfs = clfs + [clf.fit(cX, cy)] return clfs
def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0,)) for i in classes: class_data = X[(y==i),:] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count,)) data = np.concatenate((data, class_data[idx,:])) target = np.concatenate((target, i * np.ones((majority_count,)))) #print('original class data = {}'.format(class_data.shape)) #print('sampled class data = {}'.format(class_data[idx,:].shape)) #print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b/100) idx = np.random.choice(class_data.shape[0], (int(sample_rate * class_data.shape[0]),)) sampled_class_data = class_data[idx,:] #print('original class data = {}'.format(class_data.shape)) #print('majority_count = {}'.format(majority_count)) #print('class data = {}'.format(class_data.shape)) #print('b = {}'.format(b)) #print('sample rate = {}'.format(sample_rate)) #print('sampled class data = {}'.format(sampled_class_data.shape)) # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b/100 + 10e-8)) * 100 ) #print(N_smote) #print('----------') #print('smote parameters:') #print('T : {}'.format(sampled_class_data.shape)) #print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) #print('synthetic data = {})'.format(synthetic.shape)) #print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing,)) new_class_data = np.concatenate((sampled_class_data, synthetic[idx,:])) #print('new class data = {})'.format(new_class_data.shape)) #print() data = np.concatenate((data, new_class_data)) target = np.concatenate((target, i * np.ones((new_class_data.shape[0],)))) return data, target
def smote_bootstrap_sample(self, X, y, b, k): classes = np.unique(y) count = np.bincount(y) # number of instances of each class majority_class = count.argmax() # majority clas majority_count = count.max() # majority class data = np.empty((0, X.shape[1])) target = np.empty((0, )) for i in classes: class_data = X[(y == i), :] if i == majority_class: # majority class # regular bootstrap (i.e. 100% sampling rate) idx = np.random.choice(majority_count, (majority_count, )) data = np.concatenate((data, class_data[idx, :])) target = np.concatenate((target, i * np.ones( (majority_count, )))) #print('original class data = {}'.format(class_data.shape)) #print('sampled class data = {}'.format(class_data[idx,:].shape)) #print() else: # minority classes # bootstrap the class data with defined sampling rate sample_rate = (majority_count / class_data.shape[0]) * (b / 100) idx = np.random.choice( class_data.shape[0], (int(sample_rate * class_data.shape[0]), )) sampled_class_data = class_data[idx, :] #print('original class data = {}'.format(class_data.shape)) #print('majority_count = {}'.format(majority_count)) #print('class data = {}'.format(class_data.shape)) #print('b = {}'.format(b)) #print('sample rate = {}'.format(sample_rate)) #print('sampled class data = {}'.format(sampled_class_data.shape)) # run smote on bootstrapped data to obtain synthetic samples # ceil to make sure N_smote is a multiple of 100, and the small value to avoid a zero N_smote = int( np.ceil((majority_count / sampled_class_data.shape[0]) * (1 - b / 100 + 10e-8)) * 100) #print(N_smote) #print('----------') #print('smote parameters:') #print('T : {}'.format(sampled_class_data.shape)) #print('N : {}'.format(N_smote)) synthetic = smote(sampled_class_data, N=N_smote, k=self.k) #print('synthetic data = {})'.format(synthetic.shape)) #print(synthetic) # add synthetic samples to sampled class data n_missing = majority_count - sampled_class_data.shape[0] idx = np.random.choice(synthetic.shape[0], (n_missing, )) new_class_data = np.concatenate( (sampled_class_data, synthetic[idx, :])) #print('new class data = {})'.format(new_class_data.shape)) #print() data = np.concatenate((data, new_class_data)) target = np.concatenate((target, i * np.ones( (new_class_data.shape[0], )))) return data, target
def bootstrap_classifiers(self, X, y, K, pos_prob): clfs = [] for i in range(K): mask = (self.positive_label == y) negative_label = y[~mask][0] majority_size = np.sum(~mask) minority_size = len(mask) - majority_size # apply smote N_smote = int(np.ceil(majority_size / minority_size) * 100) #print 'classifier: {}'.format(i) #print ' maj size = {}'.format(majority_size) #print ' min size = {}'.format(minority_size) #print ' SMOTE:' #print ' N_smote: {}'.format(N_smote) #print ' T : {}'.format(X[mask].shape) X_syn = smote(X[mask], N=N_smote, k=self.smote_k) #print ' out : {}'.format(X_syn.shape) y_syn = self.positive_label * np.ones((X_syn.shape[0], )) # use enough synthetic data to perfectly balance the binary problem n_missing = majority_size - minority_size #print n_missing idx = np.random.choice(X_syn.shape[0], n_missing) # add synthetic data to original data X_new = np.concatenate((X, X_syn[idx, ])) y_new = np.concatenate((y, y_syn[idx, ])) # use new mask mask = (self.positive_label == y_new) # balance the classes cX, cy = [], [] for j in range(X_new.shape[0]): if np.random.random() < pos_prob: idx = np.random.random_integers(0, len(X_new[mask]) - 1) cX = cX + [X_new[mask][idx]] cy = cy + [self.positive_label] else: idx = np.random.random_integers(0, len(X_new[~mask]) - 1) cX = cX + [X_new[~mask][idx]] cy = cy + [negative_label] if not self.positive_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X_new[mask]) - 1) cX[idx_1] = X_new[mask][idx_2] cy[idx_1] = self.positive_label elif not negative_label in cy: idx_1 = np.random.random_integers(0, len(cX) - 1) idx_2 = np.random.random_integers(0, len(X_new[~mask]) - 1) cX[idx_1] = X_new[~mask][idx_2] cy[idx_1] = negative_label clf = sklearn.base.clone(self.base_classifier) clfs = clfs + [clf.fit(cX, cy)] return clfs