def smote_oversampling_regression(loop_features, loop_targets, new_feature_names): print("+++ SMOTE Oversampling Regression") loop_features_np = np.array(loop_features) loop_targets_np = np.array(loop_targets) loop_targets_np = np.reshape(loop_targets_np, (np.shape(loop_targets_np)[0], 1)) combined_data = np.append(loop_features_np, loop_targets_np, axis=1) combined_data_list = combined_data.tolist() dataframe = pd.DataFrame(combined_data_list) column_names = new_feature_names.copy() column_names.append("target") dataframe.columns = column_names smogn.smoter(data=dataframe, y="target") upsampled_data = np.array(dataframe.values.tolist()) new_loop_targets = upsampled_data[:, -1].tolist() new_loop_features = np.delete(upsampled_data, -1, axis=1).tolist() return new_loop_features, new_loop_targets
def main(): parser = argparse.ArgumentParser() parser.add_argument('-it', '--input_train', help="input file for train", default='../dataset/train.npy') parser.add_argument('-ot', '--output_train', help="output file for train weights", default='../dataset/train_weights.npy') args = parser.parse_args() src = np.load(args.input_train, allow_pickle=True) src_df = pd.DataFrame(data=src) src_df = src_df.add_prefix('col') #TODO ERROR custom phi smogn_df = smogn.smoter(src_df, 'col74', k=9, rel_coef=0.5, rel_thres=0.02) np.save(args.output_eval, smogn_df.to_numpy(), allow_pickle=True)
def smogn_on_train_splitted(): smogn_params = dict( # main arguments data=dataset['ori']['train_splitted'], y='smap_windspd', # string ('header name') k=7, # positive integer (k < n) pert=0.02, # real number (0 < R < 1) samp_method='extreme', # string ('balance' or 'extreme') drop_na_col=True, # boolean (True or False) drop_na_row=True, # boolean (True or False) replace=False, # boolean (True or False) # phi relevance arguments rel_thres=0.9, # real number (0 < R < 1) rel_method='manual', # string ('auto' or 'manual') # rel_xtrm_type='high', # unused (rel_method='manual') # rel_coef=3.525, # unused (rel_method='manual') rel_ctrl_pts_rg=[[5, 0, 0], [20, 0, 0], [35, 0, 0], [50, 1, 0]]) save_dir = ('/Users/lujingze/Programming/SWFusion/regression/' 'tc/dataset/smogn_final/smogn_on_train_splitted/') train_splitted_smogn = smogn.smoter(**smogn_params) os.makedirs(save_dir, exist_ok=True) train_splitted_smogn.to_pickle(f'{save_dir}train_splitted_smogn.pkl')
def main(data , drug_ids): for drug_id in data: print('drug_id: ', drug_id) if debug == 1: print(data[drug_id].shape) df = data[drug_id] df = df.drop(columns=['Cell_id']) # =============== # CAtegorize the labels # binInterval = np.arange(0,1.1,0.1) # df['AUC'] = pd.cut(df['AUC'], bins = binInterval, labels=binInterval[1:]) # y = df['AUC'].values # ================================== # Split the data y = df['AUC'].to_numpy(dtype ='float32') X = df.drop(columns=['AUC']).to_numpy(dtype ='float32') X = StandardScaler().fit_transform(X) X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.2, random_state=0) if debug == 1: print("Step1: Split: ",X_train_orig.shape, X_test_orig.shape) # ==================================== if apply_PCA == 1: # =========== # perform PCA PCAS = {} for n in N_COMPONENTS: if n not in PCAS: PCAS[n] = {} if debug == 1: print('PCA_',n) if do_PCA == 1: pca_v = PCA(n_components=n) principalComponents = pca_v.fit(X_train_orig) else: with open(CLEAN_DATA_DIR+'data_pca_'+str(drug_id)+'_'+str(n)+'.pkl', 'rb') as f: principalComponents = pickle.load( f) X_train = principalComponents.transform(X_train_orig) X_test = principalComponents.transform(X_test_orig) if debug == 1: print('new shape: ', X_train.shape) # ==================================== # Over Sampling if do_overSampling == 1: dfr = pd.DataFrame(X_train) dfr['AUC'] = pd.Series(y_train_orig) # oversample = RandomOverSampler(sampling_strategy='minority') # oversample = SMOTE(sampling_strategy='minority') print(len(dfr[dfr['AUC'] > 0.5])) print(len(dfr[dfr['AUC'] <= 0.5])) upsampled_data = smogn.smoter(data=dfr, y='AUC') # fit and apply the transform # X, y = oversample.fit_resample(X, y) print(len(upsampled_data[upsampled_data['AUC'] > 0.5])) print(len(upsampled_data[upsampled_data['AUC'] <= 0.5])) # break y_train = upsampled_data['AUC'].to_numpy(dtype='float32') X_train = upsampled_data.drop(columns=['AUC']).to_numpy(dtype='float32') elif do_overSampling == 2: #try2 dfr = pd.DataFrame(X_train) dfr['AUC'] = pd.Series(y_train_orig) upsampled_data = handle_imbalanced_Data_try1(dfr) y_train = upsampled_data['AUC'].to_numpy(dtype='float32') X_train = upsampled_data.drop(columns=['AUC']).to_numpy(dtype='float32') # ================== # y_test = y_test_orig PCAS[n]['X_train'] = X_train PCAS[n]['y_train'] = y_train PCAS[n]['X_test'] = X_test PCAS[n]['y_test'] = y_test if test_model == 1: svm_trivial(X_train, X_test, y_train, y_test) # save PCA with open(CLEAN_DATA_DIR+'data_pca_'+str(drug_id)+'.pkl', 'wb') as f: pickle.dump(PCAS, f) elif apply_CORR == 1: s = X_train_orig.shape[1] corr = [] for i in range(s): slope, intercept, r_value, p_value, std_err = stats.linregress(X_train_orig[:,i], y_train_orig) corr.append(r_value) print('drug_id:'+str(drug_id)+', minVal: '+str(min(corr)) +', maxVal: '+str(max(corr))) corr = np.array(corr) l1 = corr[corr > 0.4] l2 = corr[corr < -0.4] combined = np.concatenate((l1,l2), axis=0) inx = np.argwhere( (corr > 0.4) | (corr < -0.4)) X_train = X_train_orig[:, inx.T.tolist()[0]] y_train = y_train_orig X_test = X_test_orig[:, inx.T.tolist()[0]] y_test = y_test_orig print('> 0.4: ', len(l1), ', < -0.4', len(l2), ', total: ', len(inx)) # save PCA with open(CLEAN_DATA_DIR+'data_corr_val_'+str(drug_id)+'.pkl', 'wb') as f: pickle.dump(corr, f) corrF = {0:{}} corrF[0]['X_train'] = X_train corrF[0]['y_train'] = y_train corrF[0]['X_test'] = X_test corrF[0]['y_test'] = y_test with open(CLEAN_DATA_DIR+'data_corr_'+str(drug_id)+'.pkl', 'wb') as f: pickle.dump(corrF, f)
def main(data, drug_ids): # if debug == 1: # print(drug_ids) for drug_id in data: print('drug_id: ', drug_id) if debug == 1: print(data[drug_id].shape) df = data[drug_id] y = df['AUC'].to_numpy(dtype='float32') X = df.drop(columns=['AUC']).to_numpy(dtype='float32') X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # test2(df) # svm(df) #PCA_fn(df) # ==================================== # perform PCA if do_PCA == 1: if debug == 1: print('X_train shape: ', X_train.shape) pca = apply_PCA2(X_train, y_train, n_components) # save drug ids with open( 'data_pca_' + str(drug_id) + '_' + str(n_components[0]) + '.pkl', 'wb') as f: pickle.dump(pca[0], f) # X = pca[0] X_train = pca[0].transform(X_train) X_test = pca[0].transform(X_test) if debug == 1: print('n_components: ', n_components[0], ', new shape: ', X_train.shape) else: # read saved data with open( 'data_pca_' + str(drug_id) + '_' + str(n_components[0]) + '.pkl', 'rb') as f: d = pickle.load(f) X_train = d.transform(X_train) X_test = d.transform(X_test) # ==================================== # Over Sampling if do_overSampling == 1: dfr = pd.DataFrame(X_train) dfr['AUC'] = pd.Series(y_train) # oversample = RandomOverSampler(sampling_strategy='minority') # oversample = SMOTE(sampling_strategy='minority') print(len(dfr[dfr['AUC'] > 0.5])) print(len(dfr[dfr['AUC'] <= 0.5])) upsampled_data = smogn.smoter(data=dfr, y='AUC') # fit and apply the transform # X, y = oversample.fit_resample(X, y) print(len(upsampled_data[upsampled_data['AUC'] > 0.5])) print(len(upsampled_data[upsampled_data['AUC'] <= 0.5])) # break y_train = upsampled_data['AUC'].to_numpy(dtype='float32') X_train = upsampled_data.drop(columns=['AUC']).to_numpy( dtype='float32') # ================== # split for training and testing if debug == 1: print("Split: ", X_train.shape, X_test.shape) # lr(X_train, X_test, y_train, y_test) svm_trivial(X_train, X_test, y_train, y_test) # svm(X_train, X_test, y_train, y_test) break # To do : should comment this line for delivery
def get_augmented_data(data: pd.DataFrame, target_column: str, change_point_index: int, output_scale: float, da: str = 'scaled', max_samples: int = None, append: str = 'no', o_perc: float = 1.1, u_perc: float = 0.8, thr: float = 0.2, under_samp: bool = False, rel_coef: float = 1.5, rel_thr: float = 0.5, focus: str = 'high'): """ get augmented data :param data: base dataset :param target_column: taget column :param change_point_index: index of the change point :param output_scale: calculated output scaling factor :param da: data augmentation method to use :param max_samples: maximum samples to consider for data augmentation :param append: specify whether to append original and scaled dataset for da or not :param o_perc: oversampling percentage for GN :param u_perc: undersampling percentage for GN :param thr: threshold for GN :param under_samp: specify whether to undersample for SMOGN :param rel_coef: relevance coefficient for SMOGN :param rel_thr: relevance threshold for SMOGN :param focus: focus for SMOGN :return: augmented dataset """ samples = data.copy()[:change_point_index + 1].reset_index(drop=True) samples = samples.iloc[-max_samples:] if ( max_samples is not None and samples.shape[0] > max_samples) else samples samples_scaled = samples.copy() samples_scaled[target_column] *= output_scale if da == 'scaled': augmented_data = samples_scaled else: if append == 'before': samples = samples.append(samples_scaled.reset_index( drop=True)).sample(frac=1).reset_index(drop=True) else: samples = samples_scaled if da == 'smogn': augmented_data = smogn.smoter(data=samples, y=target_column, under_samp=under_samp, samp_method='extreme', rel_xtrm_type=focus, rel_coef=rel_coef, rel_thres=rel_thr) elif da == 'gn': sampler = pir.GaussianNoise(df=samples, rel_func='default', o_percentage=o_perc, y_col=target_column, u_percentage=u_perc, random_state=42, threshold=thr) augmented_data = sampler.get() return augmented_data
def build_learning_dataset(tc: TopCoder): """ Build learning dataset for prediction of - avg_score - number_of_registration - sub_reg_ratio I assume that these target data are regressionally imbalanced, thus we should resample it before learning. The threshold are set as followed: - avg_score: 90 - number_of_registration: 30 - sub_reg_ratio: 0.25 :param contain_docvec: Boolean: Whether include document vector in the feature. Default as False :param normalize: Boolean: Whether to normalzie the X data. """ # manually set data resampling threshold target_resamp_info = { 'avg_score': { 'threshold': 90, 'extreme': 'low', 'upper_bound': 100 }, 'number_of_registration': { 'threshold': 30, 'extreme': 'high', 'lower_bound': 0 }, 'sub_reg_ratio': { 'threshold': 0.25, 'extreme': 'high', 'upper_bound': 1 }, } test_size = 954 # len(feature_df) * 0.2 ~= 953.8, use 20% of the data for testing storage_path = os.path.join(os.curdir, 'result', 'boosting_learn', 'learning_data') # get the raw data from TopCoder data object cha_info = tc.get_filtered_challenge_info() feature_df = tc\ .get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True)\ .join(cha_info.reindex(['total_prize'], axis=1)) docvec_df = pd.read_json(os.path.join(os.curdir, 'data', 'new_docvec.json'), orient='index') target_df = cha_info.reindex(list(target_resamp_info.keys()), axis=1) if not (target_df.index == feature_df.index).all(): raise ValueError( 'Check index of target_df and feature_df, it\'s not equal.') for col, info in target_resamp_info.items(): print(f'Building dataset for {col}') target_sr = target_df[col] test_index = util_stratified_split_regression(target_sr, info['threshold'], info['extreme'], test_size) X_train_raw = feature_df.loc[~feature_df.index.isin(test_index )].sort_index() X_test_raw = feature_df.loc[feature_df.index.isin( test_index)].sort_index() y_train_raw = target_sr[~target_sr.index.isin(test_index)].sort_index() y_test_raw = target_sr[target_sr.index.isin(test_index)].sort_index() if not ((X_train_raw.index == y_train_raw.index).all() and (X_test_raw.index == y_test_raw.index).all()): raise ValueError('Check X, y test index, they are not equal.') for dv in True, False: print(f'Resampling with dv={dv}...') test_data_fn = os.path.join(storage_path, f'{col}_test_dv{int(dv)}.json') train_data_original_fn = os.path.join( storage_path, f'{col}_train_original_dv{int(dv)}.json') train_data_resample_fn = os.path.join( storage_path, f'{col}_train_resample_dv{int(dv)}.json') X_train, X_test, y_train, y_test = X_train_raw.copy( ), X_test_raw.copy(), y_train_raw.copy(), y_test_raw.copy() if dv: X_train = X_train.join(docvec_df) X_test = X_test.join(docvec_df) # From now on it's pure numpy till storage ;-) X_train, X_test = X_train.to_numpy(), X_test.to_numpy() y_train, y_test = y_train.to_numpy(), y_test.to_numpy() scaler = StandardScaler().fit(X_train) normalizer = Normalizer().fit(X_train) X_train, X_test = scaler.transform(X_train), scaler.transform( X_test) X_train, X_test = normalizer.transform( X_train), normalizer.transform(X_test) print( f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}' ) print( f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}') test_data = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) test_data_df = pd.DataFrame(test_data) test_data_df.columns = [ *[f'x{i}' for i in range(X_test.shape[1])], 'y' ] test_data_df.to_json(test_data_fn, orient='index') print(f'Test data DataFrame shape: {test_data_df.shape}') train_data_original = np.concatenate( (X_train, y_train.reshape(-1, 1)), axis=1) train_data_original_df = pd.DataFrame(train_data_original) train_data_original_df.columns = [ *[f'x{i}' for i in range(X_test.shape[1])], 'y' ] train_data_original_df.to_json(train_data_original_fn, orient='index') print( f'Training data original shape: {train_data_original_df.shape}' ) attempt = 0 while True: print(f'Attempt #{attempt}...') try: train_data_resample_df = smoter( data=train_data_original_df, y='y', samp_method='extreme', rel_xtrm_type=info['extreme']).reset_index( drop=True ) # just use the default setting for SMOGN except ValueError as e: print(f'Encounter error: "{e}", rerun the SMOGN...') continue else: print( f'Training data resample shape: {train_data_resample_df.shape} - before boundary filtering' ) if 'upper_bound' in info: train_data_resample_df = train_data_resample_df.loc[ train_data_resample_df['y'] <= info['upper_bound']] if 'lower_bound' in info: train_data_resample_df = train_data_resample_df.loc[ train_data_resample_df['y'] >= info['lower_bound']] train_data_resample_df.to_json(train_data_resample_fn, orient='index') print( f'Training data resample shape: {train_data_resample_df.shape} - after boundary filtering' ) print('Data stored\n\n') break
import pandas as pd import smogn housing = pd.read_csv( ## http://jse.amstat.org/v19n3/decock.pdf 'https://raw.githubusercontent.com/nickkunz/smogn/master/data/housing.csv') housing_smogn = smogn.smoter( data=housing, ## pandas dataframe y='SalePrice' ## string ('header name') ) import seaborn import matplotlib.pyplot as plt seaborn.kdeplot(housing['SalePrice'], label="Original") seaborn.kdeplot(housing_smogn['SalePrice'], label="Modified") plt.legend() plt.show()
alpha_train.cpu()).detach().numpy().tolist() ])) trainRealLoger2.writelines('\n'.join([ str(x) for x in torch.squeeze( t1_train.cpu()).detach().numpy().tolist() ])) trainRealLoger3.writelines('\n'.join([ str(x) for x in torch.squeeze( t2_train.cpu()).detach().numpy().tolist() ])) break if __name__ == '__main__': rawOrigin = read_data() rawOrigin = rawOrigin.sample(frac=1).reset_index(drop=True) raw = rawOrigin.drop(columns=['alpha']) raw['alpha'] = rawOrigin['alpha'] train_size = int(raw.shape[0] * 0.7) train = raw[:train_size] test = raw[train_size:] train_smogn = raw[:train_size] for i in range(10): print('第 {} 次扩充'.format(i)) train_smogn = smogn.smoter(data=train_smogn, y='alpha', k=9, samp_method='extreme') # mtl_helper(train, test, str(i)) svr_helper(train, test, train_smogn) # rfr_helper(train, test, train_smogn)