コード例 #1
0
def smote_oversampling_regression(loop_features, loop_targets,
                                  new_feature_names):
    print("+++ SMOTE Oversampling Regression")

    loop_features_np = np.array(loop_features)
    loop_targets_np = np.array(loop_targets)
    loop_targets_np = np.reshape(loop_targets_np,
                                 (np.shape(loop_targets_np)[0], 1))
    combined_data = np.append(loop_features_np, loop_targets_np, axis=1)
    combined_data_list = combined_data.tolist()
    dataframe = pd.DataFrame(combined_data_list)

    column_names = new_feature_names.copy()
    column_names.append("target")
    dataframe.columns = column_names

    smogn.smoter(data=dataframe, y="target")

    upsampled_data = np.array(dataframe.values.tolist())
    new_loop_targets = upsampled_data[:, -1].tolist()
    new_loop_features = np.delete(upsampled_data, -1, axis=1).tolist()

    return new_loop_features, new_loop_targets
コード例 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-it',
                        '--input_train',
                        help="input file for train",
                        default='../dataset/train.npy')
    parser.add_argument('-ot',
                        '--output_train',
                        help="output file for train weights",
                        default='../dataset/train_weights.npy')
    args = parser.parse_args()

    src = np.load(args.input_train, allow_pickle=True)
    src_df = pd.DataFrame(data=src)
    src_df = src_df.add_prefix('col')

    #TODO ERROR custom phi
    smogn_df = smogn.smoter(src_df, 'col74', k=9, rel_coef=0.5, rel_thres=0.02)
    np.save(args.output_eval, smogn_df.to_numpy(), allow_pickle=True)
コード例 #3
0
ファイル: test.py プロジェクト: Neo-101/R2S
def smogn_on_train_splitted():
    smogn_params = dict(
        # main arguments
        data=dataset['ori']['train_splitted'],
        y='smap_windspd',  # string ('header name')
        k=7,  # positive integer (k < n)
        pert=0.02,  # real number (0 < R < 1)
        samp_method='extreme',  # string ('balance' or 'extreme')
        drop_na_col=True,  # boolean (True or False)
        drop_na_row=True,  # boolean (True or False)
        replace=False,  # boolean (True or False)

        # phi relevance arguments
        rel_thres=0.9,  # real number (0 < R < 1)
        rel_method='manual',  # string ('auto' or 'manual')
        # rel_xtrm_type='high',  # unused (rel_method='manual')
        # rel_coef=3.525,         # unused (rel_method='manual')
        rel_ctrl_pts_rg=[[5, 0, 0], [20, 0, 0], [35, 0, 0], [50, 1, 0]])

    save_dir = ('/Users/lujingze/Programming/SWFusion/regression/'
                'tc/dataset/smogn_final/smogn_on_train_splitted/')
    train_splitted_smogn = smogn.smoter(**smogn_params)
    os.makedirs(save_dir, exist_ok=True)
    train_splitted_smogn.to_pickle(f'{save_dir}train_splitted_smogn.pkl')
コード例 #4
0
def main(data , drug_ids):
	

	
	for drug_id in data:
		print('drug_id: ', drug_id)
		if debug == 1:
			print(data[drug_id].shape)

		
		df = data[drug_id]
		df = df.drop(columns=['Cell_id'])


		# ===============
		# CAtegorize the labels
		# binInterval = np.arange(0,1.1,0.1)
		# df['AUC'] = pd.cut(df['AUC'], bins = binInterval, labels=binInterval[1:])
		# y = df['AUC'].values

		# ==================================
		# Split the data

		y = df['AUC'].to_numpy(dtype ='float32')
		X = df.drop(columns=['AUC']).to_numpy(dtype ='float32')
		X = StandardScaler().fit_transform(X)
		X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.2, random_state=0)
		
		if debug == 1:
			print("Step1: Split: ",X_train_orig.shape, X_test_orig.shape)
		

		# ====================================
		if apply_PCA == 1:
			# ===========
			# perform PCA
			PCAS = {}
			for n in N_COMPONENTS:
				
				if n not in PCAS:
					PCAS[n] = {}

				if debug == 1:
					print('PCA_',n)

				if do_PCA == 1:
					
					pca_v = PCA(n_components=n)
					principalComponents = pca_v.fit(X_train_orig)
					
				else:
					
					with open(CLEAN_DATA_DIR+'data_pca_'+str(drug_id)+'_'+str(n)+'.pkl', 'rb') as f:
						principalComponents = pickle.load( f)

					
				X_train = principalComponents.transform(X_train_orig)
				X_test = principalComponents.transform(X_test_orig)
				
				if debug == 1:
						print('new shape: ', X_train.shape)

				# ====================================
				# Over Sampling
				if do_overSampling == 1:
					dfr = pd.DataFrame(X_train)
					dfr['AUC'] = pd.Series(y_train_orig)

					# oversample = RandomOverSampler(sampling_strategy='minority')
					# oversample = SMOTE(sampling_strategy='minority')
					print(len(dfr[dfr['AUC'] > 0.5]))
					print(len(dfr[dfr['AUC'] <= 0.5]))

					upsampled_data = smogn.smoter(data=dfr, y='AUC')
					# fit and apply the transform
					# X, y = oversample.fit_resample(X, y)
					print(len(upsampled_data[upsampled_data['AUC'] > 0.5]))
					print(len(upsampled_data[upsampled_data['AUC'] <= 0.5]))
					# break
					y_train = upsampled_data['AUC'].to_numpy(dtype='float32')
					X_train = upsampled_data.drop(columns=['AUC']).to_numpy(dtype='float32')
				
				elif do_overSampling == 2: #try2
					dfr = pd.DataFrame(X_train)
					dfr['AUC'] = pd.Series(y_train_orig)
					upsampled_data = handle_imbalanced_Data_try1(dfr)
					y_train = upsampled_data['AUC'].to_numpy(dtype='float32')
					X_train = upsampled_data.drop(columns=['AUC']).to_numpy(dtype='float32')

				# ==================
				
				# 
				y_test = y_test_orig
				PCAS[n]['X_train'] = X_train
				PCAS[n]['y_train'] = y_train
				PCAS[n]['X_test'] = X_test
				PCAS[n]['y_test'] = y_test

				if test_model == 1:
					svm_trivial(X_train, X_test, y_train, y_test)

			# save PCA
			with open(CLEAN_DATA_DIR+'data_pca_'+str(drug_id)+'.pkl', 'wb') as f:
				pickle.dump(PCAS, f)

		elif apply_CORR == 1:
			s = X_train_orig.shape[1]
			corr = []
			for i in range(s):
				slope, intercept, r_value, p_value, std_err = stats.linregress(X_train_orig[:,i], y_train_orig)
				corr.append(r_value)

			print('drug_id:'+str(drug_id)+', minVal: '+str(min(corr)) +', maxVal: '+str(max(corr)))
			
			corr = np.array(corr)
			l1 = corr[corr > 0.4]
			l2 = corr[corr < -0.4]
			combined = np.concatenate((l1,l2), axis=0)

			inx = np.argwhere( (corr > 0.4) | (corr < -0.4))
			X_train = X_train_orig[:, inx.T.tolist()[0]]
			y_train = y_train_orig

			X_test = X_test_orig[:, inx.T.tolist()[0]]
			y_test = y_test_orig

			print('> 0.4: ', len(l1), ', < -0.4', len(l2), ', total: ', len(inx))

			# save PCA
			with open(CLEAN_DATA_DIR+'data_corr_val_'+str(drug_id)+'.pkl', 'wb') as f:
				pickle.dump(corr, f)

			corrF = {0:{}}
			corrF[0]['X_train'] = X_train
			corrF[0]['y_train'] = y_train
			corrF[0]['X_test'] = X_test
			corrF[0]['y_test'] = y_test

			with open(CLEAN_DATA_DIR+'data_corr_'+str(drug_id)+'.pkl', 'wb') as f:
				pickle.dump(corrF, f)
コード例 #5
0
def main(data, drug_ids):
    # if debug == 1:
    # 	print(drug_ids)

    for drug_id in data:
        print('drug_id: ', drug_id)
        if debug == 1:
            print(data[drug_id].shape)

        df = data[drug_id]
        y = df['AUC'].to_numpy(dtype='float32')
        X = df.drop(columns=['AUC']).to_numpy(dtype='float32')
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        # test2(df)
        # svm(df)
        #PCA_fn(df)

        # ====================================
        # perform PCA
        if do_PCA == 1:

            if debug == 1:
                print('X_train shape: ', X_train.shape)

            pca = apply_PCA2(X_train, y_train, n_components)

            # save drug ids
            with open(
                    'data_pca_' + str(drug_id) + '_' + str(n_components[0]) +
                    '.pkl', 'wb') as f:
                pickle.dump(pca[0], f)

            # X = pca[0]
            X_train = pca[0].transform(X_train)
            X_test = pca[0].transform(X_test)

            if debug == 1:
                print('n_components: ', n_components[0], ', new shape: ',
                      X_train.shape)

        else:
            # read saved data

            with open(
                    'data_pca_' + str(drug_id) + '_' + str(n_components[0]) +
                    '.pkl', 'rb') as f:
                d = pickle.load(f)

            X_train = d.transform(X_train)
            X_test = d.transform(X_test)

        # ====================================
        # Over Sampling
        if do_overSampling == 1:
            dfr = pd.DataFrame(X_train)
            dfr['AUC'] = pd.Series(y_train)

            # oversample = RandomOverSampler(sampling_strategy='minority')
            # oversample = SMOTE(sampling_strategy='minority')
            print(len(dfr[dfr['AUC'] > 0.5]))
            print(len(dfr[dfr['AUC'] <= 0.5]))

            upsampled_data = smogn.smoter(data=dfr, y='AUC')
            # fit and apply the transform
            # X, y = oversample.fit_resample(X, y)
            print(len(upsampled_data[upsampled_data['AUC'] > 0.5]))
            print(len(upsampled_data[upsampled_data['AUC'] <= 0.5]))
            # break
            y_train = upsampled_data['AUC'].to_numpy(dtype='float32')
            X_train = upsampled_data.drop(columns=['AUC']).to_numpy(
                dtype='float32')

        # ==================
        # split for training and testing

        if debug == 1:
            print("Split: ", X_train.shape, X_test.shape)

        # lr(X_train, X_test, y_train, y_test)
        svm_trivial(X_train, X_test, y_train, y_test)
        # svm(X_train, X_test, y_train, y_test)

        break  # To do : should comment this line for delivery
コード例 #6
0
ファイル: TrainHelper.py プロジェクト: grimmlab/evars-gpr
def get_augmented_data(data: pd.DataFrame,
                       target_column: str,
                       change_point_index: int,
                       output_scale: float,
                       da: str = 'scaled',
                       max_samples: int = None,
                       append: str = 'no',
                       o_perc: float = 1.1,
                       u_perc: float = 0.8,
                       thr: float = 0.2,
                       under_samp: bool = False,
                       rel_coef: float = 1.5,
                       rel_thr: float = 0.5,
                       focus: str = 'high'):
    """
    get augmented data
    :param data: base dataset
    :param target_column: taget column
    :param change_point_index: index of the change point
    :param output_scale: calculated output scaling factor
    :param da: data augmentation method to use
    :param max_samples: maximum samples to consider for data augmentation
    :param append: specify whether to append original and scaled dataset for da or not
    :param o_perc: oversampling percentage for GN
    :param u_perc: undersampling percentage for GN
    :param thr: threshold for GN
    :param under_samp: specify whether to undersample for SMOGN
    :param rel_coef: relevance coefficient for SMOGN
    :param rel_thr: relevance threshold for SMOGN
    :param focus: focus for SMOGN
    :return: augmented dataset
    """
    samples = data.copy()[:change_point_index + 1].reset_index(drop=True)
    samples = samples.iloc[-max_samples:] if (
        max_samples is not None
        and samples.shape[0] > max_samples) else samples
    samples_scaled = samples.copy()
    samples_scaled[target_column] *= output_scale
    if da == 'scaled':
        augmented_data = samples_scaled
    else:
        if append == 'before':
            samples = samples.append(samples_scaled.reset_index(
                drop=True)).sample(frac=1).reset_index(drop=True)
        else:
            samples = samples_scaled
        if da == 'smogn':
            augmented_data = smogn.smoter(data=samples,
                                          y=target_column,
                                          under_samp=under_samp,
                                          samp_method='extreme',
                                          rel_xtrm_type=focus,
                                          rel_coef=rel_coef,
                                          rel_thres=rel_thr)
        elif da == 'gn':
            sampler = pir.GaussianNoise(df=samples,
                                        rel_func='default',
                                        o_percentage=o_perc,
                                        y_col=target_column,
                                        u_percentage=u_perc,
                                        random_state=42,
                                        threshold=thr)
            augmented_data = sampler.get()
    return augmented_data
コード例 #7
0
def build_learning_dataset(tc: TopCoder):
    """ Build learning dataset for prediction of 
        - avg_score
        - number_of_registration
        - sub_reg_ratio

        I assume that these target data are regressionally imbalanced, thus we should resample it before learning.
        The threshold are set as followed:
        - avg_score: 90
        - number_of_registration: 30
        - sub_reg_ratio: 0.25

        :param contain_docvec: Boolean: Whether include document vector in the feature. Default as False
        :param normalize: Boolean: Whether to normalzie the X data.
    """
    # manually set data resampling threshold
    target_resamp_info = {
        'avg_score': {
            'threshold': 90,
            'extreme': 'low',
            'upper_bound': 100
        },
        'number_of_registration': {
            'threshold': 30,
            'extreme': 'high',
            'lower_bound': 0
        },
        'sub_reg_ratio': {
            'threshold': 0.25,
            'extreme': 'high',
            'upper_bound': 1
        },
    }
    test_size = 954  # len(feature_df) * 0.2 ~= 953.8, use 20% of the data for testing
    storage_path = os.path.join(os.curdir, 'result', 'boosting_learn',
                                'learning_data')

    # get the raw data from TopCoder data object
    cha_info = tc.get_filtered_challenge_info()
    feature_df = tc\
        .get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True)\
        .join(cha_info.reindex(['total_prize'], axis=1))
    docvec_df = pd.read_json(os.path.join(os.curdir, 'data',
                                          'new_docvec.json'),
                             orient='index')
    target_df = cha_info.reindex(list(target_resamp_info.keys()), axis=1)
    if not (target_df.index == feature_df.index).all():
        raise ValueError(
            'Check index of target_df and feature_df, it\'s not equal.')

    for col, info in target_resamp_info.items():
        print(f'Building dataset for {col}')
        target_sr = target_df[col]
        test_index = util_stratified_split_regression(target_sr,
                                                      info['threshold'],
                                                      info['extreme'],
                                                      test_size)

        X_train_raw = feature_df.loc[~feature_df.index.isin(test_index
                                                            )].sort_index()
        X_test_raw = feature_df.loc[feature_df.index.isin(
            test_index)].sort_index()
        y_train_raw = target_sr[~target_sr.index.isin(test_index)].sort_index()
        y_test_raw = target_sr[target_sr.index.isin(test_index)].sort_index()
        if not ((X_train_raw.index == y_train_raw.index).all() and
                (X_test_raw.index == y_test_raw.index).all()):
            raise ValueError('Check X, y test index, they are not equal.')

        for dv in True, False:
            print(f'Resampling with dv={dv}...')
            test_data_fn = os.path.join(storage_path,
                                        f'{col}_test_dv{int(dv)}.json')
            train_data_original_fn = os.path.join(
                storage_path, f'{col}_train_original_dv{int(dv)}.json')
            train_data_resample_fn = os.path.join(
                storage_path, f'{col}_train_resample_dv{int(dv)}.json')
            X_train, X_test, y_train, y_test = X_train_raw.copy(
            ), X_test_raw.copy(), y_train_raw.copy(), y_test_raw.copy()

            if dv:
                X_train = X_train.join(docvec_df)
                X_test = X_test.join(docvec_df)

            # From now on it's pure numpy till storage ;-)
            X_train, X_test = X_train.to_numpy(), X_test.to_numpy()
            y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

            scaler = StandardScaler().fit(X_train)
            normalizer = Normalizer().fit(X_train)

            X_train, X_test = scaler.transform(X_train), scaler.transform(
                X_test)
            X_train, X_test = normalizer.transform(
                X_train), normalizer.transform(X_test)

            print(
                f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}'
            )
            print(
                f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

            test_data = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1)
            test_data_df = pd.DataFrame(test_data)
            test_data_df.columns = [
                *[f'x{i}' for i in range(X_test.shape[1])], 'y'
            ]
            test_data_df.to_json(test_data_fn, orient='index')
            print(f'Test data DataFrame shape: {test_data_df.shape}')

            train_data_original = np.concatenate(
                (X_train, y_train.reshape(-1, 1)), axis=1)
            train_data_original_df = pd.DataFrame(train_data_original)
            train_data_original_df.columns = [
                *[f'x{i}' for i in range(X_test.shape[1])], 'y'
            ]
            train_data_original_df.to_json(train_data_original_fn,
                                           orient='index')
            print(
                f'Training data original shape: {train_data_original_df.shape}'
            )

            attempt = 0
            while True:
                print(f'Attempt #{attempt}...')
                try:
                    train_data_resample_df = smoter(
                        data=train_data_original_df,
                        y='y',
                        samp_method='extreme',
                        rel_xtrm_type=info['extreme']).reset_index(
                            drop=True
                        )  # just use the default setting for SMOGN
                except ValueError as e:
                    print(f'Encounter error: "{e}", rerun the SMOGN...')
                    continue
                else:
                    print(
                        f'Training data resample shape: {train_data_resample_df.shape} - before boundary filtering'
                    )
                    if 'upper_bound' in info:
                        train_data_resample_df = train_data_resample_df.loc[
                            train_data_resample_df['y'] <= info['upper_bound']]

                    if 'lower_bound' in info:
                        train_data_resample_df = train_data_resample_df.loc[
                            train_data_resample_df['y'] >= info['lower_bound']]

                    train_data_resample_df.to_json(train_data_resample_fn,
                                                   orient='index')
                    print(
                        f'Training data resample shape: {train_data_resample_df.shape} - after boundary filtering'
                    )
                    print('Data stored\n\n')
                    break
コード例 #8
0
ファイル: learn_smogn.py プロジェクト: philgun/coolstuff
import pandas as pd
import smogn

housing = pd.read_csv(

    ## http://jse.amstat.org/v19n3/decock.pdf
    'https://raw.githubusercontent.com/nickkunz/smogn/master/data/housing.csv')

housing_smogn = smogn.smoter(
    data=housing,  ## pandas dataframe
    y='SalePrice'  ## string ('header name')
)

import seaborn
import matplotlib.pyplot as plt
seaborn.kdeplot(housing['SalePrice'], label="Original")
seaborn.kdeplot(housing_smogn['SalePrice'], label="Modified")
plt.legend()
plt.show()
コード例 #9
0
                        alpha_train.cpu()).detach().numpy().tolist()
                ]))
                trainRealLoger2.writelines('\n'.join([
                    str(x) for x in torch.squeeze(
                        t1_train.cpu()).detach().numpy().tolist()
                ]))
                trainRealLoger3.writelines('\n'.join([
                    str(x) for x in torch.squeeze(
                        t2_train.cpu()).detach().numpy().tolist()
                ]))
                break


if __name__ == '__main__':
    rawOrigin = read_data()
    rawOrigin = rawOrigin.sample(frac=1).reset_index(drop=True)
    raw = rawOrigin.drop(columns=['alpha'])
    raw['alpha'] = rawOrigin['alpha']
    train_size = int(raw.shape[0] * 0.7)
    train = raw[:train_size]
    test = raw[train_size:]
    train_smogn = raw[:train_size]
    for i in range(10):
        print('第 {} 次扩充'.format(i))
        train_smogn = smogn.smoter(data=train_smogn,
                                   y='alpha',
                                   k=9,
                                   samp_method='extreme')
        # mtl_helper(train, test, str(i))
        svr_helper(train, test, train_smogn)
        # rfr_helper(train, test, train_smogn)