コード例 #1
0
        # iterate over 30 runs
        for run in range(repetitions):
            logger.log_message(f'Starting run {run}')

            # take random sample from the training data
            train_data = pd.read_hdf(data_path, 'partB_train_normalized')
            test_data = pd.read_hdf(data_path, 'partB_test_normalized')
            logger.log_message('Data imbalance levels before sampling')
            logger.log_message(
                get_binary_imbalance_ratio(train_data['exclusion']))

            pos_train, neg_train = split_on_binary_attribute(
                train_data, attribute='exclusion', pos_label=1, neg_label=0)
            train_data = apply_ros_rus(pos_train,
                                       neg_train,
                                       ros_rate=ros_rate,
                                       rus_rate=minority_ratio)
            del pos_train
            del neg_train

            logger.log_message('Minority class ratio after sampling: ')
            logger.log_message(
                get_binary_imbalance_ratio(train_data['exclusion']))

            # separate features from labels
            train_y = train_data['exclusion']
            train_x = train_data.drop(columns=['exclusion'])
            test_y = test_data['exclusion']
            test_x = test_data.drop(columns=['exclusion'])

            # create subset of features
コード例 #2
0
	# -------------------------------------------------- #
	train_data = pd.read_hdf(data_path, key=train_key)
	logger.log_message('Data imbalance levels before sampling')
	logger.log_message(get_imbalance_description(train_data['class']))
	logger.log_message('Size of train data = ' + str(len(train_data)))


	# LOAD NORMALIZED TEST DATA
	# -------------------------------------------------- #
	test_data = pd.read_hdf(data_path, key=test_key)


	# APPLY SAMPLING TO THE TRAINING DATA
	# --------------------------------------------------
	pos_train, neg_train = split_on_binary_attribute(train_data, attribute='class', pos_label=1, neg_label=0)
	train_data = apply_ros_rus(pos_train, neg_train, ros_rate=ros_rate, rus_rate=rus_rate)
	del pos_train
	del neg_train

	# SEPARATE FEATURES/LABELS
	# --------------------------------------------------
	train_y = train_data['class']
	train_x = train_data.drop(columns=['class'])

	test_y = test_data['class']
	test_x = test_data.drop(columns=['class'])

	logger.log_message('Training data imbalance levels after sampling')
	logger.log_message(get_imbalance_description(train_y))
	logger.log_message('Test data imbalance levels')
	logger.log_message(get_imbalance_description(test_y))