def preprocess_sachs(folder_path, save_path, num_envs, normalization='standard'): """Preprocesses Sachs. Args: folder_path: Read sachs from the path save_path: Save data in this path num_envs: The number of environments to cluster the data into normalization: Normalization option Returns: The sachs dataset with different envs [num_envs, number of sample in each envs, num of features] """ np.set_printoptions(precision=3) X = read_sachs_all(folder_path) _, d = X.shape kmeans = KMeans(n_clusters=num_envs, max_iter=1000).fit(X) labeled_X = kmeans.fit_predict(X) X_envs = utils.classify_x(X, labeled_X) # X_cluster is a dict X, Y = utils.preprocess_labeled_data(X, labeled_X, normalization) X_res, Y_res = utils.over_sampling(X, Y) X_envs = utils.classify_x(X_res, Y_res) os.makedirs(save_path, exist_ok=True) for i, X in X_envs.items(): exp = save_path + f'sachs_env_{i+1}_{num_envs}.csv' if X.shape[0] > 1: np.savetxt(exp, X, delimiter=',') return utils.distribute_to_envs(X_envs)
def preprocess_BH(save_path, cluster): """Clusters the data and prerpocess it. Args: save_path: the path to save the proprocessed data cluster: number of clusters to cluster the dataset Returns: A tensor of [num_envs, number of sample in each envs, num of features] """ np.set_printoptions(precision=3) x_raw = utils.load_BH() label = np.ones(len(x_raw)) # only train and test x, x_test, _, _ = train_test_split( x_raw, label, test_size=0.1, random_state=42) # train val and test # X_train_val, X_test, _, _ = train_test_split( # X_raw, label, test_size=0.1, random_state=42) # X, X_val, _, _ = train_test_split( # X_train_val, np.ones(len(X_train_val)), test_size=0.1, random_state=1) n, d = x.shape kmeans = KMeans(n_clusters=cluster, max_iter=1000).fit(x) labeled_x = kmeans.fit_predict(x) x_cluster = utils.classify_x(x, labeled_x) # X_cluster is a dict os.makedirs(save_path, exist_ok=True) x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x) x_envs = utils.classify_x(x_upsampled, y_upsampled) standard_scaler = preprocessing.StandardScaler() x_train = list() i = 1 # Save the data for different envs for x_env in x_envs.items(): standardx = standard_scaler.fit_transform(x_env[1]) exp = save_path + f'standard_BH_env_{i}_{cluster}.csv' np.savetxt(exp, standardx, fmt='%.3f', delimiter=',') i += 1 x_train.append(standardx) # Standard the train test dataset using the mean and std of the train dataset standard_train_param = standard_scaler.fit(x) standardxtrain = standard_scaler.transform(x) x_trainexp = save_path + f'standard_BH_train.csv' np.savetxt(x_trainexp, standardxtrain, fmt='%.3f', delimiter=',') standardxtest = standard_scaler.transform(x_test) x_testexp = save_path + f'standard_BH_test.csv' np.savetxt(x_testexp, standardxtest, fmt='%.3f', delimiter=',') return np.stack(x_train), x_testexp
def cluster_Insurance(x, num_cluster): """Clusters and returns the data clustered into different environments. Args: x: the data to be clustered num_cluster: the number of clusters Returns: np.array: the data clustered into different environments """ _, d = x.shape kmeans = KMeans(n_clusters=num_cluster, max_iter=1000).fit(x) labeled_x = kmeans.fit_predict(x) x_cluster = utils.classify_x(x, labeled_x) # X_cluster is a dict x_upsampled, y_upsampled = utils.over_sampling(x, labeled_x) x_envs = utils.classify_x(x_upsampled, y_upsampled) return x_envs
def read_sachs_to_envs(folder_path, num_envs, normalization): """Loads Sachs data and return sachs data divided into different environments. Args: folder_path: num_envs: the number of envs to read the data normalization: normalization type Returns: A tensor with shape [num_envs, number of sample in each envs, num of features] """ sachs_data = list() if num_envs == 14: y_label = [] for i, file in enumerate(glob.glob(f'{folder_path}*.xls')): sachs_df = pd.read_excel(file) sachs_array = sachs_df.to_numpy() sachs_array = utils.preprocess(sachs_array, normalization) sachs_data.append(sachs_array) y_label.append(np.ones(sachs_array.shape[0]) * i) sachs_data_envs = np.vstack(sachs_data) sachs_data_labels = np.hstack(y_label) X_res, Y_res = utils.over_sampling(sachs_data_envs, sachs_data_labels) X_cluster = utils.classify_x(X_res, Y_res) X_envs = utils.distribute_to_envs(X_cluster) elif num_envs == 2: X_envs = [None] * num_envs y_label = [None] * num_envs for i, file in enumerate(glob.glob(f'{folder_path}*.xls')): start_index = file.index('sachs_data/') + 11 end_index = file.index(' ') - 1 file_index = int(file[start_index:end_index]) label = 0 if file_index <= 9 else 1 sachs_df = pd.read_excel(file) sachs_array = sachs_df.to_numpy() if X_envs[label] is None: X_envs[label] = sachs_array y_label[label] = np.ones(sachs_array.shape[0]) * label else: X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0) y_label[label] = np.concatenate( (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0) for i in range(num_envs): X_envs[i], y_label[i] = utils.preprocess_labeled_data( X_envs[i], y_label[i], normalization) X = np.vstack(X_envs) Y = np.hstack(y_label) X_res, Y_res = utils.over_sampling(X, Y) X_cluster = utils.classify_x(X_res, Y_res) X_envs = utils.distribute_to_envs(X_cluster) elif num_envs == 3: if not os.path.exists(f'./data/cluster/*3.csv'): X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 3, normalization) else: for file in glob.glob(f'./data/cluster/*3.csv'): sachs_array = np.loadtxt(file, delimiter=',') # sachs_array = preprocess(sachs_array, args=args) sachs_data.append(sachs_array) X_envs = np.stack(sachs_data) elif num_envs == 6: if not os.path.exists(f'./data/cluster/*3.csv'): X_envs = preprocess_sachs(folder_path, f'./data/cluster/', 6, normalization) else: for file in glob.glob(f'./data/cluster/*6.csv'): sachs_array = np.loadtxt(file, delimiter=',') sachs_array = utils.preprocess(sachs_array, normalization) sachs_data.append(sachs_array) X_envs = np.stack(sachs_data) elif num_envs == 7: X_envs = [None] * num_envs y_label = [None] * num_envs for i, file in enumerate(glob.glob(f'{folder_path}*.xls')): start_index = file.index('sachs_data/') + 11 end_index = file.index(' ') - 1 file_index = int(file[start_index:end_index]) label = file_index % num_envs sachs_df = pd.read_excel(file) sachs_array = sachs_df.to_numpy() if X_envs[label] is None: X_envs[label] = sachs_array y_label[label] = np.ones(sachs_array.shape[0]) * label else: X_envs[label] = np.concatenate((X_envs[label], sachs_array), axis=0) y_label[label] = np.concatenate( (y_label[label], (np.ones(sachs_array.shape[0]) * label)), axis=0) for i in range(num_envs): X_envs[i], y_label[i] = utils.preprocess_labeled_data( X_envs[i], y_label[i], normalization) X = np.vstack(X_envs) Y = np.hstack(y_label) X_res, Y_res = utils.over_sampling(X, Y) X_cluster = utils.classify_x(X_res, Y_res) X_envs = utils.distribute_to_envs(X_cluster) return X_envs