def _read_earthquakes(): # train_data 322 * 512 # test_data 139 * 512 # train_labels 322 # test_labels 139 working_dir = os.path.join(dir_path, "earthquakes") wafer_data = {} for filename in os.listdir(working_dir): if ".arff" in filename: # there are some other formatted data(e.g. txt or md), so filter if "TEST" in filename: # add test data wafer_data["test"] = _read_arff(filename, working_dir) if "TRAIN" in filename: # add train data wafer_data["train"] = _read_arff(filename, working_dir) train_data = np.asarray( list(map(lambda x: list(x)[:-1], wafer_data["train"]))) test_data = np.asarray( list(map(lambda x: list(x)[:-1], wafer_data["test"]))) train_labels = np.asarray( list(map(lambda x: int(x[-1]), wafer_data["train"]))) test_labels = np.asarray( list(map(lambda x: int(x[-1]), wafer_data["test"]))) return Data(train_data, train_labels, test_data, test_labels)
def _read_wafer(): # train data 6164 * 152 # test data 1000 * 152 # train labels 6164 # test labels 1000 working_dir = os.path.join(dir_path, "wafer") wafer_data = {} for filename in os.listdir(working_dir): if ".arff" in filename: # there are some other formatted data(e.g. txt or md), so filter if "TEST" in filename: # add test data wafer_data["test"] = _read_arff(filename, working_dir) if "TRAIN" in filename: # add train data wafer_data["train"] = _read_arff(filename, working_dir) train_data = np.asarray( list(map(lambda x: list(x)[:-1], wafer_data["train"]))) test_data = np.asarray( list(map(lambda x: list(x)[:-1], wafer_data["test"]))) train_labels = (1 - np.asarray( list(map(lambda x: int(x[-1]), wafer_data["train"])))) / 2 test_labels = (1 - np.asarray( list(map(lambda x: int(x[-1]), wafer_data["test"])))) / 2 return Data(train_data, train_labels, test_data, test_labels)
def smote_dataset(dataset: Data): # unpack X = dataset.train.data y = dataset.train.labels sm = SMOTE(random_state=42) X_res, Y_res = sm.fit_resample(X, y) return Data(X_res, Y_res, dataset.test.data, dataset.test.labels)
def _read_cmu_wafer(): # load faster using pickle data = Data(None, None, None, None) data.train.data = pkl_loader.pkl2np("train_data.pkl").astype(int) data.train.labels = pkl_loader.pkl2np("train_labels.pkl").astype(int) data.test.data = pkl_loader.pkl2np("test_data.pkl").astype(int) data.test.labels = pkl_loader.pkl2np("test_labels.pkl").astype(int) return data
def _read_secom_preprocessed(process_type: str = None): working_dir = os.path.join(dir_path, "uci-secom-preprocessed") data = Data(None, None, None, None) for filename in os.listdir(working_dir): # common : labels if filename == "train.labels.csv": data.train.labels = np.asarray(_read_csv( filename, working_dir)).astype(int).flatten() if filename == "test.labels.csv": data.test.labels = np.asarray(_read_csv( filename, working_dir)).astype(int).flatten() if process_type == None: if filename == "train_knnImpute.csv": data.train.data = np.asarray(_read_csv( filename, working_dir)).astype(float) if filename == "test_knnImpute.csv": data.test.data = np.asarray(_read_csv( filename, working_dir)).astype(float) if process_type == "pca": if filename == "train_pca.csv": data.train.data = np.asarray(_read_csv( filename, working_dir)).astype(float) if filename == "test_pca.csv": data.test.data = np.asarray(_read_csv( filename, working_dir)).astype(float) if process_type == "ica": if filename == "train_ica.csv": data.train.data = np.asarray(_read_csv( filename, working_dir)).astype(float) if filename == "test_ica.csv": data.test.data = np.asarray(_read_csv( filename, working_dir)).astype(float) if process_type == "chisq": if filename == "train_chisq.csv": data.train.data = np.asarray(_read_csv( filename, working_dir)).astype(float) if filename == "test_chisq.csv": data.test.data = np.asarray(_read_csv( filename, working_dir)).astype(float) return data
def main(): X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) p = smote_dataset(Data(X, y, None, None)) X_ = p.train.data y_ = p.train.labels print(X_.shape) print(y_.shape)
def _read_secom(): # unsplitted data # 1568 -> 1199 + 368 working_dir = os.path.join(dir_path, "uci-secom") uci_secom_data = [] for filename in os.listdir(working_dir): uci_secom_data.extend(_read_csv(filename, working_dir)) uci_secom_data = np.asarray(uci_secom_data) # first line has label name, so it will be removed. # secom data has time column(at first), so we will erase it train_data = uci_secom_data[1:1200, 1:-1].astype(float) test_data = uci_secom_data[1200:, 1:-1].astype(float) train_labels = (uci_secom_data[1:1200, -1].astype(int) + 1) / 2 test_labels = (uci_secom_data[1200:, -1].astype(int) + 1) / 2 return Data(train_data, train_labels, test_data, test_labels)