Exemple #1
0
def _read_earthquakes():
    # train_data 322 * 512
    # test_data 139 * 512
    # train_labels 322
    # test_labels 139

    working_dir = os.path.join(dir_path, "earthquakes")
    wafer_data = {}
    for filename in os.listdir(working_dir):
        if ".arff" in filename:
            # there are some other formatted data(e.g. txt or md), so filter
            if "TEST" in filename:
                # add test data
                wafer_data["test"] = _read_arff(filename, working_dir)
            if "TRAIN" in filename:
                # add train data
                wafer_data["train"] = _read_arff(filename, working_dir)

    train_data = np.asarray(
        list(map(lambda x: list(x)[:-1], wafer_data["train"])))
    test_data = np.asarray(
        list(map(lambda x: list(x)[:-1], wafer_data["test"])))
    train_labels = np.asarray(
        list(map(lambda x: int(x[-1]), wafer_data["train"])))
    test_labels = np.asarray(
        list(map(lambda x: int(x[-1]), wafer_data["test"])))

    return Data(train_data, train_labels, test_data, test_labels)
Exemple #2
0
def _read_wafer():
    # train data 6164 * 152
    # test data 1000 * 152
    # train labels 6164
    # test labels 1000

    working_dir = os.path.join(dir_path, "wafer")
    wafer_data = {}
    for filename in os.listdir(working_dir):
        if ".arff" in filename:
            # there are some other formatted data(e.g. txt or md), so filter
            if "TEST" in filename:
                # add test data
                wafer_data["test"] = _read_arff(filename, working_dir)
            if "TRAIN" in filename:
                # add train data
                wafer_data["train"] = _read_arff(filename, working_dir)

    train_data = np.asarray(
        list(map(lambda x: list(x)[:-1], wafer_data["train"])))
    test_data = np.asarray(
        list(map(lambda x: list(x)[:-1], wafer_data["test"])))
    train_labels = (1 - np.asarray(
        list(map(lambda x: int(x[-1]), wafer_data["train"])))) / 2
    test_labels = (1 - np.asarray(
        list(map(lambda x: int(x[-1]), wafer_data["test"])))) / 2

    return Data(train_data, train_labels, test_data, test_labels)
Exemple #3
0
def smote_dataset(dataset: Data):
    # unpack
    X = dataset.train.data
    y = dataset.train.labels
    sm = SMOTE(random_state=42)
    X_res, Y_res = sm.fit_resample(X, y)
    return Data(X_res, Y_res, dataset.test.data, dataset.test.labels)
Exemple #4
0
def _read_cmu_wafer():
    # load faster using pickle
    data = Data(None, None, None, None)
    data.train.data = pkl_loader.pkl2np("train_data.pkl").astype(int)
    data.train.labels = pkl_loader.pkl2np("train_labels.pkl").astype(int)
    data.test.data = pkl_loader.pkl2np("test_data.pkl").astype(int)
    data.test.labels = pkl_loader.pkl2np("test_labels.pkl").astype(int)
    return data
Exemple #5
0
def _read_secom_preprocessed(process_type: str = None):
    working_dir = os.path.join(dir_path, "uci-secom-preprocessed")

    data = Data(None, None, None, None)

    for filename in os.listdir(working_dir):
        # common : labels
        if filename == "train.labels.csv":
            data.train.labels = np.asarray(_read_csv(
                filename, working_dir)).astype(int).flatten()
        if filename == "test.labels.csv":
            data.test.labels = np.asarray(_read_csv(
                filename, working_dir)).astype(int).flatten()

        if process_type == None:
            if filename == "train_knnImpute.csv":
                data.train.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)
            if filename == "test_knnImpute.csv":
                data.test.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)

        if process_type == "pca":
            if filename == "train_pca.csv":
                data.train.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)
            if filename == "test_pca.csv":
                data.test.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)

        if process_type == "ica":
            if filename == "train_ica.csv":
                data.train.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)
            if filename == "test_ica.csv":
                data.test.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)

        if process_type == "chisq":
            if filename == "train_chisq.csv":
                data.train.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)
            if filename == "test_chisq.csv":
                data.test.data = np.asarray(_read_csv(
                    filename, working_dir)).astype(float)

    return data
Exemple #6
0
def main():
    X, y = make_classification(n_classes=2,
                               class_sep=2,
                               weights=[0.1, 0.9],
                               n_informative=3,
                               n_redundant=1,
                               flip_y=0,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_samples=1000,
                               random_state=10)

    p = smote_dataset(Data(X, y, None, None))
    X_ = p.train.data
    y_ = p.train.labels
    print(X_.shape)
    print(y_.shape)
Exemple #7
0
def _read_secom():
    # unsplitted data
    # 1568 -> 1199 + 368
    working_dir = os.path.join(dir_path, "uci-secom")

    uci_secom_data = []
    for filename in os.listdir(working_dir):
        uci_secom_data.extend(_read_csv(filename, working_dir))
    uci_secom_data = np.asarray(uci_secom_data)
    # first line has label name, so it will be removed.
    # secom data has time column(at first), so we will erase it

    train_data = uci_secom_data[1:1200, 1:-1].astype(float)
    test_data = uci_secom_data[1200:, 1:-1].astype(float)
    train_labels = (uci_secom_data[1:1200, -1].astype(int) + 1) / 2
    test_labels = (uci_secom_data[1200:, -1].astype(int) + 1) / 2

    return Data(train_data, train_labels, test_data, test_labels)