Ejemplo n.º 1
0
    def fit(self, x, y, iteration=100):
        self.base_learner = []
        for i in range(len(self.structure.h)):
            self.base_learner = self.base_learner + [
                AmlmnbBase([[k for k in range(len(self.structure.h[i]))]],
                           [self.structure.h_states[i]],
                           delta=self.delta,
                           alpha=self.alpha,
                           mode_h=self.mode_h)
            ]

        for i in range(len(self.structure.h)):
            x_one_h = []
            for j in range(len(x)):
                x_one_h = x_one_h + [[x[j][k] for k in self.structure.h[i]]]
            self.base_learner[i].fit(x_one_h, y, iteration)

        self.convergence = []
        for j in range(iteration):
            self.convergence = self.convergence + [0]
            for i in range(len(self.structure.h)):
                self.convergence[j] += self.base_learner[i].convergence[j]
            if self.convergence[j] != 0:
                self.convergence[j] = math.log2(self.convergence[j])
            else:
                self.convergence[j] = -np.inf
Ejemplo n.º 2
0
 def cv_parallel(self, cv_iteration):
     if __name__ == '__main__':
         k = self.config["number_of_folds"]
         manager = Manager()
         precision = manager.list([0] * cv_iteration * k)
         recall = manager.list([0] * cv_iteration * k)
         f1 = manager.list([0] * cv_iteration * k)
         accuracy = manager.list([0] * cv_iteration * k)
         mcc = manager.list([0] * cv_iteration * k)
         auc = manager.list([0] * cv_iteration * k)
         temp_result = [self.cv_header]
         self.model_holder.append("MLMNB")
         children = []
         self.dataset = DataPreprocessing.binerize_class(self.dataset)
         for model_name, clf in zip(self.model_holder, self.classifiers):
             for ds_cat, ds_val in self.dataset.items():
                 for i in range(len(ds_val)):
                     for k in range(np.size(ds_val[i], 1) - 1):
                         children.append(k)
                     model = AmlmnbBase(h=[children],
                                        h_states=[10],
                                        delta=0.01,
                                        alpha=0.00001)
                     self.classifiers.append(model)
                     _dataset = np.array(ds_val[i])
                     if ds_cat == "CM1" or ds_cat == "JM1" or ds_cat == "KC1" or ds_cat == "KC2" or ds_cat == "KC3":
                         _dataset = self.delete_unused_NASA_metrics(
                             _dataset, ds_cat)
                     else:
                         reduced_tr = np.delete(_dataset,
                                                [9, 11, 13, 14, 17, 19],
                                                axis=1)
                         discretized_tr = disc.fit_transform(
                             _dataset[:, [9, 11, 13, 14, 17, 19]])
                         _dataset = np.concatenate(
                             (discretized_tr, reduced_tr), axis=1)
                     threads = []
                     for jj in range(cv_iteration):
                         X = _dataset[:, 0:-1]
                         y = _dataset[:, -1]
                         threads = threads + [
                             Process(target=self.worker_thread,
                                     args=(clf, X, y, k, jj, precision,
                                           recall, f1, accuracy, mcc, auc))
                         ]
                     for ii in range(cv_iteration):
                         threads[ii].start()
                     for ii in range(cv_iteration):
                         threads[ii].join()
     return precision, recall, f1, accuracy, mcc, auc
Ejemplo n.º 3
0
    def find_prob(self,
                  x_train,
                  y_train,
                  x_test,
                  h_states,
                  iteration=100,
                  delta=1e-10,
                  alpha=0):
        base_learner = []
        for i in range(len(self.structure.h)):
            base_learner = base_learner + [[]]
            for j in range(len(h_states)):
                base_learner[i] = base_learner[i] + [
                    AmlmnbBase([[k for k in range(len(self.structure.h[i]))]],
                               [h_states[j]],
                               delta,
                               alpha,
                               mode_h='individual')
                ]
                if len(self.structure.h[i]) == 1:
                    break

        for i in range(len(self.structure.h)):
            x_one_h = []
            for k in range(len(x_train)):
                x_one_h = x_one_h + [[
                    x_train[k][l] for l in self.structure.h[i]
                ]]
            for j in range(len(base_learner[i])):
                base_learner[i][j].fit(x_one_h, y_train, iteration)

        c = base_learner[0][0].compact_attribute[-1]
        pc = base_learner[0][0].pc

        prob = []
        for i in range(len(self.structure.h)):
            prob = prob + [[]]
            x_one_h = []
            for j in range(len(x_test)):
                x_one_h = x_one_h + [[
                    x_test[j][k] for k in self.structure.h[i]
                ]]
            for j in range(len(base_learner[i])):
                prob[i] = prob[i] + [
                    np.array(base_learner[i][j].predict(x_one_h))
                ]

        return prob, c, pc
Ejemplo n.º 4
0
def main():
    global learner, acc
    ds_name = "ant"
    data = pd.read_csv('ant0.csv')
    path = "ant.csv"
    data = data.values

    m = np.size(data, 0)

    x = list(data[:, :-1])
    y = list(data[:, -1])
    x = [list(x[i]) for i in range(len(x))]

    t1 = time.time()

    repeat_cross = 10
    eps = sys.float_info.epsilon
    learner = AmlmnbBase(delta=0.01, alpha=0.000001)

    # molecular-biology = {sp= 2, alpha=0.0009, delta=0.001}
    # lymphography = {sp = 20, alpha=0.0009, delta=0.001}

    sp = 2
    # learner.auto_struct(x, y, m, sp, pairwise=True)
    # learner.make_structure(x, y, num_cluster=1, h_states=2)
    k_fold = 10
    iteration = 100
    acc, AUC, cll = parallel(data, repeat_cross, k_fold, iteration, learner)
    # acc,L_h = learner.search_len_h(x, y, iteration, k_fold, [2], delta=0.00002, alpha=0.01)

    t2 = time.time()
    print("accuracy = ", sum(acc) / len(acc))
    print("AUC = ", sum(AUC) / len(AUC))
    # print("CLL = ", sum(cll) / len(cll))
    print("time:", t2 - t1)
    save_result_in_csv(ds_name, acc, cll, repeat_cross, k_fold, iteration, AUC, path)
Ejemplo n.º 5
0
    def cross_validation(self):
        temp_result = [self.cv_header]
        children = []
        self.model_holder.append("MLMNB")
        children = []
        learner = AmlmnbBase(h=[],
                             h_states=[1],
                             delta=0.001,
                             alpha=0.00001,
                             mode_h='individual11')
        self.classifiers.append(learner)
        temp = self.classifiers[0]
        self.classifiers[0] = learner
        self.classifiers[-1] = temp

        self.dataset = DataPreprocessing.binerize_class(self.dataset)
        for model_name, clf in zip(self.model_holder, self.classifiers):
            if model_name == "MLMNB":
                for ii in range(1, 10):
                    for ds_cat, ds_val in self.dataset.items():
                        for i in range(len(ds_val)):
                            _dataset = np.array(ds_val[i])
                            c = _dataset[:, -1]
                            _dataset = SelectKBest(
                                score_func=mutual_info_classif,
                                k=round(np.size(_dataset, 1) /
                                        2)).fit_transform(
                                            _dataset[:, 0:-1], _dataset[:, -1])
                            _dataset = np.concatenate(
                                (_dataset, c.reshape(-1, 1)), axis=1)

                            array_types = [
                                isinstance(sum(_dataset[:, feature]), int)
                                for feature in range(np.size(_dataset, 1) - 1)
                            ]
                            conti_features = []
                            for idx, e in enumerate(array_types):
                                if e == False:
                                    conti_features.append(idx)
                            for k in range(np.size(_dataset, 1) - 1):
                                children.append(k)
                            if ii == 1:
                                learner = AmlmnbBase(h=[],
                                                     h_states=[ii],
                                                     delta=0.001,
                                                     alpha=0.00001)
                            else:
                                learner = AmlmnbBase(h=[children],
                                                     h_states=[ii],
                                                     delta=0.001,
                                                     alpha=0.00001)
                            clf = learner
                            # if ds_cat == "CM1" or ds_cat == "JM1" or ds_cat == "KC1" or ds_cat == "KC2" or ds_cat == "KC3":
                            #     _dataset = self.delete_unused_NASA_metrics(_dataset, ds_cat, conti_features)
                            # else:
                            #     reduced_tr = np.delete(_dataset, conti_features, axis=1)
                            #     discretized_tr = disc.fit_transform(_dataset[:, conti_features])
                            #     _dataset = np.concatenate((discretized_tr, reduced_tr), axis=1)

                            for key_iter in range(self.config['iterations']):
                                X = _dataset[:, 0:-1]
                                y = _dataset[:, -1]
                                k = 0
                                for train_idx, test_idx in self.validator.split(
                                        X, y):
                                    print('CLASSIFIER:', model_name + str(ii),
                                          "DATASET",
                                          self.dataset_names[ds_cat][i],
                                          'ITERATION:', key_iter, 'CV_FOLD:',
                                          k)
                                    X_train, X_test = X[train_idx], X[test_idx]
                                    # y_test = actual class label of test data
                                    # y_train = actual class label of train data
                                    y_train, y_test = y[train_idx], y[test_idx]
                                    clf.fit(X_train, y_train)
                                    score = clf.predict(X_test)
                                    perf_holder = self.perf_obj.compute_measures(
                                        y_test, score)
                                    cross_val_pack = [
                                        str(self.dataset_names[ds_cat][i]),
                                        key_iter, k, model_name + str(ii),
                                        *perf_holder
                                    ]

                                    k = k + 1

                                    temp_result.append(cross_val_pack)
            else:
                for ds_cat, ds_val in self.dataset.items():
                    for i in range(len(ds_val)):
                        _dataset = np.array(ds_val[i])
                        for key_iter in range(self.config['iterations']):
                            X = _dataset[:, 0:-1]
                            y = _dataset[:, -1]
                            k = 0
                            for train_idx, test_idx in self.validator.split(
                                    X, y):
                                print('CLASSIFIER:', model_name, "DATASET",
                                      self.dataset_names[ds_cat][i],
                                      'ITERATION:', key_iter, 'CV_FOLD:', k)
                                X_train, X_test = X[train_idx], X[test_idx]
                                # y_test = actual class label of test data
                                # y_train = actual class label of train data
                                y_train, y_test = y[train_idx], y[test_idx]

                                clf.fit(X_train, y_train)
                                score = clf.predict(X_test)
                                perf_holder = self.perf_obj.compute_measures(
                                    y_test, score)
                                cross_val_pack = [
                                    str(self.dataset_names[ds_cat][i]),
                                    key_iter, k, model_name, *perf_holder
                                ]

                                k = k + 1

                                temp_result.append(cross_val_pack)

            dh_obj.write_csv(
                temp_result,
                self.config['file_level_WPDP_cross_validation_results_des'])
Ejemplo n.º 6
0
    def cpdp(self):
        children = []
        learner = AmlmnbBase(h=[],
                             h_states=[1],
                             delta=0.001,
                             alpha=0.00001,
                             mode_h='individual11')
        au = vae.AutoEncoder(self.nb_epoch, self.batch_size, self.encoding_dim,
                             self.learning_rate)
        self.classifiers.append(learner)
        self.classifiers.append(au)
        temp = self.classifiers[0]
        self.classifiers[0] = learner
        self.classifiers[-2] = temp

        data_pairs = load_CPDP_datasets()
        data_pairs = DataPreprocessing.binerizeCPDP(data_pairs)
        temp_result = [self.header3]
        for model_name, clf in zip(self.model_holder, self.classifiers):
            for pair, data in enumerate(data_pairs):
                tr = np.asarray(data[0])
                ts = np.asarray(data[1])

                if model_name == "MLMNB":
                    # tca_obj = TCA(dim=tr.shape[1] - 1, kerneltype='linear', kernelparam=0.1, mu=1)
                    # tr[:, 0:-1], ts[:, 0:-1] = tca_obj.fit_transform(tr[:, 0:-1], ts[:, 0:-1])

                    #########################################################
                    ss = [[], []]
                    for i in range(np.size(tr, 1) - 1):
                        ss[0].append(sum(tr[:, i]))
                        ss[1].append(i)
                    c = tr[:, -1]
                    tr = SelectKBest(score_func=mutual_info_classif,
                                     k=round(np.size(tr, 1) /
                                             2)).fit_transform(
                                                 tr[:, 0:-1], tr[:, -1])

                    tr = np.concatenate((tr, c.reshape(-1, 1)), axis=1)
                    idxx = []
                    for i in range(len(ss[0])):
                        for j in range(np.size(tr, 1) - 1):
                            if ss[0][i] == sum(tr[:, j]):
                                idxx.append(ss[1][i])
                    cts = ts[:, -1]
                    ts = ts[:, idxx]
                    ts = np.concatenate((ts, cts.reshape(-1, 1)), axis=1)

                    #####################################################
                    array_types = [
                        sum(tr[:, feature])
                        for feature in range(np.size(tr, 1) - 1)
                    ]
                    array_types = self.remove_leading_zero(array_types)
                    conti_features = []
                    for idx, e in enumerate(array_types):
                        if e == True:
                            conti_features.append(idx)
                    ######################################################
                    for k in range(np.size(tr, 1) - 1):
                        children.append(k)
                    learner = AmlmnbBase(h=[children],
                                         h_states=[5],
                                         delta=0.01,
                                         alpha=0.001,
                                         mode_h='individual')
                    clf = learner
                    ##########################################################

                    reduced_tr = np.delete(tr, conti_features, axis=1)
                    discretized_tr = disc.fit_transform(tr[:, conti_features])
                    tr = np.concatenate((discretized_tr, reduced_tr), axis=1)

                    reduced_ts = np.delete(ts, conti_features, axis=1)
                    discretized_ts = disc.fit_transform(ts[:, conti_features])
                    ts = np.concatenate((discretized_ts, reduced_ts), axis=1)

                X_train = tr[:, 0:-1]
                y_train = tr[:, -1]
                X_test = ts[:, 0:-1]
                y_test = ts[:, -1]

                for iterations in range(self.config['iterations']):
                    print("MODEL:", model_name, "PAIR:", pair, "ITERATION:",
                          iterations)
                    if model_name == "DNN":
                        df_train = np.concatenate(
                            (X_train, y_train.reshape(-1, 1)), axis=1)
                        df_test = np.concatenate(
                            (X_test, y_test.reshape(-1, 1)), axis=1)

                        df_train_1 = df_train[df_train[:, -1] == 1]
                        df_train_2 = df_train[df_train[:, -1] == 2]
                        df_train_1_x = np.delete(df_train_1, -1, axis=1)
                        # df_train_2_x = np.delete(df_train_2, -1, axis=1)

                        df_test_1 = df_test[df_test[:, -1] == 1]
                        df_test_2 = df_test[df_test[:, -1] == 2]
                        df_test_1_x = np.delete(df_test_1, -1, axis=1)
                        df_test_2_x = np.delete(df_test_2, -1, axis=1)

                        b.clear_session()
                        clf.fit(df_train_1_x)
                        perf_holder = clf.predict(df_test)
                    else:
                        clf.fit(X_train, y_train)
                        random.seed(100)
                        y_pred = clf.predict(X_test)

                        y_pred = np.array(y_pred)
                        perf_holder = self.perf_obj.compute_measures(
                            y_test, y_pred)

                    release_pack = [model_name, pair, iterations, *perf_holder]

                    temp_result.append(release_pack)

                    dh_obj.write_csv(
                        temp_result, self.
                        config['file_level_different_release_results_whole'])
        return self
Ejemplo n.º 7
0
    def wpdp(self):
        children = []
        learner = AmlmnbBase(h=[],
                             h_states=[1],
                             delta=0.001,
                             alpha=0.00001,
                             mode_h='individual11')
        au = vae.AutoEncoder(self.nb_epoch, self.batch_size, self.encoding_dim,
                             self.learning_rate)
        self.classifiers.append(learner)
        self.classifiers.append(au)
        temp = self.classifiers[0]
        self.classifiers[0] = learner
        self.classifiers[-2] = temp

        # MNB_CLF = AmlmnbBase(h=[], h_states=[1], delta=0.001, alpha=0.00001, mode_h='individual')
        # self.model_holder[1] = "MNB"
        # self.classifiers[1] = MNB_CLF

        self.dataset = DataPreprocessing.binerize_class(self.dataset)
        temp_result = [self.header2]
        for model_name, clf in zip(self.model_holder, self.classifiers):
            for ds_cat, ds_val in self.dataset.items():
                for i in range(len(ds_val)):
                    for j in range(i + 1, len(ds_val)):
                        tr = np.array(ds_val[i])
                        ts = np.array(ds_val[j])

                        if model_name == "MLMNB":
                            ss = [[], []]
                            for counter in range(np.size(tr, 1) - 1):
                                ss[0].append(sum(tr[:, counter]))
                                ss[1].append(counter)
                            c = tr[:, -1]
                            tr = SelectKBest(
                                score_func=mutual_info_classif,
                                k=round(np.size(tr, 1) / 2)).fit_transform(
                                    tr[:, 0:-1], tr[:, -1])

                            tr = np.concatenate((tr, c.reshape(-1, 1)), axis=1)
                            idxx = []
                            for counter1 in range(len(ss[0])):
                                for counter2 in range(np.size(tr, 1) - 1):
                                    if ss[0][counter1] == sum(tr[:, counter2]):
                                        idxx.append(ss[1][counter1])
                            cts = ts[:, -1]
                            ts = ts[:, idxx]
                            ts = np.concatenate((ts, cts.reshape(-1, 1)),
                                                axis=1)

                            array_types = [
                                sum(tr[:, feature])
                                for feature in range(np.size(tr, 1) - 1)
                            ]
                            array_types = self.remove_leading_zero(array_types)
                            conti_features = []
                            for idx, e in enumerate(array_types):
                                if e == True:
                                    conti_features.append(idx)
                            for k in range(np.size(tr, 1) - 1):
                                children.append(k)
                            learner = AmlmnbBase(h=[conti_features],
                                                 h_states=[5],
                                                 delta=0.01,
                                                 alpha=0.001,
                                                 mode_h='individual')
                            clf = learner
                            # if ds_cat == "CM1" or ds_cat == "JM1" or ds_cat == "KC1" or ds_cat == "KC2" or ds_cat == "KC3":
                            #     tr = self.delete_unused_NASA_metrics(tr, ds_cat, conti_features)
                            #     ts = self.delete_unused_NASA_metrics(ts, ds_cat, conti_features)
                            # else:
                            # _dataset[:, 0:-1] = disc.fit_transform(_dataset[:, 0:-1])
                            # reduced_tr = np.delete(tr, conti_features, axis=1)
                            # discretized_tr = disc.fit_transform(tr[:, conti_features])
                            # tr = np.concatenate((discretized_tr, reduced_tr), axis=1)
                            #
                            # reduced_ts = np.delete(ts, conti_features, axis=1)
                            # discretized_ts = disc.fit_transform(ts[:, conti_features])
                            # ts = np.concatenate((discretized_ts, reduced_ts), axis=1)

                        X_train = tr[:, 0:-1]
                        y_train = tr[:, -1]
                        X_test = ts[:, 0:-1]
                        y_test = ts[:, -1]

                        for iterations in range(self.config['iterations']):
                            if model_name == "DNN":
                                df_train = np.concatenate(
                                    (X_train, y_train.reshape(-1, 1)), axis=1)
                                df_test = np.concatenate(
                                    (X_test, y_test.reshape(-1, 1)), axis=1)

                                df_train_1 = df_train[df_train[:, -1] == 1]
                                df_train_2 = df_train[df_train[:, -1] == 2]
                                df_train_1_x = np.delete(df_train_1,
                                                         -1,
                                                         axis=1)
                                # df_train_2_x = np.delete(df_train_2, -1, axis=1)

                                df_test_1 = df_test[df_test[:, -1] == 1]
                                df_test_2 = df_test[df_test[:, -1] == 2]
                                df_test_1_x = np.delete(df_test_1, -1, axis=1)
                                df_test_2_x = np.delete(df_test_2, -1, axis=1)

                                b.clear_session()
                                clf.fit(df_train_1_x)
                                perf_holder = clf.predict(df_test)
                            else:
                                clf.fit(X_train, y_train)
                                random.seed(100)
                                y_pred = clf.predict(X_test)

                                y_pred = np.array(y_pred)
                                perf_holder = self.perf_obj.compute_measures(
                                    y_test, y_pred)

                            print(self.dataset_names[ds_cat][i])

                            release_pack = [
                                model_name, ds_cat,
                                self.dataset_names[ds_cat][i],
                                self.dataset_names[ds_cat][j], iterations,
                                *perf_holder
                            ]

                            a = pd.concat([
                                self.dataset_file_names[ds_cat][j],
                                pd.DataFrame(y_test.reshape((-1, 1))).reindex(
                                    self.dataset_file_names[ds_cat][j].index),
                                pd.DataFrame(y_pred.reshape((-1, 1))).reindex(
                                    self.dataset_file_names[ds_cat][j].index)
                            ],
                                          axis=1,
                                          ignore_index=True)

                            a.columns = self.header1
                            temp_result.append(release_pack)

                            addr = self.temp_addr + model_name + "_" + "Iteration {}".format(
                                iterations
                            ) + "_" + self.dataset_names[ds_cat][j]

            dh_obj.write_csv(
                temp_result,
                self.config['file_level_different_release_results_whole'])