Exemple #1
0
def ReprodIndividualsFromRF(list_indiv, max_id, options):

    list_indiv = list(list_indiv)
    rf = RandomForestClassifier(n_estimators=len(list_indiv))
    trees = list()
    for indiv in list_indiv:
        trees.append(indiv.clf)

    rf.estimators_ = trees
    rf.n_classes_ = trees[0].n_classes_
    rf.classes_ = trees[0].classes_

    new_dt = eqtree_rec_rf(rf,
                           0,
                           max_depth=options['max_depth'],
                           smallest_tree=False)

    new_id = max_id + 1

    indiv3 = genetic.individual(new_dt,
                                new_id,
                                type_rf=False,
                                alpha=options['alpha'],
                                evaluate_on_data=options['on_data'],
                                X=options['X'],
                                y=options['y'])

    return indiv3
Exemple #2
0
def deserialize_random_forest(model_dict):
    model = RandomForestClassifier(**model_dict['params'])
    estimators = [deserialize_decision_tree(decision_tree) for decision_tree in model_dict['estimators_']]
    model.estimators_ = np.array(estimators)

    model.classes_ = np.array(model_dict['classes_'])
    model.n_features_ = model_dict['n_features_']
    model.n_outputs_ = model_dict['n_outputs_']
    model.max_depth = model_dict['max_depth']
    model.min_samples_split = model_dict['min_samples_split']
    model.min_samples_leaf = model_dict['min_samples_leaf']
    model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf']
    model.max_features = model_dict['max_features']
    model.max_leaf_nodes = model_dict['max_leaf_nodes']
    model.min_impurity_decrease = model_dict['min_impurity_decrease']
    model.min_impurity_split = model_dict['min_impurity_split']

    if 'oob_score_' in model_dict:
        model.oob_score_ = model_dict['oob_score_']
    if 'oob_decision_function_' in model_dict:
        model.oob_decision_function_ = model_dict['oob_decision_function_']

    if isinstance(model_dict['n_classes_'], list):
        model.n_classes_ = np.array(model_dict['n_classes_'])
    else:
        model.n_classes_ = model_dict['n_classes_']

    return model
Exemple #3
0
def build_classifier(trees):
    def build_decision_tree(t):
        dt = DecisionTreeClassifier(random_state=0)
        dt.n_features_ = t.n_features
        dt.n_outputs_ = t.n_outputs
        dt.n_classes_ = t.n_classes[0]
        dt.classes_ = np.array([x for x in range(dt.n_classes_)])
        dt.tree_ = t
        return dt

    if len(trees) > 1:
        clf = RandomForestClassifier(random_state=0, n_estimators=len(trees))
        clf.estimators_ = [build_decision_tree(t) for t in trees]
        clf.n_features_ = trees[0].n_features
        clf.n_outputs_ = trees[0].n_outputs
        clf.n_classes_ = trees[0].n_classes[0]
        clf.classes_ = np.array([x for x in range(clf.n_classes_)])
    else:
        clf = build_decision_tree(trees[0])
    return clf
# Retornar uma matriz das importâncias do recurso (quanto mais alto, mais importante o recurso).
## forest.feature_importances_
print("Feature Importances")
print(forest.feature_importances_)

# Pontuação do conjunto de dados de treinamento obtido usando uma estimativa out-of-bag
## forest.oob_score_
print("Oob Score = " + str(forest.oob_score_))

# Função de decisão computada com estimativa fora do saco no conjunto de treinamento. 
# Se n_estimators for pequeno, pode ser possível que um ponto de dados nunca tenha sido 
# deixado de fora durante o bootstrap. Nesse caso, oob_decision_function_ pode conter NaN
## forest.oob_decision_function_ 
print('Oob Decision Function')
print(forest.oob_decision_function_ )

"""
 @Method
"""

# Retorna a precisão média nos dados e rótulos de teste fornecidos.
# Na classificação de vários rótulos, essa é a precisão do subconjunto, que é uma métrica rígida, pois 
# é necessário para cada amostra que cada conjunto de rótulos seja corretamente previsto. 
fscore = forest.score(X_train, y_train)
print('Score')
print(fscore)

forest.estimators_ = forest.estimators_.pop()
print(len(forest.estimators_))
Exemple #5
0
    # 原始森林
    RF = RandomForestClassifier(n_estimators=rfSize)
    RF.fit(train_x, train_y)
    RF_path = model_path + '/RF.m'
    joblib.dump(RF, RF_path)

    # BRAF
    rf3 = RandomForestClassifier(n_estimators=rf2_size)
    rf3.fit(training_c_x, training_c_y)
    rf3_path = model_path + '/rf3.m'
    joblib.dump(rf3, rf3_path)

    RF1 = RandomForestClassifier(n_estimators=rfSize)
    Gobaltree = rf1.estimators_ + rf3.estimators_
    RF1.estimators_ = Gobaltree
    RF1.classes_ = rf1.classes_
    RF1.n_classes_ = rf1.n_classes_
    RF1.n_outputs_ = rf1.n_outputs_
    RF1_path = model_path + '/braf.m'
    joblib.dump(RF1, RF1_path)

    # DBRF
    RF2 = RandomForestClassifier(n_estimators=rfSize)
    mod_Gobaltree = rf1.estimators_ + rf2.estimators_
    RF2.estimators_ = mod_Gobaltree
    RF2.classes_ = rf2.classes_
    RF2.n_classes_ = rf2.n_classes_
    RF2.n_outputs_ = rf2.n_outputs_
    RF2_path = model_path + '/borderlindbscan.m'
    joblib.dump(RF2, RF2_path)
    def app_flow(self):
        # This method contains a state machine for the slave and master instance

        # === States ===
        state_initializing = 1
        state_read_input = 2
        state_share_samples = 3
        state_gather_1 = 4
        state_wait_1 = 5
        state_train_local = 6
        state_gather_2 = 7
        state_wait_2 = 8
        state_global_ready = 9
        state_finishing = 10

        # Initial state
        state = state_initializing
        self.progress = 'initializing...'

        while True:

            if state == state_initializing:
                if self.id is not None:  # Test if setup has happened already
                    state = state_read_input

            # COMMON PART

            if state == state_read_input:
                print('Reading input...')
                base_dir = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir))

                def read_input_train(ins, path):
                    d = pd.read_csv(path, sep=self.sep)
                    data_X = d.drop(self.label, axis=1)
                    data_y = d[self.label]

                    if ins.split_test is not None:
                        ins.data = pd.read_csv(os.path.join(base_dir, ins.input_train), sep=ins.sep)
                        data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(data_X, data_y, test_size=ins.split_test)
                        ins.data_X_train.append(data_X_train)
                        ins.data_y_train.append(data_y_train)
                        ins.data_X_test.append(data_X_test)
                        ins.data_y_test.append(data_y_test)
                    else:
                        ins.data_X_train.append(data_X)
                        ins.data_y_train.append(data_y)

                def read_input_test(ins, path):
                    d = pd.read_csv(path, sep=ins.sep)
                    data_X = d.drop(ins.label, axis=1)
                    data_y = d[ins.label]
                    ins.data_X_test.append(data_X)
                    ins.data_y_test.append(data_y)

                if self.split_mode == 'directory':
                    for split_name in os.listdir(base_dir):
                        read_input_train(self, os.path.join(base_dir, split_name, self.input_train))
                        if self.input_test is not None:
                            read_input_test(self, os.path.join(base_dir, split_name, self.input_test))
                elif self.split_mode == 'file':
                    read_input_train(self, os.path.join(base_dir, self.input_train))
                    if self.input_test is not None:
                        read_input_test(self, os.path.join(base_dir, self.input_test))

                split_samples = [i.shape[0] for i in self.data_y_train]
                self.my_samples = sum(split_samples) // len(split_samples)

                print(f'Read input. Have {split_samples} samples.')

                if self.master:
                    self.data_incoming.append(pickle.dumps({
                        'samples': self.my_samples
                    }))
                    state = state_gather_1
                else:
                    self.data_outgoing = pickle.dumps({
                        'samples': self.my_samples
                    })
                    self.status_available = True
                    state = state_wait_1

            if state == state_train_local:
                print('Calculate local values...')

                rfs = []
                for i in range(len(self.data_X_train)):
                    global_rf = None
                    trees = int(self.estimators_total * self.my_samples / self.total_samples)
                    if self.mode == 'classification':
                        global_rf = RandomForestClassifier(n_estimators=trees, random_state=self.random_state)
                    elif self.mode == 'regression':
                        global_rf = RandomForestRegressor(n_estimators=trees, random_state=self.random_state)
                    global_rf.fit(self.data_X_train[i], self.data_y_train[i])
                    rfs.append({
                        'rf': global_rf,
                    })

                print(f'Trained random forests')

                if self.master:
                    self.data_incoming.append(pickle.dumps(rfs))
                    state = state_gather_2
                else:
                    self.data_outgoing = pickle.dumps(rfs)
                    self.status_available = True
                    state = state_wait_2

            if state == state_global_ready:
                print(f'Forest done')

                results_pred = []
                results_proba = []
                results_test = []
                for i in range(len(self.data_X_train)):
                    results_pred.append(self.rfs[i].predict(self.data_X_test[i]))
                    if self.mode == 'classification':
                        results_proba.append(self.rfs[i].predict_proba(self.data_X_test[i]))
                    results_test.append(self.data_y_test[i])

                def write_output(path, data):
                    df = pd.DataFrame(data=data)
                    df.to_csv(path, index=False, sep=self.sep)

                print(f'Writing output')
                base_dir_in = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir))
                base_dir_out = os.path.normpath(os.path.join(f'/mnt/output/', self.split_dir))
                if self.split_mode == 'directory':
                    for i, split_name in enumerate(os.listdir(base_dir_in)):
                        write_output(os.path.join(base_dir_out, split_name, self.output_pred), {'pred': results_pred[i][:]})
                        if self.mode == 'classification':
                            write_output(os.path.join(base_dir_out, split_name, self.output_proba), {'prob_0': results_proba[i][:, 0], 'prob_1': results_proba[i][:, 1]})
                        write_output(os.path.join(base_dir_out, split_name, self.output_test), {'y_true': results_test[i]})
                elif self.split_mode == 'file':
                    write_output(os.path.join(base_dir_out, self.output_pred), {'pred': results_pred[0][:]})
                    if self.mode == 'classification':
                        write_output(os.path.join(base_dir_out, self.output_proba), {'prob_0': results_proba[0][:, 0], 'prob_1': results_proba[0][:, 1]})
                    write_output(os.path.join(base_dir_out, self.output_test), {'y_true': results_test[0]})

                if self.master:
                    self.data_incoming.append('DONE')
                    state = state_finishing
                else:
                    self.data_outgoing = 'DONE'
                    self.status_available = True
                    break

            # GLOBAL PART

            if state == state_gather_1:
                if len(self.data_incoming) == len(self.clients):

                    client_data = []
                    for local_rfs in self.data_incoming:
                        client_data.append(pickle.loads(local_rfs))

                    self.data_incoming = []

                    total_samples = sum([cd['samples'] for cd in client_data])

                    self.total_samples = total_samples

                    self.data_outgoing = pickle.dumps(total_samples)
                    self.status_available = True
                    state = state_train_local

                else:
                    print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...')

            if state == state_gather_2:
                if len(self.data_incoming) == len(self.clients):

                    client_data = []
                    for local_rfs in self.data_incoming:
                        client_data.append(pickle.loads(local_rfs))

                    self.data_incoming = []

                    data_outgoing = []

                    for i in range(len(self.data_X_train)):
                        global_rf = None

                        # total_samples = 0
                        # for d in client_data:
                        #     total_samples += d[i]['samples']

                        for d in client_data:
                            drf = d[i]['rf']

                            # perc = d[i]['samples'] / total_samples
                            # trees = int(perc * self.estimators_total)

                            if global_rf is None:
                                global_rf = drf
                                global_rf.estimators_ = drf.estimators_
                                # global_rf.estimators_ = random.sample(drf.estimators_, trees)
                                global_rf.n_estimators = drf.n_estimators
                            else:
                                global_rf.estimators_ += drf.estimators_
                                # global_rf.estimators_ += random.sample(drf.estimators_, trees)
                                global_rf.n_estimators += drf.n_estimators

                        data_outgoing.append(global_rf)

                    self.rfs = data_outgoing

                    self.data_outgoing = pickle.dumps(data_outgoing)
                    self.status_available = True
                    state = state_global_ready

                else:
                    print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...')

            if state == state_finishing:
                if len(self.data_incoming) == len(self.clients):
                    self.status_finished = True
                    break

            # LOCAL PART

            if state == state_wait_1:
                if len(self.data_incoming) > 0:
                    self.total_samples = pickle.loads(self.data_incoming[0])
                    self.data_incoming = []

                    state = state_train_local

            if state == state_wait_2:
                if len(self.data_incoming) > 0:
                    self.rfs = pickle.loads(self.data_incoming[0])
                    self.data_incoming = []

                    state = state_global_ready

            time.sleep(1)
        local_dict.append(w)
    local_dict = sorted(local_dict)

    ############################# Part 3: Linearity ##########################

    # random forest 1
    rf1 = RandomForestClassifier(random_state=10)
    rf1.fit(train_vectors, y_train)

    # random forest 2
    rf2 = RandomForestClassifier(random_state=15)
    rf2.fit(train_vectors, y_train)

    # random forest 3
    rf3 = RandomForestClassifier(random_state=22)
    rf3.estimators_ = rf1.estimators_ + rf2.estimators_
    rf3.n_classes_ = rf1.n_classes_

    # model 1
    def model_rf1(data):
        n_data = len(data)
        res = np.zeros((n_data, 2))
        tfidf = vectorizer.transform(data)
        p = rf1.predict_proba(tfidf)
        res[:, 0] = p[:, 1]
        res[:, 1] = p[:, 1]
        return res

    # model 2
    def model_rf2(data):
        n_data = len(data)