Beispiel #1
0
 def rf(self):
     clf = rfc.RandomForest("train.csv")
     tree = clf.train()
     print tree
     if self.p_flag == "p":
         print clf.predict
         predictions = tree.predict(clf.predict)
         self.binning(clf.predict_original, predictions)
Beispiel #2
0
def check_feature_rate():
    import math
    import randomForest as rf
    missing_input = 'none'  #'mean'
    transform = False
    scale = True
    use_text = False
    dummy = False
    use_feature_selection = False

    data_path = 'DorCirurgiaCategNA.csv'
    class_questionnaire = 'Q92510'
    class_name = 'Q92510_snDorPos'
    data, original_attributes, categories = read.readData(
        data_path=data_path,
        class_name=class_name,
        class_questionnaire=class_questionnaire,
        missing_input=missing_input,
        dummy=dummy,
        transform_numeric=transform,
        use_text=use_text,
        skip_class_questionnaire=True)  #skip_class_questionnaire=False)

    X = data[:, 0:-1]
    y = np.array(data[:, -1])

    ntrees = 5001
    replace = False
    mtry = math.sqrt
    max_depth = None
    missing_branch = True
    seed = np.random.randint(0, 10000)
    clf1 = rf.RandomForest(ntrees=ntrees,
                           oob_error=True,
                           random_state=seed,
                           mtry=mtry,
                           missing_branch=missing_branch,
                           prob_answer=False,
                           max_depth=max_depth,
                           replace=replace,
                           balance=True)
    clf1.fit(X, y)
    attributes_used = {}
    for tree in clf1.forest:

        for attribute in tree.feature_indices:
            if (attribute not in attributes_used.keys()):
                attributes_used[attribute] = 1
            else:
                attributes_used[attribute] += 1

    if (len((attributes_used.keys())) != X.shape[1]):
        print(len(attributes_used.keys()))
        print(X.shape[1])
        print('not equal!!! %r' %
              (1 - len(attributes_used.keys()) / X.shape[1]))
    print({original_attributes[a]: b for a, b in attributes_used.items()})
    print(1 - clf1.oob_error_)
def select_params(X, y):
    times = 10
    final_scores = []
    parameters = []
    ntrees = 10  #[700,600,500,400,300,200,100]#,50,25]
    mtry = [math.sqrt, None, math.sqrt, math.log2]
    max_depth = [2, 3, 4, None]
    missing_branch = True  #[True,False]#,False]#,False]
    replace = False  #[True,False]

    for md in max_depth:
        for mb in missing_branch:
            for mt in mtry:
                for r in replace:
                    parameters.append({
                        'max_depth': md,
                        'missing_branch': mb,
                        'mtry': mt,
                        'replace': r
                    })
    cont = 0
    print(len(parameters))
    for params in parameters:
        cont += 1
        print('Choice %r of parameters' % cont)

        seed = np.random.randint(0, 100000)

        #for seed in np.random.choice(range(1000),times):

        clf = rf.RandomForest(random_state=seed,
                              ntrees=ntrees,
                              oob_error=True,
                              mtry=params['mtry'],
                              missing_branch=params['missing_branch'],
                              max_depth=params['max_depth'],
                              replace=params['replace'],
                              balance=True)
        clf.fit(X, y)
        final_scores.append(clf.oob_error_)

    min_score = min(final_scores)
    std = np.std(final_scores)
    print('Best set of parameters:')
    indexes = np.where(np.array(final_scores) == min_score)[0]
    for index in indexes:
        print(parameters[index])

    print('Best 1.s.e. set of parameters:')
    index = (np.abs(np.array(final_scores) - (min_score + std))).argmin()
    print('%r: %r' % (parameters[index], final_scores[index]))

    print('10 best parameters:')
    for a, b in zip(
            np.array(parameters)[np.array(final_scores).argsort()[:10]],
            np.array(sorted(final_scores)[:10])):
        print('%r : %r ' % (a, b))
Beispiel #4
0
def plot_randomforest_accuracy_threshold(X, y, original_attributes,
                                         variable_importances, ntrees, replace,
                                         mtry, max_depth, missing_branch):
    thrs = []
    accuracy = []
    nfeatures = len(variable_importances)
    features = [a[0] for a in variable_importances]
    thresholds = [a[1] for a in variable_importances]
    seed = np.random.randint(0, 10000)
    clf = rf.RandomForest(ntrees=ntrees,
                          mtry=mtry,
                          missing_branch=missing_branch,
                          prob_answer=False,
                          max_depth=max_depth,
                          replace=replace,
                          random_state=seed)
    clf.fit(X, y)
    thrs.append(thresholds[-1])
    #nf.append(nfeatures)
    accuracy.append(1 - clf.oob_error_)
    for i in range(1, nfeatures):
        print('eliminating feature %r...' % original_attributes[features[-i]])
        seed = np.random.randint(0, 10000)
        clf = rf.RandomForest(ntrees=ntrees,
                              mtry=mtry,
                              missing_branch=missing_branch,
                              prob_answer=False,
                              max_depth=max_depth,
                              replace=replace,
                              random_state=seed)
        clf.fit(X[:, features[:-i]], y)
        #print(features[:-i])
        #nf.append(nfeatures-i)
        thrs.append(thresholds[-i - 1])
        accuracy.append(1 - clf.oob_error_)

    plt.plot(thrs, accuracy, 'bo', color='blue')
    plt.xlabel('threshold')
    plt.ylabel('accuracy')
    plt.show()
Beispiel #5
0
def plot_randomforest_seed(X, y, attributes):
    missing_branch = []
    missing_c45 = []
    seeds = []
    for seed in range(0, 1000, 10):
        print('seed:')
        print(seed)

        clf = rf.RandomForest(ntrees=300,
                              mtry=math.sqrt,
                              missing_branch=True,
                              prob_answer=False,
                              max_depth=4,
                              replace=False,
                              random_state=seed)
        clf.fit(X, y)

        missing_branch.append(1 - clf.oob_error_)
        print(1 - clf.oob_error_)

        clf2 = rf.RandomForest(ntrees=300,
                               mtry=math.sqrt,
                               missing_branch=False,
                               prob_answer=False,
                               max_depth=4,
                               replace=False,
                               random_state=seed)
        clf2.fit(X, y)

        missing_c45.append(1 - clf2.oob_error_)
        print(1 - clf2.oob_error_)

        seeds.append(seed)

    plt.plot(missing_c45, missing_branch, 'x', color='blue')
    plt.show()
Beispiel #6
0
def plot_randomforest_accuracy_nfeatures(X, y, original_attributes, features,
                                         ntrees, replace, mtry, max_depth,
                                         missing_branch):
    nf = []
    accuracy = []
    nfeatures = len(features)
    seed = np.random.randint(0, 10000)
    clf = rf.RandomForest(ntrees=ntrees,
                          mtry=mtry,
                          missing_branch=missing_branch,
                          prob_answer=False,
                          max_depth=max_depth,
                          replace=replace,
                          random_state=seed)
    clf.fit(X, y)
    nf.append(0)
    accuracy.append(1 - clf.oob_error_)
    for i in range(1, nfeatures):
        print('eliminating feature %r...' % original_attributes[features[-i]])
        seed = np.random.randint(0, 10000)
        clf = rf.RandomForest(ntrees=ntrees,
                              mtry=mtry,
                              missing_branch=missing_branch,
                              prob_answer=False,
                              max_depth=max_depth,
                              replace=replace,
                              random_state=seed)
        clf.fit(X[:, features[:-i]], y)

        nf.append(i)
        accuracy.append(1 - clf.oob_error_)

    plt.plot(nf, accuracy, 'bo', color='blue')
    plt.xlabel('number of features not being considered')
    plt.ylabel('accuracy')
    plt.show()
Beispiel #7
0
def plot_randomforest_accuracy(X,
                               y,
                               attributes,
                               ntrees,
                               replace,
                               mtry,
                               max_depth,
                               missing_branch,
                               ntimes,
                               title=None):
    missing_branch_dict = {}
    missing_c45 = []
    seeds = []
    for i in range(ntimes):
        #for seed in range(0,1000,50):
        seed = np.random.randint(100000)
        #print('seed: %r' % seed)
        clf = rf.RandomForest(ntrees=ntrees,
                              mtry=mtry,
                              missing_branch=missing_branch,
                              prob_answer=False,
                              max_depth=max_depth,
                              replace=replace,
                              random_state=seed)
        clf.fit(X, y)

        if round(1 - clf.oob_error_, 2) not in missing_branch_dict.keys():
            missing_branch_dict[round(1 - clf.oob_error_, 2)] = 1
        else:
            missing_branch_dict[round(1 - clf.oob_error_, 2)] += 1

    k = sorted(missing_branch_dict.items(), key=lambda x: x[0])

    plt.bar(range(len([i[0] for i in k])), [i[1] for i in k])
    pos = np.arange(len(k))
    width = 1.0  # gives histogram aspect to the bar diagram
    ax = plt.axes()
    ax.set_xticks(pos + (width / 2))
    ax.set_xticklabels([round(i[0], 2) for i in k])
    ax.set_yticks(range(0, 50, 5))

    plt.xlabel('acurácia com missing branch = ' + str(missing_branch))

    if (title is not None):
        plt.title(title)
    plt.ylabel('frequência')

    plt.show()
Beispiel #8
0
test_X = data['test_data']
train_y = (data['training_labels'].T)[:,0]
train_X = data['training_data']
x_y = list(zip(train_X, train_y))
random.shuffle(x_y)
train_X = np.array([e[0] for e in x_y])
train_y = np.ravel([e[1] for e in x_y])
validation_X = train_X[:2000, :]
validation_y = train_y[:2000]
train_X = train_X[2000:, :]
train_y = train_y[2000:]
print(train_X.shape)

# random forest

rf = randomForest.RandomForest(10,10,train_X.shape[0],train_X.shape[1])
rf.train(train_X,train_y)
res = rf.predict(validation_X)

score = 0
for i in range(len(res)):
    if res[i] == validation_y[i]:
        score += 1
score /= len(res)
print(score)

# decision tree

tree = decisionTree.DecisionTree(10,train_X.shape[1])
tree.train(train_X,train_y)
res = tree.predict(validation_X)
def feature_selection_threshold(X,
                                y,
                                ntrees,
                                replace,
                                mtry,
                                max_depth,
                                missing_branch,
                                balance,
                                cutoff,
                                ntimes=25,
                                title=None,
                                missing_rate=False,
                                vitype='err',
                                vimissing=True,
                                backwards=False,
                                save_models=False,
                                random_subspace=False):

    # get average feature importances for each feature
    vis = average_varimp(X,
                         y,
                         ntrees,
                         replace,
                         mtry,
                         max_depth,
                         missing_branch,
                         balance=balance,
                         missing_rate=missing_rate,
                         ntimes=ntimes,
                         select=False,
                         mean=False,
                         vitype=vitype,
                         vimissing=vimissing,
                         printvi=False,
                         random_subspace=random_subspace)
    # if backwards is True, then the feature selection will start the process with all features,
    # and eliminating the least important ones in each step
    if (backwards is True):
        reverse = False
        comp_threshold = lambda x, y: x <= y
        get_slice = lambda x, index: x[index:]
        stop_index = -1
        chosen_model = -1
    # if it's False, then it starts with only the most important feature, and then adding
    # the most important ones in each step
    else:
        reverse = True
        comp_threshold = lambda x, y: x > y
        get_slice = lambda x: x[0:index]
        stop_index = 0
        chosen_model = 0

    ordered_features = [
        a[0] for a in sorted(vis, key=lambda x: np.mean(x[1]), reverse=reverse)
    ]
    thresholds = [np.mean(vis[a][1]) for a in ordered_features]
    threshold_values = sorted([round(a, 10) for a in set(thresholds)],
                              reverse=reverse)

    stop_indexes = []
    scores = []
    i = 0
    nn = 0
    # for each threshold value (feature importance value), create a forest
    # only using: (a) features whose importance value is <= than the threshold
    # if backwards is True (starting from the least important), or
    # (b) features whose importance value is > than the threshold if backwards is False
    # (starting from the most important one)
    for threshold in threshold_values:

        s_index = stop_index + 1
        while s_index < len(thresholds):
            if (comp_threshold(threshold, thresholds[s_index])):
                break
            else:
                s_index += 1
        stop_index = s_index

        features = get_slice(ordered_features, stop_index)
        seed = np.random.randint(0, 10000)
        clf = rf.RandomForest(ntrees=ntrees,
                              oob_error=True,
                              random_state=seed,
                              mtry=mtry,
                              missing_branch=missing_branch,
                              prob_answer=False,
                              max_depth=max_depth,
                              replace=replace,
                              balance=balance,
                              cutoff=cutoff,
                              random_subspace=random_subspace)

        clf.fit(X[X.columns[features]], y)
        clf.threshold = threshold
        if ('participant code' not in clf.X.columns):
            clf.X['participant code'] = X['participant code']
        if ('Q44071_snCplexoAt' not in clf.X.columns):
            clf.X['Q44071_snCplexoAt'] = X['Q44071_snCplexoAt']
        scores.append(1 - clf.oob_error_)

        if (save_models is True):
            with open('prognostic_model_' + title + str(nn) + '.pickle',
                      'wb') as handle:
                pickle.dump(clf, handle)
            nn += 1
        stop_indexes.append(stop_index)

    stdm = sem(scores)
    indexes = np.where(
        np.array(scores) == scores[((np.abs(
            np.array([a for a in scores if a != max(scores)]) -
            (max(scores) - stdm))).argmin())])[0]
    # the forest with the best score (closest to the max score subtracted from the standard error of scores) and
    # the biggest threshold value (by index -1) is chosen as the suggested model to be used
    index = indexes[chosen_model]

    clf = rf.RandomForest(ntrees=ntrees,
                          oob_error=True,
                          random_state=seed,
                          mtry=mtry,
                          missing_branch=missing_branch,
                          prob_answer=False,
                          max_depth=max_depth,
                          replace=replace,
                          balance=balance,
                          cutoff=cutoff,
                          random_subspace=random_subspace)

    #clf.attributes = attributes[get_slice(ordered_features,stop_indexes[index])]
    clf.fit(X[X.columns[get_slice(ordered_features, stop_indexes[index])]], y)

    #importance_values = [[round(np.mean(aa),10) for aa in a[1]] for a in vis if round(np.mean(a[1]),10) >= threshold_values[index]]
    #features =  attributes[[a[0] for a in vis if round(np.mean(a[1]),10) >= threshold_values[index]]]

    #plot.boxplot(importance_values,features,title)
    if (save_models is True):

        plot.plot_feature_importance_vs_accuracy(threshold_values,
                                                 scores,
                                                 xlabel='threshold',
                                                 title=title,
                                                 special=index)

    return clf
def average_varimp(X,
                   y,
                   ntrees,
                   replace,
                   mtry,
                   max_depth,
                   missing_branch,
                   balance,
                   vitype='err',
                   vimissing=True,
                   ntimes=25,
                   select=True,
                   printvi=False,
                   plotvi=False,
                   cutpoint=0.0,
                   mean=False,
                   title=None,
                   missing_rate=False,
                   random_subspace=False):
    vi = {a: [] for a in range(X.shape[1])}
    for i in range(X.shape[0]):
        if (i < ntimes):
            seed = np.random.randint(0, 10000)
            clf = rf.RandomForest(ntrees=ntrees,
                                  oob_error=True,
                                  random_state=seed,
                                  mtry=mtry,
                                  missing_branch=missing_branch,
                                  prob_answer=False,
                                  max_depth=max_depth,
                                  replace=replace,
                                  balance=balance,
                                  random_subspace=random_subspace)
            clf.fit(X, y)
            varimps = clf.variable_importance(vitype=vimissing, vimissing=True)
            for var in varimps.keys():
                if (missing_rate):
                    vi[var].append(varimps[var] *
                                   utils.notNanProportion(X[X.columns[var]]))
                else:
                    vi[var].append(varimps[var])
        else:
            break

    vimean = {a: [] for a in range(X.shape[1])}
    for var in vi.keys():
        vimean[var] = np.mean(vi[var])

    if (printvi):
        vis = sorted(vimean.items(), key=lambda x: x[1], reverse=True)
        for v, i in vis:
            print('feature: %r importance: %r' % (X.columns[v], i))

    if (plotvi):
        print(cutpoint)
        importance_values = []
        features = []
        vis = sorted(vi.items(), key=lambda x: x[0])
        for v, i in vis:
            if (vimean[v] >= cutpoint):
                importance_values.append(i)
                features.append(X.columns[v])
        import plot
        plot.boxplot(importance_values, features, title)

    if (select):
        vis = sorted(vimean.items(), key=lambda x: x[1], reverse=True)
        return sorted([var[0] for var in vis if vimean[var[0]] >= cutpoint])
    if (mean):
        return sorted(vimean.items(), key=lambda x: x[1], reverse=True)
        #return [var[0] for var in vis]

    return sorted(vi.items(), key=lambda x: x[0])
Beispiel #11
0
print('Testing Decision Tree score method...')

assert (m.score([['RAIN', 63, 50, 'T'], ['SUNNY', 66, 90, 'F'],
                 ['SUNNY', 50, 50, 'T'], ['OVERCAST', 70, 50, 'F']],
                ['PLAY', 'PLAY', 'PLAY', 'PLAY']) == accuracy_score(
                    m.predict(([['RAIN', 63, 50, 'T'], ['SUNNY', 66, 90, 'F'],
                                ['SUNNY', 50, 50, 'T'],
                                ['OVERCAST', 70, 50, 'F']])),
                    ['PLAY', 'PLAY', 'PLAY', 'PLAY']))

print('Testing Random Forest...')

clf = rf.RandomForest(ntrees=8,
                      mtry=math.sqrt,
                      oob_error=True,
                      random_state=9,
                      missing_branch=False,
                      prob_answer=False,
                      max_depth=3,
                      replace=False)
clf.fit(X, y)
fcs = clf.feature_contribution()
#clf.forest[-1].to_dot(original_attributes,out='out.dot')

clf.forest[-1].to_pdf(original_attributes, out='out2.pdf')

print("Testing Random Forest with missing data...")
# data = read.readData(data_path = '../Dados/Test_with_nan2.csv', class_name='Class',
#     dummy=dummy,transform_numeric=transform,use_text = use_text,missing_input='none')

X = data[data.columns[:-1]].values
y = data['Class'].values