Esempio n. 1
0
    def run(self):
        forest = RandomForest()
        boost = GradientBoost()
        train_x, train_y, test_x, test_y, feature_names = self.load(
            "./data/parker_sleeps.csv")
        forest.train(train_x, train_y, feature_names)
        f_accuracy, forest_predictions = forest.predict(test_x, test_y)

        print("                 Results")
        print("--------- Random Forest Technique --------- \n\n")
        print("Feature number\tFeatures\tPrediction\tActual")

        for i in range(len(test_x)):
            print("{0}\t{1}\t{2}\t{3}".format(i, train_x.iloc[i].values,
                                              forest_predictions[i],
                                              test_y.iloc[i].values[0]))

        print("\nRandom Forest Accuracy (MSE) {:.4f}".format(f_accuracy))

        boost.train(train_x, train_y, feature_names)
        b_accuracy, gb_predictions = boost.predict(test_x, test_y)

        print("\n\n--------- Gradient Boost Technique --------- \n\n")
        print("Feature number\tFeatures\tPrediction\tActual")

        for i in range(len(test_x)):
            print("{0}\t{1}\t{2:.2f}\t{3}".format(i, train_x.iloc[i].values,
                                                  gb_predictions[i],
                                                  test_y.iloc[i].values[0]))

        print("\nGradient Boost Accuracy (MSE) {:.4f}".format(b_accuracy))
Esempio n. 2
0
class Stacking():
    def __init__(self):
        pass

    def fit(self, X, y):
        self.rf = RandomForest(num_trees=15, max_depth=np.inf)
        self.rf.fit(X, y)
        y_rf = self.rf.predict(X)

        self.nb = NaiveBayes()
        self.nb.fit(X, y)
        y_nb = self.nb.predict(X)

        self.knn = KNN(k=3)
        self.knn.fit(X, y)
        y_knn = self.knn.predict(X)

        newX = np.array([y_rf, y_nb, y_knn]).transpose()

        model = DecisionTree(max_depth=np.inf,
                             stump_class=DecisionStumpErrorRate)
        self.model = model

        model.fit(newX, y)

    def predict(self, X):
        y_rf = self.rf.predict(X)
        y_nb = self.nb.predict(X)
        y_knn = self.knn.predict(X)
        x_test = np.array([y_rf, y_nb, y_knn]).transpose()

        return self.model.predict(x_test)
Esempio n. 3
0
def main(argv):
    if len(argv) != 3:
        sys.exit(f"Usage python3 {argv[0]} <testing_file> <tree_json_dir>")

    rna_ids, _, test_x = parse_data.read_data(argv[1],
                                              skip_header=False,
                                              delimiter=",")

    json_files = glob.glob(f"{argv[2]}/*.json")
    forest = RandomForest()
    weights_filename = f"{argv[2]}/tree_weights.json"
    json_files.remove(weights_filename)
    weights = {}
    with open(weights_filename, "r") as weights_file:
        weights = json.loads(weights_file.read())
    for filename in json_files:
        with open(filename, "r") as tree_file:
            tree = DecisionTree.from_json(tree_file.read())
            forest.add_tree(tree)
        forest.weights.append(weights[filename])

    for i, x in enumerate(test_x):
        prediction, confidence = forest.predict_with_confidence(x)

        if prediction == 0.0:
            confidence = 1 - confidence

        print(f"{rna_ids[i]},{confidence}")
Esempio n. 4
0
def main():
    """
    N.B. Last DataFrame Column contains labels
    """
    logger = logging.getLogger(__name__)    
    logger.debug('read data')
     
    dframe_train = pd.read_excel(os.path.join(input_filepath, "train_data.xlsx"), index_col=0)
    logger.debug('train model')
    
    '''CREATE SINGLE TREE'''
    d_t = DecisionTree(metrics = 'entropy') #max_depth = 8
    #trained_dt = dt.build_tree(dframe,header)
    #prediction = classify(small_train.values[0][:-1],t0)
    
    '''CREATE RANDOM FOREST WITH TREES d_t'''
    r_f = RandomForest(decision_tree_type=d_t, n_trees=20)
    r_f = r_f.build_forest(dframe_train, n_selected_features="best", sample_ratio =.8)
    
    '''GET MODEL ACCURACY ON VALIDATION DATA'''
    logger.debug('get model accuracy')
    dframe_val = pd.read_excel(os.path.join(input_filepath, "validate_data.xlsx"), index_col=0)
    predictions_validation = r_f.get_model_accuracy(dframe_val.columns.values.tolist(), dframe_val)  
    
    #logger.debug('single prediction')
    #rf.classify_forest(dframe_val.columns.values.tolist(),dframe_val.values[0],forest) 
    
    logger.debug('save model')
    save_model(output_filepath, "model_00.npy", r_f)
def run_random_forest(data, target_column):
    st.sidebar.title('Choose parameters for Random Forest')
    ts = st.sidebar.slider('Training size', min_value=0.0, max_value=1.0, step=0.01, value=0.7)
    n_estimators = st.sidebar.number_input('n_estimators', min_value=1, max_value=1000, step=1)
    n_features = st.sidebar.number_input('n_features', min_value=1, max_value=len(data.columns)-1, step=1, value=len(data.columns)-1)
    bootstrap_size = st.sidebar.number_input('bootstrap_size', min_value=1, max_value=int(len(data)*ts), step=1, value=int(len(data)*ts))
    if st.sidebar.checkbox('Specify Depth'):
        max_depth = st.sidebar.number_input('max_depth', min_value=1, max_value=int(len(data)*ts), step=1)
    else:
        max_depth = None
    run_status = st.sidebar.button('Run Algorithm')
    if run_status:
        with st.spinner('Running...'):
            x_train, x_test, y_train, y_test = train_test_split(data.drop([target_column], axis=1),
                                                                data[target_column],
                                                                test_size=1 - ts)
            clf = RandomForest(n_estimators=n_estimators,
                               n_features=n_features,
                               max_depth=max_depth,
                               bootstrap_size=bootstrap_size)
            clf.fit(x_train, y_train)
            """
            ## :dart: Accuracy
            """
            st.subheader(accuracy_score(y_test, clf.predict(x_test)))
    def validate(self, folds, num_trees):

        attributes = folds[0][0]
        ok = 0
        error = 0

        for fold in folds:
            del fold[0]

        for i in range(len(folds)):
            folds_copy = copy.deepcopy(folds)
            test = copy.deepcopy(folds_copy[i])
            del folds_copy[i]
            training = sum(folds_copy, [])
            training.insert(0, attributes)

            forest = RandomForest(copy.deepcopy(training))
            trees = forest.get_forest(num_trees)

            for instance in test:
                real_class = instance[len(instance)-1]
                del instance[len(instance)-1]
                prediction = forest.predict(trees, instance)

                if (prediction == real_class):
                    ok = ok + 1
                else:
                    error = error + 1

        print("Total: " + str(ok+error))
        print("Corretas: " + str(ok))
        print("Erradas: " + str(error))
        print("Porcentagem: " + str(ok/(ok+error)))
Esempio n. 7
0
def main(argv):
    """ entry point to the program """

    if len(argv) != 3:
        sys.exit(f"Usage python3 {argv[0]} <testing_file> <tree_json_dir>")

    _, test_y, test_x = parse_data.read_data(argv[1],
                                             skip_header=False,
                                             delimiter=",")

    json_files = glob.glob(f"{argv[2]}/*.json")
    forest = RandomForest()
    for filename in json_files:
        with open(filename, "r") as tree_file:
            tree = DecisionTree.from_json(tree_file.read())
            forest.add_tree(tree)

    total_right = 0
    for i, point in enumerate(test_x):
        expected = forest.predict(point)
        if test_y[i] == expected:
            total_right += 1

    accuracy = total_right / len(test_y)
    print(f"Accuracy: {accuracy}")
Esempio n. 8
0
def main():
    predictorRF = RandomForest()
    predicatorSVM = SVM()
    predicatorLogistic_Regression = Logistic_Regression()
    print('========Random Forest============')
    predictorRF.run()
    print('========SVM============')
    predicatorSVM.run()
    print('========Logistic Regression============')
    predicatorLogistic_Regression.run()
Esempio n. 9
0
def main():
    columns, x_train, y_train, x_test, y_test = preprocessing()
    random_forest_ID3 = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                                  Criterion.ID3, np.vstack((x_train, x_test)), 10)
    decision_tree_ID3 = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                             Criterion.ID3)

    random_forest_GINI = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                                  Criterion.GINI, np.vstack((x_train, x_test)), 10)
    decision_tree_GINI = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                             Criterion.GINI)
    decision_tree_ID3.set_attribute_values(np.vstack((x_train, x_test)))
    decision_tree_GINI.set_attribute_values(np.vstack((x_train, x_test)))
    validation = Validation(x_train, y_train, x_test, y_test)

    print('K-fold validation:\n\n')
    print('Criteri ID3:\n')
    print('Random forest:\n')
    score = validation.score_cross_val(3, random_forest_ID3)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')


    print('Decision tree:\n')
    score = validation.score_cross_val(3, decision_tree_ID3)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')

    print('Criteri GINI:\n')
    print('Random forest:\n')
    score = validation.score_cross_val(3, random_forest_GINI)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')


    print('Decision tree:\n')
    score = validation.score_cross_val(3, decision_tree_GINI)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')

    print('Final model: Random Forest\n')
    print('Resultats finals: \n')
    final_measure = validation.final_measure(random_forest_ID3)
    print(f'Accuracy mitjana: {np.array(final_measure[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(final_measure[Measure.SPEC]).mean()}\n')


    print('\n\n Exemple d arbre de decisió entrenat amb totes les dades disponible a out/resultat.txt')
    #Imprimim un arbre de decisió entrenat amb totes les dades, per visualitzar, tot i no ser el millor model
    x_data = np.vstack((x_train, x_test))
    y_data = np.hstack((y_train, y_test))
    decision_tree_ID3.fit(x_data, y_data)
    write_to_file(decision_tree_ID3)
Esempio n. 10
0
def main(cl_args=sys.argv[1:]):
    """Main wrapper to run the classification app."""
    args = parse_command_line_args(cl_args=cl_args)
    datafile = os.path.realpath(args["data_path"])
    testfile = os.path.realpath(args["test_path"])
    output_root = os.path.realpath(args["output_path"])
    random_seed = args["random_seed"]
    info_level = "INFO"
    if args["verbose"]:
        info_level = "DEBUG"
    # Configure the logger.
    logging.basicConfig(format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
                        level=info_level)
    num_trees = args["num_trees"]
    if not os.path.exists(output_root):
        os.makedirs(output_root)

    # annotated training data
    data = fileio(datafile)
    # annotated test data
    testdata=fileio(testfile)

    # Invoking unit test for decision tree
    if args["run_tests"]:
        logger.info("Invoking test case for decision trees")
        suite = unittest.TestLoader().loadTestsFromTestCase(TestDecisionTree)
        unittest.TextTestRunner().run(suite)
        sys.exit()

    # Creating a set of test points from test data
    test_points=[]
    for line in testdata:
        test_points.append(line[0:len(line)-1])

    # Single decision tree for entire credit approval dataset
    tree_credit = DecisionTree()
    logger.info('Commencing single Decision Tree for credit approval data')
    start_time = time.time()
    tree_credit.build_tree(data)
    end_time = time.time()
    logger.info('time_lapsed: {:0.4f}'.format(end_time - start_time))
    tree_credit.drawtree(jpeg=os.path.join(output_root, 'singletree.png'))

    # Random forest of decision trees for the credit approval dataset
    logger.info('Commencing single Random Forest for credit approval data')
    forest = RandomForest(num_trees)
    start_time = time.time()
    forest.build_forest(data, output_path=output_root, seed=random_seed)
    end_time = time.time()
    logger.info('Time to build forest: {:0.4f}'.format(end_time - start_time))

    evaluate(test_points, forest, testdata)
Esempio n. 11
0
def run_randomforest(train_examples, train_labels, attributes, test_examples,
                     test_labels, n_trees):
    amin, bim = get_data(train_examples, test_labels)
    rforest = RandomForest(entropy, 2, n_trees, len(attributes))
    if (bim > 0):
        rforest.train_dataset(train_examples, attributes, train_labels)
    else:
        clabel = None
    error = 0
    if (amin == 0):
        preds, error = rforest.test_dataset(test_examples, test_labels)

    return error
def main():
    data = datasets.load_digits()
    X = data.data
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    print("X_train.shape:", X_train.shape)
    print("Y_train.shape:", y_train.shape)

    n_features = X_train.shape[1]
    clf = RandomForest(n_estimators=100, max_features=int(np.sqrt(n_features)))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Our RF Accuracy:", accuracy)

    clf = ensemble.RandomForestClassifier(n_estimators=100,
                                          max_features=int(
                                              np.sqrt(n_features)))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("sklearn RF Accuracy:", accuracy)
Esempio n. 13
0
    def test_random_forest(self):
        iris = load_iris()

        X = iris['data']
        y = iris['target']
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

        model = RandomForest(num_trees=5, max_depth=5)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        score = accuracy_score(y_test, preds)
        self.assertGreater(score, 0.9)
Esempio n. 14
0
def cross_validation(dataset,
                     attributes,
                     percentage_train,
                     folds,
                     ntrees,
                     nattributes=-1,
                     depth_limit=None):
    """
    :param dataset: the full dataset
    :param attributes: the attributes
    :param percentage_train: float, percentage of instances that needs to go to the train partition
    :param folds: int, number of holdouts to execute
    :param ntrees: int, number of trees for each ensemble
    :param nattributes: int, number of sampled attributes (if -1, use sqrt(total))
    :return: (average_performance, stddev_performance)
    """
    if nattributes == -1:
        nattributes = int(math.sqrt(len(dataset.columns)))

    accuracies = []
    precisions = []
    recalls = []
    for fold in range(1, folds + 1):
        train_dataset, test_dataset = holdout(dataset, percentage_train)
        rf = RandomForest(train_dataset, attributes, ntrees, nattributes,
                          depth_limit)
        accuracy, precision, recall = test_RF(rf, test_dataset)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
    return mean(accuracies), stdev(accuracies), mean(precisions), stdev(
        precisions), mean(recalls), stdev(recalls)
Esempio n. 15
0
 def __init__(self, train_size=0.7):
     self.predictors = [
         MyLogisticRegression(scale_data=True),
         RandomForest(6, 3, 200),
         SVC()]
     self.blender = MyKNN(type='classifier', use_weights=False)  # DecisionTree(2, 1)
     self.train_size = train_size
Esempio n. 16
0
def evaluate_performance(trials=100):
    filename = '../data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = data[:, 0]
    n, d = X.shape

    accuracy_list = np.zeros(trials)
    precision_list = np.zeros(trials)
    recall_list = np.zeros(trials)

    for trial in range(trials):
        print 'Trial #', trial
        # Random shuffle
        idx = np.arange(n)
        np.random.seed(np.int(time() / 150))
        np.random.shuffle(idx)
        X = X[idx]
        Y = y[idx]

        # Split Train and Test samples
        train = np.int(0.8 * n)
        Xtrain = X[:train, :]
        Xtest = X[train:, :]
        Ytrain = Y[:train]
        Ytest = Y[train:]

        clf = RandomForest(n_trees=30,
                           max_depth=100,
                           ratio_per_tree=0.7,
                           ratio_features=0.3)
        clf.fit(Xtrain, Ytrain)
        pred = clf.predict(Xtest)

        accuracy_list[trial] = accuracy_score(Ytest, pred)
        precision_list[trial] = precision(Ytest, pred)
        recall_list[trial] = recall(Ytest, pred)

    stats = np.zeros((3, 3))
    stats[0, 0] = np.mean(accuracy_list)
    stats[0, 1] = np.std(accuracy_list)
    stats[1, 0] = np.mean(precision_list)
    stats[1, 1] = np.std(precision_list)
    stats[2, 0] = np.mean(recall_list)
    stats[2, 1] = np.std(recall_list)
    return stats
Esempio n. 17
0
    def run(self) -> None:
        """Creates a random forest model, trains it, and saves it."""

        model = RandomForest(self.window_args, self.forest_args)

        save_model(self.name, model)

        self.signals.float_result.emit(model.accuracy)
Esempio n. 18
0
    def fit(self, X, y):
        # instantiate the input models
        rf = RandomForest(num_trees=15)
        knn = KNN(k=3)
        nb = NaiveBayes(num_classes=2)

        # Random Forest fit and predict
        rf.create_splits(X)
        rf.fit(X, y)
        rf_pred = rf.predict(X)

        # K-Nearest Neighbors fit and predict
        knn.fit(X, y)
        knn_pred = knn.predict(X)

        # Naive Bayes fit and predict
        nb.fit(X, y)
        nb_pred = nb.predict(X)

        # use predictions from input models as inputs for meta-classifiers
        meta_input = np.hstack((rf_pred.reshape(
            (rf_pred.size, 1)), knn_pred.reshape(
                (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1))))

        # use Decision Tree as meta-classifier
        dt = DecisionTree(max_depth=np.inf)
        dt.fit(meta_input, y)

        self.rf = rf
        self.knn = knn
        self.nb = nb
        self.meta_classifier = dt
Esempio n. 19
0
  def run_forest(n_trees=400, n_features_per_tree=70, n_rows_power=0.5):
    start_time = time.time()
    forest = RandomForest(trainset, labels, n_trees, n_features_per_tree, n_rows_power)
    forest.train()
    score = []
    for test, (i, bm) in zip(testset, benchmarkset):
      result = forest.classify(test)
      result = result.most_common(1)[0][0]
      print('{:.1f}s -- No.{} should be: <{}> cal: {}'.format(time.time()-start_time, i, bm, result))
      if result == bm:
        score.append('+')
      else:
        score.append('-')
      with open('result_by_self_train[28000].csv', 'a') as f:
        f.write('{},{},<{}>\n'.format(i, result, bm))
    # print('classify done {}, {:.1%}'.format(Counter(score), Counter(score)['+'] / len(score)))

    return Counter(score)['+'] / len(score)
    def fit_emotion(self, emotion_number, X, y):
        binary_y = binarize_y(y, emotion_number)
        tree = Tree()

        if self.random_forest:
            tree = RandomForest(num_of_trees=self.num_of_trees)

        tree.fit(self.predictors, X, binary_y)

        return tree
Esempio n. 21
0
def init_model(model_type, delta, area_width):
    if model_type == 'LR':
        return LogisticRegression(delta, area_width)
    elif model_type == 'DT':
        return DecisionTree(delta)
    elif model_type == 'RF':
        return RandomForest(delta)
    else:
        raise SolverException('Invalid model type: ' + Fore.MAGENTA +
                              model_type + Fore.RESET)
Esempio n. 22
0
    def run_forest(n_trees=400, n_features_per_tree=70, n_rows_power=0.5):
        start_time = time.time()
        forest = RandomForest(trainset, labels, n_trees, n_features_per_tree,
                              n_rows_power)
        forest.train()
        score = []
        for test, (i, bm) in zip(testset, benchmarkset):
            result = forest.classify(test)
            result = result.most_common(1)[0][0]
            print('{:.1f}s -- No.{} should be: <{}> cal: {}'.format(
                time.time() - start_time, i, bm, result))
            if result == bm:
                score.append('+')
            else:
                score.append('-')
            with open('result_by_self_train[28000].csv', 'a') as f:
                f.write('{},{},<{}>\n'.format(i, result, bm))
        # print('classify done {}, {:.1%}'.format(Counter(score), Counter(score)['+'] / len(score)))

        return Counter(score)['+'] / len(score)
Esempio n. 23
0
    def fit(self, X, y):
        self.rf = RandomForest(num_trees=15, max_depth=np.inf)
        self.rf.fit(X, y)
        y_rf = self.rf.predict(X)

        self.nb = NaiveBayes()
        self.nb.fit(X, y)
        y_nb = self.nb.predict(X)

        self.knn = KNN(k=3)
        self.knn.fit(X, y)
        y_knn = self.knn.predict(X)

        newX = np.array([y_rf, y_nb, y_knn]).transpose()

        model = DecisionTree(max_depth=np.inf,
                             stump_class=DecisionStumpErrorRate)
        self.model = model

        model.fit(newX, y)
Esempio n. 24
0
    def run(self):

        logger.info('Select tickers')
        tickers = self.choose_ticker.random_tickers(self.number_of_tickers)
        logger.info(f'The selected tickers are {tickers}')
        for ticker in tickers:
            # for ticker in ['MCHP']:
            logger.info(f'Starting with ticker {ticker}')
            data = self.data_loader.load(ticker)
            self.graph_maker.plot_adjusted_prices(ticker, data)
            data = self.technical_analysis.calculate(data)
            logger.info(f'Technical analysis graphs')
            train, test, validation, train_graph, test_graph, validation_graph = self.data_loader.transform(
                data, number_of_past_points=self.number_of_past_points)
            self.graph_maker.plot_train_test_val(
                ticker, train_graph, test_graph, validation_graph)

            for position, model_name in enumerate(
                    [element.get('model') for element in self.models_and_parameters]):

                if model_name == 'feed_forward_neural_net':

                    model = FeedForwardNN(
                        dimension_of_first_layer=self.number_of_past_points * train[0][0].shape[1],
                        ticker=ticker,
                        overfitting_threshold=self.overfitting_threshold,
                    )
                if model_name == 'random_forest':
                    model = RandomForest(
                        ticker=ticker, overfitting_threshold=self.overfitting_threshold)
                if model_name == 'xgboost':
                    model = XGBoost(
                        ticker=ticker,
                        overfitting_threshold=self.overfitting_threshold)
                if model_name == 'Arima':
                    model = Arima(
                        ticker=ticker,
                        overfitting_threshold=self.overfitting_threshold)
                if model_name == 'regressors':
                    model = Regressors(
                        ticker=ticker,
                        overfitting_threshold=self.overfitting_threshold)
                best_parameters, mse, trend_ratio, prediction, true_values, there_is_a_best_prediction = model.run(
                    train=train[::-1], val=validation[::-1], test=test[::-1], model_parameters=self.models_and_parameters[position])
                logger.info(
                    f'The best scenario for a {model_name} is {best_parameters}, mse: {mse},'
                    f' ratio of trend {trend_ratio*100}')
                if there_is_a_best_prediction:
                    self.graph_maker.plot_test_results(true_values, prediction, ticker, mse, model_name)
                else:
                    logger.info(
                        f'No model could be fitted for {model_name} due to the '
                        f'overfitting threshold of {self.overfitting_threshold}')
Esempio n. 25
0
def test_random_forest():
    goal_attr = 'play'
    attr = 'wind'
    attr_universe = ['strong', 'weak']

    attr_2 = 'wheather'
    attr_2_univserse = ['sunny', 'cloudy', 'rainny']

    attr_3 = 'temperature'
    attr_3_universe = ['cold', 'norm', 'hot']

    attr_4 = 'humidity'
    attr_4_universe = ['norm', 'high']

    df = {
        goal_attr: [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1],
        attr: ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak'],
        attr_2: ['sunny', 'sunny', 'cloudy', 'rainny', 'rainny', 'rainny', 'cloudy', 'sunny', 'sunny', 'rainny', 'sunny', 'cloudy', 'cloudy'],
        attr_3: ['hot', 'hot', 'hot', 'norm', 'cold', 'cold', 'cold', 'norm', 'cold', 'norm', 'norm', 'norm', 'hot'],
        attr_4: ['high', 'high', 'high', 'high', 'norm', 'norm',
                 'norm', 'high', 'norm', 'norm', 'norm', 'high', 'norm']

    }

    df = pd.DataFrame(df)
    forest = RandomForest(shanon_gain)
    forest.train(10, 1, 4, df, {goal_attr: [0, 1]}, {
                 attr: attr_universe, attr_2: attr_2_univserse, attr_3: attr_3_universe, attr_4: attr_4_universe})

    case = {
        attr: 'strong',
        attr_2: 'rainny',
        attr_3: 'norm',
        attr_4: 'high'
    }

    result = forest.predict(case)
    expected = 0

    assert result == expected
Esempio n. 26
0
class TestModelMethods(unittest.TestCase):
    ''' Test the methods of the random forest model wrapper.
    
    Arguments:
        unittest -- Python Unit Test Framework
    '''


    def setUp(self):
        ''' Test runner setup.
        '''

        self.rf = RandomForest()     
        self.rf.model_file = 'testModel.pth'

    def testDataVariables(self):
        ''' Test whether feature and target variables are defined in the model.
        '''

        self.assertGreater(len(self.rf.features), 0)
        self.assertGreater(len(self.rf.target), 0)

    def testDataLoading(self):
        ''' Test the data loading.
        '''

        for dataset in ['train', 'val', 'test', 'full']:
            self.assertGreater(len(self.rf.data[dataset]), 0)
            self.assertGreater(len(self.rf.samples[dataset]), 0)
            self.assertGreater(len(self.rf.labels[dataset]), 0)

    def testSaveExistingModel(self):
        ''' Test the save method for a blank model.
        '''

        self.rf.model = RandomForestRegressor()
        self.assertTrue(self.rf._saveModel())
    
    def testSaveNonExistingModel(self):
        ''' Test the save method for a non existing model.
        '''

        self.rf.model = None
        self.assertFalse(self.rf._saveModel())

    def testLoadExisitngModel(self):
        ''' Test the load model for a previous saved model.
        '''

        self.rf._saveModel()
        self.assertTrue(self.rf._loadModel())

    def testLoadNonExisitngModel(self):
        ''' Test the load method for a non-existing model path.
        '''

        self.rf.model_file = 'non-existing/path.x'
        self.assertFalse(self.rf._loadModel())
Esempio n. 27
0
def testRF():
    train_x, train_y, dev_x, dev_y, test_x, test_y, = util.loadTreeData()
    RF = RandomForest(min_leaf_size=50, max_depth=10, num_trees=10)
    RF.fit(train_x, train_y)
    preds = RF.predict(train_x)
    train_rf_rmse = util.findRMSE(preds, train_y)
    preds = RF.predict(test_x)
    test_rf_rmse = util.findRMSE(preds, test_y)
    print('RF RMSE: \t', test_rf_rmse)
    return train_rf_rmse, test_rf_rmse
Esempio n. 28
0
def main():
    parser = argparse.ArgumentParser(description='Random Forest parser')
    parser.add_argument('--opt', help='test-benchmark or test-dataset.', required=True)
    parser.add_argument('--dataset', help='The dataset filename.', default='', required=False)
    parser.add_argument('--target_attribute', help='Target attribute to be predicted.', default='', required=False)
    parser.add_argument('--n_trees', help='The number of trees. The default is 5.', default=5, type=int, required=False)
    parser.add_argument('--n_attributes', help='The number of attributes. The default is the squared root of otal attributes.', default=-1, type=int, required=False)
    parser.add_argument('--k_folds', help='The number of folds for cross validation. The default is 5', default=5, type=int, required=False)
    parser.add_argument('--r', help='The number of repetitions for repeated cross validation. The default is 1', default=1, type=int, required=False)
    args = parser.parse_args()

    if args.opt == 'test-benchmark':
        test_benchmark_categorical()
        test_benchmark_numerical()

    if args.opt == 'test-dataset':
        if args.dataset == '' or not os.path.isfile(DATA_PATH + args.dataset):
            print('Dataset not found.')
            return

        try:
            with open(DATA_PATH + args.dataset[:-3] + 'json', 'r') as filetypes:
                types = json.load(filetypes)
        except:
            print('Dataset types not found, automatic types will be used.')
            types = {}

        data = pd.read_csv(
            DATA_PATH + args.dataset,
            delimiter='\t' if args.dataset[-3:] == 'tsv' else ',',
            dtype=types
        )

        if args.target_attribute not in data.columns:
            print("Target attribute doesn't exist on dataset.")
            return

        n_trees = args.n_trees
        n_random_attributes = args.n_attributes
        if n_random_attributes == -1:
            n_random_attributes = int((len(data.columns) - 1) ** 1/2)

        cv = CrossValidator(
            RandomForest(n_trees, args.target_attribute, n_random_attributes)
        )
        cv.cross_validate(data, args.k_folds, args.r)
        print('\nGlobal accuracy: %.3f (%.3f)' % (cv.accuracy, cv.accuracy_std))
Esempio n. 29
0
    def construct_RF(self):
        full_dataset = self.df
        difficult_area = self.create_difficult_area()
        critical_dataset = self.df.iloc[difficult_area]

        rf1 = RandomForest(full_dataset, 5, self.s*(1-self.p))
        trees_1 = rf1.construct_trees()
        rf2 = RandomForest(critical_dataset, 5, self.s*self.p)
        trees_2 = rf2.construct_trees()
        trees = trees_1 + trees_2
        return trees
Esempio n. 30
0
def main():
    X = list()
    y = list()
    XX = list()  # Contains data features and data labels
    numerical_cols = set([0,10,11,12,13,15,16,17,18,19,20]) # indices of numeric attributes (columns)

    # Loading data set
    print("reading input data")
    with open("data.csv") as f:
        next(f, None)

        for line in csv.reader(f, delimiter=","):
            xline = []
            for i in range(len(line)):
                if i in numerical_cols:
                    xline.append(ast.literal_eval(line[i]))
                else:
                    xline.append(line[i])

            X.append(xline[:-1])
            y.append(xline[-1])
            XX.append(xline[:])

    # VERY IMPORTANT: Minimum forest_size should be 10
    forest_size = 10
    
    # Initializing a random forest.
    randomForest = RandomForest(forest_size)

    # Creating the bootstrapping datasets
    print("creating the bootstrap datasets")
    randomForest.bootstrapping(XX)

    # Building trees in the forest
    print("fitting the forest")
    randomForest.fitting()

    # Calculating an unbiased error estimation of the random forest
    # based on out-of-bag (OOB) error estimate.
    y_predicted = randomForest.voting(X)

    # Comparing predicted and true labels
    results = [prediction == truth for prediction, truth in zip(y_predicted, y)]

    # Accuracy
    accuracy = float(results.count(True)) / float(len(results))

    print("accuracy: %.4f" % accuracy)
    print("OOB estimate: %.4f" % (1-accuracy))
Esempio n. 31
0
def train_model(model_name, X_train, Y_train, X_val, Y_val):
    """
    Trains a supervised classifier using the training data provided, and scores
    it using the validation dataset.

    Param:
        - model_name: a string containing a model type
        - Train data:
            - X_train
            - Y_train
        - Validation data:
            - X_val
            - Y_val

    Return:
        - model: a supervised classifier, to be used for testing
    """

    if model_name == 'svm':
        model = SVM()
    elif model_name == "random_forest":
        max_depth = 2
        model = RandomForest(max_depth)
    elif model_name == "neural_network":
        max_depth = 2
        out_size = 2
        hidden_size = 30
        in_size = X_train.shape[1]
        model = NeuralNetwork(in_size, hidden_size, out_size)
    elif model_name == "knn":
        n_neighbors = 50
        model = KNearestNeighbors(n_neighbors)
    else:
        return "Error: Model not yet implemented..."

    print("Training " + model_name + "...")

    train_score = model.train(X_train, Y_train)
    valid_score = model.score(X_val, Y_val)

    print("Training Accuracy: %s" % train_score)
    print("Validation Accuracy: %s" % valid_score)

    return model
Esempio n. 32
0
booster.store_classification_result(out_file)
print 'Extreme Gradient Boosting Completed. Labels saved. Moving Forward'

############## PART 3 #################
# 1. Xgboost - Gradient Boosting Machine
print 'Beginning Extreme Gradient Boosting...'
out_file = './data_final/part_3/xgboost.csv'
'''Same Code as above'''
booster.store_classification_result(out_file)
print 'Extreme Gradient Boosting Completed. Labels saved. Moving Forward'

# 2. Random Forest - Entropy Index
print 'Beginning Random Forest Classfication (ENTROPY BASED) ...'
out_file = './data_final/part_3/rf_entropy.csv'
Kfold = 10
forest = RandomForest(Kfold,dataDir,'entropy')
train_acc = forest.get_train_accuracy()
print "Train Accuracy Random Forest (Entropy):%f \n" %(train_acc)
forest.store_classification_result(out_file)
print 'Random Forest Classfication Completed. Labels saved. Moving Forward.'


# 3. Random Forest - Gini Index
print 'Beginning Random Forest Classfication (GINI BASED) ...'
out_file = './data_final/part_3/rf_gini.csv'
Kfold = 10
forest = RandomForest(Kfold,dataDir,'gini')
train_acc = forest.get_train_accuracy()
print "Train Accuracy Random Forest (Gini) :%f \n" %(train_acc)
forest.store_classification_result(out_file)
print 'Random Forest Classfication Completed. Labels saved. All done.'
def apply_random_forest(filename):
    rf = RandomForest(filename, estimators=500)

    #rf.split_data()
    rf.start_data()
    rf.train_random_forest()
    rf.test_random_forest()

    rf.switch_folds()
    rf.train_random_forest()
    rf.test_random_forest()
Esempio n. 34
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# Rescale label for Adaboost to {-1, 1}
rescaled_y_train = 2*y_train - np.ones(np.shape(y_train))
rescaled_y_test = 2*y_test - np.ones(np.shape(y_test))

# .......
#  SETUP
# .......
adaboost = Adaboost(n_clf = 8)
naive_bayes = NaiveBayes()
knn = KNN(k=4)
logistic_regression = LogisticRegression()
mlp = MultilayerPerceptron(n_hidden=20)
perceptron = Perceptron()
decision_tree = DecisionTree()
random_forest = RandomForest(n_estimators=150)
support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel)

# ........
#  TRAIN
# ........
print "Training:"
print "\tAdaboost"
adaboost.fit(X_train, rescaled_y_train)
print "\tNaive Bayes"
naive_bayes.fit(X_train, y_train)
print "\tLogistic Regression"
logistic_regression.fit(X_train, y_train)
print "\tMultilayer Perceptron"
mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1)
print "\tPerceptron"
Esempio n. 35
0
def main():
    ##### Part 1: Vanilla Logistic Regression ##### Private Score : 50%
    # USE SHOBHIT's CODE
    ##### Part 2: XGBoost
    '''
    Kfold = 5
    dataDir = "data"
    booster = Xgboost(Kfold, dataDir)
    train_acc = booster.get_train_accuracy()
    print "Train Accuracy :%f \n" %(train_acc)
    booster.store_classification_result()
    '''
    ##### Part 3: Ensembler
    print 'Beginning Ensembling....'
    train_file = 'data/train.csv'
    test_file = 'data/test.csv'
    dataDir = 'data'

    # 1. Logistic Regressor - balanced - Sparse Greedy Features
    print 'Beginning Logistic Regression ...'
    stupid_train_out_file = './data/results/lr_unbalanced_sparse_train.csv'
    out_file = './data/ensemble_results/lr_unbalanced_sparse.csv'

    clf = LogRes(train_file, test_file, stupid_train_out_file, out_file)
    clf.feature_selection(64, [3])
    print 'Logstic Regression Completed. Labels saved. Moving Forward.'

    # 2. Random Forest - Entropy Index
    print 'Beginning Random Forest Classfication (ENTROPY BASED) ...'
    out_file = './data/ensemble_results/rf_entropy.csv'
    Kfold = 10
    forest = RandomForest(Kfold,dataDir,'entropy')
    train_acc = forest.get_train_accuracy()
    print "Train Accuracy Random Forest (Entropy):%f \n" %(train_acc)
    forest.store_classification_result(out_file)
    print 'Random Forest Classfication Completed. Labels saved. Moving Forward.'

    '''
    # 3. Random Forest - Gini Index
    print 'Beginning Random Forest Classfication (GINI BASED) ...'
    out_file = './data/ensemble_results/rf_gini.csv'
    Kfold = 10
    forest = RandomForest(Kfold,dataDir,'gini')
    train_acc = forest.get_train_accuracy()
    print "Train Accuracy Random Forest (Gini) :%f \n" %(train_acc)
    forest.store_classification_result(out_file)
    print 'Random Forest Classfication Completed. Labels saved. Moving Forward.'

    # 4. Extra Trees - Entropy Index
    print 'Beginning Extremely Randomized RF i.e Extra Trees (ENTROPY BASED)...'
    out_file = './data/ensemble_results/extra_forest_entropy.csv'
    Kfold = 10
    trees = ExtraTrees(Kfold,dataDir,'entropy')
    train_acc = trees.get_train_accuracy()
    print "Train Accuracy Extra Trees (Entropy):%f \n" %(train_acc)
    trees.store_classification_result(out_file)
    print 'Extra Trees (ENTROPY) Completed. Labels saved. Moving Forward.'

    # 5. Extra Trees - Gini Index
    print 'Beginning Extremely Randomized RF i.e Extra Trees (GINI BASED)...'
    out_file = './data/ensemble_results/extra_forest_gini.csv'
    Kfold = 10
    trees = ExtraTrees(Kfold,dataDir,'gini')
    train_acc = trees.get_train_accuracy()
    print "Train Accuracy Extra Trees (Gini):%f \n" %(train_acc)
    trees.store_classification_result(out_file)
    print 'Extra Trees (GINI) Completed. Labels saved. Moving Forward.'
    '''
    
    # 6. Xgboost - Gradient Boosting Machine
    print 'Beginning Extreme Gradient Boosting...'
    out_file = './data/ensemble_results/xgboost.csv'
    Kfold = 7
    booster = Xgboost(Kfold, dataDir)
    train_acc = booster.get_train_accuracy()
    print "Train Accuracy :%f \n" %(train_acc)
    booster.store_classification_result(out_file)
    print 'Extreme Gradient Boosting Completed. Labels saved'

    print 'All models trained and test labels saved'
    print 'All done... Exiting...'
    print 'Exited'
Esempio n. 36
0

# Load data
cur_states, actions, rewards, next_states, users, action_index, user_index, valid_feats = load_data(feat_path, target_action, num_users=num_users, num_users_ratio=num_users_ratio, demography=demography)
print cur_states.shape, next_states.shape, actions.shape, rewards.shape, users.shape, len(action_index), len(user_index), sum(valid_feats)
# dim: cur_states,next_states = num_instances x num_features 
# dim: actions,rewards = num_instances

num_feats = cur_states.shape[1]
num_actions = len(action_index)
action_list = [ 0 for x in range(num_actions) ]
for a,i in action_index.iteritems(): action_list[i] = a
 
 
s0 = np.zeros((1,num_feats))
approximator = RandomForest(num_estimators=num_estimators, num_actions=num_actions)
approximator.train(cur_states, actions, rewards)
for iter in range(num_iters):
    print "---------------------------------------------------------------\nIteration", iter
    start_time = time.time()
    
    qs = approximator.predict(next_states)  # dim: num_actions x num_instances
    max_qs = np.amax(qs, axis=0)  # dim: num_instances
    max_qs[actions==action_index[target_action]] = 0
    approximator.train(cur_states, actions, rewards + discount * max_qs)
    
    if debug_action_cnt:
        max_as = np.argmax(qs, axis=0)
        action_cnt = defaultdict(int)
        for a in max_as: action_cnt[action_list[a]] += 1
        print "action count: ", action_cnt
Esempio n. 37
0
Copyright Brian Dolhansky 2014
[email protected]
"""

from random_forest import RandomForest
from sklearn.datasets import fetch_mldata
from data_utils import integral_to_indicator, split_train_test
import numpy as np

print "Loading data..."
mnist = fetch_mldata('MNIST original', data_home='/home/bdol/data')
train_data, test_data, train_target, test_target = split_train_test(mnist.data,
                                                                    mnist.target)
train_target = integral_to_indicator(train_target)
test_target_integral = integral_to_indicator(test_target)
print "Done!"

np.seterr(all='ignore')

print "Training random forest..."
rf = RandomForest(21, 10, 10, boot_percent=0.3, feat_percent=0.1, debug=False)
rf.train(train_data, train_target)
print "Done training!"

print "Testing..."
yhat = rf.test(test_data, test_target_integral)
err = (np.sum(yhat != test_target[:, None]).astype(float))/test_target.shape[0]
print "Error rate: {0}".format(err)

print "Done!"