Ejemplo n.º 1
0
def classifier_list():
    clfs = {}
    ### Forests
    clfs['grf'] = RandomForestClassifier(n_jobs=4, criterion='gini')
    clfs['erf'] = RandomForestClassifier(n_jobs=4, criterion='entropy')
    clfs['etr'] = ExtraTreesClassifier()
    ### Boosting
    # clfs['gbc']     = GradientBoostingClassifier()
    # clfs['ada']     = AdaBoostClassifier()
    # clfs['bag']     = BaggingClassifier()
    ### SVM
    clfs['lsvm'] = LinearSVC()
    # clfs['qsvm']    = SVC(probability=True, kernel='poly', degree=2)  # Slow
    # clfs['psvm']    = SVC(probability=True, kernel='poly', degree=3)  # Slow
    # clfs['ssvm']    = SVC(probability=True, kernel='sigmoid')         # Slow
    # clfs['rsvm']    = SVC(probability=True, kernel='rbf')             # Slow
    ### Naive Bayes
    # clfs['gnb']     = GaussianNB()      # Worst
    clfs['bnb'] = BernoulliNB()  # Good
    clfs['mnb'] = MultinomialNB()  # Best
    # ### Decision Tree (CART)
    clfs['gdt'] = DecisionTreeClassifier(criterion='gini')
    clfs['edt'] = DecisionTreeClassifier(criterion='entropy')
    clfs['egt'] = ExtraTreeClassifier(criterion='gini')
    clfs['eet'] = ExtraTreeClassifier(criterion='entropy')
    return clfs
Ejemplo n.º 2
0
    def __init__(self, data, target):
        self.data = data
        if len(target.shape) == 2:
            # Convert 2-dim target array into 1-dim target array
            self.target = target.reshape(target.shape[0])
        else:
            self.target = target

        self.training_data = None
        self.training_target = None
        self.test_data = None
        self.test_target = None

        # Construct 3 Tier-1 (base) classifiers
        self.Tier1_classifier1 = LogisticRegression(solver="lbfgs")
        self.Tier1_classifier2 = MultinomialNB()
        self.Tier1_classifier3 = LinearSVC(penalty="l2")
        self.Tier1_classifier4 = ExtraTreeClassifier()
        # self.Tier1_classifier5 = SGDClassifier(max_iter=1000, tol=1e-3)

        # Construct Tier-2 (meta) classifier
        # self.meta_classifier = LogisticRegression(solver="lbfgs")
        # self.meta_classifier = MultinomialNB()
        # self.meta_classifier = LinearSVC(penalty = "l2")
        self.meta_classifier = ExtraTreeClassifier()
Ejemplo n.º 3
0
def dTree(data, labels, test, impurity="gini", mdepth=None):
    newData = pd.DataFrame()
    newTest = pd.DataFrame()
    le = LabelEncoder()
    for datum in data:
        newData[datum] = le.fit_transform(data[datum])
    for testItem in test:
        newTest[testItem] = le.fit_transform(test[testItem])
    tree1 = DecisionTreeClassifier(criterion=impurity,
                                   max_depth=mdepth,
                                   random_state=42)
    tree2 = ExtraTreeClassifier(criterion=impurity,
                                max_depth=mdepth,
                                random_state=42)
    tree3 = RandomForestClassifier(criterion=impurity,
                                   max_depth=mdepth,
                                   random_state=42)
    tree1.fit(newData, labels)
    tree2.fit(newData, labels)
    tree3.fit(newData, labels)
    predict1 = tree1.predict(newTest)
    print("tree1", evaluate(predict1, validation_genres))
    predict2 = tree2.predict(newTest)
    print("tree2", evaluate(predict2, validation_genres))
    predict3 = tree3.predict(newTest)
    print("tree3", evaluate(predict3, validation_genres))
    combined_prediction = voting([predict1, predict2, predict3], [1, 1, 1])
    return combined_prediction
Ejemplo n.º 4
0
def evaluate_optimal_classifier(features, classes):
    # Obtain the classifier for the current experiment
    et = ExtraTreeClassifier(criterion='entropy',max_leaf_nodes=1200,min_samples_leaf=1,max_depth=7,random_state=5)
    classifier = AdaBoostClassifier(base_estimator=et,n_estimators=500,algorithm='SAMME',learning_rate=0.1,random_state=5)

    # Split the data set into training and test set
    train_X, test_X, train_Y, test_Y = train_test_split(features, classes, test_size=0.2)

    # Fit the training data to the model
    classifier.fit(train_X, train_Y)

    # Predict the classes of the test set
    predicted_classes = classifier.predict(test_X)

    # Compute the confusion matrix
    matrix = confusion_matrix(test_Y, predicted_classes)
    print("Confusion matrix:")
    print(matrix)

    # Compute the weighted F1-Score
    f1_measure = f1_score(test_Y, predicted_classes, average="weighted")
    print("Weighted F1-Score: {:0.3f}".format(f1_measure))

    # Compute a classification report
    report = classification_report(test_Y, predicted_classes)
    print("Classification report:")
    print(report)
Ejemplo n.º 5
0
def init_classifiers(seed):
    return {
        'AdaBoostClassifier':
        AdaBoostClassifier(random_state=seed),
        'BaggingClassifier':
        BaggingClassifier(random_state=seed),
        'ExtraTreesClassifier':
        ExtraTreesClassifier(random_state=seed),
        'GradientBoostingClassifier':
        GradientBoostingClassifier(random_state=seed),
        'RandomForestClassifier':
        RandomForestClassifier(random_state=seed),
        'XGBClassifier':
        xgb.XGBClassifier(),
        'LogisticRegression':
        LogisticRegression(random_state=seed),
        'PassiveAggressiveClassifier':
        PassiveAggressiveClassifier(random_state=seed),
        'RidgeClassifier':
        RidgeClassifier(random_state=seed),
        'RidgeClassifierCV':
        RidgeClassifierCV(),
        'SGDClassifier':
        SGDClassifier(random_state=seed),
        #'KNeighborsClassifier': KNeighborsClassifier(),
        #'RadiusNeighborsClassifier': RadiusNeighborsClassifier(),
        'MLPClassifier':
        MLPClassifier(random_state=seed),
        'DecisionTreeClassifier':
        DecisionTreeClassifier(random_state=seed),
        'ExtraTreeClassifier':
        ExtraTreeClassifier(random_state=seed)
    }
Ejemplo n.º 6
0
def main():
    np.random.seed(20)

    def scorer(est, x, y):
        y_hat = est.predict(x)
        return classification.accuracy_score(y, y_hat)

    #x, y = make_classification(n_samples=1000, n_classes=4, n_informative=10)
    x, y = fetch_kddcup99(return_X_y=True)
    x = np.array(x[:, 4:], dtype=np.float32)
    y = preprocessing.LabelEncoder().fit_transform(y)
    # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    myclf = Mree(split_method=greedy_classification)
    score = cross_val_score(myclf, x, y, cv=5, scoring=scorer)
    print("Mine greedy classification result", score, np.mean(score))
    myclf = Mree(split_method=greedy_classification_p_at_k)
    score = cross_val_score(myclf, x, y, cv=5, scoring=scorer)
    print("Mine greedy p@k classification result", score, np.mean(score))

    myclf = Mree(split_method=random_classify_p_at_k)
    score = cross_val_score(myclf, x, y, cv=5, scoring=scorer)
    print("Mine random p@k classification result", score, np.mean(score))

    clf = DecisionTreeClassifier(max_depth=10,
                                 max_features=20,
                                 min_impurity_decrease=0.000001)
    score = cross_val_score(clf, x, y, cv=5, scoring=scorer)
    print("Sklearn greedy classification result", score, np.mean(score))
    clf = ExtraTreeClassifier(max_depth=10,
                              max_features=20,
                              min_impurity_decrease=0.000001)
    score = cross_val_score(clf, x, y, cv=5, scoring=scorer)
    print("sklearn random classification result", score, np.mean(score))
    def serialize_class(self):
        """
        Convert to hdf5
        """
        clf = SVC(C=3.0, kernel='poly', degree=5)
        clf = SVR()
        clf = LinearSVC(loss='hinge', tol=0.001, C=2.0)
        clf = LinearRegression(fit_intercept=True, n_jobs=2)
        clf = GaussianNB()
        clf = SGDClassifier(loss='hinge',
                            learning_rate='optimal',
                            alpha=0.0001)
        clf = KNeighborsClassifier(n_neighbors=6,
                                   weights='uniform',
                                   algorithm='ball_tree',
                                   leaf_size=32)
        #clf = RadiusNeighborsClassifier()
        clf = GradientBoostingClassifier(n_estimators=100)
        clf = ExtraTreeClassifier()
        clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
        clf = DecisionTreeRegressor()
        # clf = ExtraTreeRegressor()
        #clf = GradientBoostingClassifier(n_estimators=10)
        clf = AdaBoostClassifier(n_estimators=2)
        #clf = AdaBoostRegressor()
        #clf = BaggingClassifier()
        #clf = BaggingRegressor()
        #clf = ExtraTreesClassifier(n_estimators=1)
        #clf = ExtraTreesRegressor()
        #clf = RandomForestClassifier()
        classifier, X_test, y_test, X = self.train_model(clf)
        print("Serializing...")
        self.save_model(classifier)

        return X_test, y_test, classifier
Ejemplo n.º 8
0
def GET_ALLKINDS_MODELS(prediction_type=None):
    if prediction_type == "C":
        return {
            "LR": LogisticRegression(),
            "LDA": LinearDiscriminantAnalysis(),
            "GNB": GaussianNB(),
            "KNC": KNeighborsClassifier(),
            "SVC": SVC(),
            "ETC": ExtraTreeClassifier(),
            "DTC": DecisionTreeClassifier(),
            "ETC_Ensemble": ExtraTreesClassifier(),
            "RFC_Ensemble": RandomForestClassifier(),
            "ABC_Ensemble": AdaBoostClassifier(),
            "GBC_Ensemble": GradientBoostingClassifier()
        }
    elif prediction_type == "R":
        return {
            "LR": LinearRegression(),
            "RIDGE": Ridge(),
            "LASSO": Lasso(),
            "EN": ElasticNet(),
            "KNR": KNeighborsRegressor(),
            "SVR": SVR(),
            "ETR": ExtraTreeRegressor(),
            "DTR": DecisionTreeRegressor(),
            "ETR_Ensemble": ExtraTreesRegressor(),
            "RFR_Ensemble": RandomForestRegressor(),
            "ABR_Ensemble": AdaBoostRegressor(),
            "GBR_Ensemble": GradientBoostingRegressor()
        }
    else:
        raise Exception()
Ejemplo n.º 9
0
def define_models(models=dict()):
	# linear models
	models['logistic'] = LogisticRegression()#models['logistic'] = key and LogisticRegression = value
	alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
	#trying different configuration of ridge:
	for a in alpha:
		models['ridge-'+str(a)] = RidgeClassifier(alpha=a)
	models['sgd'] = SGDClassifier(max_iter=1000, tol=1e-3)
	models['pa'] = PassiveAggressiveClassifier(max_iter=1000, tol=1e-3)
	# non-linear models
	n_neighbors = range(1, 21)
	#trying dgbm videoffernt configuration of models
	for k in n_neighbors:
		models['knn-'+str(k)] = KNeighborsClassifier(n_neighbors=k)
	models['cart'] = DecisionTreeClassifier()
	models['extra'] = ExtraTreeClassifier()
	models['svml'] = SVC(kernel='linear')
	models['svmp'] = SVC(kernel='poly')
	c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
	for c in c_values:
		models['svmr'+str(c)] = SVC(C=c)
	models['bayes'] = GaussianNB()
	# ensemble models
	n_trees = 100
	models['ada'] = AdaBoostClassifier(n_estimators=n_trees)
	models['bag'] = BaggingClassifier(n_estimators=n_trees)
	models['rf'] = RandomForestClassifier(n_estimators=n_trees)
	models['et'] = ExtraTreesClassifier(n_estimators=n_trees)
	models['gbm'] = GradientBoostingClassifier(n_estimators=n_trees)
	print('Defined %d models' % len(models))
	return models
Ejemplo n.º 10
0
def apply_extra_trees_classifier(trainData, targetTrain, testData, targetTest):
    """
    Applies decision tree algorithm on the dataset, by tuning various parameters

    Args:
        dataframe: The input trainData, testData and class label on which the decision tree algorithm has to be applied

    """
    # fit a CART model to the data
    etc = ExtraTreeClassifier(class_weight=None,
                              criterion='gini',
                              max_depth=None,
                              max_features='auto',
                              max_leaf_nodes=None,
                              min_samples_leaf=1,
                              min_samples_split=2,
                              min_weight_fraction_leaf=0.0,
                              random_state=None,
                              splitter='random')
    etc.fit(trainData, targetTrain)
    print(etc)
    # make predictions
    expected = targetTest
    predicted = etc.predict(testData)
    # summarize the fit of the model
    print(accuracy_score(expected, predicted))
Ejemplo n.º 11
0
def get_hyperparameters_model():
    criterion = ['gini', 'entropy']
    splitter = ['best', 'random']
    max_depth = [20, 100]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    max_features = ['auto', 'sqrt', 'log2', None]
    class_weight = ['balanced', None]

    param_dist = {
        'cls__criterion': criterion,
        'cls__splitter': splitter,
        'cls__max_depth': max_depth,
        'cls__min_samples_split': min_samples_split,
        'cls__min_samples_leaf': min_samples_leaf,
        'cls__max_features': max_features,
        'cls__class_weight': class_weight
    }

    clf = ExtraTreeClassifier()

    model = {
        'extra_tree_classifier': {
            'model': clf,
            'param_distributions': param_dist
        }
    }
    return model
Ejemplo n.º 12
0
 def Extra_Tree(self, X_train, y_train, X_test, y_test):
     
     max_depth = [5, 10, 25, 50, 75, 100]
     min_samples_leaf = [1, 2, 4, 8, 10]
     min_samples_split = [2, 4, 6, 8, 10]
     max_features = ["auto", "sqrt", "log2", None]
     criterion = ["gini", "entropy"]
     splitter = ["best", "random"]
     
     hyperparameter = {"max_depth": max_depth,
                       "min_samples_leaf": min_samples_leaf,
                       "min_samples_split": min_samples_split,
                       "max_features": max_features,
                       "criterion": criterion,
                       "splitter": splitter}
     
     n_folds = 10
     my_cv = TimeSeriesSplit(n_splits = n_folds).split(X_train)
     et = ExtraTreeClassifier(random_state = 42)
     rsearch_cv = RandomizedSearchCV(estimator = et, param_distributions = hyperparameter, n_iter = 50,
                                     scoring = "f1_macro", n_jobs = -1, cv = my_cv, random_state = 42)
     rsearch_cv.fit(X_train, y_train)
     et_best = rsearch_cv.best_estimator_
     et_best.fit(X_train, y_train)
     y_pred = et_best.predict(X_test)
     test_accuracy = accuracy_score(y_test, y_pred, normalize=True) * 100
     precision = np.round(metrics.precision_score(y_test, y_pred, average="macro"), 4)
     recall = np.round(metrics.recall_score(y_test, y_pred, average="macro"), 4)
     f1 = np.round(metrics.f1_score(y_test, y_pred, average="macro"), 4)
     
     return et_best, test_accuracy, precision, recall, f1
Ejemplo n.º 13
0
    def setUpClass(cls):
        np.random.seed(seed=1234)

        cls.sklearn_model = ExtraTreeClassifier()
        cls.classifier = ScikitlearnExtraTreeClassifier(
            model=cls.sklearn_model)
        cls.classifier.fit(x=x_train, y=y_train)
def extratree(typ, X_train, Y_train, X_test, Y_test, text):
    text.delete(1.0, tk.END)
    text.insert(
        tk.END,
        "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...",
        "bold")
    text.update_idletasks()
    from sklearn.tree import ExtraTreeClassifier
    ETC = ExtraTreeClassifier()
    ETC.fit(X_train, Y_train)
    Y_pred = ETC.predict(X_test)
    text.insert(
        tk.END, "\n\nExtra Tree Classifier report \n" +
        classification_report(Y_pred, Y_test), "bold")
    text.insert(
        tk.END,
        "*****roc_auc_score: %0.3f*****\n" % roc_auc_score(Y_pred, Y_test),
        "bold")
    text.insert(
        tk.END, "Extra Tree Classifier confusion matrix \n" +
        str(confusion_matrix(Y_pred, Y_test)), "bold")
    score = accuracy_score(Y_pred, Y_pred)
    text.insert(tk.END, "Extra tree score= ", score)
    text.update_idletasks()
    roc_curve_acc(Y_test, Y_pred, 'ETC')
    if typ == "s":
        plt.show()
    elif typ == "a":
        pass
Ejemplo n.º 15
0
    def start(self):
        """ 01. Initialise the data paths and transformation functions.  """
        self.data_dir = '../data/raw_data'
        self.trans_primitives = ['weekday', 'hour', 'time_since_previous']
        self.agg_primitives = [
            'mean', 'max', 'min', 'std', 'count', 'percent_true', 'last',
            'time_since_last', 'mode'
        ]
        self.ignore_cols = [
            'num_contacts', 'num_referrals', 'num_successful_referrals'
        ]
        self.feature_windows = [10, 30, 60, 90]  #[10,20,30]
        self.max_feature_depth = 2

        # list of estimators to use
        self.estimators = [
            ('cbc', CatBoostClassifier()), ('lgbmc', LGBMClassifier()),
            ('gbc',
             GradientBoostingClassifier(validation_fraction=0.15,
                                        n_iter_no_change=50)),
            ('et', ExtraTreeClassifier()), ('abc', AdaBoostClassifier()),
            ('rfc', RandomForestClassifier()), ('bc', BaggingClassifier()),
            ('etc', ExtraTreesClassifier()), ('gnb', GaussianNB()),
            ('mlpc', MLPClassifier()), ('gpc', GaussianProcessClassifier()),
            ('dtc', DecisionTreeClassifier()),
            ('qda', QuadraticDiscriminantAnalysis()),
            ('lr', LogisticRegression()), ('knn3', KNeighborsClassifier(3)),
            ('knn6', KNeighborsClassifier(6)),
            ('knn12', KNeighborsClassifier(12)), ('nc', NearestCentroid()),
            ('rnc', RadiusNeighborsClassifier()), ('lp', LabelPropagation()),
            ('pac', PassiveAggressiveClassifier()), ('rc', RidgeClassifier()),
            ('sgdc', SGDClassifier()), ('svg', SVC()),
            ('ngbc', NGBClassifier(Dist=Bernoulli))
        ]
        self.next(self.load_raw_data)
Ejemplo n.º 16
0
    def __init__(self):
        # 알고리즘 이름
        self._name = 'extratree'
        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/classifier/resource/classifier_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 레이블(정답) 데이터 분리
        self._x = data.drop("quality", axis=1)
        self._y = data["quality"]

        # 학습 데이터 및 테스트 데이터 분리
        self._x_train, self._x_test, self._y_train, self._y_test = train_test_split(
            self._x, self._y, test_size=0.2, shuffle=True, random_state=42)
        # 모델 선언
        self._model = ExtraTreeClassifier()

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)
Ejemplo n.º 17
0
def clf_scan(xtrain, ytrain, xtest=None, ytest=None, cv=5):
    """
    Function to perform k-fold cross validation on some standard classifiers. Note, it may take a long time for
        some of the classifiers to converge on un-scaled data. Use un-scaled data with caution.
    :return: results: Library with classifier names and scores
    :param xtrain: Matrix of features from the training set
    :param ytrain: Class labels from the training set.
    :param cv: # of folds to use during k-folds cross validation of each model.
    :param xtest: Matrix of features from the testing set
    :param ytest: Class labels from the testing set
    :return: results: Library with classifier names and scores
    """
    clfs = {
        'LogisticRegression': LogisticRegression(),
        'MLPClassifier': MLPClassifier(),
        'LinearDicriminantAnalysis':  LinearDiscriminantAnalysis(),
        'SGD Classifier': SGDClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50),
        'GradientBoostClassifier': GradientBoostingClassifier(),
        'SVC(rbf)': SVC(kernel='rbf', probability=True),
        'KNearestNeighbors': KNeighborsClassifier(),
        'ExtraTreesClassifier': ExtraTreeClassifier(),
        'RandomForestClassifier': RandomForestClassifier(n_estimators=50)
    }

    results = {}
    print('\n====== > Evaluation cross validation scores')
    for name, clf in clfs.items():
        print('==> Current estimator:\n%s\n' % clf)
        scores = cross_val_score(clf, xtrain, ytrain, cv=cv)
        results[name] = scores
    # for name, scores in results.items():
    for name in clfs.keys():
        print("%25s :: Accuracy: %0.3f%% (+/0 %0.3f%%)" % (name, 100 * results[name].mean(),
                                                           100 * results[name].std() * 2))

    if (xtest is not None) and (ytest is not None):
        test_results = {}
        cohen_kappa_results = {}
        print('=========================================================')
        print('Performing model fits on training/testing data.')
        for name, clf in clfs.items():
            print('Processing %30s' % name)
            try:
                clf.fit(xtrain, ytrain)
                test_score = clf.score(xtest, ytest)
                test_results[name] = test_score

                y_pred = clf.predict(xtest)
                kappa = cohen_kappa_score(ytest, y_pred)
                cohen_kappa_results[name] = kappa
            except Exception as e:
                print('Error encountered calculating score on test data for %s. It may not have a built-in'
                      '.score attribute!' % name)
                print('Exception: ', e)
        for name in clfs.keys():
            print("%25s :: Accuracy:        %0.3f%%\n"
                  "%25s :: Cohen's Kappa:   %0.3f" % (name, 100 * test_results[name],
                                                      " ", cohen_kappa_results[name]))
    return results
def define_models(models=dict()):
    # linear models
    models["logistic"] = LogisticRegression()
    alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for value in alpha:
        models["ridge-" + str(value)] = RidgeClassifier(alpha=value)
    models["sgd"] = SGDClassifier(max_iter=1000, tol=1e-3)
    models["pa"] = PassiveAggressiveClassifier(max_iter=1000, tol=1e-3)
    # non-linear models
    n_neighbors = range(1, 21)
    for k in n_neighbors:
        models["knn-" + str(k)] = KNeighborsClassifier(n_neighbors=k)
    models["cart"] = DecisionTreeClassifier()
    models["extra"] = ExtraTreeClassifier()
    models["svml"] = SVC(kernel="linear")
    models["svmp"] = SVC(kernel="poly")
    c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for c in c_values:
        models["svmr" + str(c)] = SVC(C=c)
    models["bayes"] = GaussianNB()
    # ensemble models
    n_trees = 100
    models["ada"] = AdaBoostClassifier(n_estimators=n_trees)
    models["bag"] = BaggingClassifier(n_estimators=n_trees)
    models["rf"] = RandomForestClassifier(n_estimators=n_trees)
    models["et"] = ExtraTreesClassifier(n_estimators=n_trees)
    models["gbm"] = GradientBoostingClassifier(n_estimators=n_trees)
    print("Defined %d models" % len(models))
    return models
Ejemplo n.º 19
0
 def setUp(self):
     super().setUp()
     self.model = ExtraTreeClassifier()
     iris = load_iris()
     X = iris.data.astype(np.float32)
     y = iris.target.astype(np.int32)
     self.model.fit(X, y)
Ejemplo n.º 20
0
def get_experiment_5():
    et = ExtraTreeClassifier(criterion='entropy',max_leaf_nodes=1200,min_samples_leaf=1,random_state=5)
    classifier = AdaBoostClassifier(base_estimator=et,n_estimators=500,algorithm='SAMME',learning_rate=0.1,random_state=5)
    param_grid = {
        'base_estimator__max_depth': [5,7]
    }
    return (classifier, param_grid)
Ejemplo n.º 21
0
def train_different_clf(data):

    X = data.as_matrix(features)
    y = data[label].values
    ### split the data
    features_train, features_test, labels_train, labels_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    print('NB')
    model = GaussianNB()
    validate_model(model, X, y, features_train, labels_train, features_test,
                   labels_test)

    print('DTC')
    model = DecisionTreeClassifier()
    validate_model(model, X, y, features_train, labels_train, features_test,
                   labels_test)

    print('ETC')
    model = ExtraTreeClassifier()
    validate_model(model, X, y, features_train, labels_train, features_test,
                   labels_test)

    print('K Neighbors')
    model = KNeighborsClassifier()
    validate_model(model, X, y, features_train, labels_train, features_test,
                   labels_test)
Ejemplo n.º 22
0
def test_grid_search():
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

    grid = {
        'base_estimator': [
            DecisionTreeClassifier(max_depth=3),
            DecisionTreeClassifier(max_depth=4),
            ExtraTreeClassifier(max_depth=4)
        ],
        'learning_rate': [0.01, 0.1, 0.5, 1.],
        'n_estimators': [5, 10, 15, 20, 30, 40, 50, 75, 100, 125],
        'algorithm': ['SAMME', 'SAMME.R']
    }
    grid = OrderedDict(grid)

    trainX, trainY = generate_sample(2000, 10, distance=0.5)
    grid_cv = GridOptimalSearchCV(AdaBoostClassifier(),
                                  grid,
                                  n_evaluations=10,
                                  refit=True,
                                  log_name='test')
    grid_cv.fit(trainX, trainY)
    grid_cv.predict_proba(trainX)
    grid_cv.predict(trainX)
    grid_cv.print_param_stats([0.1, 0.3, 0.5, 0.7])
    def setUpClass(cls):
        master_seed(seed=1234)
        super().setUpClass()

        cls.sklearn_model = ExtraTreeClassifier()
        cls.classifier = ScikitlearnExtraTreeClassifier(model=cls.sklearn_model)
        cls.classifier.fit(x=cls.x_train_iris, y=cls.y_train_iris)
Ejemplo n.º 24
0
def variables_relevantes_arbol(X, Y, alpha=None):

    if len(X) == 0:
        logger.info("No se ingreso informacion de variables")
        return []

    features = list(X.columns)

    if alpha == None:
        alpha = 1.0 / len(features)
        logger.info(
            'Se calcula el valor minimo de aceptacion de importancia: {0}'.
            format(alpha))

    try:
        model = ExtraTreeClassifier()
        model.fit(X, Y)

        importance = model.feature_importances_

        relevant_features = []
        for i in range(len(features)):
            if importance[i] > alpha:
                relevant_features.append(features[i])

    except Exception as e:
        logger.info(
            'Error con el metodo de arboles, no se determinaron variables relevantes: {0}'
            .format(e))
        relevant_features = []

    return importance, relevant_features
Ejemplo n.º 25
0
    def __init__(self,
                 n_estimators=10,
                 criterion="gini",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 max_features="auto",
                 max_leaf_nodes=None,
                 bootstrap=False,
                 oob_score=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 warm_start=False):
        super(ExtraTreesClassifier, self).__init__(
            base_estimator=ExtraTreeClassifier(),
            n_estimators=n_estimators,
            estimator_params=("criterion", "max_depth", "min_samples_split",
                              "min_samples_leaf", "max_features",
                              "max_leaf_nodes", "random_state"),
            bootstrap=bootstrap,
            oob_score=oob_score,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            warm_start=warm_start)

        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
Ejemplo n.º 26
0
def do_Extra_Tree(data, target):
    reviews_classifier_Extra_Tree = Pipeline([('tfidf', TfidfVectorizer()),
                                              ('clf', ExtraTreeClassifier())])
    parameters_Extra_Tree = {'tfidf__ngram_range': [(1, 1), (1, 2)]}

    gs_Extra_Tree = GridSearchCV(reviews_classifier_Extra_Tree,
                                 parameters_Extra_Tree,
                                 n_jobs=-1,
                                 verbose=1)
    gs_Extra_Tree = gs_Extra_Tree.fit(data, target)

    for parameters, mean_score, scores in gs_Extra_Tree.grid_scores_:
        print(parameters)
        print(mean_score)
        print(scores)
        print('=======================')

    result_file = open('result.txt', 'a')
    print("Best score: %f" % gs_Extra_Tree.best_score_, file=result_file)
    print("Best parameters: %r" % gs_Extra_Tree.best_params_, file=result_file)
    localtime = time.asctime(time.localtime(time.time()))
    print(localtime, file=result_file)
    result_file.close()

    return (gs_Extra_Tree.best_score_, gs_Extra_Tree.best_params_)
Ejemplo n.º 27
0
def ExtraTreeClassifier(n_jobs, class_weight):
    from sklearn.tree import ExtraTreeClassifier
    clf = ExtraTreeClassifier(criterion='gini', splitter='random', max_depth=None,
                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                max_features='auto', random_state=random_state, max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None, class_weight=class_weight)
    return clf
Ejemplo n.º 28
0
def do_Adaboost(data, target):
    reviews_classifier_AdaBoost = Pipeline([('tfidf', TfidfVectorizer()),
                                            ('clf', AdaBoostClassifier())])
    parameters_AdaBoost = {
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__base_estimator': (ExtraTreeClassifier(), ),
        'clf__algorithm': ('SAMME.R', )
    }

    gs_AdaBoost = GridSearchCV(reviews_classifier_AdaBoost,
                               parameters_AdaBoost,
                               n_jobs=-1,
                               verbose=1)
    gs_AdaBoost = gs_AdaBoost.fit(data, target)

    for parameters, mean_score, scores in gs_AdaBoost.grid_scores_:
        print(parameters)
        print(mean_score)
        print(scores)
        print('=======================')

    result_file = open('result.txt', 'a')
    print("Best score: %f" % gs_AdaBoost.best_score_, file=result_file)
    print("Best parameters: %r" % gs_AdaBoost.best_params_, file=result_file)
    localtime = time.asctime(time.localtime(time.time()))
    print(localtime, file=result_file)
    result_file.close()

    return (gs_AdaBoost.best_score_, gs_AdaBoost.best_params_)
Ejemplo n.º 29
0
def variables_relevantes_arbol(X, Y, alpha=None):

    if len(X) == 0:
        logger.info("No information was passed")
        return []

    features = list(X.columns)

    if alpha == None:
        alpha = 1.0 / len(features)
        logger.info(
            'Aceptance threshold for variable importance is calculated: {0}'.
            format(alpha))

    try:
        model = ExtraTreeClassifier()
        model.fit(X, Y)

        importance = model.feature_importances_

        relevant_features = []
        for i in range(len(features)):
            if importance[i] > alpha:
                relevant_features.append(features[i])

    except Exception as e:
        logger.info(
            'Error with the tree based model, : There was not relevant variables found{0}'
            .format(e))
        relevant_features = []

    return importance, relevant_features
Ejemplo n.º 30
0
def wrapper(tr_data, tr_ans, ts_data, ts_ans):

    classifiers = [
        SVC(),
        RandomForestClassifier(),
        LogisticRegression(),
        Perceptron(),
        ExtraTreeClassifier(),
        KNeighborsClassifier(),
        DecisionTreeClassifier()
    ]
    split_tr_data = split_features(tr_data)
    split_ts_data = split_features(ts_data)
    result_score = 0
    result_clf = type(classifiers.__contains__)
    result_tr_data = np.zeros((tr_data.shape))
    result_ts_data = np.zeros((ts_data.shape))
    for tr, ts in zip(split_tr_data, split_ts_data):
        for s_tr, s_ts in zip(subset(tr), subset(ts)):
            for clf in classifiers:
                clf.fit(s_tr, tr_ans)
                pred_y = clf.predict(s_ts)
                temp_score = accuracy_score(ts_ans, pred_y)
                if result_score < temp_score:
                    result_score = temp_score
                    result_clf = clf
                    result_tr_data = s_tr
                    result_ts_data = s_ts

    print(result_score, result_clf)

    return result_tr_data, result_ts_data