Example #1
0
def set_selection_method(config):
    """
    Given the configuration settings, this function instantiates the configured
    feature selection method initialized with the preset parameters.
    
    TODO: implement the same method using reflection (load the class dinamically
    at runtime)
    
    @param config: the configuration file object loaded using yaml.load()
    @return: an object that implements the TransformerMixin class (with fit(),
    fit_transform() and transform() methods).
    """
    transformer = None

    selection_cfg = config.get("feature_selection", None)
    if selection_cfg:
        method_name = selection_cfg.get("method", None)

        # checks for RandomizedLasso
        if method_name == "RandomizedLasso":
            p = selection_cfg.get("parameters", None)
            if p:
                transformer = \
                RandomizedLasso(alpha=p.get("alpha", "aic"),
                                scaling=p.get("scaling", .5),
                                sample_fraction=p.get('sample_fraction', .75),
                                n_resampling=p.get('n_resampling', 200),
                                selection_threshold=p.get('selection_threshold', .25),
                                fit_intercept=p.get('fit_intercept', True),
                                # TODO: set verbosity according to global level
                                verbose=True,
                                normalize=p.get('normalize', True),
                                max_iter=p.get('max_iter', 500),
                                n_jobs=p.get('n_jobs', 1))
            else:
                transformer = RandomizedLasso()

        # checks for ExtraTreesClassifier
        elif method_name == "ExtraTreesClassifier":
            p = selection_cfg.get("parameters", None)
            if p:
                transformer = \
                ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10),
                                     max_depth=p.get('max_depth', None),
                                     min_samples_split=p.get('min_samples_split', 1),
                                     min_samples_leaf=p.get('min_samples_leaf', 1),
                                     min_density=p.get('min_density', 1),
                                     max_features=p.get('max_features', 'auto'),
                                     bootstrap=p.get('bootstrap', False),
                                     compute_importances=p.get('compute_importances', True),
                                     n_jobs=p.get('n_jobs', 1),
                                     random_state=p.get('random_state', None),
                                     # TODO: set verbosity according to global level
                                     verbose=True)
            else:
                transformer = ExtraTreesClassifier()

    return transformer
Example #2
0
def models():
    # Building and Cross-Validating the model
    algorithms = []
    names = []

    algorithms.append(('GB_Classifier', GradientBoostingClassifier()))
    algorithms.append(('Random_Forest', RandomForestClassifier()))
    algorithms.append(('ExtraTree_Classifier', ExtraTreesClassifier()))
    algorithms.append(('LDA_Classifier', LinearDiscriminantAnalysis()))
    algorithms.append(('KNN_Classification', KNeighborsClassifier()))
    algorithms.append(('ANN_Classification', MLPClassifier()))
    for name, algo in algorithms:
        names.append(name)
    return algorithms, names
Example #3
0
def run_decision_tree_probabilistic_classification(train, train_labels, validate, validate_labels):
    # transform counts to TFIDF features
    tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
    train = tfidf.fit_transform(train).toarray()
    validate = tfidf.transform(validate).toarray()

    # encode labels
    label_encode = preprocessing.LabelEncoder()
    train_labels = label_encode.fit_transform(train_labels)

    decisionTree = ExtraTreesClassifier(n_jobs=4, n_estimators=1000, max_features=20, min_samples_split=3,
                                        bootstrap=False, verbose=3, random_state=23)
    decisionTree.fit(train, train_labels)
    predicted_labels = decisionTree.predict_proba(validate)
    print "Extra Trees Classifier LogLoss"
    print str(metrics.log_loss(validate_labels, predicted_labels))
Example #4
0
	def _init_estimator(self, k):
		est_args = self.est_args.copy()
		est_name = '{}/{}'.format(self.name, k)
		# TODO: consider if add a random_state, actually random_state of each estimator can be set in est_configs in
		# main program by users, so we need not to set random_state there.
		# More importantly, if some estimators have no random_state parameter, this assignment can throw problems.
		if est_args.get('random_state', None) is None:
			est_args['random_state'] = copy.deepcopy(self.seed)
		else:
			print("RED ALERT...(SKKFoldWrapper)")
			est_args['random_state'] = est_args['random_state'] + k ** 2

		# estimator = ExtraTreesClassifier(**est_args)
		estimator = ExtraTreesClassifier(**est_args)
		print("ESTIMATOR: ExtraTreesClassifier")

		return estimator
Example #5
0
def defaultModels(df_xmat, df_ymat_cat):

    #### representitive common classifiers in sklearn ####
    classifiers = [
        GaussianNB(),
        LogisticRegression(max_iter=500),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        SVC(kernel='rbf'),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        RandomForestClassifier(),
    ]

    cv = StratifiedKFold(n_splits=10)

    res = []

    for clf in classifiers:

        print('processing...' + str(clf)[:10])

        metrics_cv = []

        for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat):

            X_train = df_xmat.iloc[train_index, :].values
            X_test = df_xmat.iloc[test_index, :].values
            y_train = [df_ymat_cat[i] for i in train_index]
            y_test = [df_ymat_cat[i] for i in test_index]

            clf.fit(X_train, y_train)

            metrics_cv.append(clf.score(X_test, y_test))

        res.append([
            str(clf)[:10],
            np.array(metrics_cv).mean(axis=0),
            np.array(metrics_cv).std(axis=0)
        ])

    return res
Example #6
0
    def setUpClass(cls):
        cls._prep_data(cls)

        n_iter = 2
        cls.srfc_grid = RandomizedSearchCV(StreamingEXTC(n_jobs=2, verbose=1),
                                           param_distributions=SRFCGRID,
                                           scoring='roc_auc',
                                           n_iter=n_iter * 10,
                                           verbose=2,
                                           n_jobs=3,
                                           cv=4)

        cls.rfc_grid = RandomizedSearchCV(ExtraTreesClassifier(n_jobs=2),
                                          param_distributions=RFCGRID,
                                          scoring='roc_auc',
                                          n_iter=n_iter,
                                          verbose=2,
                                          n_jobs=3,
                                          cv=4)
def classify(X,y,cv):
    #clf = DecisionTreeClassifier()
    #clf = RandomForestClassifier()
    #clf = AdaBoostClassifier()
    clf = ExtraTreesClassifier()
    score = cross_val_score(clf, X, y, cv=cv)
    print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0])
    clf = clf.fit(X,y)
    #print 'Feature Importances'
    #print clf.feature_importances_
    #X = clf.transform(X,threshold=.3)
    
    preds = clf.predict(X)
    print 'predictions counter'
    print Counter(clf.predict(X))
    fp=0
    tp=0
    fn=0
    tn=0
    for a in range(len(y)):
        if y[a]==preds[a]:
            if preds[a]==0:
                tn+=1
            elif preds[a]==1:
                tp+=1
        elif preds[a]==1:fp+=1
        elif preds[a]==0:fn+=1
    
    print 'correct positives:', tp
    print 'correct negatives:', tn
    print 'false positives:', fp
    print 'false negatives:', fn
    print 'precision:',float(tp)/(tp+fp)
    print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn)
    print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn)
    print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp)
    print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') 
    return clf
Example #8
0
                     n_estimators=200,
                     min_child_weight=10,
                     subsample=0.7,
                     colsample_bytree=0.7,
                     reg_alpha=0,
                     reg_lambda=0.5)
reg.fit(X_train, y_train)
end = time.time()
y_pred_lgb = reg.predict_proba(X_test)[:, 1]
print(metrics.roc_auc_score(y_test, y_pred_lgb))
print(end - start)

start = time.time()
reg = ExtraTreesClassifier(n_estimators=100,
                           max_depth=7,
                           min_samples_leaf=10,
                           n_jobs=8,
                           random_state=4)
reg.fit(X_train, y_train)
end = time.time()
y_pred_et = reg.predict_proba(X_test)[:, 1]
print(metrics.roc_auc_score(y_test, y_pred_et))
print(end - start)

start = time.time()
reg = KNeighborsClassifier(n_neighbors=4, algorithm='kd_tree')
reg.fit(X_train, y_train)
end = time.time()
y_pred_knn = reg.predict_proba(X_test)[:, 1]
print(metrics.roc_auc_score(y_test, y_pred_knn))
print(end - start)
#print ",dfeatures[features][:-1]\n",dfeatures[features][:-1]
pd.set_option('display.max_columns', None)
print "ALL columns of dfeatures[features]"
print dfeatures[features].head(1)

# create a test and training set
x_train, x_test, y_train, y_test = train_test_split(
    dfeatures[features],
    dfeatures.author_num.values,
    test_size=0.4,
    random_state=123)
x, y = dfeatures[features], dfeatures.author_num.values

# CLASSIFIER
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

scores = cross_val_score(etclf, x, y)
print scores.mean()

# Print Confusion Matrix
print metrics.confusion_matrix(etclf.predict(x_test), y_test)
# print authors
"""
# # PREVIOUS RESULT 0.671469386087

############# RESULT WITH ALL FEATURES ############
/Users/jhave/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:401: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
0.148101533384
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# In[29]:

preprocessor = make_pipeline(SelectKBest(f_classif, k=10),
                             PolynomialFeatures(2))

AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
GBoost = make_pipeline(preprocessor, StandardScaler(),
                       GradientBoostingClassifier())
RandomForest = make_pipeline(preprocessor, RandomForestClassifier())
XGB = make_pipeline(preprocessor, XGBClassifier())
Extree = make_pipeline(preprocessor, ExtraTreesClassifier())

dict_of_models = {
    'AdaBoost': AdaBoost,
    'SVM': SVM,
    'GBoost': GBoost,
    'RandomForest': RandomForest,
    'XGB': XGB,
    'Extree': Extree
}

# In[30]:

from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
Example #11
0
nmf = NMF(n_components=150)
pca = PCA(n_components=80)
sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2)
kernel_pca = KernelPCA(n_components=150)  # Costs huge amounts of ram
randomized_pca = RandomizedPCA(n_components=500)

# REGRESSORS
random_forest_regressor = RandomForestRegressor(n_estimators=256)
gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60)
support_vector_regressor = svm.SVR()

# CLASSIFIERS
support_vector_classifier = svm.SVC(probability=True, verbose=True)
linear_support_vector_classifier = svm.LinearSVC(dual=False)
nearest_neighbor_classifier = KNeighborsClassifier()
extra_trees_classifier = ExtraTreesClassifier(n_estimators=256)
bagging_classifier = BaggingClassifier(
    base_estimator=GradientBoostingClassifier(n_estimators=200,
                                              max_features=4),
    max_features=0.5,
    n_jobs=2,
    verbose=1)
gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=200,
                                                          max_features=4,
                                                          learning_rate=0.3,
                                                          verbose=0)
random_forest_classifier = RandomForestClassifier(n_estimators=2)
logistic_regression = LogisticRegression(C=0.5)
ridge_classifier = RidgeClassifier(alpha=0.1, solver='svd')
bayes = MultinomialNB()
sgd = SGDClassifier()
Example #12
0
if ",QDA," in Functions:
    models.append(('QDA', QuadraticDiscriminantAnalysis()))
if ",GBC," in Functions:
    models.append(('GBC', GradientBoostingClassifier()))
if ",ETC," in Functions:
    models.append(('ETC', ExtraTreeClassifier()))
if ",BC," in Functions:
    models.append(('BC', BaggingClassifier()))
if ",SGDC," in Functions:
    models.append(('SGDC', SGDClassifier()))
if ",RC," in Functions:
    models.append(('RC', RidgeClassifier()))
if ",PAC," in Functions:
    models.append(('PAC', PassiveAggressiveClassifier()))
if ",ETSC," in Functions:
    models.append(('ETSC', ExtraTreesClassifier()))
if ",BNB," in Functions:
    models.append(('BNB', BernoulliNB()))
if ",GM," in Functions:
    models.append(('GM', GaussianMixture()))

from sklearn.model_selection import KFold
from collections import Counter

Predictii = [[] for _ in range(len(Y_Test))]

Accs = []

normlist = []
if Norm == "N1":
    normlist.append("N1")
Example #13
0
    Normalizer()
]

# %%

#=================Classifier
classifier_test = [
    OneVsRestClassifier(SVC()),
    DecisionTreeClassifier(max_depth=5),
    SVC(),
    SVC(kernel="linear", C=0.025),
    LogisticRegressionCV(cv=5, random_state=0),
    GradientBoostingClassifier(random_state=0),
    BaggingClassifier(base_estimator=SVC(), n_estimators=10,
                      random_state=0).fit(X, y),
    ExtraTreesClassifier(n_estimators=100, random_state=0),
    HistGradientBoostingClassifier(),
    MLPClassifier(random_state=1, max_iter=300),
    OneVsOneClassifier(LinearSVC(random_state=0)),
    OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),
                         random_state=0)
]
print('Importacao OK')
#%%
count = 0
dict_test = {}
dict_all = {}
for i in range(len(scaler)):
    scaler_i = scaler[i]
    for j in range(len(classifier_test)):
        count += 1
Example #14
0
 def __init__(self, patternEnum=PatternEnum.EVENTUALLY):
     '''
     Initialize pattern's object with corresponding model file name and the best SMV classifier and pre-processing method identified before.
     '''
     self.patternEnum = patternEnum
     self.pattern = Pattern(patternEnum)
     modelFile = str(
         patternEnum.order) + "_" + patternEnum.getFullName() + ".pkl"
     self.modelFile = config.PROJECT_ROOT + os.sep + "models" + os.sep + modelFile
     self.preProcessMethod = "NONE"
     if (patternEnum == PatternEnum.EVENTUALLY):
         self.maxRandState = 196558
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
             decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
             max_iter=-1, probability=False, random_state=None, shrinking=True,
             tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.ALWAYS):
         self.maxRandState = 124255
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.FOLLOWS):
         self.maxRandState = 196588
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.PRECEDES):
         self.maxRandState = 187708
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.NEVER):
         self.maxRandState = 182526
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
             oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.STEADY_STATE):
         self.maxRandState = 119746
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.UNTIL):
         self.maxRandState = 114007
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.INFINITELY_OFTEN):
         self.maxRandState = 150000
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.NEXT):
         self.maxRandState = 173977
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.RELEASE):
         self.maxRandState = 105454
         # random seed to shuffle data for training
         self.preProcessMethod = "SCALE"
         self.clf = \
         SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.WEAK_UNTIL):
         self.maxRandState = 163090
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
Example #15
0
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Apply Some Featuring
poly_reg = PolynomialFeatures(degree=1)

# Transform into numpy object
x_train = poly_reg.fit_transform(X_train)
X_test = poly_reg.fit_transform(X_test)
y_test = np.array(y_test.ix[:,0])
y_train = np.array(y_train.ix[:,0])

# Build model with good params
model = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.6, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# Fit the model
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Scoring
if regression:
    print('Score on test set:', mean_absolute_error(y_test, y_pred))
else:
    print('Score on test set:', accuracy_score(y_test, y_pred))
Example #16
0
    if (with_proba == True):
        adjusted_proba = DataFrame(classifier.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")


build_audit(DecisionTreeClassifier(random_state=13, min_samples_leaf=5),
            "DecisionTreeAudit")
build_audit(
    BaggingClassifier(DecisionTreeClassifier(random_state=13,
                                             min_samples_leaf=5),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "DecisionTreeEnsembleAudit")
build_audit(ExtraTreesClassifier(random_state=13, min_samples_leaf=5),
            "ExtraTreesAudit")
build_audit(
    GradientBoostingClassifier(random_state=13, loss="exponential", init=None),
    "GradientBoostingAudit")
build_audit(LinearDiscriminantAnalysis(solver="lsqr"),
            "LinearDiscriminantAnalysisAudit")
build_audit(LogisticRegressionCV(), "LogisticRegressionAudit")
build_audit(
    BaggingClassifier(LogisticRegression(),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "LogisticRegressionEnsembleAudit")
build_audit(GaussianNB(), "NaiveBayesAudit")
build_audit(RandomForestClassifier(random_state=13, min_samples_leaf=5),
            "RandomForestAudit")
Example #17
0
from xgboost import XGBClassifier
import pickle
from sklearn.model_selection import train_test_split
import shutil
from statistics import mean

warnings.filterwarnings('ignore')

classifiers = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    BernoulliNB(),
    CalibratedClassifierCV(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    LabelPropagation(),
    LabelSpreading(),
    LinearDiscriminantAnalysis(),
    LogisticRegression(),
    LogisticRegressionCV(),
    MLPClassifier(),
    NuSVC(probability=True),
    QuadraticDiscriminantAnalysis(),
    RandomForestClassifier(),
    SGDClassifier(loss='log'),
    SVC(probability=True),
Example #18
0
def __get_classifier_model(classifier, args):
    """
    Convenience function for obtaining a classification model

    Args:
        classifier(str): A string indicating the name of the classifier
        args: An arguments object

    Returns:
        A classification model based on the given classifier string
    """
    # Make SGD Logistic Regression model the default
    model = SGDClassifier(loss='log',
                          penalty='l2',
                          shuffle=True,
                          n_iter=5,
                          n_jobs=-1,
                          random_state=179)
    if classifier == SVM:
        model = SVC(kernel=args.kernel,
                    class_weight="balanced",
                    cache_size=8096,
                    random_state=17,
                    probability=True)
    elif classifier == ADA_BOOST:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = AdaBoostClassifier(base_estimator=dt,
                                   n_estimators=400,
                                   random_state=17)
    elif classifier == RF:
        # Configure the classifier to use all available CPU cores
        model = RandomForestClassifier(class_weight="balanced",
                                       n_jobs=-1,
                                       n_estimators=400,
                                       random_state=17,
                                       max_features='auto',
                                       max_depth=15,
                                       criterion='gini')
    elif classifier == GRADIENT_BOOST:
        model = GradientBoostingClassifier(random_state=17,
                                           n_estimators=400,
                                           max_features='auto')
    elif classifier == EXTRA_TREES:
        model = ExtraTreesClassifier(random_state=17,
                                     n_estimators=400,
                                     n_jobs=-1,
                                     class_weight='balanced',
                                     max_depth=15,
                                     max_features='auto',
                                     criterion='gini')
    elif classifier == BAGGING:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = BaggingClassifier(base_estimator=dt,
                                  n_estimators=400,
                                  random_state=17,
                                  n_jobs=-1,
                                  max_features=0.8,
                                  max_samples=0.8,
                                  bootstrap=False)
    elif classifier == PASSIVE_AGGRESSIVE:
        model = PassiveAggressiveClassifier(n_iter=10,
                                            class_weight='balanced',
                                            n_jobs=-1,
                                            random_state=41)
    elif classifier == PERCEPTRON:
        model = Perceptron(n_jobs=-1,
                           n_iter=10,
                           penalty='l2',
                           class_weight='balanced',
                           alpha=0.25)
    return model
Example #19
0
state_filter = ['successful','failed'] #,'canceled', 'live']


# NOTE:  Adjust Trainingset / Testset division ratio:
divratio = 0.3


# Normalization (L1 & L2):
# NOTE:  Change 'normtype' value to 'l1' / 'l2' to change normalization type:
normtype = 'l2'#'l1'


# model_selection is used for manually enabling the individual models.
# NOTE:  Setting boolean value, eanbles/disables model.
model_selection = {
    'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ),
    'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ),
    'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ),
    'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ),
    'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ),
    'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ),
    'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ),
    'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ),
    'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ),
    'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ),
    'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ),
    'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ),
    'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ),
    'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ),  # (C=0.01, penalty='l1', dual=False) ),
    'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), #
    'Nu_SVM': (True, NuSVC(gamma='auto') ),
Example #20
0
from sklearn.svm.classes import SVR
from sklearn.tree import DecisionTreeClassifier


DECISION_TREE = DecisionTreeClassifier()
LOGISTIC_REGRESSION = LogisticRegression()
NAIVE_BAYS = GaussianNB()

K_N_N = KNeighborsClassifier()
SUPPORT_VECTOR = svm.SVC(kernel="linear")

# Ensemble classifiers
RANDOM_FOREST = RandomForestClassifier(n_estimators=100)
GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100)
ADA_BOOST = AdaBoostClassifier(n_estimators=100)
EXTRA_TREE = ExtraTreesClassifier(n_estimators=100)


# Regressors
GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
LINEAR_RG = LinearRegression()
RIDGE_RG = Ridge()
LASSO_RG = Lasso()
SVR_RG = SVR()

def getClassifierMap():
    CLASSIFIER_MAP = {
    "DECISION_TREE": DECISION_TREE,
    "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION,
    "NAIVE_BAYS": NAIVE_BAYS,
    "K_N_N": K_N_N,
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import sys
import os
from tools import *

data = load_obj(sys.argv[1])
y = data["labels"]
X = data["features"]

clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1)

scores = cross_val_score(clf, X, y, cv=5)
save_obj(scores, "extra_trees_scores.pkl")
Example #22
0
from uci_comparison import compare_estimators
from sklearn.ensemble.forest import RandomForestClassifier, ExtraTreesClassifier
from rr_forest import RRForestClassifier
from rr_extra_forest import RRExtraTreesClassifier

estimators = {
    'RandomForest': RandomForestClassifier(n_estimators=20),
    'RndRotForest': RRForestClassifier(n_estimators=20),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=20),
    'RndRotETrees': RRExtraTreesClassifier(n_estimators=20),
}

# optionally, pass a list of UCI dataset identifiers as the datasets parameter, e.g. datasets=['iris', 'diabetes']
# optionally, pass a dict of scoring functions as the metric parameter, e.g. metrics={'F1-score': f1_score}
compare_estimators(estimators)
Example #23
0
train_data = pd.read_csv('resources/train.csv')
train_data = train_data.dropna()
train_data = preprocess_data(train_data)

X = train_data[['is_1', 'is_2', 'is_3', 'Fare', 'is_male', 'is_female']]
Y = train_data['Survived']

XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2)

n_estimators = 100

models = [
    DecisionTreeClassifier(max_depth=3),
    BaggingClassifier(n_estimators=n_estimators),
    RandomForestClassifier(n_estimators=n_estimators),
    ExtraTreesClassifier(n_estimators=n_estimators),
    AdaBoostClassifier(n_estimators=n_estimators)
]

model_title = [
    'DecisionTree', 'Bagging', 'RandomForest', 'ExtraTrees', 'AdaBoost'
]

surv_preds, surv_probs, scores, fprs, tprs, thres = ([] for i in range(6))

for i, model in enumerate(models):
    print('Fitting {0}'.format(model_title[i]))

    clf = model.fit(XTrain, YTrain)
    surv_preds.append(model.predict(XTest))
    surv_probs.append(model.predict_proba(XTest))
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names