Ejemplo n.º 1
0
def main():
	# catch parameters
	training_set_features = sys.argv[1]
	training_set_classes = training_set_features.replace('features', 'classes')
	forest_file = sys.argv[2]

        # loading training features
        with open(training_set_features, 'r') as f:
            training_feature_vector = numpy.load(f)
	    if 1 == training_feature_vector.ndim:
		training_feature_vector = numpy.expand_dims(training_feature_vector, -1)
        with open(training_set_classes , 'r') as f:
            training_class_vector = numpy.load(f)
	

        # prepare and train the decision forest
        forest = ExtraTreesClassifier(n_estimators=200,
                            criterion = 'entropy',
                            max_features = None,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
			    max_depth = 500,
                            bootstrap = True,
                            oob_score = False,
                            random_state=0,
                            n_jobs=n_jobs,
                            compute_importances=True)
        forest.fit(training_feature_vector, training_class_vector)

	# saving the decision forest
	with open(forest_file, 'wb') as f:
		pickle.dump(forest, f)
Ejemplo n.º 2
0
def set_selection_method(config):
    """
    Given the configuration settings, this function instantiates the configured
    feature selection method initialized with the preset parameters.
    
    TODO: implement the same method using reflection (load the class dinamically
    at runtime)
    
    @param config: the configuration file object loaded using yaml.load()
    @return: an object that implements the TransformerMixin class (with fit(),
    fit_transform() and transform() methods).
    """
    transformer = None

    selection_cfg = config.get("feature_selection", None)
    if selection_cfg:
        method_name = selection_cfg.get("method", None)

        # checks for RandomizedLasso
        if method_name == "RandomizedLasso":
            p = selection_cfg.get("parameters", None)
            if p:
                transformer = \
                RandomizedLasso(alpha=p.get("alpha", "aic"),
                                scaling=p.get("scaling", .5),
                                sample_fraction=p.get('sample_fraction', .75),
                                n_resampling=p.get('n_resampling', 200),
                                selection_threshold=p.get('selection_threshold', .25),
                                fit_intercept=p.get('fit_intercept', True),
                                # TODO: set verbosity according to global level
                                verbose=True,
                                normalize=p.get('normalize', True),
                                max_iter=p.get('max_iter', 500),
                                n_jobs=p.get('n_jobs', 1))
            else:
                transformer = RandomizedLasso()

        # checks for ExtraTreesClassifier
        elif method_name == "ExtraTreesClassifier":
            p = selection_cfg.get("parameters", None)
            if p:
                transformer = \
                ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10),
                                     max_depth=p.get('max_depth', None),
                                     min_samples_split=p.get('min_samples_split', 1),
                                     min_samples_leaf=p.get('min_samples_leaf', 1),
                                     min_density=p.get('min_density', 1),
                                     max_features=p.get('max_features', 'auto'),
                                     bootstrap=p.get('bootstrap', False),
                                     compute_importances=p.get('compute_importances', True),
                                     n_jobs=p.get('n_jobs', 1),
                                     random_state=p.get('random_state', None),
                                     # TODO: set verbosity according to global level
                                     verbose=True)
            else:
                transformer = ExtraTreesClassifier()

    return transformer
Ejemplo n.º 3
0
def select_features(X,y,X_test,n_features=100):
    '''
    select the top n_features
    '''
    forest = ExtraTreesClassifier(n_estimators=100,random_state=571)
    forest.fit(X,y)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    X = X[:,indices[0:n_features]]
    X_test = X_test[:,indices[0:n_features]]
    return X,X_test
Ejemplo n.º 4
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Ejemplo n.º 5
0
def run_decision_tree_probabilistic_classification(train, train_labels, validate, validate_labels):
    # transform counts to TFIDF features
    tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
    train = tfidf.fit_transform(train).toarray()
    validate = tfidf.transform(validate).toarray()

    # encode labels
    label_encode = preprocessing.LabelEncoder()
    train_labels = label_encode.fit_transform(train_labels)

    decisionTree = ExtraTreesClassifier(n_jobs=4, n_estimators=1000, max_features=20, min_samples_split=3,
                                        bootstrap=False, verbose=3, random_state=23)
    decisionTree.fit(train, train_labels)
    predicted_labels = decisionTree.predict_proba(validate)
    print "Extra Trees Classifier LogLoss"
    print str(metrics.log_loss(validate_labels, predicted_labels))
Ejemplo n.º 6
0
 def __init__(
     self,
     sc=None,
     partitions="auto",
     n_estimators=100,
     criterion="gini",
     max_depth=None,
     min_samples_split=2,
     min_samples_leaf=1,
     min_weight_fraction_leaf=0.0,
     max_features="auto",
     max_leaf_nodes=None,
     min_impurity_decrease=0.0,
     min_impurity_split=None,
     bootstrap=False,
     oob_score=False,
     n_jobs=None,
     random_state=None,
     verbose=0,
     warm_start=False,
     class_weight=None,
 ):
     ExtraTreesClassifier.__init__(
         self,
         n_estimators=n_estimators,
         criterion=criterion,
         max_depth=max_depth,
         min_samples_split=min_samples_split,
         min_samples_leaf=min_samples_leaf,
         min_weight_fraction_leaf=min_weight_fraction_leaf,
         max_features=max_features,
         max_leaf_nodes=max_leaf_nodes,
         min_impurity_decrease=min_impurity_decrease,
         min_impurity_split=min_impurity_split,
         bootstrap=bootstrap,
         oob_score=oob_score,
         n_jobs=n_jobs,
         random_state=random_state,
         verbose=verbose,
         warm_start=warm_start,
         class_weight=class_weight,
     )
     self.sc = sc
     self.partitions = partitions
Ejemplo n.º 7
0
class ExtraTreesClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
def build_sample(regressor, name):
	# feature selection
	sample_X.shape
	clf = ExtraTreesClassifier()
	clf = clf.fit(sample_X, sample_y)
	print clf.feature_importances_  
	model = SelectFromModel(clf, prefit=True)
	X_new = model.transform(sample_X)
	X_new.shape 
	X_new.columns             
	# repeat the CV procedure 10 times to get more precise results
	n = 10  
	# for each iteration, randomly hold out 10% of the data as CV set
	for i in range(n):
		X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
		      sample_X[:, features], sample_y, test_size=.10, random_state=i*SEED)
		# train...
		regressor = regressor.fit(X_train, y_train)
		# save model
		#store_pkl(regressor, name + ".pkl")
		# predict on train
		preds = regressor.predict(X_cv)
		# print 
		#print preds
		# create DataFrame
		#preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"])
		#print preds
		#print y_cv
		# mape
		mape_r = mape(y_cv, preds)
		# print
		print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r)
	# predict on test
	predict_res = regressor.predict(sample_t[:, features])
	preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"])
	preds_on_test['ID'].astype(int)
	# save predictions
	store_csv(preds_on_test, name + ".csv")
	return predict_res
def classify(X,y,cv):
    #clf = DecisionTreeClassifier()
    #clf = RandomForestClassifier()
    #clf = AdaBoostClassifier()
    clf = ExtraTreesClassifier()
    score = cross_val_score(clf, X, y, cv=cv)
    print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0])
    clf = clf.fit(X,y)
    #print 'Feature Importances'
    #print clf.feature_importances_
    #X = clf.transform(X,threshold=.3)
    
    preds = clf.predict(X)
    print 'predictions counter'
    print Counter(clf.predict(X))
    fp=0
    tp=0
    fn=0
    tn=0
    for a in range(len(y)):
        if y[a]==preds[a]:
            if preds[a]==0:
                tn+=1
            elif preds[a]==1:
                tp+=1
        elif preds[a]==1:fp+=1
        elif preds[a]==0:fn+=1
    
    print 'correct positives:', tp
    print 'correct negatives:', tn
    print 'false positives:', fp
    print 'false negatives:', fn
    print 'precision:',float(tp)/(tp+fp)
    print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn)
    print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn)
    print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp)
    print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') 
    return clf
Ejemplo n.º 10
0
def models():
    # Building and Cross-Validating the model
    algorithms = []
    names = []

    algorithms.append(('GB_Classifier', GradientBoostingClassifier()))
    algorithms.append(('Random_Forest', RandomForestClassifier()))
    algorithms.append(('ExtraTree_Classifier', ExtraTreesClassifier()))
    algorithms.append(('LDA_Classifier', LinearDiscriminantAnalysis()))
    algorithms.append(('KNN_Classification', KNeighborsClassifier()))
    algorithms.append(('ANN_Classification', MLPClassifier()))
    for name, algo in algorithms:
        names.append(name)
    return algorithms, names
Ejemplo n.º 11
0
	def _init_estimator(self, k):
		est_args = self.est_args.copy()
		est_name = '{}/{}'.format(self.name, k)
		# TODO: consider if add a random_state, actually random_state of each estimator can be set in est_configs in
		# main program by users, so we need not to set random_state there.
		# More importantly, if some estimators have no random_state parameter, this assignment can throw problems.
		if est_args.get('random_state', None) is None:
			est_args['random_state'] = copy.deepcopy(self.seed)
		else:
			print("RED ALERT...(SKKFoldWrapper)")
			est_args['random_state'] = est_args['random_state'] + k ** 2

		# estimator = ExtraTreesClassifier(**est_args)
		estimator = ExtraTreesClassifier(**est_args)
		print("ESTIMATOR: ExtraTreesClassifier")

		return estimator
Ejemplo n.º 12
0
def defaultModels(df_xmat, df_ymat_cat):

    #### representitive common classifiers in sklearn ####
    classifiers = [
        GaussianNB(),
        LogisticRegression(max_iter=500),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        SVC(kernel='rbf'),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        RandomForestClassifier(),
    ]

    cv = StratifiedKFold(n_splits=10)

    res = []

    for clf in classifiers:

        print('processing...' + str(clf)[:10])

        metrics_cv = []

        for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat):

            X_train = df_xmat.iloc[train_index, :].values
            X_test = df_xmat.iloc[test_index, :].values
            y_train = [df_ymat_cat[i] for i in train_index]
            y_test = [df_ymat_cat[i] for i in test_index]

            clf.fit(X_train, y_train)

            metrics_cv.append(clf.score(X_test, y_test))

        res.append([
            str(clf)[:10],
            np.array(metrics_cv).mean(axis=0),
            np.array(metrics_cv).std(axis=0)
        ])

    return res
Ejemplo n.º 13
0
    def setUpClass(cls):
        cls._prep_data(cls)

        n_iter = 2
        cls.srfc_grid = RandomizedSearchCV(StreamingEXTC(n_jobs=2, verbose=1),
                                           param_distributions=SRFCGRID,
                                           scoring='roc_auc',
                                           n_iter=n_iter * 10,
                                           verbose=2,
                                           n_jobs=3,
                                           cv=4)

        cls.rfc_grid = RandomizedSearchCV(ExtraTreesClassifier(n_jobs=2),
                                          param_distributions=RFCGRID,
                                          scoring='roc_auc',
                                          n_iter=n_iter,
                                          verbose=2,
                                          n_jobs=3,
                                          cv=4)
Ejemplo n.º 14
0
 def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
     self._hyperparams = {
         'n_estimators': n_estimators,
         'criterion': criterion,
         'max_depth': max_depth,
         'min_samples_split': min_samples_split,
         'min_samples_leaf': min_samples_leaf,
         'min_weight_fraction_leaf': min_weight_fraction_leaf,
         'max_features': max_features,
         'max_leaf_nodes': max_leaf_nodes,
         'min_impurity_decrease': min_impurity_decrease,
         'min_impurity_split': min_impurity_split,
         'bootstrap': bootstrap,
         'oob_score': oob_score,
         'n_jobs': n_jobs,
         'random_state': random_state,
         'verbose': verbose,
         'warm_start': warm_start,
         'class_weight': class_weight}
     self._wrapped_model = SKLModel(**self._hyperparams)
#print ",dfeatures[features][:-1]\n",dfeatures[features][:-1]
pd.set_option('display.max_columns', None)
print "ALL columns of dfeatures[features]"
print dfeatures[features].head(1)

# create a test and training set
x_train, x_test, y_train, y_test = train_test_split(
    dfeatures[features],
    dfeatures.author_num.values,
    test_size=0.4,
    random_state=123)
x, y = dfeatures[features], dfeatures.author_num.values

# CLASSIFIER
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

scores = cross_val_score(etclf, x, y)
print scores.mean()

# Print Confusion Matrix
print metrics.confusion_matrix(etclf.predict(x_test), y_test)
# print authors
"""
# # PREVIOUS RESULT 0.671469386087

############# RESULT WITH ALL FEATURES ############
/Users/jhave/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:401: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
0.148101533384
Ejemplo n.º 16
0
    if (with_proba == True):
        adjusted_proba = DataFrame(classifier.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")


build_audit(DecisionTreeClassifier(random_state=13, min_samples_leaf=5),
            "DecisionTreeAudit")
build_audit(
    BaggingClassifier(DecisionTreeClassifier(random_state=13,
                                             min_samples_leaf=5),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "DecisionTreeEnsembleAudit")
build_audit(ExtraTreesClassifier(random_state=13, min_samples_leaf=5),
            "ExtraTreesAudit")
build_audit(
    GradientBoostingClassifier(random_state=13, loss="exponential", init=None),
    "GradientBoostingAudit")
build_audit(LinearDiscriminantAnalysis(solver="lsqr"),
            "LinearDiscriminantAnalysisAudit")
build_audit(LogisticRegressionCV(), "LogisticRegressionAudit")
build_audit(
    BaggingClassifier(LogisticRegression(),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "LogisticRegressionEnsembleAudit")
build_audit(GaussianNB(), "NaiveBayesAudit")
build_audit(RandomForestClassifier(random_state=13, min_samples_leaf=5),
            "RandomForestAudit")
Ejemplo n.º 17
0
from xgboost import XGBClassifier
import pickle
from sklearn.model_selection import train_test_split
import shutil
from statistics import mean

warnings.filterwarnings('ignore')

classifiers = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    BernoulliNB(),
    CalibratedClassifierCV(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    LabelPropagation(),
    LabelSpreading(),
    LinearDiscriminantAnalysis(),
    LogisticRegression(),
    LogisticRegressionCV(),
    MLPClassifier(),
    NuSVC(probability=True),
    QuadraticDiscriminantAnalysis(),
    RandomForestClassifier(),
    SGDClassifier(loss='log'),
    SVC(probability=True),
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# In[29]:

preprocessor = make_pipeline(SelectKBest(f_classif, k=10),
                             PolynomialFeatures(2))

AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
GBoost = make_pipeline(preprocessor, StandardScaler(),
                       GradientBoostingClassifier())
RandomForest = make_pipeline(preprocessor, RandomForestClassifier())
XGB = make_pipeline(preprocessor, XGBClassifier())
Extree = make_pipeline(preprocessor, ExtraTreesClassifier())

dict_of_models = {
    'AdaBoost': AdaBoost,
    'SVM': SVM,
    'GBoost': GBoost,
    'RandomForest': RandomForest,
    'XGB': XGB,
    'Extree': Extree
}

# In[30]:

from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn import metrics
from sklearn import preprocessing

authorship = read_csv("http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv")
authors = list(set(authorship.Author.values))
le = preprocessing.LabelEncoder()
le.fit(authors)
authorship["Author_num"] = le.transform(authorship["Author"])

# What are some of the stop words we're looking at?
features = list(authorship.columns)
features
features.remove("Author")
features.remove("Author_num")

# Create a random variable (random forests work best with a random variable)
# and create a test and training set
authorship["random"] = [random.random() for i in range(841)]
x_train, x_test, y_train, y_test = train_test_split(
    authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123
)


# Fit Model
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)
Ejemplo n.º 20
0
class BestClassifiers(object):
    '''
    SVM models of different patterns. It loads model and enable prediction with new data.
    '''
    def __init__(self, patternEnum=PatternEnum.EVENTUALLY):
        '''
        Initialize pattern's object with corresponding model file name and the best SMV classifier and pre-processing method identified before.
        '''
        self.patternEnum = patternEnum
        self.pattern = Pattern(patternEnum)
        modelFile = str(
            patternEnum.order) + "_" + patternEnum.getFullName() + ".pkl"
        self.modelFile = config.PROJECT_ROOT + os.sep + "models" + os.sep + modelFile
        self.preProcessMethod = "NONE"
        if (patternEnum == PatternEnum.EVENTUALLY):
            self.maxRandState = 196558
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
                decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
                max_iter=-1, probability=False, random_state=None, shrinking=True,
                tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.ALWAYS):
            self.maxRandState = 124255
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.FOLLOWS):
            self.maxRandState = 196588
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.PRECEDES):
            self.maxRandState = 187708
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.NEVER):
            self.maxRandState = 182526
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                max_depth=None, max_features=None, max_leaf_nodes=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
                oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.STEADY_STATE):
            self.maxRandState = 119746
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.UNTIL):
            self.maxRandState = 114007
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.INFINITELY_OFTEN):
            self.maxRandState = 150000
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features='log2', max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)
        elif (patternEnum == PatternEnum.NEXT):
            self.maxRandState = 173977
            # random seed to shuffle data for training
            self.preProcessMethod = "NORMALIZE"
            self.clf = \
            SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.RELEASE):
            self.maxRandState = 105454
            # random seed to shuffle data for training
            self.preProcessMethod = "SCALE"
            self.clf = \
            SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
              decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
              max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)
        elif (patternEnum == PatternEnum.WEAK_UNTIL):
            self.maxRandState = 163090
            # random seed to shuffle data for training
            self.preProcessMethod = "NONE"
            self.clf = \
            ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
               max_depth=None, max_features=None, max_leaf_nodes=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
               oob_score=False, random_state=0, verbose=0, warm_start=False)

    def getModel(self):
        "Get the classifier, and the unique class labels (the class names)"
        try:
            #             print("self.modelFile",self.modelFile)
            clf, preprocessor = pickle.load(open(self.modelFile, "rb"))
            #             print("Classifier found. It is loading.")
            return clf, preprocessor
        except (OSError,
                IOError):  # Model does not exist, first train then save it
            #             print("Classifier not found. New classifier is training.")
            X, preprocessor = processData(self.pattern.feature,
                                          self.preProcessMethod)
            # shuffle data
            shuffled_X, shuffled_y = shuffle(X,
                                             self.pattern.y,
                                             random_state=self.maxRandState)
            self.clf.fit(shuffled_X, shuffled_y)
            # save the model
            pickle.dump((self.clf, preprocessor), open(self.modelFile, "wb"))
            return self.clf, preprocessor
        except Exception as e:
            print(e)

    def predict(self, properties):
        clf, preprocessor = self.getModel()
        if preprocessor:
            properties = preprocessor.transform(
                properties
            )  # apply the pre-processing method done for training date
        targetMC = clf.predict(properties)
        return targetMC
Ejemplo n.º 21
0
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Apply Some Featuring
poly_reg = PolynomialFeatures(degree=1)

# Transform into numpy object
x_train = poly_reg.fit_transform(X_train)
X_test = poly_reg.fit_transform(X_test)
y_test = np.array(y_test.ix[:,0])
y_train = np.array(y_train.ix[:,0])

# Build model with good params
model = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.6, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# Fit the model
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Scoring
if regression:
    print('Score on test set:', mean_absolute_error(y_test, y_pred))
else:
    print('Score on test set:', accuracy_score(y_test, y_pred))
Ejemplo n.º 22
0
state_filter = ['successful','failed'] #,'canceled', 'live']


# NOTE:  Adjust Trainingset / Testset division ratio:
divratio = 0.3


# Normalization (L1 & L2):
# NOTE:  Change 'normtype' value to 'l1' / 'l2' to change normalization type:
normtype = 'l2'#'l1'


# model_selection is used for manually enabling the individual models.
# NOTE:  Setting boolean value, eanbles/disables model.
model_selection = {
    'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ),
    'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ),
    'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ),
    'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ),
    'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ),
    'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ),
    'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ),
    'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ),
    'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ),
    'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ),
    'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ),
    'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ),
    'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ),
    'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ),  # (C=0.01, penalty='l1', dual=False) ),
    'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), #
    'Nu_SVM': (True, NuSVC(gamma='auto') ),
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import sys
import os
from tools import *

data = load_obj(sys.argv[1])
y = data["labels"]
X = data["features"]

clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1)

scores = cross_val_score(clf, X, y, cv=5)
save_obj(scores, "extra_trees_scores.pkl")
Ejemplo n.º 24
0
if ",QDA," in Functions:
    models.append(('QDA', QuadraticDiscriminantAnalysis()))
if ",GBC," in Functions:
    models.append(('GBC', GradientBoostingClassifier()))
if ",ETC," in Functions:
    models.append(('ETC', ExtraTreeClassifier()))
if ",BC," in Functions:
    models.append(('BC', BaggingClassifier()))
if ",SGDC," in Functions:
    models.append(('SGDC', SGDClassifier()))
if ",RC," in Functions:
    models.append(('RC', RidgeClassifier()))
if ",PAC," in Functions:
    models.append(('PAC', PassiveAggressiveClassifier()))
if ",ETSC," in Functions:
    models.append(('ETSC', ExtraTreesClassifier()))
if ",BNB," in Functions:
    models.append(('BNB', BernoulliNB()))
if ",GM," in Functions:
    models.append(('GM', GaussianMixture()))

from sklearn.model_selection import KFold
from collections import Counter

Predictii = [[] for _ in range(len(Y_Test))]

Accs = []

normlist = []
if Norm == "N1":
    normlist.append("N1")
Ejemplo n.º 25
0
from uci_comparison import compare_estimators
from sklearn.ensemble.forest import RandomForestClassifier, ExtraTreesClassifier
from rr_forest import RRForestClassifier
from rr_extra_forest import RRExtraTreesClassifier

estimators = {
    'RandomForest': RandomForestClassifier(n_estimators=20),
    'RndRotForest': RRForestClassifier(n_estimators=20),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=20),
    'RndRotETrees': RRExtraTreesClassifier(n_estimators=20),
}

# optionally, pass a list of UCI dataset identifiers as the datasets parameter, e.g. datasets=['iris', 'diabetes']
# optionally, pass a dict of scoring functions as the metric parameter, e.g. metrics={'F1-score': f1_score}
compare_estimators(estimators)
Ejemplo n.º 26
0
from sklearn.svm.classes import SVR
from sklearn.tree import DecisionTreeClassifier


DECISION_TREE = DecisionTreeClassifier()
LOGISTIC_REGRESSION = LogisticRegression()
NAIVE_BAYS = GaussianNB()

K_N_N = KNeighborsClassifier()
SUPPORT_VECTOR = svm.SVC(kernel="linear")

# Ensemble classifiers
RANDOM_FOREST = RandomForestClassifier(n_estimators=100)
GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100)
ADA_BOOST = AdaBoostClassifier(n_estimators=100)
EXTRA_TREE = ExtraTreesClassifier(n_estimators=100)


# Regressors
GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
LINEAR_RG = LinearRegression()
RIDGE_RG = Ridge()
LASSO_RG = Lasso()
SVR_RG = SVR()

def getClassifierMap():
    CLASSIFIER_MAP = {
    "DECISION_TREE": DECISION_TREE,
    "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION,
    "NAIVE_BAYS": NAIVE_BAYS,
    "K_N_N": K_N_N,
Ejemplo n.º 27
0
# also tested this:
# svm.SVC(kernel='linear', C=1.0), GaussianNB()
# doesn't improve and takes long

#running crossvalidation score on all classifiers
for clf in classifiers:
    score = cross_val_score(clf, X, y, cv=cv)
    print "%s \n Accuracy: %0.2f (+/- %0.2f)\n" % (clf, score.mean(), score.std() / 2)

#now let's go to OOS test
testX = test[['Sex01','Fare','SibSp','Parch','Pclass']]
medianFare = testX.Fare.median()
testX.Fare = testX.Fare.fillna(medianFare)

#print results to CSV files for Kaggle submission
clf = ExtraTreesClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('ETClf.csv',index=False)

clf = RandomForestClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('RFClf.csv',index=False)

clf = DecisionTreeClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('DTClf.csv',index=False)

Ejemplo n.º 28
0
def __get_classifier_model(classifier, args):
    """
    Convenience function for obtaining a classification model

    Args:
        classifier(str): A string indicating the name of the classifier
        args: An arguments object

    Returns:
        A classification model based on the given classifier string
    """
    # Make SGD Logistic Regression model the default
    model = SGDClassifier(loss='log',
                          penalty='l2',
                          shuffle=True,
                          n_iter=5,
                          n_jobs=-1,
                          random_state=179)
    if classifier == SVM:
        model = SVC(kernel=args.kernel,
                    class_weight="balanced",
                    cache_size=8096,
                    random_state=17,
                    probability=True)
    elif classifier == ADA_BOOST:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = AdaBoostClassifier(base_estimator=dt,
                                   n_estimators=400,
                                   random_state=17)
    elif classifier == RF:
        # Configure the classifier to use all available CPU cores
        model = RandomForestClassifier(class_weight="balanced",
                                       n_jobs=-1,
                                       n_estimators=400,
                                       random_state=17,
                                       max_features='auto',
                                       max_depth=15,
                                       criterion='gini')
    elif classifier == GRADIENT_BOOST:
        model = GradientBoostingClassifier(random_state=17,
                                           n_estimators=400,
                                           max_features='auto')
    elif classifier == EXTRA_TREES:
        model = ExtraTreesClassifier(random_state=17,
                                     n_estimators=400,
                                     n_jobs=-1,
                                     class_weight='balanced',
                                     max_depth=15,
                                     max_features='auto',
                                     criterion='gini')
    elif classifier == BAGGING:
        dt = DecisionTreeClassifier(max_depth=15,
                                    criterion='gini',
                                    max_features='auto',
                                    class_weight='balanced',
                                    random_state=39)
        model = BaggingClassifier(base_estimator=dt,
                                  n_estimators=400,
                                  random_state=17,
                                  n_jobs=-1,
                                  max_features=0.8,
                                  max_samples=0.8,
                                  bootstrap=False)
    elif classifier == PASSIVE_AGGRESSIVE:
        model = PassiveAggressiveClassifier(n_iter=10,
                                            class_weight='balanced',
                                            n_jobs=-1,
                                            random_state=41)
    elif classifier == PERCEPTRON:
        model = Perceptron(n_jobs=-1,
                           n_iter=10,
                           penalty='l2',
                           class_weight='balanced',
                           alpha=0.25)
    return model
Ejemplo n.º 29
0
                     n_estimators=200,
                     min_child_weight=10,
                     subsample=0.7,
                     colsample_bytree=0.7,
                     reg_alpha=0,
                     reg_lambda=0.5)
reg.fit(X_train, y_train)
end = time.time()
y_pred_lgb = reg.predict_proba(X_test)[:, 1]
print(metrics.roc_auc_score(y_test, y_pred_lgb))
print(end - start)

start = time.time()
reg = ExtraTreesClassifier(n_estimators=100,
                           max_depth=7,
                           min_samples_leaf=10,
                           n_jobs=8,
                           random_state=4)
reg.fit(X_train, y_train)
end = time.time()
y_pred_et = reg.predict_proba(X_test)[:, 1]
print(metrics.roc_auc_score(y_test, y_pred_et))
print(end - start)

start = time.time()
reg = KNeighborsClassifier(n_neighbors=4, algorithm='kd_tree')
reg.fit(X_train, y_train)
end = time.time()
y_pred_knn = reg.predict_proba(X_test)[:, 1]
print(metrics.roc_auc_score(y_test, y_pred_knn))
print(end - start)
Ejemplo n.º 30
0
    Normalizer()
]

# %%

#=================Classifier
classifier_test = [
    OneVsRestClassifier(SVC()),
    DecisionTreeClassifier(max_depth=5),
    SVC(),
    SVC(kernel="linear", C=0.025),
    LogisticRegressionCV(cv=5, random_state=0),
    GradientBoostingClassifier(random_state=0),
    BaggingClassifier(base_estimator=SVC(), n_estimators=10,
                      random_state=0).fit(X, y),
    ExtraTreesClassifier(n_estimators=100, random_state=0),
    HistGradientBoostingClassifier(),
    MLPClassifier(random_state=1, max_iter=300),
    OneVsOneClassifier(LinearSVC(random_state=0)),
    OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),
                         random_state=0)
]
print('Importacao OK')
#%%
count = 0
dict_test = {}
dict_all = {}
for i in range(len(scaler)):
    scaler_i = scaler[i]
    for j in range(len(classifier_test)):
        count += 1
Ejemplo n.º 31
0
 def __init__(self, patternEnum=PatternEnum.EVENTUALLY):
     '''
     Initialize pattern's object with corresponding model file name and the best SMV classifier and pre-processing method identified before.
     '''
     self.patternEnum = patternEnum
     self.pattern = Pattern(patternEnum)
     modelFile = str(
         patternEnum.order) + "_" + patternEnum.getFullName() + ".pkl"
     self.modelFile = config.PROJECT_ROOT + os.sep + "models" + os.sep + modelFile
     self.preProcessMethod = "NONE"
     if (patternEnum == PatternEnum.EVENTUALLY):
         self.maxRandState = 196558
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
             decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
             max_iter=-1, probability=False, random_state=None, shrinking=True,
             tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.ALWAYS):
         self.maxRandState = 124255
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.FOLLOWS):
         self.maxRandState = 196588
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.PRECEDES):
         self.maxRandState = 187708
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.NEVER):
         self.maxRandState = 182526
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
             oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.STEADY_STATE):
         self.maxRandState = 119746
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.UNTIL):
         self.maxRandState = 114007
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.INFINITELY_OFTEN):
         self.maxRandState = 150000
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
     elif (patternEnum == PatternEnum.NEXT):
         self.maxRandState = 173977
         # random seed to shuffle data for training
         self.preProcessMethod = "NORMALIZE"
         self.clf = \
         SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.RELEASE):
         self.maxRandState = 105454
         # random seed to shuffle data for training
         self.preProcessMethod = "SCALE"
         self.clf = \
         SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False)
     elif (patternEnum == PatternEnum.WEAK_UNTIL):
         self.maxRandState = 163090
         # random seed to shuffle data for training
         self.preProcessMethod = "NONE"
         self.clf = \
         ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
Ejemplo n.º 32
0
def main():
    ### Import data sets
    l_train = pd.read_csv('lemon_training.csv')
    l_test = pd.read_csv('lemon_test.csv')


    ### Clean/prepare data sets
    l_train = l_train.dropna(axis=1)
    l_test = l_test.dropna(axis=1)

    features = list(l_train.describe().columns)
    features.remove('RefId')
    features.remove('IsBadBuy')


    ### Create test and training sets
    train_features = l_train[features].values
    train_class = l_train.IsBadBuy.values
    OSS_features = l_test[features].values

    # Seed PRNG
    np.random.seed(1234)
    X_train, X_test, y_train, y_test = \
        cross_validation.train_test_split(train_features, train_class, test_size=.3)


    ### Build model
    # model = naive_bayes.GaussianNB().fit(X_train, y_train)
    model = ExtraTreesClassifier(max_depth=8).fit(X_train, y_train)
    model.score(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)


    ### Stats
    print 'training:\n', metrics.confusion_matrix(y_train, y_pred_train)
    print metrics.classification_report(y_train, y_pred_train)
    print 'test:\n', metrics.confusion_matrix(y_test, y_pred_test)
    print metrics.classification_report(y_test, y_pred_test)
    fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, y_pred_train, pos_label=1)
    fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1)
    print 'train MA: ', model.score(X_train, y_train)
    print 'test MA: ', model.score(X_test, y_test)
    print 'train AUC: ', metrics.auc(fpr_train, tpr_train)
    print 'test AUC: ', metrics.auc(fpr_test, tpr_test)



    # Cross Validation
    AUCs = []
    # for i in xrange(10):
    #     X_train, X_test, y_train, y_test = \
    #     cross_validation.train_test_split(train_features, train_class, test_size=.3)
    #     y_pred_test = model.fit(X_train, y_train).predict(X_test)
    #     fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1)
    #     AUCs.append(metrics.auc(fpr_test, tpr_test))
        
    # print 'AUC cross val: ', AUCs


    ### Do output predicitons for OSS data
    OSS_features = l_test[features].values
    y_pred_OSS = model.predict(OSS_features)
    submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred_OSS })
    if y[a] == preds[a]:
      if preds[a] == 0:
        tn += 1
      elif preds[a] == 1:
        tp += 1
    elif preds[a] == 1:
      fp += 1
    elif preds[a] == 0:
      fn += 1

  print 'correct positives:', tp
  print 'correct negatives:', tn
  print 'false positives:', fp
  print 'false negatives:', fn

  extra_trees = ExtraTreesClassifier()
  extra_score = cross_val_score(extra_trees, X, y, cv=i)
  print '\nextra trees %s-fold cross validation accuracy: %s' % (i, sum(extra_score)/extra_score.shape[0])

  extra_fit = extra_trees.fit(X, y)
  print 'Feature Importances %s' % (extra_fit.feature_importances_)
  for f in extra_fit.feature_importances_:
    print '{}: {}'.format(next(features), f)

  X_for_preds = extra_fit.transform(X, threshold=min(extra_fit.feature_importances_))
  preds = extra_fit.predict(X_for_preds)
  print 'predictions counter %s' % (Counter(extra_fit.predict(X_for_preds)))
  fp = 0
  tp = 0
  fn = 0
  tn = 0
Ejemplo n.º 34
0
import random
from pandas import read_csv
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn import metrics
from sklearn import preprocessing
authorship = read_csv('http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv')
authors = list(set(authorship.Author.values))
le = preprocessing.LabelEncoder()
le.fit(authors)
authorship['Author_num'] = le.transform(authorship['Author'])

#What are some of the stop words we're looking at?
features = list(authorship.columns)
features
features.remove('Author')
features.remove('Author_num')

# Create a random variable (random forests work best with a random variable)
# and create a test and training set
authorship['random'] = [random.random() for i in range(841)]
x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123)


# Fit Model
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)
print metrics.classification_report(etclf.predict(x_test), y_test)
Ejemplo n.º 35
0
def etree_classify(X,Y):
	clf = ExtraTreesClassifier(n_estimators=500, max_depth=10, criterion='gini',min_samples_split=2, \
			min_samples_leaf=1, max_features=None, bootstrap=False, oob_score=False, n_jobs=-1)
	clf.fit(X,Y)
	return clf
Ejemplo n.º 36
0
nmf = NMF(n_components=150)
pca = PCA(n_components=80)
sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2)
kernel_pca = KernelPCA(n_components=150)  # Costs huge amounts of ram
randomized_pca = RandomizedPCA(n_components=500)

# REGRESSORS
random_forest_regressor = RandomForestRegressor(n_estimators=256)
gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60)
support_vector_regressor = svm.SVR()

# CLASSIFIERS
support_vector_classifier = svm.SVC(probability=True, verbose=True)
linear_support_vector_classifier = svm.LinearSVC(dual=False)
nearest_neighbor_classifier = KNeighborsClassifier()
extra_trees_classifier = ExtraTreesClassifier(n_estimators=256)
bagging_classifier = BaggingClassifier(
    base_estimator=GradientBoostingClassifier(n_estimators=200,
                                              max_features=4),
    max_features=0.5,
    n_jobs=2,
    verbose=1)
gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=200,
                                                          max_features=4,
                                                          learning_rate=0.3,
                                                          verbose=0)
random_forest_classifier = RandomForestClassifier(n_estimators=2)
logistic_regression = LogisticRegression(C=0.5)
ridge_classifier = RidgeClassifier(alpha=0.1, solver='svd')
bayes = MultinomialNB()
sgd = SGDClassifier()
Ejemplo n.º 37
0
train_data = pd.read_csv('resources/train.csv')
train_data = train_data.dropna()
train_data = preprocess_data(train_data)

X = train_data[['is_1', 'is_2', 'is_3', 'Fare', 'is_male', 'is_female']]
Y = train_data['Survived']

XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2)

n_estimators = 100

models = [
    DecisionTreeClassifier(max_depth=3),
    BaggingClassifier(n_estimators=n_estimators),
    RandomForestClassifier(n_estimators=n_estimators),
    ExtraTreesClassifier(n_estimators=n_estimators),
    AdaBoostClassifier(n_estimators=n_estimators)
]

model_title = [
    'DecisionTree', 'Bagging', 'RandomForest', 'ExtraTrees', 'AdaBoost'
]

surv_preds, surv_probs, scores, fprs, tprs, thres = ([] for i in range(6))

for i, model in enumerate(models):
    print('Fitting {0}'.format(model_title[i]))

    clf = model.fit(XTrain, YTrain)
    surv_preds.append(model.predict(XTest))
    surv_probs.append(model.predict_proba(XTest))
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names