def main(): # catch parameters training_set_features = sys.argv[1] training_set_classes = training_set_features.replace('features', 'classes') forest_file = sys.argv[2] # loading training features with open(training_set_features, 'r') as f: training_feature_vector = numpy.load(f) if 1 == training_feature_vector.ndim: training_feature_vector = numpy.expand_dims(training_feature_vector, -1) with open(training_set_classes , 'r') as f: training_class_vector = numpy.load(f) # prepare and train the decision forest forest = ExtraTreesClassifier(n_estimators=200, criterion = 'entropy', max_features = None, min_samples_split = 2, min_samples_leaf = 1, max_depth = 500, bootstrap = True, oob_score = False, random_state=0, n_jobs=n_jobs, compute_importances=True) forest.fit(training_feature_vector, training_class_vector) # saving the decision forest with open(forest_file, 'wb') as f: pickle.dump(forest, f)
def set_selection_method(config): """ Given the configuration settings, this function instantiates the configured feature selection method initialized with the preset parameters. TODO: implement the same method using reflection (load the class dinamically at runtime) @param config: the configuration file object loaded using yaml.load() @return: an object that implements the TransformerMixin class (with fit(), fit_transform() and transform() methods). """ transformer = None selection_cfg = config.get("feature_selection", None) if selection_cfg: method_name = selection_cfg.get("method", None) # checks for RandomizedLasso if method_name == "RandomizedLasso": p = selection_cfg.get("parameters", None) if p: transformer = \ RandomizedLasso(alpha=p.get("alpha", "aic"), scaling=p.get("scaling", .5), sample_fraction=p.get('sample_fraction', .75), n_resampling=p.get('n_resampling', 200), selection_threshold=p.get('selection_threshold', .25), fit_intercept=p.get('fit_intercept', True), # TODO: set verbosity according to global level verbose=True, normalize=p.get('normalize', True), max_iter=p.get('max_iter', 500), n_jobs=p.get('n_jobs', 1)) else: transformer = RandomizedLasso() # checks for ExtraTreesClassifier elif method_name == "ExtraTreesClassifier": p = selection_cfg.get("parameters", None) if p: transformer = \ ExtraTreesClassifier(n_estimators=p.get('n_estimators', 10), max_depth=p.get('max_depth', None), min_samples_split=p.get('min_samples_split', 1), min_samples_leaf=p.get('min_samples_leaf', 1), min_density=p.get('min_density', 1), max_features=p.get('max_features', 'auto'), bootstrap=p.get('bootstrap', False), compute_importances=p.get('compute_importances', True), n_jobs=p.get('n_jobs', 1), random_state=p.get('random_state', None), # TODO: set verbosity according to global level verbose=True) else: transformer = ExtraTreesClassifier() return transformer
def select_features(X,y,X_test,n_features=100): ''' select the top n_features ''' forest = ExtraTreesClassifier(n_estimators=100,random_state=571) forest.fit(X,y) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] X = X[:,indices[0:n_features]] X_test = X_test[:,indices[0:n_features]] return X,X_test
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def run_decision_tree_probabilistic_classification(train, train_labels, validate, validate_labels): # transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) train = tfidf.fit_transform(train).toarray() validate = tfidf.transform(validate).toarray() # encode labels label_encode = preprocessing.LabelEncoder() train_labels = label_encode.fit_transform(train_labels) decisionTree = ExtraTreesClassifier(n_jobs=4, n_estimators=1000, max_features=20, min_samples_split=3, bootstrap=False, verbose=3, random_state=23) decisionTree.fit(train, train_labels) predicted_labels = decisionTree.predict_proba(validate) print "Extra Trees Classifier LogLoss" print str(metrics.log_loss(validate_labels, predicted_labels))
def __init__( self, sc=None, partitions="auto", n_estimators=100, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ): ExtraTreesClassifier.__init__( self, n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, ) self.sc = sc self.partitions = partitions
class ExtraTreesClassifierImpl(): def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start, 'class_weight': class_weight} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def build_sample(regressor, name): # feature selection sample_X.shape clf = ExtraTreesClassifier() clf = clf.fit(sample_X, sample_y) print clf.feature_importances_ model = SelectFromModel(clf, prefit=True) X_new = model.transform(sample_X) X_new.shape X_new.columns # repeat the CV procedure 10 times to get more precise results n = 10 # for each iteration, randomly hold out 10% of the data as CV set for i in range(n): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( sample_X[:, features], sample_y, test_size=.10, random_state=i*SEED) # train... regressor = regressor.fit(X_train, y_train) # save model #store_pkl(regressor, name + ".pkl") # predict on train preds = regressor.predict(X_cv) # print #print preds # create DataFrame #preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"]) #print preds #print y_cv # mape mape_r = mape(y_cv, preds) # print print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r) # predict on test predict_res = regressor.predict(sample_t[:, features]) preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"]) preds_on_test['ID'].astype(int) # save predictions store_csv(preds_on_test, name + ".csv") return predict_res
def classify(X,y,cv): #clf = DecisionTreeClassifier() #clf = RandomForestClassifier() #clf = AdaBoostClassifier() clf = ExtraTreesClassifier() score = cross_val_score(clf, X, y, cv=cv) print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0]) clf = clf.fit(X,y) #print 'Feature Importances' #print clf.feature_importances_ #X = clf.transform(X,threshold=.3) preds = clf.predict(X) print 'predictions counter' print Counter(clf.predict(X)) fp=0 tp=0 fn=0 tn=0 for a in range(len(y)): if y[a]==preds[a]: if preds[a]==0: tn+=1 elif preds[a]==1: tp+=1 elif preds[a]==1:fp+=1 elif preds[a]==0:fn+=1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn print 'precision:',float(tp)/(tp+fp) print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn) print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn) print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp) print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') return clf
def models(): # Building and Cross-Validating the model algorithms = [] names = [] algorithms.append(('GB_Classifier', GradientBoostingClassifier())) algorithms.append(('Random_Forest', RandomForestClassifier())) algorithms.append(('ExtraTree_Classifier', ExtraTreesClassifier())) algorithms.append(('LDA_Classifier', LinearDiscriminantAnalysis())) algorithms.append(('KNN_Classification', KNeighborsClassifier())) algorithms.append(('ANN_Classification', MLPClassifier())) for name, algo in algorithms: names.append(name) return algorithms, names
def _init_estimator(self, k): est_args = self.est_args.copy() est_name = '{}/{}'.format(self.name, k) # TODO: consider if add a random_state, actually random_state of each estimator can be set in est_configs in # main program by users, so we need not to set random_state there. # More importantly, if some estimators have no random_state parameter, this assignment can throw problems. if est_args.get('random_state', None) is None: est_args['random_state'] = copy.deepcopy(self.seed) else: print("RED ALERT...(SKKFoldWrapper)") est_args['random_state'] = est_args['random_state'] + k ** 2 # estimator = ExtraTreesClassifier(**est_args) estimator = ExtraTreesClassifier(**est_args) print("ESTIMATOR: ExtraTreesClassifier") return estimator
def defaultModels(df_xmat, df_ymat_cat): #### representitive common classifiers in sklearn #### classifiers = [ GaussianNB(), LogisticRegression(max_iter=500), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(kernel='rbf'), AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier(), ] cv = StratifiedKFold(n_splits=10) res = [] for clf in classifiers: print('processing...' + str(clf)[:10]) metrics_cv = [] for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat): X_train = df_xmat.iloc[train_index, :].values X_test = df_xmat.iloc[test_index, :].values y_train = [df_ymat_cat[i] for i in train_index] y_test = [df_ymat_cat[i] for i in test_index] clf.fit(X_train, y_train) metrics_cv.append(clf.score(X_test, y_test)) res.append([ str(clf)[:10], np.array(metrics_cv).mean(axis=0), np.array(metrics_cv).std(axis=0) ]) return res
def setUpClass(cls): cls._prep_data(cls) n_iter = 2 cls.srfc_grid = RandomizedSearchCV(StreamingEXTC(n_jobs=2, verbose=1), param_distributions=SRFCGRID, scoring='roc_auc', n_iter=n_iter * 10, verbose=2, n_jobs=3, cv=4) cls.rfc_grid = RandomizedSearchCV(ExtraTreesClassifier(n_jobs=2), param_distributions=RFCGRID, scoring='roc_auc', n_iter=n_iter, verbose=2, n_jobs=3, cv=4)
def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start, 'class_weight': class_weight} self._wrapped_model = SKLModel(**self._hyperparams)
#print ",dfeatures[features][:-1]\n",dfeatures[features][:-1] pd.set_option('display.max_columns', None) print "ALL columns of dfeatures[features]" print dfeatures[features].head(1) # create a test and training set x_train, x_test, y_train, y_test = train_test_split( dfeatures[features], dfeatures.author_num.values, test_size=0.4, random_state=123) x, y = dfeatures[features], dfeatures.author_num.values # CLASSIFIER etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) scores = cross_val_score(etclf, x, y) print scores.mean() # Print Confusion Matrix print metrics.confusion_matrix(etclf.predict(x_test), y_test) # print authors """ # # PREVIOUS RESULT 0.671469386087 ############# RESULT WITH ALL FEATURES ############ /Users/jhave/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:401: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3. % (min_labels, self.n_folds)), Warning) 0.148101533384
if (with_proba == True): adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv") build_audit(DecisionTreeClassifier(random_state=13, min_samples_leaf=5), "DecisionTreeAudit") build_audit( BaggingClassifier(DecisionTreeClassifier(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAudit") build_audit(ExtraTreesClassifier(random_state=13, min_samples_leaf=5), "ExtraTreesAudit") build_audit( GradientBoostingClassifier(random_state=13, loss="exponential", init=None), "GradientBoostingAudit") build_audit(LinearDiscriminantAnalysis(solver="lsqr"), "LinearDiscriminantAnalysisAudit") build_audit(LogisticRegressionCV(), "LogisticRegressionAudit") build_audit( BaggingClassifier(LogisticRegression(), random_state=13, n_estimators=3, max_features=0.5), "LogisticRegressionEnsembleAudit") build_audit(GaussianNB(), "NaiveBayesAudit") build_audit(RandomForestClassifier(random_state=13, min_samples_leaf=5), "RandomForestAudit")
from xgboost import XGBClassifier import pickle from sklearn.model_selection import train_test_split import shutil from statistics import mean warnings.filterwarnings('ignore') classifiers = [ AdaBoostClassifier(), BaggingClassifier(), BernoulliNB(), CalibratedClassifierCV(), DecisionTreeClassifier(), ExtraTreeClassifier(), ExtraTreesClassifier(), GaussianNB(), GaussianProcessClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(), LabelPropagation(), LabelSpreading(), LinearDiscriminantAnalysis(), LogisticRegression(), LogisticRegressionCV(), MLPClassifier(), NuSVC(probability=True), QuadraticDiscriminantAnalysis(), RandomForestClassifier(), SGDClassifier(loss='log'), SVC(probability=True),
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.preprocessing import PolynomialFeatures, StandardScaler from sklearn.feature_selection import SelectKBest, f_classif # In[29]: preprocessor = make_pipeline(SelectKBest(f_classif, k=10), PolynomialFeatures(2)) AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0)) SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0)) GBoost = make_pipeline(preprocessor, StandardScaler(), GradientBoostingClassifier()) RandomForest = make_pipeline(preprocessor, RandomForestClassifier()) XGB = make_pipeline(preprocessor, XGBClassifier()) Extree = make_pipeline(preprocessor, ExtraTreesClassifier()) dict_of_models = { 'AdaBoost': AdaBoost, 'SVM': SVM, 'GBoost': GBoost, 'RandomForest': RandomForest, 'XGB': XGB, 'Extree': Extree } # In[30]: from sklearn.metrics import confusion_matrix, classification_report, f1_score from sklearn.model_selection import learning_curve from sklearn.model_selection import GridSearchCV
from sklearn.ensemble.forest import ExtraTreesClassifier from sklearn import metrics from sklearn import preprocessing authorship = read_csv("http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv") authors = list(set(authorship.Author.values)) le = preprocessing.LabelEncoder() le.fit(authors) authorship["Author_num"] = le.transform(authorship["Author"]) # What are some of the stop words we're looking at? features = list(authorship.columns) features features.remove("Author") features.remove("Author_num") # Create a random variable (random forests work best with a random variable) # and create a test and training set authorship["random"] = [random.random() for i in range(841)] x_train, x_test, y_train, y_test = train_test_split( authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123 ) # Fit Model etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) # Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test)
class BestClassifiers(object): ''' SVM models of different patterns. It loads model and enable prediction with new data. ''' def __init__(self, patternEnum=PatternEnum.EVENTUALLY): ''' Initialize pattern's object with corresponding model file name and the best SMV classifier and pre-processing method identified before. ''' self.patternEnum = patternEnum self.pattern = Pattern(patternEnum) modelFile = str( patternEnum.order) + "_" + patternEnum.getFullName() + ".pkl" self.modelFile = config.PROJECT_ROOT + os.sep + "models" + os.sep + modelFile self.preProcessMethod = "NONE" if (patternEnum == PatternEnum.EVENTUALLY): self.maxRandState = 196558 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.ALWAYS): self.maxRandState = 124255 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.FOLLOWS): self.maxRandState = 196588 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.PRECEDES): self.maxRandState = 187708 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.NEVER): self.maxRandState = 182526 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.STEADY_STATE): self.maxRandState = 119746 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.UNTIL): self.maxRandState = 114007 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.INFINITELY_OFTEN): self.maxRandState = 150000 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='log2', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.NEXT): self.maxRandState = 173977 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.RELEASE): self.maxRandState = 105454 # random seed to shuffle data for training self.preProcessMethod = "SCALE" self.clf = \ SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.WEAK_UNTIL): self.maxRandState = 163090 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) def getModel(self): "Get the classifier, and the unique class labels (the class names)" try: # print("self.modelFile",self.modelFile) clf, preprocessor = pickle.load(open(self.modelFile, "rb")) # print("Classifier found. It is loading.") return clf, preprocessor except (OSError, IOError): # Model does not exist, first train then save it # print("Classifier not found. New classifier is training.") X, preprocessor = processData(self.pattern.feature, self.preProcessMethod) # shuffle data shuffled_X, shuffled_y = shuffle(X, self.pattern.y, random_state=self.maxRandState) self.clf.fit(shuffled_X, shuffled_y) # save the model pickle.dump((self.clf, preprocessor), open(self.modelFile, "wb")) return self.clf, preprocessor except Exception as e: print(e) def predict(self, properties): clf, preprocessor = self.getModel() if preprocessor: properties = preprocessor.transform( properties ) # apply the pre-processing method done for training date targetMC = clf.predict(properties) return targetMC
# Split into training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # Apply Some Featuring poly_reg = PolynomialFeatures(degree=1) # Transform into numpy object x_train = poly_reg.fit_transform(X_train) X_test = poly_reg.fit_transform(X_test) y_test = np.array(y_test.ix[:,0]) y_train = np.array(y_train.ix[:,0]) # Build model with good params model = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=None, max_features=0.6, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=4, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) # Fit the model model.fit(x_train, y_train) # Predict y_pred = model.predict(X_test) # Scoring if regression: print('Score on test set:', mean_absolute_error(y_test, y_pred)) else: print('Score on test set:', accuracy_score(y_test, y_pred))
state_filter = ['successful','failed'] #,'canceled', 'live'] # NOTE: Adjust Trainingset / Testset division ratio: divratio = 0.3 # Normalization (L1 & L2): # NOTE: Change 'normtype' value to 'l1' / 'l2' to change normalization type: normtype = 'l2'#'l1' # model_selection is used for manually enabling the individual models. # NOTE: Setting boolean value, eanbles/disables model. model_selection = { 'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ), 'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ), 'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ), 'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ), 'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ), 'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ), 'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ), 'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ), 'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ), 'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ), 'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ), 'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ), 'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ), 'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ), # (C=0.01, penalty='l1', dual=False) ), 'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), # 'Nu_SVM': (True, NuSVC(gamma='auto') ),
from sklearn.ensemble.forest import ExtraTreesClassifier from sklearn.model_selection import cross_val_score from sklearn import metrics import sys import os from tools import * data = load_obj(sys.argv[1]) y = data["labels"] X = data["features"] clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1) scores = cross_val_score(clf, X, y, cv=5) save_obj(scores, "extra_trees_scores.pkl")
if ",QDA," in Functions: models.append(('QDA', QuadraticDiscriminantAnalysis())) if ",GBC," in Functions: models.append(('GBC', GradientBoostingClassifier())) if ",ETC," in Functions: models.append(('ETC', ExtraTreeClassifier())) if ",BC," in Functions: models.append(('BC', BaggingClassifier())) if ",SGDC," in Functions: models.append(('SGDC', SGDClassifier())) if ",RC," in Functions: models.append(('RC', RidgeClassifier())) if ",PAC," in Functions: models.append(('PAC', PassiveAggressiveClassifier())) if ",ETSC," in Functions: models.append(('ETSC', ExtraTreesClassifier())) if ",BNB," in Functions: models.append(('BNB', BernoulliNB())) if ",GM," in Functions: models.append(('GM', GaussianMixture())) from sklearn.model_selection import KFold from collections import Counter Predictii = [[] for _ in range(len(Y_Test))] Accs = [] normlist = [] if Norm == "N1": normlist.append("N1")
from uci_comparison import compare_estimators from sklearn.ensemble.forest import RandomForestClassifier, ExtraTreesClassifier from rr_forest import RRForestClassifier from rr_extra_forest import RRExtraTreesClassifier estimators = { 'RandomForest': RandomForestClassifier(n_estimators=20), 'RndRotForest': RRForestClassifier(n_estimators=20), 'ExtraTrees': ExtraTreesClassifier(n_estimators=20), 'RndRotETrees': RRExtraTreesClassifier(n_estimators=20), } # optionally, pass a list of UCI dataset identifiers as the datasets parameter, e.g. datasets=['iris', 'diabetes'] # optionally, pass a dict of scoring functions as the metric parameter, e.g. metrics={'F1-score': f1_score} compare_estimators(estimators)
from sklearn.svm.classes import SVR from sklearn.tree import DecisionTreeClassifier DECISION_TREE = DecisionTreeClassifier() LOGISTIC_REGRESSION = LogisticRegression() NAIVE_BAYS = GaussianNB() K_N_N = KNeighborsClassifier() SUPPORT_VECTOR = svm.SVC(kernel="linear") # Ensemble classifiers RANDOM_FOREST = RandomForestClassifier(n_estimators=100) GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100) ADA_BOOST = AdaBoostClassifier(n_estimators=100) EXTRA_TREE = ExtraTreesClassifier(n_estimators=100) # Regressors GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1) LINEAR_RG = LinearRegression() RIDGE_RG = Ridge() LASSO_RG = Lasso() SVR_RG = SVR() def getClassifierMap(): CLASSIFIER_MAP = { "DECISION_TREE": DECISION_TREE, "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION, "NAIVE_BAYS": NAIVE_BAYS, "K_N_N": K_N_N,
# also tested this: # svm.SVC(kernel='linear', C=1.0), GaussianNB() # doesn't improve and takes long #running crossvalidation score on all classifiers for clf in classifiers: score = cross_val_score(clf, X, y, cv=cv) print "%s \n Accuracy: %0.2f (+/- %0.2f)\n" % (clf, score.mean(), score.std() / 2) #now let's go to OOS test testX = test[['Sex01','Fare','SibSp','Parch','Pclass']] medianFare = testX.Fare.median() testX.Fare = testX.Fare.fillna(medianFare) #print results to CSV files for Kaggle submission clf = ExtraTreesClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('ETClf.csv',index=False) clf = RandomForestClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('RFClf.csv',index=False) clf = DecisionTreeClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('DTClf.csv',index=False)
def __get_classifier_model(classifier, args): """ Convenience function for obtaining a classification model Args: classifier(str): A string indicating the name of the classifier args: An arguments object Returns: A classification model based on the given classifier string """ # Make SGD Logistic Regression model the default model = SGDClassifier(loss='log', penalty='l2', shuffle=True, n_iter=5, n_jobs=-1, random_state=179) if classifier == SVM: model = SVC(kernel=args.kernel, class_weight="balanced", cache_size=8096, random_state=17, probability=True) elif classifier == ADA_BOOST: dt = DecisionTreeClassifier(max_depth=15, criterion='gini', max_features='auto', class_weight='balanced', random_state=39) model = AdaBoostClassifier(base_estimator=dt, n_estimators=400, random_state=17) elif classifier == RF: # Configure the classifier to use all available CPU cores model = RandomForestClassifier(class_weight="balanced", n_jobs=-1, n_estimators=400, random_state=17, max_features='auto', max_depth=15, criterion='gini') elif classifier == GRADIENT_BOOST: model = GradientBoostingClassifier(random_state=17, n_estimators=400, max_features='auto') elif classifier == EXTRA_TREES: model = ExtraTreesClassifier(random_state=17, n_estimators=400, n_jobs=-1, class_weight='balanced', max_depth=15, max_features='auto', criterion='gini') elif classifier == BAGGING: dt = DecisionTreeClassifier(max_depth=15, criterion='gini', max_features='auto', class_weight='balanced', random_state=39) model = BaggingClassifier(base_estimator=dt, n_estimators=400, random_state=17, n_jobs=-1, max_features=0.8, max_samples=0.8, bootstrap=False) elif classifier == PASSIVE_AGGRESSIVE: model = PassiveAggressiveClassifier(n_iter=10, class_weight='balanced', n_jobs=-1, random_state=41) elif classifier == PERCEPTRON: model = Perceptron(n_jobs=-1, n_iter=10, penalty='l2', class_weight='balanced', alpha=0.25) return model
n_estimators=200, min_child_weight=10, subsample=0.7, colsample_bytree=0.7, reg_alpha=0, reg_lambda=0.5) reg.fit(X_train, y_train) end = time.time() y_pred_lgb = reg.predict_proba(X_test)[:, 1] print(metrics.roc_auc_score(y_test, y_pred_lgb)) print(end - start) start = time.time() reg = ExtraTreesClassifier(n_estimators=100, max_depth=7, min_samples_leaf=10, n_jobs=8, random_state=4) reg.fit(X_train, y_train) end = time.time() y_pred_et = reg.predict_proba(X_test)[:, 1] print(metrics.roc_auc_score(y_test, y_pred_et)) print(end - start) start = time.time() reg = KNeighborsClassifier(n_neighbors=4, algorithm='kd_tree') reg.fit(X_train, y_train) end = time.time() y_pred_knn = reg.predict_proba(X_test)[:, 1] print(metrics.roc_auc_score(y_test, y_pred_knn)) print(end - start)
Normalizer() ] # %% #=================Classifier classifier_test = [ OneVsRestClassifier(SVC()), DecisionTreeClassifier(max_depth=5), SVC(), SVC(kernel="linear", C=0.025), LogisticRegressionCV(cv=5, random_state=0), GradientBoostingClassifier(random_state=0), BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(X, y), ExtraTreesClassifier(n_estimators=100, random_state=0), HistGradientBoostingClassifier(), MLPClassifier(random_state=1, max_iter=300), OneVsOneClassifier(LinearSVC(random_state=0)), OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0), random_state=0) ] print('Importacao OK') #%% count = 0 dict_test = {} dict_all = {} for i in range(len(scaler)): scaler_i = scaler[i] for j in range(len(classifier_test)): count += 1
def __init__(self, patternEnum=PatternEnum.EVENTUALLY): ''' Initialize pattern's object with corresponding model file name and the best SMV classifier and pre-processing method identified before. ''' self.patternEnum = patternEnum self.pattern = Pattern(patternEnum) modelFile = str( patternEnum.order) + "_" + patternEnum.getFullName() + ".pkl" self.modelFile = config.PROJECT_ROOT + os.sep + "models" + os.sep + modelFile self.preProcessMethod = "NONE" if (patternEnum == PatternEnum.EVENTUALLY): self.maxRandState = 196558 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.ALWAYS): self.maxRandState = 124255 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.FOLLOWS): self.maxRandState = 196588 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.PRECEDES): self.maxRandState = 187708 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.NEVER): self.maxRandState = 182526 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.STEADY_STATE): self.maxRandState = 119746 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.UNTIL): self.maxRandState = 114007 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.INFINITELY_OFTEN): self.maxRandState = 150000 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='log2', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.NEXT): self.maxRandState = 173977 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.RELEASE): self.maxRandState = 105454 # random seed to shuffle data for training self.preProcessMethod = "SCALE" self.clf = \ SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.WEAK_UNTIL): self.maxRandState = 163090 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False)
def main(): ### Import data sets l_train = pd.read_csv('lemon_training.csv') l_test = pd.read_csv('lemon_test.csv') ### Clean/prepare data sets l_train = l_train.dropna(axis=1) l_test = l_test.dropna(axis=1) features = list(l_train.describe().columns) features.remove('RefId') features.remove('IsBadBuy') ### Create test and training sets train_features = l_train[features].values train_class = l_train.IsBadBuy.values OSS_features = l_test[features].values # Seed PRNG np.random.seed(1234) X_train, X_test, y_train, y_test = \ cross_validation.train_test_split(train_features, train_class, test_size=.3) ### Build model # model = naive_bayes.GaussianNB().fit(X_train, y_train) model = ExtraTreesClassifier(max_depth=8).fit(X_train, y_train) model.score(X_train, y_train) y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) ### Stats print 'training:\n', metrics.confusion_matrix(y_train, y_pred_train) print metrics.classification_report(y_train, y_pred_train) print 'test:\n', metrics.confusion_matrix(y_test, y_pred_test) print metrics.classification_report(y_test, y_pred_test) fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, y_pred_train, pos_label=1) fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1) print 'train MA: ', model.score(X_train, y_train) print 'test MA: ', model.score(X_test, y_test) print 'train AUC: ', metrics.auc(fpr_train, tpr_train) print 'test AUC: ', metrics.auc(fpr_test, tpr_test) # Cross Validation AUCs = [] # for i in xrange(10): # X_train, X_test, y_train, y_test = \ # cross_validation.train_test_split(train_features, train_class, test_size=.3) # y_pred_test = model.fit(X_train, y_train).predict(X_test) # fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1) # AUCs.append(metrics.auc(fpr_test, tpr_test)) # print 'AUC cross val: ', AUCs ### Do output predicitons for OSS data OSS_features = l_test[features].values y_pred_OSS = model.predict(OSS_features) submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred_OSS })
if y[a] == preds[a]: if preds[a] == 0: tn += 1 elif preds[a] == 1: tp += 1 elif preds[a] == 1: fp += 1 elif preds[a] == 0: fn += 1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn extra_trees = ExtraTreesClassifier() extra_score = cross_val_score(extra_trees, X, y, cv=i) print '\nextra trees %s-fold cross validation accuracy: %s' % (i, sum(extra_score)/extra_score.shape[0]) extra_fit = extra_trees.fit(X, y) print 'Feature Importances %s' % (extra_fit.feature_importances_) for f in extra_fit.feature_importances_: print '{}: {}'.format(next(features), f) X_for_preds = extra_fit.transform(X, threshold=min(extra_fit.feature_importances_)) preds = extra_fit.predict(X_for_preds) print 'predictions counter %s' % (Counter(extra_fit.predict(X_for_preds))) fp = 0 tp = 0 fn = 0 tn = 0
import random from pandas import read_csv from sklearn.cross_validation import train_test_split from sklearn.ensemble.forest import ExtraTreesClassifier from sklearn import metrics from sklearn import preprocessing authorship = read_csv('http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv') authors = list(set(authorship.Author.values)) le = preprocessing.LabelEncoder() le.fit(authors) authorship['Author_num'] = le.transform(authorship['Author']) #What are some of the stop words we're looking at? features = list(authorship.columns) features features.remove('Author') features.remove('Author_num') # Create a random variable (random forests work best with a random variable) # and create a test and training set authorship['random'] = [random.random() for i in range(841)] x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123) # Fit Model etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) # Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test) print metrics.classification_report(etclf.predict(x_test), y_test)
def etree_classify(X,Y): clf = ExtraTreesClassifier(n_estimators=500, max_depth=10, criterion='gini',min_samples_split=2, \ min_samples_leaf=1, max_features=None, bootstrap=False, oob_score=False, n_jobs=-1) clf.fit(X,Y) return clf
nmf = NMF(n_components=150) pca = PCA(n_components=80) sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2) kernel_pca = KernelPCA(n_components=150) # Costs huge amounts of ram randomized_pca = RandomizedPCA(n_components=500) # REGRESSORS random_forest_regressor = RandomForestRegressor(n_estimators=256) gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60) support_vector_regressor = svm.SVR() # CLASSIFIERS support_vector_classifier = svm.SVC(probability=True, verbose=True) linear_support_vector_classifier = svm.LinearSVC(dual=False) nearest_neighbor_classifier = KNeighborsClassifier() extra_trees_classifier = ExtraTreesClassifier(n_estimators=256) bagging_classifier = BaggingClassifier( base_estimator=GradientBoostingClassifier(n_estimators=200, max_features=4), max_features=0.5, n_jobs=2, verbose=1) gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=200, max_features=4, learning_rate=0.3, verbose=0) random_forest_classifier = RandomForestClassifier(n_estimators=2) logistic_regression = LogisticRegression(C=0.5) ridge_classifier = RidgeClassifier(alpha=0.1, solver='svd') bayes = MultinomialNB() sgd = SGDClassifier()
train_data = pd.read_csv('resources/train.csv') train_data = train_data.dropna() train_data = preprocess_data(train_data) X = train_data[['is_1', 'is_2', 'is_3', 'Fare', 'is_male', 'is_female']] Y = train_data['Survived'] XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2) n_estimators = 100 models = [ DecisionTreeClassifier(max_depth=3), BaggingClassifier(n_estimators=n_estimators), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(n_estimators=n_estimators) ] model_title = [ 'DecisionTree', 'Bagging', 'RandomForest', 'ExtraTrees', 'AdaBoost' ] surv_preds, surv_probs, scores, fprs, tprs, thres = ([] for i in range(6)) for i, model in enumerate(models): print('Fitting {0}'.format(model_title[i])) clf = model.fit(XTrain, YTrain) surv_preds.append(model.predict(XTest)) surv_probs.append(model.predict_proba(XTest))
def all_classifier_models(): models = [] metrix = [] c_report = [] train_accuracy = [] test_accuracy = [] models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier())) models.append(('GaussianNB', GaussianNB())) models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100))) models.append(('SVM', SVC(gamma='auto'))) models.append(('Linear_SVM', LinearSVC())) models.append(('XGB', XGBClassifier())) models.append(('SGD', SGDClassifier())) models.append(('Perceptron', Perceptron())) models.append(('ExtraTreeClassifier', ExtraTreeClassifier())) models.append(('OneClassSVM', OneClassSVM(gamma = 'auto'))) models.append(('NuSVC', NuSVC())) models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1))) models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0))) models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0))) models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('RidgeClassifierCV', RidgeClassifierCV())) models.append(('RidgeClassifier', RidgeClassifier())) models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier())) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))] models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))) clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'))) models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('ExtraTreesClassifier', ExtraTreesClassifier())) models.append(('CategoricalNB', CategoricalNB())) models.append(('ComplementNB', ComplementNB())) models.append(('BernoulliNB', BernoulliNB())) models.append(('MultinomialNB', MultinomialNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('NearestCentroid', NearestCentroid())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('GaussianMixture', GaussianMixture())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) test_accuracy= [] names = [] for name, model in models: try: m = model m.fit(X_train, y_train) y_pred = m.predict(X_test) train_acc = round(m.score(X_train, y_train) * 100, 2) test_acc = metrics.accuracy_score(y_test,y_pred) *100 c_report.append(classification_report(y_test, y_pred)) test_accuracy.append(test_acc) names.append(name) metrix.append([name, train_acc, test_acc]) except: print("Exception Occurred :",name) return metrix,test_accuracy,names