Ejemplo n.º 1
0
def get_model(rc="r", model="LIN"):

    if rc == "r":
        ## REGRESSORS
        if model == "LIN":
            rgrsr = linear_model.LinearRegression()
        elif model == "LASSO":
            rgrsr = linear_model.LassoLars(alpha=0.0)
        elif model == "RF":
            rgrsr = ensemble.RandomForestRegressor(n_estimators=120,
                                                   bootstrap=True,
                                                   max_depth=None,
                                                   oob_score=True)
        elif model == "RFX":
            rgrsr = ensemble.ExtraTreesRegressor(n_estimators=120,
                                                 bootstrap=True,
                                                 max_depth=None,
                                                 oob_score=True,
                                                 max_features=None)
        elif model == "RFXX":
            rgrsr = ensemble.ExtraTreesRegressor(n_estimators=12,
                                                 bootstrap=True,
                                                 max_depth=None,
                                                 max_features=None)
        elif model == "SV":
            rgrsr = svm.SVR(epsilon=0.0, tol=0.1)
        else:
            raise RuntimeError("Invalid model [%s]" % (model, ))

    if rc == "c":
        ## CLASSIFIERS
        if model == "LIN":
            rgrsr = LinearDiscriminantAnalysis()
        elif model == "RF":
            rgrsr = ensemble.RandomForestClassifier(n_estimators=120,
                                                    bootstrap=True,
                                                    max_depth=None,
                                                    oob_score=True)
        elif model == "RFX":
            rgrsr = ensemble.ExtraTreesClassifier(n_estimators=120,
                                                  bootstrap=True,
                                                  max_depth=None,
                                                  oob_score=True,
                                                  max_features=None)
        elif model == "RFXX":
            rgrsr = ensemble.ExtraTreesClassifier(n_estimators=12,
                                                  bootstrap=True,
                                                  max_depth=None,
                                                  max_features=None)
        elif model == "SV":
            rgrsr = svm.SVC()
        else:
            raise RuntimeError("Invalid model [%s]" % (model, ))

    return rgrsr
Ejemplo n.º 2
0
    def _init_model(self, parms=None):
        """Set ML model"""

        from sklearn import ensemble
        if self.args.extreme:
            if parms is not None:
                return 'ExtremeRF', ensemble.ExtraTreesClassifier(**parms)
            return 'ExtremeRF', ensemble.ExtraTreesClassifier()
        else:
            if parms is not None:
                return 'RF', ensemble.RandomForestClassifier(**parms)
            return 'RF', ensemble.RandomForestClassifier()
Ejemplo n.º 3
0
def get_alg(alg, mdl_config):
    if alg == 'skrf':
        clf = ensemble.RandomForestClassifier(
            n_estimators=mdl_config.get('n_estimators', 500),
            max_features=mdl_config.get('max_features', 0.35),
            max_depth=mdl_config.get('max_depth', 15),
            n_jobs=-1,
        )
    elif alg == 'skrfp':
        clf = ensemble.RandomForestClassifier(
            n_estimators=mdl_config.get('n_estimators', 500),
            max_features=mdl_config.get('max_features', 0.35),
            max_depth=mdl_config.get('max_depth', 15),
            criterion='entropy',
            n_jobs=-1,
        )
    elif alg == 'sket':
        clf = ensemble.ExtraTreesClassifier(
            n_estimators=mdl_config.get('n_estimators', 500),
            max_features=mdl_config.get('max_features', 0.5),
            max_depth=mdl_config.get('max_depth', 15),
            n_jobs=-1,
        )
    elif alg == 'sketp':
        clf = ensemble.ExtraTreesClassifier(
            n_estimators=mdl_config.get('n_estimators', 500),
            max_features=mdl_config.get('max_features', 0.5),
            max_depth=mdl_config.get('max_depth', 11),
            criterion='entropy',
            n_jobs=-1,
        )
    elif alg == 'skgbc':
        clf = ensemble.GradientBoostingClassifier(
            n_estimators=mdl_config.get('n_estimators', 30),
            max_depth=mdl_config.get('max_depth', 5))
    elif alg == 'xgb':
        # https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
        clf = xgb.XGBClassifier(
            n_estimators=mdl_config.get('n_estimators', 30),
            max_depth=mdl_config.get('max_depth', 7),
            learning_rate=mdl_config.get('learning_rate', 0.1),
            objective="multi:softprob",
            silent=True)
    elif alg == 'knn':
        clf = KNeighborsClassifier(n_neighbors=25,
                                   weights='distance',
                                   metric='manhattan',
                                   n_jobs=-1)
    elif alg == 'sklr':
        clf = linear_model.LogisticRegression(multi_class='multinomial',
                                              solver='lbfgs')
    return clf
Ejemplo n.º 4
0
def make_etc_model(train_df, synergy_score, n_estimators, max_features):
    from sklearn import ensemble
    model = ensemble.ExtraTreesClassifier(n_estimators=n_estimators,
                                          max_features=max_features,
                                          n_jobs=-1)
    model = fit(model, train_df, synergy_score)
    return model
Ejemplo n.º 5
0
    def find_best_features_extratree(self, participants, X, calibrations, y,
                                     **kwargs):
        from sklearn import ensemble, preprocessing
        default_fields =\
            utils.all_body_fields() + utils.all_body_orientation_fields()
        fields = kwargs.get('fields', default_fields)
        load_features = kwargs.get('load_features', False)

        if load_features:
            X = self.load_all_features(participants,
                                       X,
                                       calibrations,
                                       y,
                                       include=fields)

        X = X[self.add_features]
        columns = copy.deepcopy(X.columns)
        with pd.option_context('mode.use_inf_as_na', True):
            X = X.fillna(X.mean())

        labelEncoder = preprocessing.LabelEncoder()
        y['target'] = y[utils.target_fields()]\
            .apply(lambda xs: str(tuple(xs)), axis=1)
        y = labelEncoder.fit_transform(y['target'])

        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        model = ensemble.ExtraTreesClassifier()
        model.fit(X, y)
        feat_importances = pd.Series(model.feature_importances_, index=columns)
        return feat_importances.sort_values()
Ejemplo n.º 6
0
def random_forest(training_feature,training_target,test_feature,test_target):
	values=range(10,210,10)
	parameters=[{'n_estimators':values}]
	'''
	dt=grid_search.GridSearchCV(ensemble.RandomForestClassifier(), parameters, cv=5,scoring="accuracy",n_jobs=6)
	dt.fit(training_feature,training_target)
	print dt.score(training_feature, training_target)
	print dt.score(test_feature, test_target)
	#print dt.best_estimator_
	#model=dt.best_estimator_
	#importances=model.feature_importances_
	#for item in importances:
	#	print item
			
	
	values_small=range(1,3)
	parameters=[{'n_estimators':values,'learning_rate':values_small}]
	dt=grid_search.GridSearchCV(ensemble.AdaBoostClassifier(),parameters,cv=5,scoring="accuracy",n_jobs=6)
	dt.fit(training_feature,training_target)
	print dt.score(training_feature, training_target)
	print dt.score(test_feature, test_target)
	print dt.best_estimator_
	
	'''
	parameters=[{'n_estimators':values}]
	dt=grid_search.GridSearchCV(ensemble.ExtraTreesClassifier(),parameters,cv=5,scoring="accuracy",n_jobs=6)
	dt.fit(training_feature,training_target)
	#print dt.score(training_feature, training_target)
	#print dt.score(test_feature, test_target)
	#print dt.best_estimator_
	model=dt.best_estimator_
	importances=model.feature_importances_
	for item in importances:
		print item
Ejemplo n.º 7
0
def get_algorithms():
    MLA_dict = {
        # Ensemble methods
        "ada": ensemble.AdaBoostClassifier(),
        "bc": ensemble.BaggingClassifier(),
        "etc": ensemble.ExtraTreesClassifier(),
        "gbc": ensemble.GradientBoostingClassifier(),
        "rfc": ensemble.RandomForestClassifier(),
        # Gaussian processes
        "gpc": gaussian_process.GaussianProcessClassifier(),
        # Linear models
        "lr": linear_model.LogisticRegressionCV(),
        "pac": linear_model.PassiveAggressiveClassifier(),
        "rcc": linear_model.RidgeClassifierCV(),
        "sgd": linear_model.SGDClassifier(),
        "per": linear_model.Perceptron(),
        # Navies bayes
        "bnb": naive_bayes.BernoulliNB(),
        "gnb": naive_bayes.GaussianNB(),
        # Nearest neighbour
        "knn": neighbors.KNeighborsClassifier(),
        # SVM
        "svc": svm.SVC(probability=True),
        "nvc": svm.NuSVC(probability=True),
        "lvc": svm.LinearSVC(),
        # Trees
        "dtc": tree.DecisionTreeClassifier(),
        "ets": tree.ExtraTreeClassifier(),
        # Discriminant analysis
        "lda": discriminant_analysis.LinearDiscriminantAnalysis(),
        "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(),
    }
    return MLA_dict
Ejemplo n.º 8
0
def runET(train_X,
          train_y,
          test_X,
          test_y=None,
          validation=1,
          n_est_val=50,
          depth_val=None,
          split_val=2,
          leaf_val=1,
          feat_val='auto',
          jobs_val=4,
          random_state_val=0):
    clf = ensemble.ExtraTreesClassifier(n_estimators=n_est_val,
                                        max_depth=depth_val,
                                        min_samples_split=split_val,
                                        min_samples_leaf=leaf_val,
                                        max_features=feat_val,
                                        criterion='entropy',
                                        n_jobs=jobs_val,
                                        random_state=random_state_val)
    clf.fit(train_X, train_y)
    pred_train_y = clf.predict_proba(train_X)[:, 1]
    pred_test_y = clf.predict_proba(test_X)[:, 1]

    if validation:
        train_loss = log_loss(train_y, pred_train_y)
        loss = log_loss(test_y, pred_test_y)
        print "Train, Test loss : ", train_loss, loss
        return pred_test_y, loss
    else:
        return pred_test_y
Ejemplo n.º 9
0
def learn(feat_set, y, method='LogReg'):
    """
    Training of different classifiers
    """

    if method == 'mbK-means':
        clasf = cluster.MiniBatchKMeans(n_clusters=16,
                                        init='k-means++',
                                        max_iter=300,
                                        random_state=None).fit(feat_set)
    elif method == 'LogReg':
        clasf = linear_model.LogisticRegression(penalty='l1',
                                                class_weight='balanced',
                                                solver='saga',
                                                multi_class='multinomial',
                                                warm_start='False',
                                                max_iter=100,
                                                n_jobs=-1).fit(
                                                    feat_set, y.ravel())
    elif method == 'ParForest':
        clasf = ensemble.ExtraTreesClassifier(n_estimators=1000,
                                              max_features=128,
                                              n_jobs=-1,
                                              random_state=0).fit(
                                                  feat_set, y.ravel())

    return clasf
Ejemplo n.º 10
0
def create_model_from_signatures(sig_csv_path, model_out, sig_datatype=np.int32):
    """
    Takes a .csv file containing class signatures - produced by extract_features_to_csv - and uses it to train
    and pickle a scikit-learn model.

    Parameters
    ----------
    sig_csv_path
        The path to the signatures file
    model_out
        The location to save the pickled model to.
    sig_datatype
        The datatype to read the csv as. Defaults to int32.

    Notes
    -----
    At present, the model is an ExtraTreesClassifier arrived at by tpot:
    model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
                                 min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
    """
    model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
                                     min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
    features, labels = load_signatures(sig_csv_path, sig_datatype)
    model.fit(features, labels)
    joblib.dump(model, model_out)
Ejemplo n.º 11
0
def ExtraTrees(X_train, X_test, y_train, y_test):
    for i in range(len(X_train)):
        rf = ensemble.ExtraTreesClassifier()
        rf.fit(X_train[i], y_train[i])
        X_train[i] = rf.score(X_train[i], y_train[i])
        X_test[i] = rf.score(X_test[i], y_test[i])
    return X_train, X_test
Ejemplo n.º 12
0
    def run(self):
        self.output().makedirs()
        X, y, cols = self.load_data('train')
        weights = dict(enumerate(core.weights))
        cls = ensemble.ExtraTreesClassifier(
            n_estimators=self.n_trees, n_jobs=-1, verbose=10,
            bootstrap=True, min_samples_leaf=10,
            oob_score=False, class_weight=weights)
        cls.fit(X, y)
        importances = pandas.Series(
            cls.feature_importances_,
            index=cols)

        importance_frame = importances.groupby([ix.split('.')[0] for ix in importances.index])
        importance_aggs = importance_frame.agg(['mean', 'max', 'min', 'sum'])
        report_data = str(importance_aggs.sort_values('sum'))
        print(report_data)

        X, y, _ = self.load_data('valid')
        preds = cls.predict_proba(X)[:, 1]
        weights = core.weights[y]
        loss = metrics.log_loss(y, preds, sample_weight=weights)
        print(colors.green | str(loss))

        X, y, _ = self.load_data('merge')
        merge_pred = cls.predict_proba(X)[:, 1]
        pandas.Series(merge_pred).to_csv('cache/XTC_%s/merge_predictions.csv' % self.base_name)

        X, y, _ = self.load_data('test')
        pred = cls.predict_proba(X)[:, 1]
        pandas.Series(pred).to_csv('cache/XTC_%s/predictions.csv' % self.base_name)

        with self.output().open('w') as f:
            f.write(report_data)
Ejemplo n.º 13
0
def train_time(name, train_samples, attribute_candidates, label_attribute):
    X_train = train_samples[attribute_candidates].values
    y_train = train_samples[label_attribute].values

    t = time.process_time()
    clf_sklearn = tree.DecisionTreeClassifier()
    clf_sklearn.fit(X_train, y_train)
    dt_train_time = time.process_time() - t

    t = time.process_time()
    clf_sklearn_rf = ensemble.RandomForestClassifier()
    clf_sklearn_rf.fit(X_train, y_train)
    rf_train_time = time.process_time() - t

    t = time.process_time()
    etd_sklearn = ensemble.ExtraTreesClassifier(n_estimators=100,
                                                criterion='gini',
                                                min_samples_leaf=2,
                                                max_features='sqrt')
    etd_sklearn.fit(X_train, y_train)
    etd_train_time = time.process_time() - t

    print(f'{name},decision_tree,{int(dt_train_time * 1000)}')
    print(f'{name},random_forest,{int(rf_train_time * 1000)}')
    print(f'{name},extremely_randomized_trees,{int(etd_train_time * 1000)}')
Ejemplo n.º 14
0
def forget(name, train_samples, attribute_candidates, label_attribute):

    train_samples = train_samples.sample(frac=1).reset_index(drop=True)

    repetitions = int(len(train_samples) / 1000)

    for _ in range(0, repetitions):
        train_samples = train_samples.iloc[1:]

        X_train = train_samples[attribute_candidates].values
        y_train = train_samples[label_attribute].values

        t = time.process_time()
        clf_sklearn = tree.DecisionTreeClassifier()
        clf_sklearn.fit(X_train, y_train)
        dt_train_time = time.process_time() - t

        t = time.process_time()
        clf_sklearn_rf = ensemble.RandomForestClassifier()
        clf_sklearn_rf.fit(X_train, y_train)
        rf_train_time = time.process_time() - t

        t = time.process_time()
        etd_sklearn = ensemble.ExtraTreesClassifier(n_estimators=100,
                                                    criterion='gini',
                                                    min_samples_leaf=2,
                                                    max_features='sqrt')
        etd_sklearn.fit(X_train, y_train)
        etd_train_time = time.process_time() - t

        print(f'{name},decision_tree,{int(dt_train_time * 1000000)}')
        print(f'{name},random_forest,{int(rf_train_time * 1000000)}')
        print(
            f'{name},extremely_randomized_trees,{int(etd_train_time * 1000000)}'
        )
Ejemplo n.º 15
0
def learning(dataset, blocking = True):
    classifier = ensemble.ExtraTreesClassifier(criterion='entropy')

    plot_x = []
    plot_y_testing = []
    plot_y_mean = []

    train_sizes = np.linspace(0.1, 0.9, 10)

    train_size_abs, train_scores, test_scores = model_selection.learning_curve(classifier, dataset.training_features, dataset.training_labels,
        cv=10, train_sizes=train_sizes)
    
    train_losses = [1 - np.array(a).mean() for a in train_scores]
    test_losses = [1 - np.array(a).mean() for a in test_scores]
    
    plt.figure()
    plt.grid()
    plt.xlabel('Training Set Size')
    plt.ylabel('Loss')
    plt.title(TITLE)
    plt.plot(train_size_abs, train_losses)
    plt.plot(train_size_abs, test_losses)
    plt.legend(['Training', 'Testing'])
    if blocking:
        plt.show()
Ejemplo n.º 16
0
def chaos_feature_importance(x_all, y_feat, shadow_selector, feat_dic={}, **chaos_args):

    chaos_feat_iter = chaos_args.get('chaos_feat_iter', 10)
    chaos_n_estimators = chaos_args.get('chaos_n_estimators', 10)
    chaos_nb_features = chaos_args.get('chaos_nb_features', 30)
    chaos_gen_iter = chaos_args.get('chaos_gen_iter', 20)
    chaos_dummy_max = chaos_args.get('chaos_dummy_max', 20)

    ori_numcols = x_all.columns
    sel_numcols = []
    for j in range(chaos_feat_iter):
        x_all = x_all[list(set(ori_numcols) | set(sel_numcols))]
        clf = ensemble.ExtraTreesClassifier(n_estimators=chaos_n_estimators, n_jobs=-1)
        numcols = get_num_cols(x_all)
        catcols = [c for c in x_all.columns if x_all[c].dtype.name == 'object']
        chaos_gen(x_all, numcols, catcols, gen_iter=chaos_gen_iter, dummy_max=chaos_dummy_max)
        numcol2 = get_num_cols(x_all)
        x_feat = x_all[shadow_selector][numcol2]
        clf.fit(x_feat.replace(np.inf, 0).replace(-np.inf, 0).fillna(-1), y_feat)
        for f, v in get_clf_feat(clf, x_all, nb_feat=chaos_nb_features):
            sel_numcols.append(f)
            if f in feat_dic:
                feat_dic[f] += v
            else:
                feat_dic[f] = v
Ejemplo n.º 17
0
    def extreme_randomize_applied(Train,
                                  New,
                                  treshold,
                                  bootstrap=False,
                                  n_estimators: int = 200,
                                  max_depth: int = 50,
                                  oob_score: bool = False,
                                  class_weight='balanced_subsample',
                                  sampling=None,
                                  label='FRAUDE'):

        yTrain = Train[[label]]
        xTrain = Train
        del xTrain[label]

        if sampling == None:
            pass
        elif sampling == 'ALLKNN':
            xTrain, yTrain = under_sampling(xTrain, yTrain)
        else:
            xTrain, yTrain = over_sampling(xTrain, yTrain, model=sampling)

        min_sample_leaf = round((len(xTrain.index)) * 0.005)
        min_sample_split = min_sample_leaf * 10
        max_features = round(len(xTrain.columns) / 3)

        fileModel = ensemble.ExtraTreesClassifier(
            criterion='entropy',
            bootstrap=bootstrap,
            min_samples_leaf=min_sample_leaf,
            min_samples_split=min_sample_split,
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            oob_score=oob_score,
            random_state=531,
            verbose=1,
            class_weight=class_weight,
            n_jobs=1)

        fileModel.fit(
            xTrain.drop(['id_siniestro'], axis=1).values, yTrain.values)
        y_hat_New = fileModel.predict_proba(
            New.drop('id_siniestro', axis=1).values)
        y_hat_New = np.delete(y_hat_New, 0, axis=1)
        df_proba = pd.DataFrame(y_hat_New,
                                columns=['probabilidad'],
                                index=New.index)
        df_proba = pd.concat([New['id_siniestro'], df_proba], axis=1)

        # y_random = (y_hat_New <= treshold).astype(int)
        y_hat_New = (y_hat_New > treshold).astype(int)
        print(y_hat_New)
        df_proba_threshold = pd.DataFrame(y_hat_New,
                                          columns=['Treshold'],
                                          index=New.index)
        print(df_proba_threshold)
        df_proba_conc = pd.concat([df_proba, df_proba_threshold], axis=1)
        print(df_proba_conc)
        return df_proba_conc
Ejemplo n.º 18
0
def ReduceClassesFeature(
        class_id1, class_id2, data_X_1, data_X_2,
        importance_treshold):  #two 2D numpy array and corresponding class id
    X = np.vstack((data_X_1, data_X_2))
    y = np.asarray([class_id1] * len(data_X_1) + [class_id2] * len(data_X_2))
    ET_classfier = ensemble.ExtraTreesClassifier()
    ET_classfier.fit(X, y)
    importance = ET_classfier.feature_importances_
    feature_select_mask = (importance > importance_treshold)
    X = []
    y = []
    data_X_1 = data_X_1[:, feature_select_mask]
    data_X_2 = data_X_2[:, feature_select_mask]
    remainder_ratio = 1
    if class_id1 < 2 and class_id2 > 1:  #class_id1 = 0,1 class_id2 = 2,3,4
        #very imbalance data
        #sample the big set
        rand_idx = np.random.permutation(len(data_X_1))
        data_X_1 = data_X_1[rand_idx[0:remainder_ratio * len(data_X_2)], :]
    elif class_id2 < 2 and class_id1 > 1:  #class_id2 = 0,1 class_id1 = 2,3,4 (not happend in the one vs one models)
        rand_idx = np.random.permutation(len(data_X_2))
        data_X_2 = data_X_2[rand_idx[0:remainder_ratio * len(data_X_1)], :]
    elif class_id1 == 0 and class_id2 == 1:
        rand_idx = np.random.permutation(len(data_X_1))
        data_X_2 = data_X_2[rand_idx[0:remainder_ratio * len(data_X_1)], :]

    print 'reduct shape', np.shape(data_X_1)
    print 'mask', feature_select_mask
    return [data_X_1, data_X_2, ET_classfier, feature_select_mask]
Ejemplo n.º 19
0
    def setUp(self):
        self.modellist = [
            lgb.LGBMRegressor(n_estimators=1),
            lgb.LGBMClassifier(n_estimators=1),
            xgb.XGBRegressor(n_estimators=1),
            xgb.XGBRegressor(n_estimators=1),
            svm.SVR(kernel='linear'),
            svm.SVC(kernel='linear'),
            cb.CatBoostRegressor(n_estimators=1),
            cb.CatBoostClassifier(n_estimators=1),
            ske.GradientBoostingRegressor(n_estimators=1),
            ske.GradientBoostingClassifier(n_estimators=1),
            ske.ExtraTreesRegressor(n_estimators=1),
            ske.ExtraTreesClassifier(n_estimators=1),
            ske.RandomForestRegressor(n_estimators=1),
            ske.RandomForestClassifier(n_estimators=1),
            skl.LogisticRegression(),
            skl.LinearRegression()
        ]

        df = pd.DataFrame(range(0, 21), columns=['id'])
        df['y'] = df['id'].apply(lambda x: 1 if x < 10 else 0)
        df['x1'] = np.random.randint(1, 123, df.shape[0])
        df['x2'] = np.random.randint(1, 3, df.shape[0])
        df = df.set_index('id')
        self.x_df = df[['x1', 'x2']]
        self.y_df = df['y'].to_frame()
Ejemplo n.º 20
0
def model_training(complete_tag_count, prediction, predicted_class, nonpredicted_class, language):
    tag_total = np.array(complete_tag_count)
    clf = ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0, criterion='entropy',
                                        n_jobs=9)
    """if prediction == 'age':
        cv = cross_validation.StratifiedKFold(predicted_final, 10)
        results = cross_validation.cross_val_predict(clf, tag_total, nonpredicted_final, cv=cv)
        final_tags = []
        for i in range(len(tag_total)):
            user = tag_total[i]
            user_gender = results[i]
            if user_gender == 'M' or user_gender == 'MALE':
                g = 0
            elif user_gender == 'F' or user_gender == 'FEMALE':
                g = 1
            user = np.append(user, g)
            final_tags.append(user)
    else:
        final_tags = tag_total"""
    final_tags = tag_total

    clf.fit(final_tags, predicted_class)

    with open('models/'+language+'-'+prediction+'-base.p', 'wb') as clffile:
        pickle.dump(clf, clffile)
Ejemplo n.º 21
0
def extra_trees(x_train, y_train, x_test, cleaned_features):
    clf = ensemble.ExtraTreesClassifier(random_state=seed)
    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)
    feature_df = pd.DataFrame(columns=cleaned_features)
    feature_df.loc['feature_importances'] = clf.feature_importances_
    return predicted, feature_df, clf
Ejemplo n.º 22
0
def select_features(x,y,data):
    '''
    Use a tree classifier to select the most relevent features from data.csv
    70%-30% train-test split for purposes of cross validation.
    '''
    feature_select = ske.ExtraTreesClassifier().fit(x,y)
    model = SelectFromModel(feature_select, prefit=True)
    x_new = model.transform(x)
    nb_features = x_new.shape[1]
    x_train, x_test, y_train, y_test = model_selection.train_test_split(x_new, y, test_size=0.3)
    features = []
    # print('{} features were selected as being important:'.format(nb_features))
    indices = numpy.argsort(feature_select.feature_importances_)[::-1][:nb_features]
    col_width = len(max(data.columns[2+indices[f]] for f in range(nb_features))) + 5

    imp_features = []
    for f in range(nb_features):
        number = f+1
        feature_name = ''.join(data.columns[2+indices[f]].ljust(col_width))
        feature_importance = feature_select.feature_importances_[indices[f]]
        # print('{}.\t{} {}'.format(number, feature_name, (feature_importance * 100)))
        imp_features.append([str(feature_name).strip(), feature_importance * 100])

    for f in sorted(numpy.argsort(feature_select.feature_importances_)[::-1][:nb_features]):
        features.append(data.columns[2+f])
    # pp(imp_features)
    # return x_train, x_test, y_train, y_test, features
    return imp_features
Ejemplo n.º 23
0
def rank_features_etc(X, Y, columns):
    # supervised ranking
    model = ensemble.ExtraTreesClassifier(n_estimators=100,
                                          max_depth=10,
                                          random_state=0)
    model.fit(X, Y)

    #list feature importance
    importances = model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in model.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]
    #indices = indices[:25] #limit to 25

    # Print the feature ranking
    #print("Feature ranking:")

    #for f in range(X.shape[1] - 1):
    #    print(f, 'index', indices[f], X.columns[indices[f]], importances[indices[f]])

    # Plot the feature importances of the forest
    plt.figure(figsize=(15, 5))
    plt.title("ETC Feature Importances")
    plt.bar(range(X.shape[1]),
            importances[indices],
            color="r",
            yerr=std[indices],
            align="center")
    plt.xticks(range(X.shape[1]),
               np.array(columns)[indices],
               rotation='vertical')
    plt.xlim([-1, X.shape[1] + 1])
    plt.show()

    return indices
Ejemplo n.º 24
0
 def model_init(self, reDefine_clf={}, used_clf=[]):
     clfs = {
         'xgb':
         xgboost.XGBClassifier(n_estimators=100, max_depth=3),
         'lgb':
         lightgbm.LGBMClassifier(n_estimators=100, max_depth=3),
         'gbdt':
         ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=3),
         'rf':
         ensemble.RandomForestClassifier(n_estimators=200, max_depth=6),
         'logit':
         linear_model.LogisticRegression(),
         'svc':
         svm.SVC(probability=True),
         'adbt':
         ensemble.AdaBoostClassifier(
             base_estimator=tree.DecisionTreeClassifier(max_depth=3)),
         'bagg':
         ensemble.BaggingClassifier(n_estimators=100),
         'ext':
         ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=3),
         #   'perceptron':linear_model.Perceptron(),
         'dt':
         tree.DecisionTreeClassifier(),
         'knn':
         neighbors.KNeighborsClassifier(),
         'network':
         neural_network.MLPClassifier()
     }
     clfs.update(reDefine_clf) if reDefine_clf else clfs
     if used_clf:
         return dict(
             (key, value) for key, value in clfs.items() if key in used_clf)
     else:
         return clfs
def et1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    scores = list()
    num_seeds = 3
    num_splits = 5
    base_seed = 13
    ss = model_selection.ShuffleSplit(n_splits=num_splits)
    for seed in range(base_seed, base_seed + num_seeds):
        ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed)
        for n, (itrain, ival) in enumerate(ss.split(train2, y)):
            reg = ensemble.ExtraTreesClassifier(max_depth=15,
                                               random_state=seed,
                                               n_estimators=2500,
                                               n_jobs=-2)
            reg.fit(train2[itrain], y[itrain])
            p = reg.predict_proba(train2[ival])[:,1]
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            print(cname, 'seed %d step %d: '%(seed, n+1), score, now())
            scores.append(score)
            z[cname] += pconvert(reg.predict_proba(test2)[:,1])

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits * num_seeds
    v[cname] /= num_seeds
Ejemplo n.º 26
0
def runET(train_X,
          train_y,
          test_X,
          test_y=None,
          test_X2=None,
          depth=10,
          leaf=5,
          feat=0.3):
    model = ensemble.ExtraTreesClassifier(n_estimators=300,
                                          max_depth=depth,
                                          min_samples_split=10,
                                          min_samples_leaf=leaf,
                                          max_features=feat,
                                          n_jobs=6,
                                          random_state=0)
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:, 1]
    test_preds = model.predict_proba(test_X)[:, 1]
    test_preds2 = model.predict_proba(test_X2)[:, 1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print "Depth, leaf, feat : ", depth, leaf, feat
        print "Train and Test loss : ", train_loss, test_loss
    return test_preds, test_loss, test_preds2
def feature_importance(trainX, trainY, testX, testY, columns):
    """
        Calculates the feature importance on the training set for a given set of variables
        It prints this importance and plots it
        """

    ## Feature selection
    clf = ensemble.ExtraTreesClassifier(random_state=1729,
                                        n_estimators=250,
                                        n_jobs=-1)
    selector = clf.fit(trainX, trainY)
    importances = clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(trainX.shape[1]):
        print("%d. %s (%f)" %
              (f + 1, columns[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(trainX.shape[1]),
            importances[indices],
            color="r",
            yerr=std[indices],
            align="center")
    plt.xticks(range(trainX.shape[1]), indices)
    plt.xlim([-1, trainX.shape[1]])
    plt.show()
Ejemplo n.º 28
0
def ensemble_models():
	# Here let's implement some ensemble methods to potentially improve accuracy
	# And get a better idea of the inherent structure of the data

	models = np.empty([4, 2], dtype = object)
	# Boosting ensembles
	models[0] = ['Gradient Boosted Machine', ensemble.GradientBoostingClassifier(random_state = 1)]
	models[1] = ['AdaBoost Classifier', ensemble.AdaBoostClassifier(random_state = 1)]

	# Bagging Ensembles
	# Even though the decision tree didn't do so well, a random forest might
	n_trees = 100
	models[2] = ['Random Forest', ensemble.RandomForestClassifier(n_estimators = n_trees,	max_features = 3, random_state = 1)]
	models[3] = ['Extra Trees Classifier', ensemble.ExtraTreesClassifier(n_estimators = n_trees, max_features = 3, random_state = 1)]

	# Fit & evaluate models
	for name, model in models:
		# Different model metrics
		for scoring in ('accuracy', 'roc_auc'):
			cross_validation(name, model, X, Y, scoring)

		# Fit model and make predictions
		fitted_model = model.fit(X_train, Y_train)
		Y_pred = fitted_model.predict(X_test)
		
		# Classification report & Confusion Matrix
		classification_report(name, Y_test, Y_pred)
		confusion_matrix(name, Y_test, Y_pred)		

	"""
Ejemplo n.º 29
0
def get_curve_per_model(model):
    _models = [model.trained_classifier]

    runtime = []
    for k in [25, 50, 100, 200]:
        if model.algorithm == 'RF':
            base_model = ensemble.RandomForestClassifier(n_jobs=4, verbose=1)
        elif model.algorithm == 'ET':
            base_model = ensemble.ExtraTreesClassifier(n_jobs=4, verbose=1)
        for i in model.param_ranges.keys():
            setattr(base_model, i, model.trained_classifier.get_params()[i])
        setattr(base_model, 'n_estimators', k)
        print base_model
        start = timeit.default_timer()
        base_model.fit(model.X_train, model.Y_train, model.W_train)
        elapsed = timeit.default_timer()
        runtime.append(elapsed - start)
        _models.append(base_model)

    curves = []
    for m in _models:
        print 'Constructing AMS curve for models with n_estimators (this could take several minutes)'
        train_score = m.predict_proba(model.X_train)[:, 1]
        test_score = m.predict_proba(model.X_test)[:, 1]
        curve, thresh = ams_vs_cutoff(model.X_test, model.Y_test, model.W_test,
                                      train_score, test_score)
        curves.append(curve)
    return curves, thresh, runtime
Ejemplo n.º 30
0
def cross_validate_model(X_train, Y_train):
    """
	Here we perform cross validation of models to choose the best one.
	"""
    # Divide the training and testing data
    train, test, y_actual, y_predict = train_test_split(X_train,
                                                        Y_train,
                                                        test_size=0.5,
                                                        random_state=41)
    train_n, test_n, y_actual_n, y_predict_n = train_test_split(X_train,
                                                                Y_train,
                                                                test_size=0.5,
                                                                random_state=0)

    # Add one hot encoder
    rf = ensemble.RandomForestClassifier(n_estimators=50, max_depth=5)
    rf_enc = OneHotEncoder()
    rf_lm = sklinear.LogisticRegression()
    rf.fit(train, y_actual)
    rf_enc.fit(rf.apply(train))
    rf_lm.fit(rf_enc.transform(rf.apply(test)), y_predict)
    y_predict_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(test_n)))
    mse_rf_lm = metrics.mean_squared_error(y_predict_n, y_predict_rf_lm[:, 1])
    print('MSE RandomForestClassifier followed by LogisticRegression is %f' %
          (mse_rf_lm))

    # List the classification methods to use.
    clf_quaddis = discriminant_analysis.QuadraticDiscriminantAnalysis()
    clf_logreg = sklinear.LogisticRegression(penalty='l1')
    clf_random_forest = ensemble.RandomForestClassifier(n_estimators=50,
                                                        max_depth=10)
    clf_adaboost = ensemble.AdaBoostClassifier(n_estimators=50)
    clf_mlpc = neural_network.MLPClassifier()
    clf_extra_tree = ensemble.ExtraTreesClassifier(n_estimators=50,
                                                   bootstrap=True)

    # Add the above methods in an array
    # More ameable for looping
    methods = [
        clf_quaddis, clf_logreg, clf_random_forest, clf_adaboost, clf_mlpc,
        clf_extra_tree
    ]
    methods_label = [
        'clf_quaddis', 'clf_logreg', 'clf_random_forest', 'clf_adaboost',
        'clf_mlpc', 'clf_extra_tree'
    ]

    method_mse = np.zeros((len(methods), 1))
    # Fit and predict for each method
    for i in range(len(methods)):
        methods[i].fit(train, y_actual)
        method_predict = methods[i].predict_proba(test)
        method_mse[i] = metrics.mean_squared_error(y_predict,
                                                   method_predict[:, 1])
        print('MSE for %s while cross validation : %f' %
              (methods_label[i], method_mse[i]))

    # We return the method which has the minimum mse
    return np.argmin(method_mse)