Esempi in Python per SelectFromModel.predict, esempi in Python per sklearn.feature_selection.SelectFromModel.predict

Esempio n. 1

0

Mostra file

def score_model(dataset):
    train, test, targets = recover_train_test_target(dataset)

    randomForestClassifier = RandomForestClassifier(n_estimators=50, max_features='sqrt')
    randomForestClassifier = randomForestClassifier.fit(train, targets)

    DataExploration.show_variable_relation_with_survival(train, randomForestClassifier)

    model = SelectFromModel(randomForestClassifier, prefit=True)
    train_reduced = model.transform(train)
    test_reduced = model.transform(test)

    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50,
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

    print 'Score: ', compute_score(model, train, targets, scoring='accuracy')

    output = model.predict(test).astype(int)
    df_output = pd.DataFrame()
    aux = pd.read_csv('data/test.csv')
    df_output['PassengerId'] = aux['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId', 'Survived']].to_csv('data/solution.csv', index=False)

Esempio n. 2

0

Mostra file

File: Models.py Progetto: khemanta/IBM-Advanced-Data-Science-Capstone

class randForestClassifier(AbstractModelClass):

    def __init__(self,n_estimators):
        self.n_estimators=n_estimators
        self.create_model()

    def create_model(self):
        self.classifier =  SelectFromModel(RandomForestClassifier(self.n_estimators))

    def fit(self,data,labels):
        self.classifier.fit(data,labels)

    def classifier_prediction(self,x_test):
        return self.classifier.predict(x_test)

    def get_support(self):
        return self.classifier.get_support()

    def get_SelectedFeatures(self, feature_data ):
        return feature_data.columns[self.get_support()]

    def get_FeatureImportance(self):
        feature_ranked = self.classifier.estimator_.feature_importances_
        ranked_feature_indices = np.argsort(feature_ranked)[::-1]
        return ranked_feature_indices , feature_ranked

Esempio n. 3

0

Mostra file

def randomForest(train, test, targets):
    clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
    clf = clf.fit(train, targets)
    model = SelectFromModel(clf, prefit=True)

    run_gs = False
    if run_gs:
        parameter_grid = {
            'max_depth': [4, 6, 8],
            'n_estimators': [50, 10],
            'max_features': ['sqrt', 'auto', 'log2'],
            'min_samples_split': [1, 3, 10],
            'min_samples_leaf': [1, 3, 10],
            'bootstrap': [True, False],
        }
        forest = RandomForestClassifier()
        cross_validation = StratifiedKFold(targets, n_folds=5)

        grid_search = GridSearchCV(forest,
                                   scoring='accuracy',
                                   param_grid=parameter_grid,
                                   cv=cross_validation)

        grid_search.fit(train, targets)
        model = grid_search
        parameters = grid_search.best_params_

        print('Best score: {}'.format(grid_search.best_score_))
        print('Best parameters: {}'.format(grid_search.best_params_))
    else:
        parameters = {
            'bootstrap': False,
            'min_samples_leaf': 3,
            'n_estimators': 50,
            'min_samples_split': 10,
            'max_features': 'sqrt',
            'max_depth': 6
        }

        model = RandomForestClassifier(**parameters)
        model.fit(train, targets)

    output = model.predict(test).astype(int)
    df_output = pd.DataFrame()
    aux = pd.read_csv('../input/test.csv')
    df_output['PassengerId'] = aux['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId', 'Survived']].to_csv('../predition.csv',
                                                  index=False)

Esempio n. 4

0

Mostra file

File: 202006260158.py Progetto: keumdohoon/STUDY

        y_test_pred = []
        y_pred = []
        for col in select_models:
            test_preds = []
            preds = []
            for model in models[col]:
                test_preds.append(model.predict(x_test))
                preds.append(model.predict(x_pred))
            test_pred = np.mean(test_preds, axis=0)
            pred = np.mean(preds, axis=0)

            y_test_pred.append(test_pred)
            y_pred.append(pred)
        selection_model.fit(select_x_train, y_train[:, i])

        y_pred = selection_model.predict(select_x_test)
        r2 = r2_score(y_test[:, i], y_pred)
        mae = MAE(y_test[:, i], y_pred)
        print(selection_model.best_params_)
        if mae <= best_mae:
            print("예아~")
            best_mae = mae
            best_model = selection_model
            best_y_pred = selection_model.predict(select_x_pred)
            best_y_test_pred = y_pred
        print("Thresh=%.3f, n=%d, MAE: %.5f R2: %.2f%%" %
              (thresh, select_x_train.shape[1], mae, r2 * 100))
    final_y_pred.append(best_y_pred)
    final_y_test_pred.append(best_y_test_pred)

y_test_pred = []

Esempio n. 5

0

Mostra file

plt.show()

# In[ ]:

start = time.clock()
model = XGBClassifier(booster='gbtree',
                      max_depth=5,
                      eval_metric='auc',
                      learning_rate=0.7,
                      min_child_weight=0.9,
                      verbose_eval=True)
model.fit(DataFrame(X_train, dtype='float'), DataFrame(y_train))
end = time.clock()
print('训练模型的时间为' + str(end - start))
start = time.clock()
y_pred = model.predict(DataFrame(X_test, dtype='float'))
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, predictions))
end = time.clock()
print('预测163个结果，所需时间为' + str(end - start))

# In[ ]:

print(model)

# In[ ]:

xgb.to_graphviz(model, num_trees=10)

Esempio n. 6

0

Mostra file

File: predictor.py Progetto: smoot618/clarktech-ncaab-predictor

class Predictor:
    def __init__(self):
        self._regressor = None
        self._model = None
        self._X_train = None
        self._X_test = None
        self._y_train = None
        self._y_test = None

        data = self._read_data()
        self._create_features(data)
        self._create_regressor()
        self._train_model()

    def simplify(self, test_data):
        test_data = test_data.loc[:,
                                  test_data.columns.isin(self.
                                                         _filtered_features)]
        test_data = test_data.reindex(self._filtered_features, axis=1)
        self._X_test = self._X_test.reindex(self._filtered_features, axis=1)
        parameters = {
            'bootstrap': False,
            'min_samples_leaf': 3,
            'n_estimators': 50,
            'min_samples_split': 10,
            'max_features': 'sqrt',
            'max_depth': 6
        }
        self._model = RandomForestRegressor(**parameters)
        self._model.fit(self._X_train, self._y_train)
        return test_data

    def predict(self, test_data, output_datatype):
        return self._model.predict(test_data).astype(output_datatype)

    def _read_data(self):
        if path.exists(DATASET_NAME):
            data = pd.read_pickle(DATASET_NAME)
            return differential_vector(data)
        frames = [pd.read_pickle(match) for match in \
                  glob('matches/*/*')]
        data = pd.concat(frames)
        data.drop_duplicates(inplace=True)
        data = filter_stats(data)
        data = data.dropna()
        data['home_free_throw_percentage'].fillna(0, inplace=True)
        data['away_free_throw_percentage'].fillna(0, inplace=True)
        data['points_difference'] = data['home_points'] - data['away_points']
        return differential_vector(data)

    def _create_features(self, data):
        X = data.drop('points_difference', 1)
        y = data['points_difference']
        split_data = train_test_split(X, y)
        self._X_train, self._X_test, self._y_train, self._y_test = split_data

    def _create_regressor(self):
        reg = RandomForestRegressor(n_estimators=50, max_features='sqrt')
        self._regressor = reg.fit(self._X_train, self._y_train)

    def _train_model(self):
        train = self._X_train
        self._model = SelectFromModel(self._regressor,
                                      prefit=True,
                                      threshold=0.01)
        self._X_train = self._model.transform(self._X_train)
        new_columns = train.columns[self._model.get_support()]
        self._filtered_features = [str(col) for col in new_columns]

Esempio n. 7

0

Mostra file

File: modelling.py Progetto: tiffytiffy/DS_showcase


#Tree-based feature selection
#################   C5-   Random Forrest
from sklearn.feature_selection import SelectFromModel
#use select from model to select those features which importance is greater than the mean importance of all the features by default
rf_sel = RandomForestClassifier(n_estimators = 20).fit(x_train[selected_feat], y_train)

#save model
joblib.dump(rf_sel, 'q2c_rf_fea_sel.pkl')

#load model back into spyder
rf_sel = joblib.load('q2c_rf_fea_sel.pkl')

# 4 - create prediction and convert it into dataframe
df_pred = pd.DataFrame(rf_sel.predict(x_test[selected_feat]),columns = ['prediction'])

#5 merge predction back to test dataset
final_df = pd.merge(adult_test,df_pred,how = 'left',left_index = True, right_index = True)

##Export data as CSV
final_df.to_csv('q2c_rf_fea_sel.csv', index=False)



#### Q2d - ensemble method
## voting

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

Esempio n. 8

0

Mostra file

File: titanic.py Progetto: MartinYan623/Kaggle-Titanic

for alg in algorithms:
    # Fit the algorithm using the full training data.
    alg.fit(train, targets)
    # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
    predictions = alg.predict_proba(test.astype(float))[:,1]
    full_predictions.append(predictions)
predictions = (full_predictions[0] + full_predictions[1]*2 + full_predictions[2]) / 4
models = [logreg_cv, rf, gboost]
for model in models:
    print('Cross-validation of : {0}'.format(model.__class__))
    score = compute_score(clf=model, X=train_reduced, y=targets, scoring='accuracy')
    print ('CV score = {0}'.format(score))
    print('****')
rf.fit(train, targets)
"""
predictions = model.predict(test[predictors])
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
predictions = predictions.astype(int)
submission = pd.DataFrame({
    "PassengerId": titanic_test["PassengerId"],
    "Survived": predictions
})
print(submission)
submission.to_csv('/Users/martin_yan/Desktop/submission3.csv', index=False)
"""
# 用到的特征
# 1.线性回归
# alg = LinearRegression()
# 2.逻辑回归
# alg= LogisticRegression(random_state=1)

Esempio n. 9

0

Mostra file

def predictWithFeatureSelectionNNConst(X, y, topN, size, learning_rate,
                                       n_iter):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=size,
                                                        random_state=0,
                                                        shuffle=True)
    clf = ExtraTreesClassifier(n_estimators=100)
    clf = clf.fit(X_train, y_train)

    print('Feature Importances')
    print(clf.feature_importances_)
    for feature in zip(X.columns, clf.feature_importances_):
        print(feature)
    feature_importance_normalized = np.std(
        [tree.feature_importances_ for tree in clf.estimators_], axis=0)

    XFeatures = list()

    model = SelectFromModel(clf,
                            prefit=True,
                            threshold=-np.inf,
                            max_features=topN)
    X_train = model.transform(X_train)
    X_test = model.transform(X_test)

    print(model.get_support(indices=True))
    for feature_list_index in model.get_support(indices=True):
        XFeatures.append(X.columns[feature_list_index])

    print('Selected Features')
    print(XFeatures)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)

    model = MLPClassifier(hidden_layer_sizes=(100, ),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=learning_rate,
                          max_iter=n_iter,
                          shuffle=True,
                          random_state=None,
                          verbose=True,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10)

    model.fit(X_train, y_train)
    joblib.dump(model, 'dataset/mlp_class.jbl')
    stat = list()
    # Evaluate on training data
    print('\n-- Training data --')
    predictions = model.predict(X_train)
    accuracy = metrics.accuracy_score(y_train, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(metrics.classification_report(y_train, predictions))
    print('Confusion Matrix:')
    print(metrics.confusion_matrix(y_train, predictions))
    print('')
    stat.append(round(accuracy * 100.0, 2))
    # Evaluate on test data
    print('\n---- Test data ----')
    predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(metrics.classification_report(y_test, predictions))
    print('Confusion Matrix:')
    print(metrics.confusion_matrix(y_test, predictions))
    stat.append(round(accuracy * 100.0, 2))
    stat.append(learning_rate)

    plt.plot(model.loss_curve_)
    plt.xlabel('Epoch')
    plt.ylabel('Value')
    plt.title('Model loss for relu activation and solver adam')
    plt.show()
    return stat

Esempio n. 10

0

Mostra file

modeler = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

S_X_train, S_X_test = stacking(modeler,
                               X_train,
                               y_train,
                               X_test,
                               regression=False,
                               metric=metrics.log_loss,
                               needs_proba=True,
                               stratified=True,
                               shuffle=True,
                               random_state=42,
                               verbose=2)

# %%
model = LogisticRegression(penalty='l1', C=1, random_state=42)

model = model.fit(S_X_train, y_train)

y_pred = pd.Series(model.predict(S_X_test))
y_pred_proba = model.predict_proba(S_X_test)[:, 1]

print("R Square:", metrics.accuracy_score(y_test, model.predict(S_X_test)))
print("kappa:", metrics.cohen_kappa_score(y_test, model.predict(S_X_test)))

# %%

Esempio n. 11

0

Mostra file

            if type_col[c]==1:
                print "Catagorical var %s selected \n "%ki
                p+=1
            break
        c+=1
                
print "attributes cat--",p
print list_already_taken



X_train, X_test, y_train, y_test = train_test_split(X_new_df, Y, test_size=0.3, random_state=0)

print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape


regr = linear_model.LinearRegression()
model = regr.fit(X_train, y_train)

Y_res = model.predict(X_test)

print("Mean squared error: %.2f"% mean_squared_error(y_test, Y_res))
print('Variance score: %.2f' % r2_score(y_test, Y_res))


print "END"

Esempio n. 12

0

Mostra file

File: 5_neural_network.py Progetto: adiba-khan/Thesis-dimensionality-reduction

        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y_b[train_index], y_b[test_index]

        start = time.time()

        lasso = Lasso(alpha=alpha[idx]).fit(x_train, y_train)
        model = SelectFromModel(lasso, prefit=True)
        x_train = model.transform(x_train)
        x_test = model.transform(x_test)

        #create NN model
        model = build_model()
        model.fit(x_train, y_train, epochs=10, batch_size=100, verbose=1)

        prediction = model.predict(x_test)

        end = time.time()
        save_data[f"{classifier_condition}_fold_{z+1}_n={alpha[idx]}"] = (model_evaluation(f"{classifier_condition}, {alpha[idx]}", f"fold_{z+1}", x_test, y_test, prediction, model, end-start, n_classes))

        z+=1

    range = ((idx*10) + (idx + 1))
    save_data[f"Average {classifier_condition}, n = {alpha[idx]}"] = save_data.iloc[:,range:].mean(axis=1)

save_data.to_csv(f"{classifier_condition}_new.csv")

#-------------------------------------------
# RUN CLASSIFIER WITH ISOMAP IMPLEMENTATION
#-------------------------------------------
'''ISOMAP is so slow that the value of n_components is manually adjusted;

Esempio n. 13

0

Mostra file

File: titanic_v2.py Progetto: HufsaT/datascience_sentdextutorials

# to find best model, we try 3 diff ones
svml = SVC()
gboost = GradientBoostingClassifier()
rf = RandomForestClassifier(n_estimators=100)
logreg = LogisticRegressionCV()
gaus = GaussianNB()
knear = KNeighborsClassifier()


models = [logreg, svml, rf, gboost,knear,gaus]

#for model in models:
#    print("Cross-validating: {0}".format(model.__class__))
#    score = compute_score(clf=model,x=train_x_reduced, y=train_y)
#    print("Accuracy of model: {0}".format(score))
#    print("*************")
    
model = GradientBoostingClassifier()
model.fit(train_x,train_y)
output = model.predict(test_x).astype(int) # so we don't get floats

passIDs = pd.read_csv("test_titanic.csv")
results = pd.DataFrame()
results["PassengerId"] = passIDs["PassengerId"]
results["Survived"] = output
print(results.shape)
results.to_csv("titanicsubmission.csv",index=False)

Esempio n. 14

0

Mostra file

File: titanic.py Progetto: uday97/titanic-survival-odds

X_train,X_test,y_train,y_test=train_test_split(train_reduced,targets,test_size=0.2,random_state=0)

parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
    
model = RandomForestClassifier(**parameters)
model.fit(X_train, y_train)

model2=GradientBoostingClassifier()
model2.fit(X_train,y_train)

model3=SVC()
model3.fit(X_train, y_train)
compute_score(model, X_test, y_test, scoring='accuracy')

y=model.predict(test_reduced)
testset=pd.read_csv('test.csv')
my_submission = pd.DataFrame({'PassengerId': testset.PassengerId, 'Survived': y})
my_submission.to_csv('pred2.csv', index=False)

#To test different hyperparameter combinations of RandomForestClassifier
'''run_gs = True

if run_gs:
    parameter_grid = {
                 'max_depth' : [4, 6, 8],
                 'n_estimators': [50, 10],
                 'max_features': ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [1.0, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],

Esempio n. 15

0

Mostra file

File: reg.py.py Progetto: itsnavneetk/2-level-regressor

#ans = bdt_discrete.predict(final_model_test_data);
ans = clf_2.predict(final_model_test_data)

#ans = clf_3.predict(final_model_test_data);
#ans = clf_4.predict(final_model_test_data);

for mn in range(len(ans)):
    if (ans[mn] == 0):
        classify = 0
    else:
        classify = 1
##############################1ST######################################################
    if (classify == 0):
        q = np.matrix([final_model_test_data[mn]])
        predictions = model.predict(q)
        r.append(predictions)
#######################################################################################
    elif classify == 1:
        q = np.matrix([final_model_test_data[mn]])
        predictions = model_2.predict(q)
        r.append(predictions)
#######################################################################################

predicted_results = r

for t in range(len(r)):
    if (r[t] - int(r[t])) > 0.5:
        predicted_results[t] = math.ceil(r[t] * 10 / 10)
    else:
        predicted_results[t] = int(r[t])

Esempio n. 16

0

Mostra file

File: Model_Lukas.py Progetto: lukaskln/AML_Project

                          min_child_samples=6,
                          min_child_weight=0,
                          subsample=0.8,
                          colsample_bytree=0.7,
                          reg_alpha=0,
                          importance_type="split")

for train_ix, test_ix in cv.split(X_train):

    X_cvtrain, X_cvtest = X_train.iloc[train_ix, :], X_train.iloc[test_ix, :]
    y_cvtrain, y_cvtest = y_train["y"].iloc[train_ix], y_train["y"].iloc[
        test_ix]

    model.fit(X_cvtrain, y_cvtrain)

    predtrain = model.predict(X_cvtrain)
    pred = model.predict(X_cvtest)

    print("\nTrain R2:")
    print(np.round(r2_score(y_cvtrain, predtrain), 2))
    print("\nTest R2:")
    print(np.round(r2_score(y_cvtest, pred), 2))
    print("\n________________________")

    R2.append(np.round(r2_score(y_cvtest, pred), 4))

print("\nAverage R2:", round(np.sum(R2) / 5, 2))
print("Std:", round(np.std(R2), 4))

# Predict Test Data

Esempio n. 17

0

Mostra file

imp_feat.nlargest(20).plot(kind='barh')
ind = np.argsort(imp)
rf.feature_importances_

# Stacking model
xcl_train, xcl_test, ycl_train, ycl_test = train_test_split(x_cl,
                                                            y_cl,
                                                            test_size=0.3)
#Support vector classifier
#svm=SVC(C=5, probability=True,gamma='auto')
#svm.fit(xcl_train,ycl_train)
#
#
lr = LogisticRegressionCV(cv=10)
lr.fit(x_cl, y_cl)
metrics.accuracy_score(y_cl, lr.predict(x_cl))
metrics.roc_auc_score(y_cl, lr.predict(x_cl))
#nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(xcl_train, ycl_train)
metrics.accuracy_score(ycl_test, gb.predict(xcl_test))
#rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)

#Predicting Monthly Revenue
reg_df = temp
cl_df['Churn']
reg_df['Revenue_loss'] = cl_df.Churn * reg_df['MonthlyRevenue']
reg_df['Revenue_loss']
reg_df = reg_df.drop(['MonthlyRevenue'], axis=1)
#Casting categorical types
for n, v in reg_df.iteritems():

Esempio n. 18

0

Mostra file

random_forest = GridSearchCV(RandomForestClassifier(class_weight="balanced",
                                                    random_state=123),
                             rf_param_grid,
                             cv=kfold,
                             n_jobs=-1,
                             refit=True,
                             scoring="roc_auc")

random_forest.fit(X_train_rf_selected, y_train)

print(
    f'Best score: {random_forest.best_score_} with param: {random_forest.best_params_}'
)

X_test_rf_selected = X_test[X_test.columns.intersection(rf_selected_features)]
y_rf_predictions = random_forest.predict(X_test_rf_selected)

conf_matrix = metrics.confusion_matrix(y_test, y_rf_predictions)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, fmt='g', cmap='coolwarm_r')
plt.title('Random Forests')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#plt.savefig('RF_CM.png', quality=95)
plt.show()

print(f"Accuracy: {metrics.accuracy_score(y_test, y_rf_predictions)}")
print(classification_report(y_test, y_rf_predictions))

y_pred_prob_rf = random_forest.predict_proba(X_test_rf_selected)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob_rf)
auc = metrics.roc_auc_score(y_test, y_pred_prob_rf)

Esempio n. 19

0

Mostra file

def main():


    data = pd.read_csv('selfie_dataset.txt', 
                        sep=" ", 
                        header=None, 
                        names=["Nome","Rate", "partial_faces", "is_female", "baby", "child","teenager", "youth", "middle_age","senior", "white", "black","asian", "oval_face", "round_face",
                                "heart_face", "smiling", "mouth_open","frowning", "wearing_glasses", "wearing_sunglasses","wearing_lipstick","2tongue_out0", "duck_face","black_hair",
                                 "blond_hair", "brown_hair","red_hair", "curly_hair", "straight_hair","braid_hair", "showing_cellphone", "using_earphone","using_mirror", "wearing_hat"
                                 ,"braces","harsh_lighting","dim_lighting"])

                                  
    

    
    labels = np.array(data['Rate'])
    features1= data.drop("Rate", axis = 1)
    features= features1.drop("Nome", axis = 1)

    feature_list = list(features.columns)
    features = np.array(features)


    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.1,random_state=0)

    sc = StandardScaler()
    train_features = sc.fit_transform(train_features)
    test_features = sc.transform(test_features)

    print('The shape of our train_features is:', train_features.shape)
    print('The shape of our test_features is:', test_features.shape)


    isTrained = False
    min_importance = 0.04
    n_estimators = 200
    retrain = True

    if(isTrained):

        if(retrain):
            crf = joblib.load("regressor.pkl")

            rf = SelectFromModel(crf, threshold=min_importance)
            rf.fit(train_features, train_labels)

            train_features = rf.transform(train_features)
            test_features = rf.transform(test_features)

            print('The shape of our important_train_features is:', train_features.shape)
            print('The shape of our important_test_features is:', test_features.shape)

            rf_important = RandomForestRegressor(n_estimators=n_estimators,random_state=1)


            rf_important.fit(train_features, train_labels)

            rf = rf_important

            print(rf_important)
            print("\n\n")
            predictions = rf_important.predict(test_features)
            importances = list(rf_important.feature_importances_)
            
        else:
            rf = joblib.load("regressor.pkl")
            print(rf)
            print("\n\n")
            predictions = rf.predict(test_features)
            importances = list(rf.feature_importances_)
    
    else:

        rf = RandomForestRegressor(n_estimators = n_estimators,oob_score=True,random_state=2)
        rf.fit(train_features, train_labels)
        joblib.dump(rf, 'regressor.pkl') 

        print(rf)
        print("\n\n")
        predictions = rf.predict(test_features)
        importances = list(rf.feature_importances_)

    
    
    print('Mean Absolute Error:', mean_absolute_error(test_labels,predictions))
    mape = np.mean(np.abs((test_labels - predictions) / test_labels)) * 100
    accuracy = 100 - mape
    print('Accuracy:', round(accuracy, 2), '%')

    print('Variance Score: ', explained_variance_score(test_labels,predictions))

    print("\n\n")

    print("Importances: ")
    importances = list(rf.feature_importances_)
    feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(feature_list, importances)]
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    for pair in feature_importances:
        print('{} : {}'.format(*pair))


    print()

Esempio n. 20

0

Mostra file

#print(test_reduced.shape)

parameters = {
    'bootstrap': False,
    'min_samples_leaf': 3,
    'n_estimators': 50,
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'max_depth': 6
}

model = RandomForestClassifier(**parameters)
model.fit(training_df_processed, survived_column)

print(
    compute_score(model,
                  training_df_processed,
                  survived_column,
                  scoring='accuracy'))

output = model.predict(test_df_processed).astype(int)

df_output = pandas.DataFrame()
aux = pandas.read_csv('D:/ML work/Titanic Data/test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId',
           'Survived']].to_csv('D:/ML work/Titanic Data/output.csv',
                               index=False)

Esempio n. 21

0

Mostra file

parameters = {
    'bootstrap': False,
    'min_samples_leaf': 4,
    'n_estimators': 50,
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'max_depth': 5
}

model = RandomForestClassifier(**parameters)
model.fit(X_train_reduced, Y_train)

# In[133]:

output = model.predict(test1_reduced).astype(int)
model1 = round(model.score(X_train_reduced, Y_train) * 100, 2)
model1

# #### Applying Random Forest Classifier. One can play with parameters (hyperparameter tuning to  increase score). I have achieved  .803 with less feature engineering. However, as i increased the number of dummies for age, it came down to 78.9.

# In[131]:

output = model.predict(test1_reduced).astype(int)
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": output
})
submission.to_csv("titanic51_submission.csv", index=False)

#  I'll update as i improve. Your guidance is appreciated. Also thanks a lot for all the tutorials where i learned a lot.

Esempio n. 22

0

Mostra file

File: rf.manual_randomizedCV.py Progetto: Anukriti12/SwarmProject

## Evaluation (No tuning) ------------> (1)
prediction = rf_classifier.predict(X_test)
print('Confusion Matrix\n', confusion_matrix(y_test, prediction))
print('Accuracy Score: ', accuracy_score(y_test, prediction))
print('Classification Report:\n', classification_report(y_test, prediction))

### Manual Hyperparameter Tuning
model = RandomForestClassifier(n_estimators=300,
                               criterion='entropy',
                               max_features='sqrt',
                               min_samples_leaf=10,
                               random_state=100).fit(X_train, y_train)

## Evaluation (Manual tuning) ------------> (2)
predictions = model.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

### Randomized Search Cv
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(10, 1000, 10)]
min_samples_split = [2, 5, 10, 14]
min_samples_leaf = [1, 2, 4, 6, 8]
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,

Esempio n. 23

0

Mostra file

File: script1001.py Progetto: darkblue-b/kaggleScape

else:
    parameters = {
        'bootstrap': False,
        'min_samples_leaf': 3,
        'n_estimators': 50,
        'min_samples_split': 10,
        'max_features': 'sqrt',
        'max_depth': 6
    }

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

# In[138]:

Y_pred = model.predict(test).astype(int)

# ## Model, predict and solve

# In[35]:

X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

# In[36]:

# Logistic Regression

logreg = LogisticRegression()

Esempio n. 24

0

Mostra file

File: titanic-machine-learning-from-disaster.py Progetto: nischalshrestha/automatic_wat_discovery

                           verbose=1)
grid_search = grid_search.fit(train_reduced, targets)
params = grid_search.best_params_

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

# ###### Now we use the result of Best params as hyperparameters to train our final machine learning "model"

# In[ ]:

model = RandomForestClassifier(**params)

model.fit(train_reduced, targets)
print(compute_score(clf=model, X=train_reduced, y=targets, scoring='accuracy'))

# ### So we obtain a 82+% classification accuracy with our machine learning model. We will submit our prediction to check how good we did in our test data set. We will use the predictions made from Random Forest y_pred.

# In[ ]:

y_pred = model.predict(test_reduced).astype(int)

# In[ ]:

test_data = pd.read_csv('../input/test.csv')
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": y_pred
})
submission.to_csv('titanic.csv', index=False)

Esempio n. 25

0

Mostra file

            coef0=0.0,
            decision_function_shape=None,
            degree=3,
            gamma=1e-05,
            kernel='rbf',
            max_iter=-1,
            probability=True,
            random_state=None,
            shrinking=True,
            tol=0.001,
            verbose=False)  #max iter -> eğitebildiğin kadar eğit dedik -1 ile.

model.fit(X_train, y_train)
print "Eğitim bitti"
'''eğitim uzun sürüyor cache size vb parametreler sürede etkili'''
''' onceden yarattıgın pickle dosyasını cagırmak için;

from sklearn.externals import joblib

model = joblib.load('yeni_model.pkl')

model.predict(X_test[150,:])
'''

from sklearn.externals import joblib
joblib.dump(
    best_model, 'malware_model.pkl'
)  #pickle a cevirme yerden tasarruf ve tekrar yuklendiginde kolaylik

import pickle

Esempio n. 26

0

Mostra file

                          name,
                          classes=sorted(list(set(ground_truth))),
                          normalize=True,
                          title='Normalized confusion matrix')

# In[7]:

#----- Feature importance ranking -----#

for name, model in [('RandomForest_recursive', RandomForestClassifier())]:
    print('Performing recursive feature elimination             : ', name)

    selector = RFECV(model, step=20, cv=10)
    selector = selector.fit(data_train, ground_truth)

    y_pred = selector.predict(data_train)
    cm = confusion_matrix(ground_truth,
                          y_pred,
                          labels=sorted(list(set(ground_truth))))

    #----- plot, print and save train results -----#
    plot_recall(name, cm, train_directory)
    plot_confusion_matrix(cm,
                          train_directory,
                          name,
                          classes=sorted(list(set(ground_truth))),
                          normalize=True,
                          title='Normalized confusion matrix')

    #----- save the model to disk -----#
    filename = os.path.join(train_directory, name) + '_model.sav'

Esempio n. 27

0

Mostra file

File: main.py Progetto: gunanksood/Titanic-ML-

    grid_search.fit(train, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

else:
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50,
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId', 'Survived']].to_csv('gridsearch_rf.csv', index=False)

trained_models = []
for model in models:
    model.fit(train, targets)
    trained_models.append(model)

predictions = []
for model in trained_models:
    predictions.append(model.predict_proba(test)[:, 1])

Esempio n. 28

0

Mostra file

File: predictor.py Progetto: peytondodd/basketball-predictor

class Predictor:
    def __init__(self, data_directory='matches'):
        self._regressor = None
        self._model = None
        self._X_train = None
        self._X_test = None
        self._y_train = None
        self._y_test = None

        data = self._read_data(data_directory)
        self._create_features(data)
        self._create_regressor()
        self._train_model()

    @property
    def accuracy(self):
        predicted = self.predict(self._X_test, int)
        accuracy = round(accuracy_score(self._y_test, predicted) * 100.0, 2)
        print 'Accuracy: %s%%' % accuracy

    def print_tree(self):
        dot_data = StringIO()
        i = 1
        for tree_in_forest in self._model.estimators_:
            dot_data = tree.export_graphviz(
                tree_in_forest,
                out_file='tree_%s.dot' % str(i),
                feature_names=self._filtered_features,
                class_names=['away_win', 'home_win'],
                filled=True,
                rounded=True,
                special_characters=True)
            i += 1

    def simplify(self, test_data):
        test_data = test_data.loc[:,
                                  test_data.columns.isin(self.
                                                         _filtered_features)]
        test_data = test_data.reindex(self._filtered_features, axis=1)
        self._X_test = self._X_test.reindex(self._filtered_features, axis=1)
        parameters = {
            'bootstrap': False,
            'min_samples_leaf': 3,
            'n_estimators': 50,
            'min_samples_split': 10,
            'max_features': 'sqrt',
            'max_depth': 6
        }
        self._model = RandomForestRegressor(**parameters)
        self._model.fit(self._X_train, self._y_train)
        return test_data

    def predict(self, test_data, output_datatype):
        return self._model.predict(test_data).astype(output_datatype)

    def _read_data(self, data_directory):
        frames = [pd.read_pickle(match) for match in \
                  glob('%s/*/*' % data_directory)]
        data = pd.concat(frames)
        data.drop_duplicates(inplace=True)
        data = filter_stats(data)
        data = data.dropna()
        data['home_free_throw_percentage'].fillna(0, inplace=True)
        data['away_free_throw_percentage'].fillna(0, inplace=True)
        data['points_difference'] = data['home_points'] - data['away_points']
        return differential_vector(data)

    def _create_features(self, data):
        X = data.drop('away_points', 1)
        X = X.drop('home_points', 1)
        y = data[['home_points', 'away_points']].values
        split_data = train_test_split(X, y)
        self._X_train, self._X_test, self._y_train, self._y_test = split_data

    def _create_regressor(self):
        reg = RandomForestRegressor(n_estimators=50, max_features='sqrt')
        self._regressor = reg.fit(self._X_train, self._y_train)

    def _train_model(self):
        train = self._X_train
        self._model = SelectFromModel(self._regressor,
                                      prefit=True,
                                      threshold=0.01)
        self._X_train = self._model.transform(self._X_train)
        new_columns = train.columns[self._model.get_support()]
        self._filtered_features = [str(col) for col in new_columns]

Esempio n. 29

0

Mostra file

#计算变量间的相关系数#
for i in range(len(feature)):
    selection.append([
        feature[i],
        corr.loc[:,
                 [feature[i]]][(np.abs(corr.loc[:, [feature[i]]].values) > 0.8)
                               & (corr.loc[:, [feature[i]]].values != 1)].index
    ])

##调参用sklearn的api##

###预测 ####
test = pd.read_csv("./data/crawler_test.txt", sep='\t')
test.head()
dtest = xgb.DMatrix(data=test.loc[:, feature].astype('float'))
preds = model.predict(dtest)
dfff = pd.concat([
    test.loc[:, ["clientid", "updatetime", "label", "score"]],
    pd.DataFrame(preds, columns=["pred"])
],
                 axis=1)
dfff
dfff.to_csv("crawler_0708prediction.csv", index=False, header=False)

###测试模型文件线上线下是否一致##
df_test = pd.DataFrame({
    "allianceid": ["na"],
    "avginterval2minutes": [799.4],
    "avginterval5minutes": [813.13336],
    "clientid": ["09031172210287250176"],
    "clientip2minutes": [1],

Esempio n. 30

0

Mostra file

File: task2_nn_vote_2.py Progetto: hangjiaz/AML_project

    model.compile(optimizer=optim,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # model.fit(x_ktrain, y_ktrain, batch_size=100, epochs=100, verbose=1)

    # y_kpred = np.argmax(model.predict(x_ktest), axis=1)

    # score = balanced_accuracy_score(y_ktest, y_kpred)
    # print(score)

    ############### model training
    y_clean = keras.utils.to_categorical(y_clean, 3)
    model.fit(x_clean, y_clean, batch_size=50, epochs=100, verbose=1)

    y_pred_mat[:, k] = np.argmax(model.predict(x_test_selected), axis=1)

y_pred = np.zeros(y_testid.shape[0])
for j in range(y_pred_mat.shape[0]):
    y_pred[j] = Counter(y_pred_mat[j]).most_common(1)[0][0]

print('vote_mat:', y_pred_mat[:5])
print('vote_result:', y_pred[:5])

# # ################ write output file
with open('output.csv', 'w') as f:
    f.write("{},{}\n".format("id", "y"))
    for i in range(len(y_testid)):
        f.write("{},{}\n".format(y_testid[i], y_pred[i]))