def train_dummy_regressors(features, target):
    for strat in ['mean', 'median']:
        dr = DummyRegressor(strategy=strat)
        dr.fit(features, y=target.flatten())

        dummy_score = (100 * dr.score(features, target))
        print('{:.1f} % score for a dummy regressor using the {} stragety'.format(
	    dummy_score,
	    dr.get_params()['strategy']))
feat_imps.plot(x='feature', y='importance', kind='barh')

# LINEAR REGRESSION

X_train, X_test, y_train, y_test = \
 train_test_split(X, y, test_size=0.2, random_state=44)

# Create a dummy regressor
dummy_mean = DummyRegressor(strategy='mean')

# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)

# Get R-squared score
dummy_mean.score(X_test, y_test)  # -0.11 R2 using mean

# vanilla linear regression - R2 is -4!
model = regr.fit(X_train, y_train)

model.score(X_test, y_test)

# LASSO

# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X_train)

# Create lasso regression with alpha value
lasso = Lasso(alpha=0.1)
y_train_10 = train_10['GAME_TOTAL'].to_numpy()
X_test_10 = test_10.drop('GAME_TOTAL', axis=1).to_numpy()
y_test_10 = test_10['GAME_TOTAL'].to_numpy()
Test_Vegas = test_10['TOTAL_CLOSE'].to_numpy()
Train_Vegas = train_10['TOTAL_CLOSE'].to_numpy()

#Vegas BASELINE = 17.565434708173875
mean_squared_error(y_test_10, Test_Vegas, squared=False)

#DUMMY REGRESSOR:

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train_10, y_train_10)
#returns -0.0011412
#second run with new data =  -0.00201585
dummy_regr.score(X_test_10, y_test_10)
#returns 21.1452
#second run = 21.1599
mean_squared_error(y_test_10, dummy_regr.predict(X_test_10), squared=False)

#OLS
regressor = sm.OLS(y_train_10, X_train_10)
regressor = regressor.fit()
#evidently this returned a 0.991 R**2
#second run gave us 0.993
regressor.summary()
preds = regressor.predict(X_test_10)
#this returns a RMSE of 19.29939303517463
#second run gives 17.708329120934696, which is close to vegas without any tuning...
mean_squared_error(y_test_10, preds, squared=False)
Esempio n. 4
0
X_train_s = train_season.drop('GAME_TOTAL', axis = 1).to_numpy()
y_train_s = train_season['GAME_TOTAL'].to_numpy()
X_test_s = test_season.drop('GAME_TOTAL', axis = 1).to_numpy()
y_test_s = test_season['GAME_TOTAL'].to_numpy()
Test_Vegas = test_season['TOTAL_CLOSE'].to_numpy()
Train_Vegas = train_season['TOTAL_CLOSE'].to_numpy()

#Vegas BASELINE = 17.650007402704748 
mean_squared_error(np.append(y_train_s,y_test_s), np.append(Train_Vegas,Test_Vegas), squared = False)

#DUMMY REGRESSOR:

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train_s, y_train_s)
#-0.7833193001644205
dummy_regr.score(X_test_s, y_test_s)
#27.845427872989156
mean_squared_error(y_test_s, dummy_regr.predict(X_test_s), squared = False)

#OLS
regressor = sm.OLS(y_train_s, X_train_s)
regressor = regressor.fit()
#evidently this returned a 0.991 R**2
#second run gave us 0.993
regressor.summary()
preds = regressor.predict(X_test_s)
#18.5802074596655
mean_squared_error(y_test_s, preds, squared = False)

#RANDOM FOREST
rf = RandomForestRegressor(oob_score=True)
Esempio n. 5
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# # Baseline Model

# In[36]:

from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
dummy_regr.predict(X_train)
baseline = dummy_regr.score(X_train, y_train)
print("Baseline R^2: %f" % baseline)

# # Multiple Linear Regression

# In[37]:

ols = linear_model.LinearRegression()
ols.fit(X_train, y_train)
print("Coefficients: %s" % ols.coef_)
print("Intercept: %f" % ols.intercept_)
y_test_prediction = ols.predict(X_test)
ols.score(X_train, y_train)

# In[40]:
Esempio n. 6
0
def test_regressor_score_with_None(y, y_test):
    reg = DummyRegressor()
    reg.fit(None, y)
    assert_equal(reg.score(None, y_test), 1.0)
Esempio n. 7
0
    def train_rf(self, features, labels):

        print('Training random forest ...')

        self.model = RandomForestRegressor(n_estimators=100,
                                           max_features='sqrt',
                                           max_depth=np.ceil(len(features[0])/5),
                                           min_samples_leaf=3,
                                           n_jobs=-1)

        self.model2 = RandomForestClassifier(
            n_estimators=100,
            max_features='sqrt',
            max_depth=np.ceil(len(features[0])/5),
            min_samples_leaf=3,
            n_jobs=-1
        )

        self.lr0 = linear_model.TheilSenRegressor()
        self.lr1 = linear_model.TheilSenRegressor()

        reg_dummy = DummyRegressor()
        clf_dummy = DummyClassifier()

        kfold = KFold(n_splits=self.kfold, shuffle=True)
        kfold2 = KFold(n_splits=self.kfold, shuffle=True)

        features, labels = shuffle(features, labels)

        import matplotlib.pyplot as plt
        import seaborn as sns
        sns.set(style='whitegrid', context='paper')

        for ifold, (train, test) in enumerate(kfold.split(labels)):
            self.model.fit(features[train], labels[train])
            score_train = self.model.score(features[train], labels[train])
            score_test = self.model.score(features[test], labels[test])
            reg_dummy.fit(features[train], labels[train])
            score_dummy = reg_dummy.score(features[test], labels[test])
            print('Fold %d: %.4f / %.4f (%.4f)' % (ifold, score_test, score_train, score_dummy))

            labels_t = labels.transpose()
            y_pred = self.model.predict(features)
            y_pred_t = y_pred.transpose()
            # self.lr0.fit(labels_t[0][train].reshape(-1, 1), y_pred_t[0][train])
            self.lr1.fit(labels_t[1][train].reshape(-1, 1), y_pred_t[1][train])
            y_lr = self.lr1.predict(labels_t[1][test].reshape(-1,1))
            dy = np.abs(y_pred_t[1][test] - y_lr) < 0.2
            print('\t%d / %d' % (np.sum(dy), np.sum(1 - dy)))
            for jfold, (train2, test2) in enumerate(kfold2.split(dy)):
                self.model2.fit(features[test[train2]], dy[train2])
                y_pred2 = self.model2.predict(features[test[test2]])
                score_train2 = precision_score(dy[train2], self.model2.predict(features[test[train2]]), average='binary')
                score_test2 = precision_score(dy[test2], y_pred2, average='binary')
                clf_dummy.fit(features[test[train2]], dy[train2])
                score_dummy = precision_score(dy[test2], clf_dummy.predict(features[test[test2]]), average='binary')
                print('\tFold %d: %.4f / %.4f (%.4f)' % (jfold, score_test2, score_train2, score_dummy))

                score_final_train = self.model.score(features[test[train2]], labels[test[train2]])
                score_final_test = self.model.score(features[test[test2[y_pred2]]], labels[test[test2[y_pred2]]])
                print('\tFinal: %.4f / %.4f' % (score_final_test, score_final_train))

            fig, axs = plt.subplots(2,2)
            train_truth = labels[train].transpose()
            train_pred = self.model.predict(features[train]).transpose()
            test_truth = labels[test].transpose()
            test_pred = y_pred[test].transpose()
            sns.scatterplot(x=train_truth[0], y=train_pred[0], ax=axs[0,0])
            sns.scatterplot(x=train_truth[1], y=train_pred[1], ax=axs[0,1])
            sns.scatterplot(x=test_truth[0][test2[y_pred2]], y=test_pred[0][test2[y_pred2]], ax=axs[1, 0])
            sns.scatterplot(x=test_truth[1][test2[y_pred2]], y=test_pred[1][test2[y_pred2]], ax=axs[1,1])
            plt.draw()

        plt.show()

        return
from sklearn.linear_model import LinearRegression

# 加载数据
boston = load_boston()
features, target = boston.data, boston.target

# 将数据分为测试集和训练集
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=0)

# 创建DummyRegressor对象
dummy = DummyRegressor(strategy='mean')

# 训练回归模型
dummy.fit(features_train, target_train)

# 计算R方得分
print(dummy.score(features_test, target_test))

ols = LinearRegression()
ols.fit(features_train, target_train)

# 计算R方得分
print(ols.score(features_test, target_test))

# 创建一个讲所有样本预测为20的DummyRegressor
clf = DummyRegressor(strategy='constant', constant=20)
clf.fit(features_train, target_train)

# 计算模型的得分
print(clf.score(features_test, target_test))
Esempio n. 9
0
svm_linear_c1 = SVC(kernel='linear', C=1.0)
svm_linear_c1.fit(X_train, y_train)
svm_linear_c1.score(X_test, y_test)

#Aula 6.2 - Avaliação de Modelos de Classificação e Regressão
#Modelos Dummy
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.datasets import load_iris, load_boston
X, y = load_iris(return_X_y=True)
dc = DummyClassifier(strategy='stratified')
dc.fit(X, y)
dc.score(X, y)
X, y = load_boston(return_X_y=True)
dr = DummyRegressor(strategy='mean')
dr.fit(X, y)
dr.score(X, y)

#Matriz de confusão, recall e precisão
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix, classification_report
X, y = load_breast_cancer(return_X_y=True)
dc = DummyClassifier(strategy='stratified')
dc.fit(X, y)
confusion_matrix(y, dc.predict(X))
print(classification_report(y, dc.predict(X)))

#Validação cruzada
from sklearn.model_selection import cross_val_score
X, y = load_iris(return_X_y=True)
dc = DummyClassifier(strategy='stratified')
cross_val_score(dc, X, y, cv=5)
#Create Dummy Regression Always Predicts The Mean Value Of Target
# Create a dummy regressor
dummy_mean = DummyRegressor(strategy='mean')

# "Train" dummy regressor
dummy_mean.fit(X, y)

# In[ ]:

dummy_mean.predict(X)

# In[ ]:

# Get R-squared score
dummy_mean.score(X, y)

# ### Making a model

# In[ ]:

X_train.head()

# In[ ]:

X_train.rename({"unnamed: 0": "a"}, axis="columns", inplace=True)
X_train.drop(["a"], axis=1, inplace=True)

# In[ ]:

X_test.rename({"unnamed: 0": "a"}, axis="columns", inplace=True)
Esempio n. 11
0
  print('Mean Absolute Error: ', mean_absolute_error(test_data, predictions))
  print('Mean Squared Error: ', mean_squared_error(test_data, predictions))
  print('Root MSE: ', rmse(test_data, predictions))
  print('Mean Absolute Percentage Error', np.mean(np.abs((test_data - predictions) / test_data)) * 100)

"""Now I'll run the data through a dummy regressor to see how well an 'empty' model performs."""

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=35)

dummy_mean = DummyRegressor(strategy='mean')

dummy_mean.fit(X_train, y_train)

predictions = dummy_mean.predict(X_test)

dummy_score = dummy_mean.score(X_test, y_test)
print('Dummy Score: ', dummy_score)
metrics(y_test, predictions)

dummy_median = DummyRegressor(strategy='median')

dummy_median.fit(X_train, y_train)

predictions = dummy_median.predict(X_test)

dummy_score = dummy_median.score(X_test, y_test)
print('Dummy Score: ', dummy_score)
metrics(y_test, predictions)

"""So the R^2 values to beat are -0.005 and -0.162
Esempio n. 12
0
    model.add(Dense(1))

    model.compile(
        optimizer=Adam(),
        loss="mse",
        #metrics=['accuracy']
    )

    return model


model = KerasRegressor(build_fn, epochs=10)

N = int(len(X) * 0.7)
#X_train, X_test, y_train, y_test = train_test_split(X, Y)
X_train, X_test, y_train, y_test = X[:N], X[N:], Y[:N], Y[N:]
estimator = make_pipeline(
    StandardScalerNDim(),
    make_pipeline(FlattenNDim(), LinearSVR()),
    #model
)
estimator.fit(X_train, y_train)
print("Model score:", estimator.score(X_test, y_test))

dummy = DummyRegressor()
dummy.fit(y_train[:, np.newaxis], y_train)
print("Dummy score:", dummy.score(y_test[:, np.newaxis], y_test))

y_pred = estimator.predict(X_test)
print("R2 score: %s" % r2_score(y_test, y_pred))
plot_data(X_test, y_test, y_pred)
Esempio n. 13
0
    return df


X, Y = rgf.drop('revenue', axis=1), rgf['revenue']
X = regression_engineering(X)
train_X, test_X, train_Y, test_Y = train_test_split(
    X, Y, train_size=0.75,
    test_size=0.25)  #randomly separating training and test set
reg = GradientBoostingRegressor()
reg.fit(train_X, train_Y)  #Train regressor model
print('Regressor Score: ', reg.score(test_X, test_Y))

#Compare with dummy regressor!!
dummy = DummyRegressor()
dummy.fit(train_X, train_Y)
print('Dummy Regressor Score: ', dummy.score(test_X, test_Y))

sns.set_style('whitegrid')
plt.figure(figsize=(12, 14))
sns.barplot(x=reg.feature_importances_, y=X.columns)
plt.savefig('regressor.png')

#Classification: Predicting Movie Sucess
cls = movies_df[movies_df['return'].notnull()]

cls = cls.drop(['revenue'], axis=1)
cls['return'] = cls['return'].apply(
    lambda x: 1 if x >= 1 else 0)  #create binary output for classification
cls['return'].value_counts()  #balanced classes

cls['belongs_to_collection'] = cls['belongs_to_collection'].fillna('').apply(
Esempio n. 14
0
def main():
    """

    :return:
    """
    try:
        if sys.argv[1] == "-S" or "--single" or "-s":
            training_data = load(
                f"../data/train/{SINGLE_TRAIN_FILE_NAME}")
            trial_data = load(f"../data/{SINGLE_TRIAL_FILE_NAME}")
        elif sys.argv[1] == "-M" or "--multi" or "-m":
            training_data = load(
                f"../data/train/{MULTI_TRAIN_FILE_NAME}")
            trial_data = load(f"../data/{MULTI_TRIAL_FILE_NAME}")
    except IndexError:
        exit(
            "Please specify which type of trial information you would"
            " like to use (-S for single trial, -M for multi trial"
            " information)!")

    training_data = training_data.dropna()
    trial_data = trial_data.dropna()

    print("Extracting training features...")
    X_train, y_train = extract_features(training_data,
                                        use_sentence=True,
                                        use_word_embeddings=False,
                                        use_token=True,
                                        use_readability_measures=False), \
                       training_data[['complexity']]
    print("Extracting trial features...")
    X_trial, y_trial = extract_features(trial_data,
                                        use_sentence=True,
                                        use_word_embeddings=False,
                                        use_token=True,
                                        use_readability_measures=False), \
                       trial_data[['complexity']]

    tokens = X_trial[['token', "sentence"]]
    X_train.drop(["complexity", "id", "token", "sentence"],
                 axis=1, inplace=True)
    X_trial.drop(["complexity", "id", "token", "sentence"],
                 axis=1, inplace=True)
    print("Finished feature processing!\n")

    regressor = DummyRegressor(strategy="median")
    regressor.fit(X_train, y_train)

    y_guess = regressor.predict(X_trial)

    regressor.score(X_train, y_train)

    print(f"Mean squared error: {mean_squared_error(y_trial, y_guess)}")
    print(f"R^2 score: {r2_score(y_trial, y_guess)}")
    print(f"Explained variance score:"
          f" {explained_variance_score(y_trial, y_guess)}")
    print(f"Max error: {max_error(y_trial, y_guess)}")
    print(f"Mean absolute error:"
          f" {mean_absolute_error(y_trial, y_guess)}")

    results = y_trial.merge(pd.DataFrame(y_guess), left_index=True,
                            right_index=True)
    results = results.merge(tokens, left_index=True, right_index=True)
    results.columns = ["Actual", "Predicted", "Token", "Sentence",]
    print(results[['Actual', "Predicted", "Token"]])

    fig = results.plot(kind='bar', rot=0,
                       title="Actual and predicted complexity scores"
                             " by dummy (single token)",
                       xlabel="Sample ID", ylabel="Complexity score",
                       grid=False, figsize=(20, 9)
                       ).get_figure()
    fig.savefig("dummy_results.png")
Esempio n. 15
0
def benchmark(X_train, X_test, y_train, y_test):
    print "********************DummyRegressor Model******************"
    model = DummyRegressor()
    model.fit(X_train, y_train)
    print '{}'.format(model.score(X_test, y_test))
    return model
Esempio n. 16
0
# Load libraries
from sklearn.datasets import load_boston
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
boston = load_boston()

# Create features
X, y = boston.data, boston.target

# Make test and training split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create a dummy regressor
dummy_mean = DummyRegressor(strategy='mean')

# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)

# Create a dummy regressor that always predit a contant value
dummy_constant = DummyRegressor(strategy='constant', constant=20)

# "Train" dummy regressor
dummy_constant.fit(X_train, y_train)

# Get R-squared score
dummy_constant.score(X_test, y_test)  
Esempio n. 17
0
Dummy = DummyRegressor()
# DummyRegressor 알고리즘 선언

Dummy.fit(X_train,y_train)
# DummyRegressor 알고리즘에 나의 데이터를 적용시켜본다.

Dummy_y_pred = Dummy.predict(X_test)
# Dummy 알고리즘을 사용해서 Y값을 예측한다.

print('DummyRegressor Mean Absolute Error:', metrics.mean_absolute_error(y_test,Dummy_y_pred))
print('DummyRegressor Mean Squared Error:', metrics.mean_squared_error(y_test,Dummy_y_pred))
print('DummyRegressor Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,Dummy_y_pred)))
print('DummyRegressor Accuracy:', metrics.r2_score(y_test,Dummy_y_pred))
Dummy_df = pd.DataFrame({'Actual':y_test, 'Predicted':Dummy_y_pred})
print(Dummy_df)
print("R-squared for Train: %.2f" %Dummy.score(X_train,y_train))
print("R-squared for Test: %.2f" %Dummy.score(X_test,y_test))

# ------------------------------------------- Output
# random_state=0일 때

# DummyRegressor Mean Absolute Error: 603.3633884521669
# DummyRegressor Mean Squared Error: 880156.477329049
# DummyRegressor Root Mean Squared Error: 938.1665509540665
# DummyRegressor Accuracy: -0.00020030937269321925
# R-squared for Train: 0.00
# R-squared for Test: -0.00

# random_state=43일 때
# DummyRegressor Mean Absolute Error: 606.3160356253877
# DummyRegressor Mean Squared Error: 1342621.3396696597
fig = plt.figure()
ax = fig.add_subplot(111)
red = ax.scatter(Xtrain, ytrain, color='red', marker='+')
knn_plot = ax.plot(Xtest, knn.predict(Xtest), color='green')
kridge_plot = ax.plot(Xtest, kridge.predict(Xtest), color='blue')
base = ax.plot(Xtest, dummy.predict(Xtest), color='orange', linestyle='--')
ax.set_ylabel("output Y", fontsize=20)
ax.set_xlabel("input X", fontsize=20)
fig.legend(["kNN", "KernelRidge", "baseline", "train"],
           scatterpoints=1,
           loc='right',
           ncol=2,
           fontsize=15)
ax.set_title(
    "kNN & KernelRidge Predictions", fontsize=20)


# Compute percentage of accuracy for each predictions
knn_accuracy = knn.score(Xtrain, ytrain)
kridge_accuracy = kridge.score(Xtrain, ytrain)
baseline_accuracy = dummy.score(Xtrain, ytrain)


# Print outputs
print("base model accuracy score: ", baseline_accuracy,
      " - knn model accuracy score: ", knn_accuracy,
      " - kridge accuracy: ", kridge_accuracy)

plt.show()
                                                    test_size=0.2,
                                                    random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# average SalePrice in train and test
print('mean SalePrice in train : {0:.3f}'.format(np.mean(y_train)))
print('mean SalePrice in test : {0:.3f}'.format(np.mean(y_test)))

from sklearn.dummy import DummyRegressor

# baseline model
model_dummy = DummyRegressor(strategy='mean')
model_dummy.fit(X_train, y_train)
print('score for baseline model: {0:.2f}'.format(
    model_dummy.score(X_test, y_test)))


def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2, 3, 4, 5],
                'selection': ['random', 'cyclic']
Esempio n. 20
0
ml_desafio02['Machile-Learning-RD'] = ridge_predicoes
ml_desafio02['Diferença-ML-RD'] = ((y_teste - ml_desafio02['Machile-Learning-RD'])**2)**.5
plt.figure(figsize=(14,8))
plt.title('Comparação Da diferença da Machine Learning SVM x Ridge Gerando uma nota')
sns.histplot(data = ml_desafio02, x = 'Diferença-ML-SVM', kde = True, stat='density', color='blue',alpha=1,fill=True,multiple='stack')
aula04_d2 = sns.histplot(data = ml_desafio02, x = 'Diferença-ML-RD', kde = True, stat='density', color='green',alpha=.5,fill=True,multiple='stack')
aula04_d2.set_xlabel('Valor da Diferença')
aula04_d2.set_ylabel('Quantidade Diferença')
aula04_d2.legend(labels=('SVM','Ridge'),title=('Machine Learning'))

"""Desafio 03"""

modelo_dummyv2 = DummyRegressor(quantile=1,constant=9)
modelo_dummyv2.fit(x_treino,y_treino)
dummy_predicoesv2 = modelo_dummyv2.predict(x_teste)
print(modelo_dummyv2.score(X=x_treino,y=y_treino))
print(mean_squared_error(y_teste,ridge_predicoes)**.5)
print(mean_squared_error(y_teste,predicoes_matematica)**.5)
print(mean_squared_error(y_teste,dummy_predicoes)**.5)
print(mean_squared_error(y_teste,dummy_predicoesv2)**.5)

"""Desafio 04"""

from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
validadores = [explained_variance_score,max_error,mean_absolute_error,mean_squared_error,median_absolute_error,r2_score]
validadores_nome = ['explained_variance_score','max_error','mean_absolute_error','mean_squared_error','median_absolute_error','r2_score']
predicoes = [ridge_predicoes,predicoes_matematica,dummy_predicoes]
predicoes_nome = [' ridge_predicoes ',' predicoes_matematica ',' dummy_predicoes ']
for posy, y in enumerate(predicoes):
  print(f'{predicoes_nome[posy-1]:-^50}')
  for posx, x in enumerate(validadores):
bottom_99_dataset.iloc[0, :57]

# **DummyRegressor with mean strategy as a baseline**

# In[24]:

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

X = bottom_99_dataset[['year']]
y = bottom_99_dataset['total_net_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train)
model.score(X_test, y_test)

# In[25]:

from sklearn.preprocessing import LabelEncoder

le_state = LabelEncoder()
le_city = LabelEncoder()
factor_columns = ['state_company', 'city']
model_dataset = bottom_99_dataset.dropna(subset=factor_columns)
model_dataset['state_company'] = le_state.fit_transform(
    model_dataset['state_company'])
model_dataset['city'] = le_city.fit_transform(model_dataset['city'])

model_columns = [
    'cnpj', 'issue_date_day', 'issue_date_month', 'issue_date_year'
Esempio n. 22
0
def test_regressor_score_with_None(y, y_test):
    reg = DummyRegressor()
    reg.fit(None, y)
    assert reg.score(None, y_test) == 1.0
Esempio n. 23
0
R_ridge_train_score = ridge.score(X_train, y_train)
R_ridge_test_score = ridge.score(X_test, y_test)
y_pred_ridge = ridge.predict(X_test)
# regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=param_grid, n_jobs=n_jobs)
# regressor.fit(X_train, y_train)
for k, (train, test) in enumerate(cv.split(X, y)):
    pdb.set_trace()
    estimator.fit(X[train], y[train])
    y_pred = estimator.predict(X_test)

    # # best_est= regressor.best_estimator_

    # # print "Best Estimator Parameters"
    # # print"---------------------------"
    # # print "n_estimators: %d" %best_est.n_estimators
    # # print "max_depth: %d" %best_est.max_depth
    # # print "Learning Rate: %.1f" %best_est.learning_rate
    # # print "min_samples_leaf: %d" %best_est.min_samples_leaf
    # # print "max_features: %.1f" %best_est.max_features
    # # print "Train R-squared: %.2f" %best_est.score(X_train,y_train)
    print "Feature Importances"
    print estimator.feature_importances_
    print "R-squared for Train: %.2f" % estimator.score(X[train], y[train])
    print "R-squared for Test: %.2f" % estimator.score(X[test], y[test])

dummy = DummyRegressor()
dummy.fit(X_train, y_train)
R_dummy = dummy.score(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)

pdb.set_trace()
import pandas as pd
from sklearn.dummy import DummyRegressor

# Loading in the data
canucks = pd.read_csv('data/canucks_subbed.csv')

# Define X and y
X = canucks.loc[:, ['No.', 'Age', 'Height', 'Weight', 'Experience']]
y = canucks['Salary']

# Create a model
model = DummyRegressor(strategy="mean")

# Fit your data
model.fit(X, y)

# Predict the labels of X
model.predict(X)

# The model accuracy
accuracy = round(model.score(X, y), 2)

accuracy
# cv里 k折交叉验证时迄今最常见的方法,也有一些其他方法如 (leave-one-out-cross-validation) 该方法的折数等于样本数
# scoring 参数 指定了衡量模型性能的标准。 本章其他节会讨论
# n_jobs=-1 是使用所有可用的CPU核进行计算。

# 11.2 创建一个基准回归模型
from sklearn.datasets import load_boston
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

boston = load_boston()
features, target = boston.data, boston.target
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=0)

dummy = DummyRegressor(strategy='mean')     # 创建DummyRegressor对象        简单的基准回归模型
dummy.fit(features_train, target_train)     # 训练回归模型
dummy.score(features_test, target_test)     # 计算R方得分    返回的是R-squared

from sklearn.linear_model import LinearRegression       # 训练自己的模型并与基准模型做比较
ols = LinearRegression()
ols.fit(features_train, target_train)
ols.score(features_test, target_test)

# DummyRegressor 允许我们创建一个简单的模型,以此作为基准和实际的模型进行对比。通常使用这种办法来模拟某个产品或系统中已有的原始预测系统。
# 可选的方法包括训练集的均值或者中位数。此外如果将strategy设置成constant 并使用constant参数。则模型的预测结果都为这个常数。

clf = DummyRegressor(strategy="constant",constant=20)
clf.fit(features_train, target_train)
clf.score(features_test, target_test)   # R-squared 越接近1,代表特征对目标向量的解释越好(即相关性越高)

# 11.3 创建一个基准分类模型
from sklearn.datasets import load_iris
Esempio n. 26
0
# Wczytanie bibliotek.
from sklearn.datasets import load_boston
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

# Wczytanie danych.
boston = load_boston()

# Utworzenie cech.
features, target = boston.data, boston.target

# Podział na zbiory uczący i testowy.
features_train, features_test, target_train, target_test = train_test_split(
    features, target, random_state=0)

# Utworzenie sztucznego regresora.
dummy = DummyRegressor(strategy='mean')

# "Wytrenowanie" sztucznego regresora.
dummy.fit(features_train, target_train)

# Pobranie kwadratu wartości.
dummy.score(features_test, target_test)
# In[5]:


# Calculate and print RMSE training set error of the dummy model
from sklearn.metrics import mean_squared_error
dummy_r_training_rsme = np.sqrt(mean_squared_error(y_train_r, dummy_r.predict(X_train_r)))
print('dummy RMSE: {:.3f}'.format(dummy_r_training_rsme))


# In[6]:


# Calculate and print the R2 training set score of the dummy model
# hint: can use models 'score' function
dummy_r_training_r2 = dummy_r.score(X_train_r, y_train_r)
print('dummy R2: {:.3f}'.format(dummy_r_training_r2))


# In[7]:


# Calculate and print the mean 5-fold cross valication R2 score of the dummy model
from sklearn.model_selection import cross_val_score
dummy_r_cv = cross_val_score(dummy_r, X_train_r, y_train_r, cv=5)
print('dummy mean cv R2: {:.3f}'.format(np.mean(dummy_r_cv)))


# ### Measure performance of Linear Regression

# In[8]: