data.plot(kind = 'scatter',x='Radio',y= 'Sales')

plt.plot(X_new,preds,c='Red',linewidth=4)



data.plot(kind = 'scatter',x='Newspaper',y= 'Sales')

plt.plot(X_new,preds,c='Green',linewidth=1)



import statsmodels.formula.api as smf
lr = smf.ols(formula = 'Sales ~ TV', data=data).fit()
lr.conf_int()



#FINDING THE PROBABILITY VALUES
lr.pvalues

#FINDING THE R-SQUARED VALUES
lr.rsquared


#MUTILINEAR REGRESSION
feature_cols = ['TV','Radio','Newspaper']
X = data[feature_cols]
y = data.Sales
Ejemplo n.º 2
0
data.plot(kind='scatter', x='Irrigation_pumping', y='Observed')
plt.plot(X_new_irrig, preds_irrig, c='red', linewidth=2)

# %%
# create a fitted model in one line
lm = smf.ols(
    formula=
    'Observed ~ Evapotranspiration + month + Precipitation + Segment_id + Irrigation_pumping + year',
    data=data).fit()

# print the coefficients
display(lm.params)
print(f"rquared: {lm.rsquared}")
# print the confidence intervals for the model coefficients
display(lm.conf_int())

# %%
plt.figure()
ax = sns.regplot(x='Evapotranspiration', y='Observed', data=data)

plt.figure()
ax = sns.regplot(x='Precipitation', y='Observed', data=data)

# %%
lm = smf.ols(formula='Observed ~ Evapotranspiration', data=data).fit()
# print the p-values for the model coefficients
lm.pvalues

# %%
# Print the R-squared value for the model using statsmodels
Ejemplo n.º 3
0
                                    )

# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo
X_train = sm.add_constant(X_train, prepend=True)
modelo = sm.OLS(endog=y_train, exog=X_train,)
modelo = modelo.fit()
print(modelo.summary())


# Intervalos de confianza para los coeficientes del modelo
# ==============================================================================
print('')
print('Intervalos de confianza para los coeficientes del modelo')
print(modelo.conf_int(alpha=0.05))
print('')


# Predicciones:

# Una vez entrenado el modelo, se pueden obtener predicciones
# para nuevos datos. Los modelos de statsmodels permiten
# calcular las predicciones de dos formas:

# .predict(): devuelve únicamente el valor de las predicciones.

# .get_prediction().summary_frame(): devuelve, además de
# las predicciones, los intervalos de confianza asociados.

predicciones = modelo.get_prediction(exog = X_train).summary_frame(alpha=0.05)
# then, plot the least squares line
plt.plot(X_new, preds, c='red', linewidth=2)

# ## Confidence in the Model
#
# **Question:** Is linear regression a high bias/low variance model, or a low bias/high variance model?
#
# **Answer:** It's a High bias/low variance model. Under repeated sampling, the line will stay roughly in the same place (low variance), but the average of those models won't do a great job capturing the true relationship (high bias). Note that low variance is a useful characteristic when you don't have a lot of training data.
#
# A closely related concept is **confidence intervals**. Statsmodels calculate 95% confidence intervals for your model coefficients, which are interpreted as follows: If the population from which this sample was drawn was **sampled 100 times**, approximately **95 of those confidence intervals** would contain the "true" coefficient.

# In[12]:

import statsmodels.formula.api as smf
lm = smf.ols(formula='Sales ~ TV', data=data).fit()
lm.conf_int()

# Keep in mind that you only have a **single sample of data**, and not the **entire population of data**. The "true" coefficient is either within this interval or it isn't, but there's no way to actually know.
# You estimate the coeffeicient with the data you have and indicate uncertainity about the estimate by giving a range that the co-efficient is probably within
#
# Note that using 95% confidence intervals is just a convention. You can create 90% confidence intervals (which will be more narrow), 99% confidence intervals (which will be wider), or whatever intervals you like.

# ## Hypothesis Testing and p-values
#
# Closely related to confidence intervals is **hypothesis testing**. Generally speaking, you start with a **null hypothesis** and an **alternative hypothesis** (that is opposite the null). Then, you check whether the data supports **rejecting the null hypothesis** or **failing to reject the null hypothesis**.
#
# (Note that "failing to reject" the null is not the same as "accepting" the null hypothesis. The alternative hypothesis may indeed be true, except that you just don't have enough data to show that.)
#
# As it relates to model coefficients, here is the conventional hypothesis test:
# - **null hypothesis:** There is no relationship between TV ads and Sales (and thus $\beta_1$ equals zero)
# - **alternative hypothesis:** There is a relationship between TV ads and Sales (and thus $\beta_1$ is not equal to zero)
Ejemplo n.º 5
0
# Now i'm going to try a different model - the decision tree regressor (from sklearn)
from sklearn.tree import DecisionTreeRegressor

treereg = DecisionTreeRegressor(random_state=1)
treereg.fit(X_train, y_train)

# print metrics (specific to the problem, not the model)
treereg.score(X_test, y_test)
predstr = treereg.predict(X_test)
metrics.mean_squared_error(y_test, predstr)
np.sqrt(metrics.mean_squared_error(y_test, predstr))

## cross validation
scorestr = cross_val_score(treereg, X, y, cv=10, scoring="mean_squared_error").mean()
np.sqrt(-scorestr)

# Now, let's run a linear regression in stats models so I can look at the p-values for all my feeatures
import statsmodels.formula.api as smf

# create a fitted model in one line (OLS)
lm = smf.ols(
    formula="SharesperUser ~ SA + link + video + status + photo + AboutAmerica + Art + Asean + Campususa + CivilSociety + Culture + Democracy + Development + Diversity + DrugEnforcement + Economy + Education + EnglishLearning + Entrepreneurship + Environment + FreeExpression + GoodGovernance + Grants + Health + HumanRights + MissionAffairs + Policy + Preservation + ScienceTech + Sports + StudyinUSA + Trade + TraveltoUSA + Visas + Women + Yali + Yseali",
    data=data,
).fit()

# print the coefficients and other metrics
lm.params
lm.rsquared
lm.conf_int()
lm.pvalues