Esempio n. 1
0
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Fitting the Linear model

ML_Regression = LinearRegression()
ML_Regression.fit(X_train, Y_train)

# Prediting the Results

Y_pred = ML_Regression.predict(X_test)

# Building Backward Elimination
X=np.append(arr=np.ones((50,1)),values=X,axis=1)

X_opt = X[:, [0, 1, 2, 3, 4, 5]]
OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit()

OLS_Regression.summary()

X_opt = X[:, [0, 1, 3, 4, 5]]
OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit()
OLS_Regression.summary()

X_opt = X[:, [0, 3, 4, 5]]
OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit()
OLS_Regression.summary()

X_opt = X[:, [0, 3, 5]]
OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit()
OLS_Regression.summary()
Esempio n. 2
0
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

print(y_pred)

#backward elimination
import statsmodels.formula.api as sm
X = np.append(arr=np.ones((14, 1)).astype(int),
              values=sonveriler.iloc[:, :-1],
              axis=1)
X_l = sonveriler.iloc[:, [0, 1, 2, 3, 4, 5]].values
r_ols = sm.OLS(endog=sonveriler.iloc[:, -1:], exog=X_l)
r = r_ols.fit()
print(r.summary())

sonveriler = sonveriler.iloc[:, 1:]

import statsmodels.formula.api as sm
X = np.append(arr=np.ones((14, 1)).astype(int),
              values=sonveriler.iloc[:, :-1],
              axis=1)
X_l = sonveriler.iloc[:, [0, 1, 2, 3, 4]].values
r_ols = sm.OLS(endog=sonveriler.iloc[:, -1:], exog=X_l)
r = r_ols.fit()
print(r.summary())

x_train = x_train.iloc[:, 1:]
# Splitting the data set into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#  Fitting Multiple Linear Regression to the training set
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predecting the test result
y_pred = regressor.predict(X_test)

# Building the optimal model using backward elimination method
# X = np.append(arr=X, values=np.ones((50,1)).astype(int), axis=1)
X = np.append(arr=np.ones((50,1)).astype(int), values=X, axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

Esempio n. 4
0
r2 = LinearRegression()
r2.fit(x_train, y_train)

y_prediction2 = r2.predict(x_test)

X = np.append(arr=np.ones((22, 1)).astype(int), values=veri, axis=1)
X_list = veri.iloc[:, [
    0,
    1,
    2,
    3,
    4,
    5,
]].values

r = sm.OLS(endog=boy, exog=X_list).fit()

print(r.summary())

X = np.append(arr=np.ones((22, 1)).astype(int), values=veri, axis=1)
X_list = veri.iloc[:, [
    0,
    1,
    2,
    3,
    5,
]].values

r = sm.OLS(endog=boy, exog=X_list).fit()

print(r.summary())
oneHotDev = pd.concat([oneHotDev, dev[featuresNum], dev.iloc[:,-1]], axis=1)
oneHotTest = pd.concat([oneHotTest, test[featuresNum]], axis=1)

# reduce features by using statsmodel
oneHotTrainSM = pd.concat([pd.DataFrame(np.ones(len(oneHotTrain)), columns=['bias']), oneHotTrain], axis=1)
oneHotDevSM = pd.concat([pd.DataFrame(np.ones(len(oneHotDev)), columns=['bias']), oneHotDev], axis=1)
oneHotTestSM = pd.concat([pd.DataFrame(np.ones(len(oneHotTest)), columns=['bias']), oneHotTest], axis=1)

# initialize variables
pVal = 0.15
count = 0

# reduce features using bic
while count < len(oneHotTrainSM.columns)-1:
    count += 1
    regOLS = sm.OLS(endog=oneHotTrainSM.iloc[:,-1], exog=oneHotTrainSM.iloc[:,:-1]).fit()
    if regOLS.pvalues.max() > pVal:
        oneHotTrainSM = oneHotTrainSM.drop([regOLS.pvalues.idxmax()], axis=1)
        oneHotDevSM = oneHotDevSM.drop([regOLS.pvalues.idxmax()], axis=1)
        oneHotTestSM = oneHotTestSM.drop([regOLS.pvalues.idxmax()], axis=1)
    else:
        break

if 'bias' in oneHotTrainSM.columns.values:
    oneHotTrainSM = oneHotTrainSM.drop(['bias'], axis=1)
    oneHotDevSM = oneHotDevSM.drop(['bias'], axis=1)
    oneHotTestSM = oneHotTestSM.drop(['bias'], axis=1)

# reassign training, dev and test data
oneHotTrain = oneHotTrainSM
oneHotDev = oneHotDevSM
Esempio n. 6
0
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import statsmodels.formula.api as sm

data = pd.read_csv("C:/Users/GEU/Downloads/data2.csv")
X = data.iloc[:, :-1].values
Y = data.iloc[:, 4].values
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()

X = np.append(np.ones([30, 1]).astype(int), values=X, axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5, 6]]
print(X_opt)
reg_ols = sm.OLS(endog=Y, exog=X_opt).fit()
print(reg_ols.summary())
X_train, X_test, Y_train, Y_test = train_test_split(X_opt,
                                                    Y,
                                                    test_size=1 / 3,
                                                    random_state=0)
m = LinearRegression()
m.fit(X_train, Y_train)
print(m.score(X_test, Y_test))
"""ADDING b0"""
data_length = len(df_train);
df_train['b0'] = [1]*data_length;

"""BACKWARD ELIMINATION"""
max_p_value = 1;
non_significant_column = None;
eliminator = None;
num = 0
sm_result = None
while max_p_value > 0.05:
	if not non_significant_column == None:
		del df_train[non_significant_column];
		del df_test[non_significant_column];
	sm_result = sm.OLS(endog = df_train["SalePrice"], exog = df_train.loc[:, df_train.columns != "SalePrice"]).fit();
	p_values = sm_result.pvalues;
	max_p_value = np.amax(p_values)
	i = np.where(p_values == max_p_value);
	non_significant_column = list(p_values.index[i])[0];

# remove b0
del df_train['b0'];
""" LOGISTIC REGRESSION """
regressor = LogisticRegression(random_state=0);
regressor.fit(df_train.loc[:, df_train.columns != "SalePrice"], df_train["SalePrice"]);
prediction = regressor.predict(df_test.loc[:, df_test.columns != "SalePrice"]);

# #################
# SUBMIT ANSWER
# #################
regression.fit(x_train2,y_train2)

boy_pred = regression.predict(x_test2)

#print(y_test2)
#print(boy_pred)



#-----------------backwardElimination----------------------------


X = np.append(arr=np.ones((22,1)).astype(int),values=newData,axis=1) # birebirlik array oluşturuldu int türünde, newData arrayine yukardan aşağı şeklinde eklendi
#print(X)
X_l = newData.iloc[:,[0,1,2,3,4,5]].values  # daha sonradan üzerinde oynma yapabilmek için bir dizi şeklinde aldık
print(type (X_l))  # burada yapılan asıl olay tam olarak bizim multilinner regrestion modelimizde y = B + B1X1 + B2X2 +  B3X3 ... bir denklem var. Bu denklem
                # için elimizde bağımsız değişkenlerimiz bulunmakta ama ilgi B sabiti yok bu sabiti de ekleyebilmek için 1 lerden oluşan bir
                # sutun ekledik. Bu sutun 1 lerden oluşma sebebi katsayısını 1 olmasıdır.

boyArray = boyDFrame.iloc[:,0:1].values
result_OLS = sm.OLS(endog=boyArray,exog=X_l) # boy verisine göre diğer değişkenlerin bilgilerini "Ols raporu"(koveryans varyans p_value vb.) çıkarıyor. Bu çıkarma işleminin
r = result_OLS.fit()                        # gerçekleşmesi için fit() demen lazım.

print(r.summary()) # çıkarılan değerlerin özeti. Buradan P_value olan en büyük değeri elemeliyiz.

X_l2 = newData.iloc[:,[1,2,3,4,5]].values
result_OLS2 = sm.OLS(endog=boyArray,exog=X_l2) # burada 0. index deki değeri eledik. Bu şekilde devam etcek. Genelde 0.05 altında olana kadar eleme işlemi devam eder

r2 = result_OLS2.fit()

print(r2.summary())
Esempio n. 9
0
y_train = sc_y.fit_transform(y_train)"""

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# building the optimal model using Bbackward Elimination

import statsmodels.formula.api as sm
X = np.append(np.ones((50, 1)).astype(int), X, 1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(y, X_opt).fit()
print(regressor_OLS.summary())
# remove hihest significance value above 5% (2)
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(y, X_opt).fit()
print(regressor_OLS.summary())

# remove hihest significance value above 5% (1)
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(y, X_opt).fit()
print(regressor_OLS.summary())

# remove hihest significance value above 5% (4)
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(y, X_opt).fit()
print(regressor_OLS.summary())
Esempio n. 10
0
def _fit_ols(y, x, **kwargs):
    """ Perform the basic ols regression."""
    # mixed effects code is obtained here:
    # http://stackoverflow.com/a/22439820/1167475
    return [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]
Esempio n. 11
0
def ols_summary(ones_length, x_dataset, y_dataset, columns):
    B0 = np.append(arr = np.ones((ones_length,1)).astype(int), values=x_dataset, axis=1)
    ols = sm.OLS(endog=y_dataset, exog=x_dataset.iloc[:,columns]).fit()
    return ols.summary()



#-----------------------------------------------------------
##### 6- Build Optimal model using Backward Elimination #####
#-----------------------------------------------------------

import statsmodels.formula.api as sm

# add column of 1s in the beggining of the matrix
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)

# X_opt will have matrix of variables that have high impact on the profit
X_opt = X[:, [0, 1, 2, 3, 4, 5]] #copy entire X
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors 
regressor_OLS.summary() # view the summary and P-value

# remove the independent variable that has highest p-value (P>t) and t=0.05 = 5%
X_opt = X[:, [0, 1, 3, 4, 5]] #copy entire X
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors 
regressor_OLS.summary() # view the summary and P-value

# remove the independent variable that has highest p-value (P>t) and t=0.05 = 5%
X_opt = X[:, [0, 3, 4, 5]] #copy entire X
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors 
regressor_OLS.summary() # view the summary and P-value

# remove the independent variable that has highest p-value (P>t) and t=0.05 = 5%
X_opt = X[:, [0, 3, 5]] #copy entire X
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors 
"""ADDING b0"""
data_length = len(X_train);
X_train['b0'] = [1]*data_length;

"""BACKWARD ELIMINATION"""
max_p_value = 1;
non_significant_column = None;
eliminator = None;
num = 0
sm_result = None
while max_p_value > 0.05:
	if not non_significant_column == None:
		del X_train[non_significant_column];
		del X_test[non_significant_column];
	sm_result = sm.OLS(endog = y_train, exog = X_train.loc[:, X_train.columns != "Survived"]).fit();
	p_values = sm_result.pvalues;
	max_p_value = np.amax(p_values)
	i = np.where(p_values == max_p_value);
	non_significant_column = list(p_values.index[i])[0];

# remove b0
del X_train['b0'];

""" KNN """
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric= 'minkowski', p =2)
classifier.fit(X_train.loc[:, X_train.columns != "Survived"], y_train)
prediction = classifier.predict(X_test.loc[:, X_test.columns != "Survived"])

plt.plot(y_test, color='red')

# USE BACKWARD ELIMINATION METHOD
import statsmodels.formula.api as sm

# append X to an array of ones
X1 = np.append(arr=np.ones((len(dataset.index), 1)).astype(int),
               values=X,
               axis=1)

# step 1 confidence-value is 0.05

# step 2: Fit the full modell with all possible predictors/variables
X1_optimal = X1[:, [0, 1, 2, 3, 4, 5]]
# use ordinaly least significant regressor
regressor_OLS = sm.OLS(endog=y, exog=X1_optimal).fit()
regressor_OLS.summary()

# step 3, 4, 5: Find the variable with higest p-value and remove it from the model and fit again
# x2 has the highest p-value, index 2
X1_optimal = X1[:, [0, 1, 3, 4, 5]]
# use ordinaly least significant regressor
regressor_OLS = sm.OLS(endog=y, exog=X1_optimal).fit()
regressor_OLS.summary()

# x1 has the highest p-value, index 1
X1_optimal = X1[:, [0, 3, 4, 5]]
# use ordinaly least significant regressor
regressor_OLS = sm.OLS(endog=y, exog=X1_optimal).fit()
regressor_OLS.summary()
Esempio n. 15
0
if deviance < dist.chi2.ppf(0.95, degree_of_freedom):
    print('test_regression_c est rejete pour best_regression_b')

# Finance
# a)
r_royal = helper.yf_log_yield_extractor('./data/royalbank_monthly.csv',
                                        number_datapoint=30)
r_royal.to_excel('./output/log_yield_royalbank.xlsx')

# b)
plt.scatter([x for x in range(30)], r_royal['Yield']**2)
plt.title('Yield squared vs Time')

# c)
plt.scatter([x for x in range(30)], r_royal['Yield'])
plt.title('Yield vs Time')

mu = np.mean(r_royal['Yield'])

y = (r_royal['Yield'][:29] - mu)**2
x = (r_royal['Yield'][1:30] - mu)**2
x = sti.add_constant(x)

regression = stt.OLS(list(y), np.array(x))
results = regression.fit()
results.summary()

h31 = results.predict((1, y[0]))[0]
h32 = results.predict((1, h31))[0]
Esempio n. 16
0
x[:, 3] = labelencoder_X.fit_transform(x[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
x = onehotencoder.fit_transform(x).toarray()

#Avoiding the dummy variable trap
x = x[:, 1:]

#Splitting training and test data sets
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=0) #split 20% test/80% train


#Fitting Multiple Linear Regression to training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() 
regressor.fit(x_train, y_train) 

#Predicting the Test set results
y_pred = regressor.predict(x_test)


#backward elimination
import statsmodels.formula.api as sm
x = np.append(arr=np.ones((50,1)).astype(int),values=x, axis=1)
x_opt = x[:,[0,1,2,3,4,5]]
regressor_OLS=sm.OLS(endog=y, exog=x_opt).fit() #fit with all possible predictors
regressor_OLS.summary()
x_opt = x[:,[0,1,3,4,5]]
regressor_OLS=sm.OLS(endog=y, exog=x_opt).fit() #fit with all possible predictors
regressor_OLS.summary()
Esempio n. 17
0
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train_new, X_test_new, Y_train_new, Y_test_new = train_test_split(
    X_new, Y_new, test_size=0.2, random_state=0)

#fitting the multiple linear regression model

regressor.fit(X_train_new, Y_train_new)

#predicting test set result

Y_pred_new = regressor.predict(X_test_new)

# bakward elimination regression

import statsmodels.formula.api as sm
X_new = np.append(arr=np.ones((50, 1)).astype(int), values=X_new, axis=1)
X_optimal = X_new[:, [0, 1, 2, 3, 4, 5]]
regressorfromols = sm.OLS(
    endog=Y_new,
    exog=X_optimal).fit()  # parameters are dependent and independent variable
regressorfromols.summary()  #show the summary including p-values
X_optimal = X_optimal[:, [0, 2, 3, 4]]
regressorfromols = sm.OLS(endog=Y_new, exog=X_optimal).fit()
regressorfromols.summary()
X_optimal = X_optimal[:, [0, 1, 3]]
regressorfromols = sm.OLS(endog=Y_new, exog=X_optimal).fit()
regressorfromols.summary()
X_optimal = X_optimal[:, [0, 1]]
regressorfromols = sm.OLS(endog=Y_new, exog=X_optimal).fit()
regressorfromols.summary()
# Now we will split the data to train and test the data for prediction

training_x,test_x,training_y,test_y = train_test_split(real_x,real_y,test_size=0.2 '''indicating for 20% test data, rest will go into the training data''', random_state = 0 ''' Taking 0 difference between our prediction and model predication''')

# now we will do Regression to train the data
MLR = LinearRegression()
MLR.fit(training_x,training_y)  #training data

#now we will all prediction 
pred_y = MLR.predict(test_x)
pred_y  #PREDICTED VALUE
 # now you can compare the predicted value with the exact value
test_y  #real value
MLR.coef_  #to calculate the cofficient value
MLR.intercept_  # to find the intercept value

#now to calculate the value through formula
#y = b0+ b1x1+b2x2......+BnXn   (We have to get b=0, to amke all the zero)

real_x = np.append(arr=np.ones((50,1)).astypes(int),values=real_x,axis=1) # now we got all the values in 1

x_opt= real[:,[0,1,2,3,4,5]]
reg_OLS=sm.OLS(endog=real_y, exog=x_opt).fit()
reg_OLS.summary()

#if the p value would greate then 0.5 then remove that row index value
x_opt= real[:,[0,1,2,3,4,5]] #we will  remove 2 from list, thier value is more than 0.5 p value.
reg_OLS=sm.OLS(endog=real_y, exog=x_opt).fit()
reg_OLS.summary()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

import statsmodels.formula.api as sm
x = np.append(arr=np.ones((50, 1)).astype(int), values=x, axis=1)
x_opt = x[:, [0, 1, 2, 3, 4, 5]]
regressor_ols = sm.OLS(endog=y, exog=x_opt).fit()
regressor_ols.summary()

x_opt = x[:, [0, 1, 3, 4, 5]]
regressor_ols = sm.OLS(endog=y, exog=x_opt).fit()
regressor_ols.summary()

x_opt = x[:, [0, 3, 4, 5]]
regressor_ols = sm.OLS(endog=y, exog=x_opt).fit()
regressor_ols.summary()

x_opt = x[:, [0, 3, 5]]
regressor_ols = sm.OLS(endog=y, exog=x_opt).fit()
regressor_ols.summary()

x_opt = x[:, [0, 3]]
Esempio n. 20
0
from sklearn.linear_model import LinearRegression

linerModel = LinearRegression()
linerModel.fit(X_train, y_train)
predictVal = linerModel.predict(X_test)
#print("============Predicted Value==============")
#print(predictVal)

#Build Optimal Model using BAckward Elimination
import statsmodels.formula.api as sm
#We  y = b0 = x1b1 + x2b2 + ... xnbn,So we are making y = b0x0 + b1x1 + b2x2 + ..... bnxn
#X = np.append(X, np.ones((48,1)).astype(int), axis=1) # It will append in last position
#We need to add One clomuns on ones in first position that why we appending
X = np.append(np.ones((48, 1)).astype(int), X, axis=1)
xOpt = X[:, [0, 1, 2, 3, 4, 5]]
regressorOls = sm.OLS(endog=y, exog=xOpt).fit()
# See Summary if P > 0.05 then eleminate which have higher value of p
print(regressorOls.summary())
#Eleminate x it has 0.959
xOpt = X[:, [0, 1, 3, 4, 5]]
regressorOls = sm.OLS(endog=y, exog=xOpt).fit()
print(regressorOls.summary())
# Now Remove X2 because it has 0.897 value
xOpt = X[:, [0, 3, 4, 5]]
regressorOls = sm.OLS(endog=y, exog=xOpt).fit()
print(regressorOls.summary())
#BAckward Elimination process
xOpt = X[:, [0, 3, 5]]
regressorOls = sm.OLS(endog=y, exog=xOpt).fit()
print(regressorOls.summary())
#BAckward Elimination process
Esempio n. 21
0
index = aff.index(max_prob)

print("percentage of total women actually had an affair is : ", affair[1])
print("Prediction of women : ", index)

#optimal Model
import statsmodels.formula.api as sm

features = np.append(arr=np.ones((6366, 1)).astype(int),
                     values=features,
                     axis=1)

features_opt = features[:, [
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
]]
regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit()
regressor_OLS.summary()

features_opt = features[:, [0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit()
regressor_OLS.summary()

features_opt = features[:, [0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit()
regressor_OLS.summary()

features_opt = features[:, [0, 6, 7, 8, 9, 10, 11, 12, 13, 15]]
regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit()
regressor_OLS.summary()

features_opt = features[:, [0, 11, 12, 13, 15]]
Esempio n. 22
0
# ====> training and test set divide
startup_data_train, startup_data_test, profit_train, profit_test = train_test_split(startup_data, profit, test_size=0.2)


# ====> Linear regression model 
regressor = LinearRegression()
regressor.fit(startup_data_train, profit_train)

# ====> predict the test results
profit_predict = regressor.predict(startup_data_test)

# ====> Bulding the optimal model using backward elimination
startup_data = np.append(arr = np.ones((50,1)).astype(int), values = startup_data, axis = 1)

startup_data_opt = startup_data[:, [0, 1, 2, 3, 4, 5]]
regressor_ols = smapi.OLS(profit, startup_data_opt).fit()
regressor_summary = regressor_ols.summary()

startup_data_opt = startup_data[:, [0, 1, 3, 4, 5]]
regressor_ols = smapi.OLS(profit, startup_data_opt).fit()
regressor_summary = regressor_ols.summary()

startup_data_opt = startup_data[:, [0, 3, 4, 5]]
regressor_ols = smapi.OLS(profit, startup_data_opt).fit()
regressor_summary = regressor_ols.summary()

startup_data_opt = startup_data[:, [0, 3, 5]]
regressor_ols = smapi.OLS(profit, startup_data_opt).fit()
regressor_summary = regressor_ols.summary()

startup_data_opt = startup_data[:, [0, 3]]
Esempio n. 23
0
              formula, density * u.grams / u.milliliter))
         for cas, (formula,
                   density) in data[["formula", "density"]].iterrows()))

data["gaff_corrected"] = data.gaff + data.polcorr
data["opls_corrected"] = data.opls + data.polcorr

figure()

plt.plot([0.01, 1], [0.01, 1], 'k')  # Guide
title("Inverse Static Dielectric (Virtual Chemistry; GAFF)")
xlabel("Predicted")
ylabel("Experiment")

x, y = data["gaff"], data["expt"]
ols_model = sm.OLS(y, x)
ols_results = ols_model.fit()
r2 = ols_results.rsquared
#plot(x, y, 'o', label="GAFF (R^2 = %.3f)" % r2)
plot(x**-1, y**-1, 'o', label="GAFF")

xlim((0.01, 1))
ylim((0.01, 1))
plt.gca().set_aspect('equal', adjustable='box')
plt.draw()
savefig("./manuscript/figures/dielectric_virtual_chemistry_gaff_nocorr.pdf",
        bbox_inches="tight")

x, y = data["gaff_corrected"], data["expt"]
ols_model = sm.OLS(y, x)
ols_results = ols_model.fit()
X1 = np.append(arr=X1, values=np.ones((rows_count, 1)).astype(float), axis=1)
X1 = X1[:, [5, 0, 1, 2, 3, 4]]
# 2ns way of append values in which (b.) value will be automaically set to 1 at index 0 (use any fron 1st or 2nd way)
X1 = np.append(arr=np.ones((rows_count, 1)).astype(int), values=X1, axis=1)
"""
for i in X1:
     X2 = X1[:,[5]] /2
"""

# 9-2) Now we are going to start backward elimination
import statsmodels.formula.api as sm
# we are going to make a varibale of collection of independent/predictors
X1_optimized = X1[:, [
    0, 1, 2, 3, 4, 5
]]  # We will delete index step by step to get important predictors
regressor_OLS = sm.OLS(
    endog=Y1, exog=X1_optimized).fit()  #Fit full model with all predictors
regressor_OLS.summary(
)  # Check summary of model and remove highest P values predictors

X1_optimized = X1[:, [
    0,
    3,
    4,
    5,
]]  # We will delete index step by step to get important predictors
regressor_OLS = sm.OLS(
    endog=Y1, exog=X1_optimized).fit()  #Fit full model with all predictors
regressor_OLS.summary()

X1_optimized = X1[:, [
    3, 4, 5
    else:
        X_no[k, :] = X[i, :].reshape(1, colX)
        temp = temp - 1
        i = i + 1
        k = k + 1

#removing extra rows containing zeros
X_yes = X_yes[:-(rowX - j), :]
X_no = X_no[:-(rowX - k), :]

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_yes = sc_X.fit_transform(X_yes)
X_no = sc_X.transform(X_no)

#building the optimal model using backward elimination
import statsmodels.formula.api as sm
#appendding X(i) as ones to X matrix
X = np.append(arr=np.ones((len(X_yes), 1)).astype(int), values=X_yes, axis=1)
#PL = 0.05
X_opt = X_yes[:,
              [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]]
regressor_OLS = sm.OLS(endog=np.ones((j, 1)), exog=X_opt).fit()
regressor_OLS.summary()

#unfortunately
#P values are almost same. That means linear model doesnt adapt to it.
#Implement Non Linear Models such as Kernal-SVM, Random Forest, XGBoost
#Suggested ones are Kernal-SVM or XGBoost
Esempio n. 26
0
#   opt_features = features[:, [0, 1, 2, 3, 4, 5]]      # Step1 - All-in
#   ols_regressor = sm.OLS(endog=output, exog=opt_features).fit()       # Step2 - Fit the model
#   ols_regressor.summary()     # Step3 - Consider the predictor with the highest P-value
#
#   opt_features = features[:, [0, 1, 3, 4, 5]]      # Step4 - Remove the predictor
#   ols_regressor = sm.OLS(endog=output, exog=opt_features).fit()    # Step2
#   ols_regressor.summary()     # Step3
#
#   opt_features = features[:, [0, 3, 4, 5]]        # Step4
#   ols_regressor = sm.OLS(endog=output, exog=opt_features).fit()    # Step2
#   ols_regressor.summary()     # Step3
#
#
# Way-1: Using threshold values ----------------------------------------------
opt_features = features[:, [0, 3, 5]]           # Step4
ols_regressor = sm.OLS(endog=target, exog=opt_features).fit()   # Step2
ols_regressor.summary()     # Step3
opt_features = features[:, [3]]                 # Step4
# Removing not only feature-5 but also the feature-0 as it is the constant value we have added.

# ***** Checking the Results with opt_features *****
opt_training_features = training_features[:, [2]]
opt_testing_features = testing_features[:, [2]]
opt_regressor = LinearRegression()
opt_regressor.fit(opt_training_features, training_target)
opt_predicted_target = opt_regressor.predict(opt_testing_features)
opt_error = abs(testing_target - opt_predicted_target)

# ***** Visualising Results (Optimised) *****
#   - Visualising the Training set results
plt.subplot(121)
x_train, x_test = train_test_split(x, test_size=0.2, random_state=0)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=0)

# Regresi Linear Multiple
from sklearn.linear_model import LinearRegression
regresilinear = LinearRegression()
regresilinear.fit(x_train, y_train)

# Data hasil regresi linear
y_pred = regresilinear.predict(x_train)

import statsmodels.formula.api as sm
x = np.append(arr=np.ones((50, 1)), values=x, axis=1)
x_opt = x[:, [0, 1, 2, 3, 4, 5]]
# Tahap 1 ------------------------------------------
hasil = sm.OLS(endog=y, exog=x_opt).fit()
hasil.summary()
# Tahap 2 ------------------------------------------
x_opt = x[:, [0, 1, 3, 4, 5]]
hasil = sm.OLS(endog=y, exog=x_opt).fit()
hasil.summary()
# Tahap 3 ------------------------------------------
x_opt = x[:, [0, 3, 4, 5]]
hasil = sm.OLS(endog=y, exog=x_opt).fit()
hasil.summary()
# Tahap 4 ------------------------------------------
x_opt = x[:, [0, 3, 5]]
hasil = sm.OLS(endog=y, exog=x_opt).fit()
hasil.summary()
# Tahap 5 ------------------------------------------
x_opt = x[:, [0, 3]]
                       columns=['kilo', 'yas', 'cinsiyet'])

veri = pd.concat([Ilkpart, Sonpart], axis=1)

x_train, x_test, y_train, y_test = train_test_split(veri,
                                                    Boy,
                                                    test_size=0.33,
                                                    random_state=0)

regression.fit(x_train, y_train)

y_preg = regression.predict(x_test)

import statsmodels.formula.api as sm

X = np.append(arr=np.ones((22, 1)).astype(int), values=veri, axis=1)
X_l = veri.iloc[:, [0, 1, 2, 3, 4, 5]].values
r_ols = sm.OLS(endog=Boy, exog=X_l)
r = r_ols.fit()
print(r.summary())

X_l = veri.iloc[:, [0, 1, 2, 3, 5]].values
r_ols = sm.OLS(endog=Boy, exog=X_l)
r = r_ols.fit()
print(r.summary())

X_l = veri.iloc[:, [0, 1, 2, 3]].values
r_ols = sm.OLS(endog=Boy, exog=X_l)
r = r_ols.fit()
print(r.summary())
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

#trying to predict the resullts with  testing dataset ,compare with actual prediction
y_pred = regressor.predict(X_test)

#Building the optimal model (Performance reason,extract significant models)
import statsmodels.formula.api as sm
X = X[:, 1:-1]
'''
appending the dependent variables with ones for coefficient of x0
'''
X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)
X_optimal_features = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_optimal_features).fit()
regressor_OLS.summary()
'''
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.013e+04   6884.820      7.281      0.000    3.62e+04     6.4e+04
x1           198.7888   3371.007      0.059      0.953   -6595.030    6992.607
x2           -41.8870   3256.039     -0.013      0.990   -6604.003    6520.229
x3             0.8060      0.046     17.369      0.000       0.712       0.900
x4            -0.0270      0.052     -0.517      0.608      -0.132       0.078
x5             0.0270      0.017      1.574      0.123      -0.008       0.062
==============================================================================
'''
#since we can remove the variable that has highest p value>0.05---X2
X_optimal_features = X[:, [0, 1, 3, 4, 5]]
Esempio n. 30
0
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(s3.iloc[:,1:2],s3.iloc[:,-1:],test_size=0.33,random_state=0)



from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)

y_pred=regressor.predict(x_test)
print(y_pred)

import statsmodels.formula.api as sm
X=np.append(arr=np.ones((14,1)).astype(int),values=s3.iloc[:,:-1],axis=1)
X_l=s3.iloc[:,1].values
r_ols=sm.OLS(endog=s3.iloc[:,-1:],exog=X_l)
r=r_ols.fit()
print(r.summary())

'''X_l=s3.iloc[:,[0,1,2,3,5]].values
r_ols=sm.OLS(endog=boy,exog=X_l)
r=r_ols.fit()
print(r.summary())
'''
y_pred=regressor.predict(x_test)