Ejemplo n.º 1
0
def linregress(X_train, X_test, y_train, y_test):
    coef = []
    for col in X_train.columns.tolist():
        X = StandardScaler().fit_transform(X_train[col])
        lr = LinearRegression()
        lr.fit(X.reshape(-1, 1), y_train)
        coef.append([col, lr.coef_])
    coef = sorted(coef, key=lambda x: x[1])[::-1]
    nos = [x[1] for x in coef]
    labs = [x[0] for x in coef]
    for lab in labs:
        if lab == 'doubles':
            labs[labs.index(lab)] = '2B'
        elif lab == 'triples':
            labs[labs.index(lab)] = '3B'
        elif lab == 'Intercept':
            idx = labs.index('Intercept')
            labs.pop(idx)
            nos.pop(idx)
    labs = [lab.upper() for lab in labs]
    x = range(len(nos))
    plt.plot(x,nos, lw=2, c='b')
    plt.xticks(x, labs)
    plt.title('Linear Regression Coefficients (Win Percentage)')
    plt.savefig('images/coefficients.png')
    plt.show()
    print labs
Ejemplo n.º 2
0
 def _reduce_X(self,X,i):
     X_new = np.zeros(X.shape)
     lr = LinearRegression()
     for j in range(X_new.shape[1]):
         lr.fit(y= X[:,j].reshape(-1, 1), X= X[:,i].reshape(-1, 1))
         X_new[:,j] = X[:,j] - lr.coef_*X[:,i]
     return np.delete(X_new, i, axis=1)
Ejemplo n.º 3
0
def train_regressor(options, embed_map, wordvecs, worddict):
    """
    Return regressor to map word2vec to RNN word space
    """
    # Gather all words from word2vec that appear in wordvecs
    d = defaultdict(lambda : 0)
    for w in embed_map.vocab.keys():
        d[w] = 1
    shared = OrderedDict()
    count = 0
    for w in worddict.keys()[:options['n_words']-2]:
        if d[w] > 0:
            shared[w] = count
            count += 1

    # Get the vectors for all words in 'shared'
    w2v = numpy.zeros((len(shared), 300), dtype='float32')
    sg = numpy.zeros((len(shared), options['dim_word']), dtype='float32')
    for w in shared.keys():
        w2v[shared[w]] = embed_map[w]
        sg[shared[w]] = wordvecs[w]

    clf = LinearRegression()
    clf.fit(w2v, sg)
    return clf
def linearRegressionExample(X, Y):
	# fit-intercept defines if we should fit an intrecpt term or not
	est = LinearRegression(fit_intercept=False)
	#fit the data
	est.fit(X,Y)
	# get coefficients
	est.coef_
Ejemplo n.º 5
0
def normalize_money_with_date():
    with open('train_test.pickle') as f:
        train_set,test_set = pickle.load(f)
    
    money = float(np.max([movie['total_money'] for movie in train_set]))
    year_money = np.array([[movie['date'].year,float(movie['total_money'])/money] for movie in train_set],float)
    
    year_mean = np.zeros([5,2])
    for y in range(5):
        money = year_money[year_money[:,0] == 2011+y,1]
        plt.scatter(y*np.ones(np.shape(money)),money)
        mean = np.mean(money)
        year_mean[y,:] = np.array([1+y,mean],float)
    
    regressor = LinearRegression()
    regressor.fit(year_mean[:,0:1],year_mean[:,1])
    a,b = regressor.coef_, regressor.intercept_
    with open('coef.pickle') as f:
        coef = pickle.load(f)
        coef['normalize_year'] = {'a':a,'b':b,'base':2010}
    with open('coef.pickle','w') as f:
        pickle.dump(coef,f)
    
    print a,b,regressor.score(year_mean[:,0:1],year_mean[:,1])
    plt.plot(year_mean[:,1])
    plt.savefig('year_money.png')
def calc_task_two_one():
    warnings.warn("deprecated", DeprecationWarning)
    model = LinearRegression()
    X = np.array(df[x_list].values)
    y = df['Price'].values
    model.fit(X, y)
    return model, X, y
Ejemplo n.º 7
0
    def RunLinearRegressionScikit(q):
      totalTimer = Timer()

      # Load input dataset.
      # If the dataset contains two files then the second file is the responses 
      # file.
      Log.Info("Loading dataset", self.verbose)
      if len(self.dataset) == 2:
        X = np.genfromtxt(self.dataset[0], delimiter=',')
        y = np.genfromtxt(self.dataset[1], delimiter=',')
      else:
        X = np.genfromtxt(self.dataset, delimiter=',')
        y = X[:, (X.shape[1] - 1)]
        X = X[:,:-1]

      try:
        with totalTimer:
          # Perform linear regression.
          model = SLinearRegression()
          model.fit(X, y, n_jobs=-1)
          b = model.coef_
      except Exception as e:
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
#train_x = np.reshape(train_x,(-1,2))
#print(train_x)
#train_x2 = data["Y"][:-2].values
#train_x2 = np.reshape(train_x2,(-1,1))
train_y = data["Expected_output"][:-2].values.reshape(-1,1)
#train_y = np.reshape(train_y,(-1,1))

#test_x = pd.DataFrame(data,columns = data[["X","Y"]][-2:].values)
test_x = data[["X","Y"]][-2:].values.reshape(-1,2)
#test_x = np.reshape(test_x,(-1,2))
#test_x2 = data["Y"][-2:].values
#test_x2 = np.reshape(test_x2,(-1,1))
test_y = data["Expected_output"][-2:].values.reshape(-1,1)
#test_y = np.reshape(test_y,(-1,1))

#print(test_x["X"])
model = LinearRegression()
model.fit(train_x,train_y)

coeff = model.coef_
intercept = model.intercept_
points = [intercept+(coeff[0]*i[0]) for i in train_x]
plt.plot(points,"ro")
predict_y = model.predict(test_x)
plt.plot(train_y,predict_y,"b*")

print(predict_y)
plt.show()
#intercept =
#points =
Ejemplo n.º 9
0
df.drop('type', axis=1, inplace=True)

df.drop(df[df['bedroom_num'] < df['bathroom_num'] - 1].index, inplace=True)

mean_price = df['price'].mean()
df.drop(df[df['price'] < mean_price - 75000].index, inplace=True)
df.drop(df[df['price'] > mean_price + 75000].index, inplace=True)
df['price'].describe()

X = df.drop('price', axis=1)
y = df['price']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error

#print(model.score(X_test,y_test))

with open('mumbai.pickle', 'wb') as f:
    pickle.dump(model, f)
X = list(X.columns)
X = '@'.join(X)
with open('locations2.txt', 'w') as file:
    file.write(X)
Ejemplo n.º 10
0
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

previous_days = np.array([1, 7, 11, 21, 25, 28])
next_days = np.array([7, 11, 21, 25, 28, 31])
previous_days = previous_days.reshape(-1, 1)
# miachapanin darcnuma erkchapani reshape
# orinakneri grafik
# plt.scatter(previous_days,next_days )
# plt.xlabel("previous_days")
# plt.ylabel("next_days ")
# plt.show()
# print(previous_days)
ml_model = LinearRegression()
# superviz
ml_model.fit(previous_days, next_days)

ml_model.intercept_#b-n banadzevi
#intercept
ml_model.coef_#a-n banadzevi
# y_pred = ml_model.intercept_ + ml_model.coef_*X
#slope
# plt.scatter(previous_days,next_days , color='red')
# plt.xlabel('previous_days')
# plt.ylabel('next_days ')
# plt.plot(previous_days, ml_model.predict(previous_days))
# plt.show()
y_pred = ml_model.predict([[1]])
print(y_pred)
# print(help(ml_model.predict))
Ejemplo n.º 11
0
def main():
    train_X, train_y, valid_X, valid_y, train_w, val_w = get_data()

    print("********* Linear Regression *********")
    linear_regression = LinearRegression(fit_intercept=False)
    linear_regression.fit(train_X, train_y)
    train_y_predictions = linear_regression.predict(train_X)
    valid_y_predictions = linear_regression.predict(valid_X)

    # The betas
    linear_reg_beta = linear_regression.coef_
    print("True betas:{}\nEstimated linear regression betas:{}\n".format(
        _beta, linear_reg_beta))

    print(
        "Training loss with true beta:{:.3f}\nValidation loss with true beta:{:.3f}"
        .format(compute_loss_square_with_true_betas(train_X, train_y, _beta),
                compute_loss_square_with_true_betas(valid_X, valid_y, _beta)))
    print("Training square loss:{:.3f}\nValidation square loss:{:.3f}".format(
        compute_square_loss(train_y, train_y_predictions),
        compute_square_loss(valid_y, valid_y_predictions)))

    # The mean squared error
    print("Linear regression mean squared error: {:.2f}, {:.2f}\n".format(
        mean_squared_error(train_y, train_y_predictions),
        mean_squared_error(valid_y, valid_y_predictions)))

    print("********* Linear Regression Error Analysis *********")
    X = np.vstack([train_X, valid_X])
    print("Rank:{:d}".format(np.linalg.matrix_rank(X)))
    U, S, V = np.linalg.svd(X, full_matrices=False)
    X_SVD = U @ np.diag(S) @ V
    print("Is X close to X_SVD?", np.isclose(X, X_SVD).all())
    print("Singular values:{}".format(S))
    w = np.hstack([train_w, val_w])
    Inv_S = np.linalg.inv(np.diag(S))
    print("True betas:{}".format(_beta))
    print("Computed betas:{}".format(linear_reg_beta))
    beta_OLS_beta_true = (U @ Inv_S @ V).T @ w
    print("Difference:{}".format(beta_OLS_beta_true))
    mean_training_error = 1 + 5 / 70
    variance_training_error = (70 - 5) / (70.**2)
    variance_training_error *= 2
    print("Mean of the average training error:{}\nVariance:{}".format(
        mean_training_error, variance_training_error))

    valid_x_cov = np.cov(valid_X)
    U = U[:20, :5]
    test_mean_square_error = U.T @ valid_x_cov @ U
    test_mean_square_error = test_mean_square_error @ Inv_S
    test_mean_square_error = np.sum(test_mean_square_error, axis=1)
    print("Test mean square error:{}\n".format(test_mean_square_error))

    print("********* Ridge Regression *********")
    ridge_regression = Ridge(alpha=0.5, fit_intercept=False)
    ridge_regression.fit(train_X, train_y)
    train_y_predictions = ridge_regression.predict(train_X)
    valid_y_predictions = ridge_regression.predict(valid_X)

    # The coefficients
    ridge_beta = ridge_regression.coef_
    print("True beta:{}\nRidge betas:{}\n".format(_beta, ridge_beta))

    print(
        "Training loss with true beta:{:.3f}\nValidation loss with true beta:{:.3f}"
        .format(compute_loss_square_with_true_betas(train_X, train_y, _beta),
                compute_loss_square_with_true_betas(valid_X, valid_y, _beta)))

    print("Training square loss:{:.3f}\nValidation square loss:{:.3f}".format(
        compute_square_loss(train_y, train_y_predictions),
        compute_square_loss(valid_y, valid_y_predictions)))

    # The mean squared error
    print('Mean squared error: {:.2f}, {:.2f}'.format(
        mean_squared_error(train_y, train_y_predictions),
        mean_squared_error(valid_y, valid_y_predictions)))

    U, S, V = np.linalg.svd(train_X, full_matrices=False)
    X_SVD = U @ np.diag(S) @ V
    print("Is X close to X_SVD?", np.isclose(train_X, X_SVD).all())
    print("Singular values:{}".format(S))
#concat data frames what to be splited
dataFrameExceptHumidity = pd.concat([outlook, temperature], axis=1)
dataFrameExceptHumidity = pd.concat([dataFrameExceptHumidity, windy], axis=1)
dataFrameExceptHumidity = pd.concat([dataFrameExceptHumidity, play], axis=1)
""" DATA PREDICTION """
#split data as test and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dataFrameExceptHumidity,
                                                    humidity,
                                                    test_size=0.33,
                                                    random_state=0)

#create multiple regression and predict
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_prediction = regressor.predict(x_test)

#backward eliminatipn
import statsmodels.formula.api as sm
X = np.append(arr=np.ones((14, 1)).astype(int),
              values=dataFrameExceptHumidity,
              axis=1)
X_l = dataFrameExceptHumidity.iloc[:, [0, 1, 2, 3, 4, 5]].values
r_ols = sm.OLS(endog=humidity, exog=X_l).fit()
print(r_ols.summary())

#make elimination according to p-values of r_ols
X_l = dataFrameExceptHumidity.iloc[:, [0, 1, 2, 3, 5]].values
r_ols = sm.OLS(endog=humidity, exog=X_l).fit()
Ejemplo n.º 13
0
import matplotlib.pyplot as plt
import pandas as pd

#import datsaet
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:,:1].values
Y = dataset.iloc[:,1].values

#splitting into train and test data
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=1/3,random_state=0)

#perform / fit SLR for train dataset
from sklearn.linear_model import LinearRegression
Regressor = LinearRegression()
Regressor.fit(X_train,Y_train)

#predict the test set results and compare
y_pred = Regressor.predict(X_test)

#plotting y_pred for x_train vs x_train
plt.scatter(X_train,Y_train,color='red')
plt.plot(X_train,Regressor.predict(X_train),color='blue')
plt.title('Salary vs years of Experience')
plt.xlabel('Years of experience')
plt.ylabel('salary')
plt.show()

#plotting y_pred for x_test vs x_test
plt.scatter(X_test,Y_test,color='red')
plt.plot(X_train,Regressor.predict(X_train),color='blue')
    else:
        if tid in transcript_counts:
            all_y.append(transcript_counts[tid])
            all_x.append([float(x) for x in data[1:]])
            trans_ids.append(tid)
f.close()

all_x = np.array(all_x)
all_y = np.log2(all_y)

print "normalizing data"
#my_normalization = preprocessing.StandardScaler().fit(all_x)
#all_x = my_normalization.transform(all_x)
print "fitting linear regression"
clf = LinearRegression(fit_intercept=True)
clf = clf.fit(all_x, all_y)
print "making predictions"
predictions = clf.predict(all_x)

count_mean = np.mean(all_y)
residuals = all_y - predictions
corrected_counts = residuals + count_mean

out = open(args.output, 'w')
for i in xrange(len(all_y)):
    out.write("%s\t%d\t%d\t%f\t%f\n" %
              (taxid_transcript[trans_ids[i]], trans_ids[i],
               transcript_counts[trans_ids[i]], rpkms[trans_ids[i]], 2**
               corrected_counts[i]))
out.close()
Ejemplo n.º 15
0
grouped_test2 = df_gptest[['drive-wheels', 'price']].groupby(['drive-wheels'])
# print(grouped_test2.head(2))
# print(df_gptest)

grouped_test2.get_group('4wd')['price']
# ANOVA
f_val, p_val = stats.f_oneway(
    grouped_test2.get_group('fwd')['price'],
    grouped_test2.get_group('rwd')['price'],
    grouped_test2.get_group('4wd')['price'])

# print("ANOVA results: F=", f_val, ", P =", p_val)

f_val, p_val = stats.f_oneway(
    grouped_test2.get_group('fwd')['price'],
    grouped_test2.get_group('rwd')['price'])

# print("ANOVA results: F=", f_val, ", P =", p_val)

f_val, p_val = stats.f_oneway(
    grouped_test2.get_group('4wd')['price'],
    grouped_test2.get_group('rwd')['price'])

# print("ANOVA results: F=", f_val, ", P =", p_val)

X = df[['highway-mpg']]
Y = df['price']
lm.fit(X, Y)
Yhat = lm.predict(X)
print(Yhat[0:5])
Ejemplo n.º 16
0
##########################################################
# 	Linear Regression 
##########################################################



def rmsle(y_pred, y_test) : 
	assert len(y_test) == len(y_pred)
	assert (y_pred < 0).sum() == 0
	return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))


# try linear regression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
baseline_error = rmsle(y_test, y_pred)
print(baseline_error)



##########################################################
# 	Ridge Regression 
##########################################################


n_alphas = 100
alphas = np.logspace(-5, 5, 100 )

coefs = list()
Ejemplo n.º 17
0
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import ElasticNet, LinearRegression
data, target = load_svmlight_file('E2006.train')
lr = LinearRegression(fit_intercept=True)

from sklearn.cross_validation import KFold
kf = KFold(len(target), n_folds=10)
err = 0
for train, test in kf:
    lr.fit(data[train], target[train])
    p = map(lr.predict, data[test])
    p = np.array(p).ravel()
    e = p - target[test]
    err += np.dot(e, e)

rmse_10cv = np.sqrt(err / len(target))

lr.fit(data, target)
p = np.array(map(lr.predict, data))
p = p.ravel()
e = p - target
total_error = np.dot(e, e)
rmse_train = np.sqrt(total_error / len(p))

print('RMSE on training: {}'.format(rmse_train))
Ejemplo n.º 18
0
        'eight': 8,
        'nine': 9,
        'ten': 10,
        'eleven': 11,
        'twelve': 12,
        'zero': 0,
        0: 0
    }
    return word_dict[word]


# fit in experience column
X['experience'] = X['experience'].apply(lambda x: convert_to_int(x))

y = dataset.iloc[:, -1]

#spliting training set and test set
#since we have too small data so we will train our model with all the availabel data.

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

#fitting model with training data
regressor.fit(X, y)

# saviing model to disk
pickle.dump(regressor, open('model.pkl', 'wb'))

#loading model to prepare the result
model = pickle.load(open('model.pkl', 'rb'))
print(model.predict([[2, 9, 6]]))
Ejemplo n.º 19
0
# Calculate 251 day average closing price and standard deviation
# 251 because approx. 251 trading days per year according to
# https://tradingsim.com/blog/trading-days-in-a-year/
data['close_year_mean'] = data.Close.rolling(251).mean().shift(1)
data['close_year_std'] = data.Close.rolling(251).std().shift(1)

# Calculate volume 251 day avg & std
data['vol_year_mean'] = data.Volume.rolling(251).mean().shift(1)
data['vol_year_std'] = data.Volume.rolling(251).std().shift(1)

# Drop null values
# First 251 rows where there wasn't enough data to calculate year_mean and year_std
data = data.dropna(axis=0)

# Use 2013-01-01 as date to begin testing
test_date = dt.datetime(year=2013, month=1, day=1)
train = data[data['Date'] < test_date]
test = data[data['Date'] >= test_date]

# Only use new columns as features
features = [i for i in data.columns if i.endswith('mean') or i.endswith('std')]
target = 'Close'

lr = LinearRegression()
lr.fit(train[features], train[target])
predictions = lr.predict(test[features])
rmse = mean_squared_error(predictions, test[target])**(1 / 2)

print('Root Mean Squared Error: ', rmse)
# Output: Root Mean Squared Error:  22.24912756194984
Ejemplo n.º 20
0
X = preprocessing.scale(
    X
)  # scaled the Data this is a set of ADJ Close values for label these are the results or X
#used to generate the model
X_lately = X[-forcast_out:]
X = X[:-forcast_out]
#used for labels
y = np.array(dfreg['Label'])
y = y[:-forcast_out]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X)

#Linear LinearRegression
clfreg = LinearRegression(n_jobs=-1)  # -1 means uses all processors
clfreg.fit(X_train, y_train)

#Quaratic Regression
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)

#Polynomial regresion of degree 3 (Cubic?)
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)

#KNN Regression

clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)

#confidence scores
# In[10]:

feature_cols = [
    "Monthly Income", "Transaction Time", "Gender_Female", "Gender_Male",
    "City_Tier 1", "City_Tier 2", "City_Tier 3", "Record"
]

# In[11]:

X = df_new[feature_cols]
Y = df_new["Total Spend"]

# In[12]:

lm = LinearRegression()
lm.fit(X, Y)

# In[13]:

print(lm.intercept_)
print(lm.coef_)

# In[14]:

list(zip(feature_cols, lm.coef_))

# In[15]:

lm.score(X, Y)

# El modelo puede ser escrito como:
gs_a.support = gsFeatureSupport
gs_a.selector = gsFeatureSelector
gs_a.fit(gsX,Y_train)#Train the model

#%%
# Linear Regression model and feature selection
linearRegression = LinearRegression()
linearFeatureSelector = RFECV(linearRegression, cv = 5).fit(X_train,Y_train)
LinearX = linearFeatureSelector.transform(X_train)
linearFeatureSupport = linearFeatureSelector.support_
# store selector in Linear Regression Model
linearRegression.support = linearFeatureSupport
linearRegression.selector = linearFeatureSelector

# train Linear Regression Model
linearRegression.fit(LinearX,Y_train)

# %%

'''
We can choose to open or not open a model
'''
#with open('\Model_RandomForest.joblib', 'rb') as gs_a:

#gs_a = joblib.load('./Model_RandomForest.joblib')


"""
Verification
"""
verifi_data = pd.read_csv("../Data/Verification_data.csv")
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# import data
data_set = pd.read_csv('Position_Salaries.csv')
x = data_set['Level']
y = data_set['Salary']

# split data to train & test sets
x_train, x_test, y_train, y_test = train_test_split(x, y)

# generate the polynomial features from x: x^0, x^1, x^2, ...
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train.reshape(-1, 1))

# train the model using the ploynomial features
model = LinearRegression()
model.fit(x_train_poly, y_train)

# generate the ploy features from test set, then predict
x_test_poly = poly.transform(x_test.reshape(-1, 1))
y_pred = model.predict(x_test_poly)

#plot the results
plt.plot(x_train, y_train, 'ro', label='training_data')
plt.plot(x_test, y_test, 'bo', label='testing_data')
plt.plot(x_test, y_pred, 'go', label='predicted_data')
plt.legend()
plt.show()
#split our data set into the following parts
np.random.seed(1)
train, validate, test = np.split(df_clean.sample(frac=1), [int(.6*len(df_clean)), int(.8*len(df_clean))])
train_x= train.drop(['imdb_score'], axis=1)
train_y=train['imdb_score']
 
test_x= test.drop(['imdb_score'], axis=1)
test_y=test['imdb_score']

###################################################################################################################
#linear regression
###################################################################################################################
# train our algorithm
regressor = LinearRegression()  
results=regressor.fit(train_x, train_y) #training the algorithm


X2 = sm.add_constant(train_x)
est = sm.OLS(train_y, X2)
est2 = est.fit()
print(est2.summary())

#test our algorithm
pred = results.predict(test_x)

#compare actual vs predicted values
df_output = pd.DataFrame({'Actual': test_y, 'Predicted': pred})
df_output

# Calculate mean absolute percentage error (MAPE)
Ejemplo n.º 25
0
def main(regressor="random_forest"):
    """
    The main method
    """
    # Fetch data from internet
    data = fetch_and_load_data()

    # Process median_income into categories
    data["income_cat"] = np.ceil(data["median_income"] / 1.5)
    data["income_cat"].where(data["income_cat"] < 5, 5.0, inplace=True)

    # Split data into training and testing sets
    train_data, test_data = split_train_test_stratified(data, "income_cat")

    # Extract labels and housing data
    housing_labels = train_data["median_house_value"].copy()
    housing = train_data.drop("median_house_value", axis=1)

    # split housing into categorical and numerical data
    # cat_attributes = ["ocean_proximity", "income_cat"]
    cat_attributes = ["ocean_proximity"]
    num_attributes = ['longitude', 'latitude', 'housing_median_age',
                      'total_rooms', 'total_bedrooms', 'population',
                      'households', 'median_income']
 
    # Set up pipeline to prepare data with.
    full_pipeline = setup_pipeline(num_attributes, cat_attributes)

    # Prepare the data
    housing_prepared = full_pipeline.fit_transform(housing)

    print()

    # Select the appropriate regressor
    if regressor == "linear":
        reg_model = LinearRegression()
        reg_name = "Linear Regressor"
    elif regressor == "random_forest":
        reg_model = RandomForestRegressor()
        reg_name = "Random Forest Regressor"
    elif regressor == "decision_tree":
        reg_model = DecisionTreeRegressor()
        reg_name = "Decision Tree Regressor"
    elif regressor == "svr":
        reg_model = SVR(kernel="linear", gamma='auto')
        reg_name = "Support Vector Machine"
    else:
        error_mes = "Regressor '{regressor}' not recognised."
        raise ValueError(error_mes.format(regressor=regressor))

    # Train regression model
    reg_model.fit(housing_prepared, housing_labels)
    display_model_performance(reg_model,
                              housing_prepared,
                              housing_labels,
                              reg_name)

    if regressor == "random_forest":
        # Fine tune Random Forest
        param_grid = [
                {'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6, 8]},
                {'bootstrap': [False], 'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6]}]
        final_model = fine_tune_model(RandomForestRegressor(),
                                      param_grid,
                                      housing_prepared,
                                      housing_labels)
        # Get the best model weights
        print()
        print("Attribute weights:")
        feature_importances = final_model.feature_importances_
        print_attribute_importances(feature_importances, num_attributes, full_pipeline)
    elif regressor == "linear":
        final_model = reg_model
        print("Coefficients used by linear model:")
        coeffs = final_model.coef_
        print_attribute_importances(coeffs, num_attributes, full_pipeline)
    elif regressor == "decision_tree":
        # Fine tune Decision Tree
        param_grid = [{'criterion': ["mse", "friedman_mse", "mae"]}]
        final_model = fine_tune_model(DecisionTreeRegressor(),
                                      param_grid,
                                      housing_prepared,
                                      housing_labels)
    elif regressor == "svr":
        param_grid = [
                {'kernel': ["linear"], "C": [10000, 100000]},
                {'kernel': ["rbf"], "C": [10000, 100000],
                    "gamma": [0.045, 0.05, 0.055]}]
        final_model = fine_tune_model(SVR(),
                                      param_grid,
                                      housing_prepared,
                                      housing_labels)
    else:
        final_model = reg_model

    print()

    # Evaluate on test set
    X_test = test_data.drop("median_house_value", axis=1)
    y_test = test_data["median_house_value"].copy()

    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = final_model.predict(X_test_prepared)

    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    print("Final Standard Error:", final_rmse)
forecast_out = int(30) 
df['Prediction'] = df[['Adj Close']].shift(-forecast_out)
df.shape

X = np.array(df.drop(['Prediction'], 1))
X = preprocessing.scale(X)
X.shape

X_forecast = X[-forecast_out:] # set X_forecast equal to last 30
X = X[:-forecast_out] # remove last 30 from X
X.shape

y = np.array(df['Prediction'])
y = y[:-forecast_out]
y.shape

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

# Training
clf = LinearRegression()
clf.fit(X_train,y_train)
# Testing
confidence = clf.score(X_test, y_test)
print("confidence: ", confidence)

forecast_prediction = clf.predict(X_forecast)
print(forecast_prediction)


Ejemplo n.º 27
0
import pandas as pd
import numpy as np
## Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values
## Encoding Categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelEncoder = LabelEncoder()
X[:, 3] = labelEncoder.fit_transform(X[:, 3])
# print(X[:, 3])
onehotencoder = OneHotEncoder(categories='auto')
X = onehotencoder.fit_transform(X).toarray()
## Avoiding Dummy Variable Trap
X = X[:, 1:]
## Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)
# print(X_train, '\n', X_test, '\n', Y_train, '\n', Y_test)

# Step2: Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# Step3: Predicting the Test set results
Y_pred = regressor.predict(X_test)
print('Y_test = ', Y_test, '\n', 'Y_pred = ', Y_pred)
Ejemplo n.º 28
0
# Importar dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values #Variables independientes o predictoras
y = dataset.iloc[:, 2].values # Tiene que ser una matriz, no un vector

# No hace falta dividir en datatest y datatrain porque hay pocos datos
"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
"""

# Comprobar la regresion lineal y comprobar

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)

# Ajustar la regresión polinómica con todo el dataset

from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4) 
X_poly = poly_reg.fit_transform(X) # Fit solo crea el modelo, y el transform aplica los cambios
lin_reg_2 = LinearRegression() # Es la misma, pero es una regresion LINEAL polinomica
lin_reg_2.fit(X_poly, y)

""" Muchas librerias necesitan una columa de unos a la izquierda"""
"""Primero se modela las variables, y luego regresion lineal"""

# Visualizacion de los resultados del modelo lineal

plt.scatter(X, y, color = "red")
Ejemplo n.º 29
0
x = np.arange(0, 100)
y = np.arange(0, 100)
print(x)
print(y)
lr = LinearRegression()

#lr.fit(x,y)
# you should see an error...

x.ndim
y.ndim

x.shape
y.shape

x = x.reshape(-1, 1)
x.shape

x.ndim

lr.fit(x, y)
# now it works!

plt.scatter(x, y, color='red')

plt.plot(x, lr.predict(x), color='blue')
plt.title('Linear Regression Demo')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
Ejemplo n.º 30
0
print(seenMovie)
print(metadata)
print("Data loaded")
print(seenMovie.shape, '\t', metadata.shape)
seenMovie = seenMovie.astype('int')
# split train and test set
X_train, X_test, y_train, y_test = train_test_split(metadata,
                                                    seenMovie,
                                                    test_size=0.3,
                                                    random_state=1,
                                                    shuffle=True,
                                                    stratify=seenMovie)

# build model 2 nnls regression model
reg_nnls = LinearRegression(positive=True)
y_pred_nnls = reg_nnls.fit(X_train, y_train).predict(X_test)
r2_score_nnls = r2_score(y_test, y_pred_nnls)
print("NNLS R2 score", r2_score_nnls)
logLossVal_nnls = log_loss(y_test,
                           y_pred_nnls,
                           eps=1e-15,
                           normalize=True,
                           sample_weight=None,
                           labels=None)

scaled_test = minmax_scale(y_test, feature_range=(0, 1))
scaled_pred = minmax_scale(y_pred_nnls, feature_range=(0, 1))
mse_2 = calculateMeanSquareError(scaled_test, scaled_pred)
# m2_recall = recall_score(y_test, y_pred_nnls, average='binary')
# m2_precision = precision_score(y_test, y_pred_nnls, average='binary')
print("LogLoss Model 2: ", logLossVal_nnls)
Ejemplo n.º 31
0
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../../../data/Position_Salaries.csv")
X = df.iloc[:, 1:2].values
Y = df.iloc[:, 2:].values

from sklearn.preprocessing import PolynomialFeatures
poly_feature = PolynomialFeatures(degree=2)
X_poly = poly_feature.fit_transform(X)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_poly, Y)

fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.scatter(X, Y, color='r')
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
ax.plot(X_grid, lin_reg.predict(poly_feature.fit_transform(X_grid)))
ax.set_title('level-salary curve')
ax.set_xlabel('level')
ax.set_ylabel('salary')
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling of train and test data
# Not needed as sklearn.linear_model will take care it self.
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train) # fit and transform
#X_test = sc_X.transform(X_test) # already data is fit so only transform here
# no need to do feature scaling for y as there are only two values (yes/no)


# Fitting Simple Linear Regression to Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# To retrieve the intercept
r_intercept = regressor.intercept_
# For retrieving the slope (coefficient of x)
r_coef = regressor.coef_


# Predicting the test set result.
y_pred = regressor.predict(X_test)

# Visulization of results

# First plot predictions for training set and compare with ground truth
plt.scatter(X_train, y_train, color = 'red') # Ground truth values for train
plt.plot(X_train, regressor.predict(X_train), color = 'blue') # Predicted values for train
plt.title("Best Prices vs List Prices - Training")
import pandas
from sklearn.linear_model import LinearRegression
data = pandas.read_csv('iphone_price.csv')
model = LinearRegression()
model.fit(data[['version']], data[['price']])
print(model.predict([[20]]))
print(model.predict([[25]]))
print(model.predict([[30]]))
Ejemplo n.º 34
0
    best_lasso_mse = None

    # Split data into Training and Test part (85 x 1 vector y_train; 85 x 7 matrix X_train; 18 x 1 vector y_test; and, 85 x 7 x_test)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=18,
                                                        train_size=85)

    # X_train and y_train further split on each iteration for 5-fold validation
    for train_index, test_index in kf.split(X_train):
        X_train1, X_cv = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train1, y_cv = y_train.iloc[train_index], y_train.iloc[test_index]

        # Train the linear model and save if it is best model based on score
        lr_model = LinearRegression()
        lr_model.fit(X_train1, y_train1)
        mse1 = mean_squared_error(y_cv, lr_model.predict(X_cv))
        if linear_cv_mse == [] or mse1 < min(linear_cv_mse):
            best_lm_mse = mse1
            lr_model_best = lr_model
        linear_cv_mse.append(mse1)

        # Train the ridge model and save if it is best model
        rg_model = Ridge(alpha=20)
        rg_model.fit(X_train1, y_train1)
        mse2 = mean_squared_error(y_cv, rg_model.predict(X_cv))
        if ridge_cv_mse == [] or mse2 < min(ridge_cv_mse):
            best_rg_mse = mse2
            rg_model_best = rg_model
        ridge_cv_mse.append(mse2)
                                                    random_state=42)

#Feature Scaling the data                               # You might want to make this optional
from sklearn.preprocessing import StandardScaler
fs_X = StandardScaler()
fs_y = StandardScaler()
X_train = fs_X.fit_transform(X_train)
X_test = fs_X.transform(X_test)
y_train = fs_y.fit_transform(np.array(y_train).reshape(-1, 1))
y_test = fs_y.transform(np.array(y_test).reshape(-1, 1))

# Linear Regression
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)

predictions_lin = lm.predict(X_test)

# Polynomial Regression using degree 3
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(
    degree=2)  # You might want to try with different values of degree
X_poly = poly.fit_transform(X_train)
poly.fit(X_poly, y_train)
lm2 = LinearRegression()
lm2.fit(X_poly, y_train)

predictions_poly = lm2.predict(poly.fit_transform(X_test))

#Suppor Vector Regression
Ejemplo n.º 36
0
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
#We will split 10 to test, 20 to train
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=1 / 3,
                                                    random_state=0)

#No need for feature scaling

#Fitting Simple Lin Regression model to Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()  #We are fine with default parameters
regressor.fit(
    X_train,
    y_train)  #machine is the regressor, made it learn on the training set

#Machine can now based on its learning experience predict the new salary
#Regressor learned the correlations between experience and salary

#Predicting the test results - create a vector  of predictions
y_pred = regressor.predict(
    X_test)  #vector of predictions of dependant variable

#The predictions are pretty damn close

#Visualizing the results with matplotlib
plt.scatter(X_train, y_train, color='red')  #plots the real values
plt.plot(X_train, regressor.predict(X_train),
         color='blue')  #shows the comparisn between X_train and predictions
Ejemplo n.º 37
0
pos = input('Enter your Position Level (1-10)')
act_sal = input('Enter your Salary')
error = 0.1

dataset = pd.read_csv('Position_Salaries.csv')
idm = dataset.iloc[:,1:2].values
dm = dataset.iloc[:,2].values

from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 5)
idm_poly = poly_reg.fit_transform(idm)
poly_reg.fit(idm_poly,dm)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(idm_poly,dm)

idm_grid = np.arange(min(idm),max(idm),0.01)
idm_grid = idm_grid.reshape(len(idm_grid),1)
mpt.scatter(idm,dm,color='red')
mpt.plot(idm_grid,lin_reg.predict(poly_reg.fit_transform(idm_grid)),color='blue')
mpt.title('Truth or Bluff - Polynomial Regression')
mpt.xlabel('Position Level')
mpt.ylabel('Salary')
mpt.show()

prdt_sal = lin_reg.predict(poly_reg.fit_transform(pos))
err=(prdt_sal-float(act_sal))/prdt_sal
if(err<=error):
    print('Truth')
else: