Ejemplo n.º 1
0
def run_regression( preprocess_dict, flags_dict, X, Y, model_name_in ):
    if( preprocess_dict["ModelName"] == "Linear" ):
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit( X, Y )
    
    elif( preprocess_dict["ModelName"] == "Poly" ):
        from sklearn.preprocessing import PolynomialFeatures
        regressor = PolynomialFeatures( degree = 4 )
        X_poly = regressor.fit_transform( X )
        regressor.fit(X_poly, Y)
        lin_reg_2 = LinearRegression()
        lin_reg_2.fit( X_poly, Y )
    
    elif( preprocess_dict['ModelName'] == 'SVM' ):
        from sklearn.svm import SVR
        regressor = SVR( kernel = 'rbf' )
        regressor.fit( X, Y )
        
    elif( preprocess_dict['ModelName'] == 'DecisionTree' ):
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor( random_state = preprocess_dict['RandState'] )
        regressor.fit( X, Y )
    
    elif( preprocess_dict['ModelName'] == 'RandomForest' ):
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor( n_estimators = 20, random_state = preprocess_dict['RandState'] )
        regressor.fit(X, Y)
    
    # Plot graphs
    if( flags_dict["show_graph"] and ( preprocess_dict["ModelName"] == "Linear" or preprocess_dict['ModelName'] == 'SVM' ) ):
        plt.scatter(X, Y, color = 'red')
        plt.plot( X, regressor.predict(X), color = '#FDDA87' )
        plt.plot( X, regressor.predict( X ), color = '#295BFF' )
        plt.title('Salary vs Experience (Test set)')
        plt.xlabel('Years of Experience')
        plt.ylabel('Salary')
        plt.show()
    
    elif( flags_dict["show_graph"] and preprocess_dict["ModelName"] == "Poly" ):
        # Visualising the Polynomial Regression results
        plt.scatter(X, Y, color = '#295BFF')
        plt.plot(X, lin_reg_2.predict( regressor.fit_transform(X)), color = '#FDDA87' )
        plt.title('Truth or Bluff (Polynomial Regression)')
        plt.xlabel('Position level')
        plt.ylabel('Salary')
        plt.show()
    
    elif( flags_dict["show_graph"] and ( preprocess_dict['ModelName'] == 'DecisionTree' or 
        preprocess_dict['ModelName'] == 'RandomForest' ) ):
        # Visualising the SVR results (for higher resolution and smoother curve)
        X_grid = np.arange(min(X), max(X), 0.01) # choice of 0.01 instead of 0.1 step because the data is feature scaled
        X_grid = X_grid.reshape((len(X_grid), 1))
        plt.scatter(X, Y, color = '#295BFF')
        plt.plot(X_grid, regressor.predict(X_grid), color = '#FDDA87')
        plt.title('Truth or Bluff (SVR)')
        plt.xlabel('Position level')
        plt.ylabel('Salary')
        plt.show()
Ejemplo n.º 2
0
# SGDRegressor

# Scaling the features using StandardScaler:
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)

regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print ('Cross validation r-squared scores:', scores)
print ('Average cross validation r-squared score:', np.mean(scores))
regressor.fit_transform(X_train, y_train)
print ('Test set r-squared score', regressor.score(X_test, y_test))


# Selecting the best features

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=33)

df.columns

feature_names = list(df.columns[2:])

feature_names.remove('PT08.S4(NO2)')

import matplotlib.pyplot as plt
Ejemplo n.º 3
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

data = np.genfromtxt("job.csv", delimiter=",")
xData = data[1:, 1]
yData = data[1:, 2]
plt.scatter(xData, yData)
plt.show()
# 线性拟合
xData = np.atleast_2d(xData).T
yData = np.atleast_2d(yData).T
model = LinearRegression()
model.fit(xData, yData)
plt.plot(xData, yData, "b.")
plt.plot(xData, model.predict(xData), "r")
plt.show()

model = PolynomialFeatures(degree=10)  # 定义特征维度
xPoly = model.fit_transform(xData)  # 特征处理(升维)
lin_reg = LinearRegression()  # 对象声明
lin_reg.fit(xPoly, yData)

plt.plot(xData, yData, "b.")
xTest = np.atleast_2d(np.linspace(1, 10, 100)).T  # 高密度取样(平滑)
plt.plot(xTest, lin_reg.predict(model.fit_transform(xTest)), c="r")
plt.show()
print(lin_reg.coef_)
print(lin_reg.intercept_)
Ejemplo n.º 4
0
def ChooseRunRegression(REG_TYPE, PARAMETER, X, y, X_train, y_train, X_test,
                        y_test):

    if REG_TYPE == 'LINEAR':
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)

        #Predicting using test set
        y_pred = regressor.predict(X_test)

        #Predicting using cross validation (KFold method)
        y_pred_kf = cross_val_predict(regressor, X, y, cv=10)

        evaluate(y_pred, y_pred_kf, y, y_test)

    elif REG_TYPE == 'POLYNOMIAL':

        if PARAMETER == 'Auto':
            regressor = PolynomialFeatures(degree=2)
        else:
            regressor = PolynomialFeatures(degree=int(PARAMETER[0]))

        X_poly_k = regressor.fit_transform(X)
        X_poly = regressor.fit_transform(X_train)

        lin_reg_pl = LinearRegression()

        lin_reg_pl.fit(X_poly, y_train)
        y_pred = lin_reg_pl.predict(regressor.fit_transform(X_test))

        #Predicting using cross validation (KFold method)
        y_pred_kf = cross_val_predict(lin_reg_pl, X_poly_k, y, cv=10)

        evaluate(y_pred, y_pred_kf, y, y_test)

    elif REG_TYPE == 'DECISION_TREE':

        if PARAMETER == 'Auto':
            regressor = DecisionTreeRegressor(random_state=0)
        else:
            regressor = DecisionTreeRegressor(
                max_depth=int(PARAMETER[0]),
                min_samples_split=float(PARAMETER[1]),
                min_samples_leaf=float(PARAMETER[2]),
                max_features=int(PARAMETER[3]),
                random_state=0)

        regressor.fit(X_train, y_train)

        #Predicting using test set
        y_pred = regressor.predict(X_test)

        #Predicting using cross validation (KFold method)
        y_pred_kf = cross_val_predict(regressor, X, y, cv=10)

        evaluate(y_pred, y_pred_kf, y, y_test)

    elif REG_TYPE == 'RANDOM_FOREST':

        if PARAMETER == 'Auto':
            regressor = RandomForestRegressor(random_state=0)
        else:
            regressor = RandomForestRegressor(
                max_depth=int(PARAMETER[0]),
                min_samples_split=float(PARAMETER[1]),
                min_samples_leaf=float(PARAMETER[2]),
                max_features=int(PARAMETER[3]),
                n_estimators=int(PARAMETER[4]),
                random_state=0)

        regressor.fit(X_train, y_train)

        #Predicting using test set
        y_pred = regressor.predict(X_test)

        #Predicting using cross validation (KFold method)
        y_pred_kf = cross_val_predict(regressor, X, y, cv=10)

        evaluate(y_pred, y_pred_kf, y, y_test)

    else:
        print('Choose available regressor!')
Ejemplo n.º 5
0
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)
regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print 'Cross validation r-sqaured scores:', scores
print 'Average cross validation r-squared score:', np.mean(scores)
regressor.fit_transform(X_train, y_train)
print 'Test set r-squared score', regressor.score(X_test, y_test)


################# Updated poly 1 #################
"""
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.preprocessing import PolynomialFeatures

>>> X_train = [[6], [8], [10], [14],   [18]]
>>> y_train = [[7], [9], [13], [17.5], [18]]
>>> X_test = [[6],  [8],   [11], [16]]
>>> y_test = [[8], [12], [15], [18]]
Ejemplo n.º 6
0
# %%
D = pairwise_distances(X)
D.shape

# %%
plt.imshow(D, zorder=2, cmap="Blues", interpolation="nearest")
plt.colorbar()

# %%
D2 = pairwise_distances(X2)
np.allclose(D, D2)

# %%
model = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
out = model.fit_transform(D)
plt.scatter(out[:, 0], out[:, 1], **colorize)
plt.axis("equal")

# %%


def random_projection(X, dimension=3, rseed=42):
    assert dimension >= X.shape[1]
    rng = np.random.RandomState(rseed)
    C = rng.randn(dimension, dimension)
    e, V = np.linalg.eigh(np.dot(C, C.T))
    return np.dot(X, V[:X.shape[1]])


X3 = random_projection(X, 3)
Ejemplo n.º 7
0
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting the Regression Model to the dataset
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

from sklearn.preprocessing import PolynomialFeatures
regressor = PolynomialFeatures(degree = 4)
X_poly = regressor.fit_transform(X)

from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()

from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100)


regressor.fit(X,y)

# Predicting a new result
y_pred = regressor.predict(10)
Ejemplo n.º 8
0
def train(X, Y):
    regr = LinearRegression()
    regr.fit_transform(X, Y)
    return regr
# load & scaling data
data = load_boston()
X_train,X_test,y_train,y_test = \
    train_test_split(data.data,data.target)
X_scaler, y_scaler = StandardScaler(), StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.fit_transform(X_test)
y_test = y_scaler.fit_transform(y_test)

# model building
model = SGDRegressor(loss='squared_loss')
scores = cross_val_score(model, X_train, y_train, cv=5)
print 'CV r-squared: ', scores
print 'Avg CV r-squared: ', np.mean(scores)
model.fit_transform(X_train, y_train)
print 'Test set r-squared: ', model.score(X_test, y_test)
# CV r-squared:  [ 0.75474391  0.61151436  0.69517402  0.71785126  0.41502689]
# Avg CV r-squared:  0.638862088204
# Test set r-squared:  0.822425952544

### FEATURE EXTRACTION & PREPROCESSING

## Feature Extraction

# Example 1: vectorize cities
#   DictVectorizer

from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
instances = [{
Ejemplo n.º 10
0
        min_rmse = poly_rmse
        min_deg = deg

# Plot and present results
print('Best degree {} with RMSE {}'.format(min_deg, min_rmse))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(degrees, rmses)
ax.set_yscale('log')
ax.set_xlabel('Degree')
ax.set_ylabel('RMSE')

# Polynomial Regression Starts With Degree 2

polynomial_reg = PolynomialFeatures(degree=2)
X_polinomial = polynomial_reg.fit_transform(X)
lin_reg = LinearRegression()
lin_reg_with_pure_values = lin_reg.fit(X_polinomial, y)

CRS = cross_val_score(lin_reg_with_pure_values, X, y, cv=10)
y_pred = lin_reg.predict(polynomial_reg.fit_transform(X_test))
# Results
print('-- Polynomial Regression Results --')
print('Best degree was {} with RMSE {}'.format(min_deg, min_rmse))
MAE = metrics.mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error of the Data:', MAE)
MSE = metrics.mean_squared_error(y_test, y_pred)
print('Mean Squared Error of the Data:', MSE)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error of the Data:', RMSE)
CVA = format(np.mean(CRS))
Ejemplo n.º 11
0
y=dataset.iloc[:,2].values

from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor()

regressor.fit(x,y)



y_predd=regressor.predict(x)

plt.scatter(x,y,color='red')
plt.plot(x,regressor2.predict(x),color='blue')
plt.show()

xi=regressor.fit_transform(x)




import pandas as pd

dataset=pd.read_csv('Position_Salaries.csv')
x=dataset.iloc[:,1:2].values
y=dataset.iloc[:,2].values

from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor()

regressor.fit(x,y)
from scipy.cluster.hierarchy import dendrogram, linkage
dendrogram = sch.dendrogram(sch.linkage(var1, method='ward'))
cluster = AgglomerativeClustering(n_clusters=5,
                                  affinity='euclidean',
                                  linkage='complete')
temp = cluster.fit_predict(var1)
print(temp)
'''K-Means Clustering'''
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters=5)
temp = cluster.fit_predict(var1)
print(temp)
'''PCA'''
from sklearn.decomposition import PCA
model = PCA(n_components=3)
temp = model.fit_transform(var1)
print(temp)
ratio = model.explained_variance_ratio_
print(ratio)

from sklearn.svm import SVC
linearsvm = SVC(kernel='linear', random_state=0)
nonlinearsvm = SVC(kernel='rbf', random_state=0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

from sklearn.tree import DecisionTreeClassifier
# load & scaling data
data = load_boston()
X_train,X_test,y_train,y_test = \
    train_test_split(data.data,data.target)
X_scaler, y_scaler = StandardScaler(), StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.fit_transform(X_test)
y_test = y_scaler.fit_transform(y_test)

# model building
model = SGDRegressor(loss='squared_loss')
scores = cross_val_score(model,X_train,y_train,cv=5)
print 'CV r-squared: ', scores
print 'Avg CV r-squared: ', np.mean(scores)
model.fit_transform(X_train,y_train)
print  'Test set r-squared: ', model.score(X_test,y_test)
    # CV r-squared:  [ 0.75474391  0.61151436  0.69517402  0.71785126  0.41502689]
    # Avg CV r-squared:  0.638862088204
    # Test set r-squared:  0.822425952544


### FEATURE EXTRACTION & PREPROCESSING

## Feature Extraction

# Example 1: vectorize cities
#   DictVectorizer

from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
Ejemplo n.º 14
0
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.loc[:, 'Level'].values
y = dataset.loc[:, 'Salary'].values
X = X.reshape(len(X),1)
# y = y.reshape(len(X),1)


from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

lin_reg = LinearRegression()
lin_reg.fit(X,y)

poly_reg = PolynomialFeatures (degree = 1)
X_poly = poly_reg.fit_transform(X)

# Integrate
lin_reg_2  = LinearRegression()
lin_reg_2.fit(X_poly, y)

# Linear Regression
# lin_reg.predict([[6.5]])
plt.scatter(X, y, color = 'g')
plt.plot (X, lin_reg.predict(X), color = 'red')
plt.title('Linear Regression')
plt.xlabel('EXP')
plt.ylabel('Salary')
plt.show()

# Polynomial Regression