Esempio n. 1
0
def learning_curve(classifier, X, y, cv, sample_sizes,
    degree=1, pickle_path=None, verbose=True):
    """ Learning curve
    """

    learning_curves = []
    for i, (train_index, test_index) in enumerate(cv):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]

        if degree > 1:
            poly = PolynomialFeatures(degree=degree, interaction_only=False, include_bias=True)
            X_train = poly.fit_transform(X_train)
            X_test = poly.transform(X_test)

        lc = []
        for sample in sample_sizes:
            classifier.fit(X_train[:sample], y_train[:sample])

            # apply classifier on test set
            y_pred = classifier.predict(X_test)
            confusion = metrics.confusion_matrix(y_test, y_pred)
            lc.append(balanced_accuracy_expected(confusion))

        learning_curves.append(lc)
        if verbose: print(i, end=' ')
    
    # pickle learning curve
    if pickle_path:
        with open(pickle_path, 'wb') as f:
            pickle.dump(learning_curves, f, protocol=4)
    if verbose: print()
Esempio n. 2
0
def get_polynomial_features(df, interaction_sign=' x ', **kwargs):
    """
    Gets polynomial features for the given data frame using the given sklearn.PolynomialFeatures arguments
    :param df: DataFrame to create new features from
    :param kwargs: Arguments for PolynomialFeatures
    :return: DataFrame with labeled polynomial feature values
    """
    pf = PolynomialFeatures(**kwargs)
    feats = _get_polynomial_features(df.columns.tolist(), pf.fit(df), interaction_sign=interaction_sign)
    return pd.DataFrame(pf.transform(df), columns=feats)
Esempio n. 3
0
class PolyFeatures(object):
    def __init__(self, degree):
        self.degree = degree
        self.poly = PolynomialFeatures(self.degree)

    def fit_transform(self, X):
        return self.poly.fit_transform(X[:, :])

    def transform(self, x):
        return self.poly.transform(x[:, :])
def polynomialRegression():
  import numpy as np
  import matplotlib.pyplot as plt 
  from sklearn.linear_model import LinearRegression
  from sklearn.preprocessing import PolynomialFeatures

  X_train = [[6],[8],[10],[14],[18]]
  y_train = [[7],[9],[13],[17.5],[18]]
  X_test = [[6],[8],[11],[16]]
  y_test = [[8],[12],[15],[18]]

  regressor = LinearRegression()
  regressor.fit(X_train,y_train)

  xx = np.linspace(0,26,100)
  yy = regressor.predict(xx.reshape(xx.shape[0],1))
  plt.plot(xx,yy)

  quadratic_featurizer = PolynomialFeatures(degree=2)
  X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
  X_test_quadratic = quadratic_featurizer.transform(X_test)

  regressor_quadratic = LinearRegression()
  regressor_quadratic.fit(X_train_quadratic,y_train)

  xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0],1))

  plt.plot(xx,regressor_quadratic.predict(xx_quadratic),c='r',linestyle='--')
  plt.title("pizza on diameter")
  plt.xlabel("pizza in inch")
  plt.ylabel("px in usd")
  plt.axis([0,25,0,25])
  plt.grid(True)
  plt.scatter(X_train,y_train)
  plt.show()

  print X_train
  print X_train_quadratic
  print X_test
  print X_test_quadratic
  print "simple reg r-squared", regressor.score(X_test,y_test)
  print "Quadratic regression r-squared", regressor_quadratic.score(X_test_quadratic, y_test)
def test_polynomialfeatures_vs_sklearn():
    # Compare msmbuilder.preprocessing.PolynomialFeatures
    # with sklearn.preprocessing.PolynomialFeatures

    polynomialfeaturesr = PolynomialFeaturesR()
    polynomialfeaturesr.fit(np.concatenate(trajs))

    polynomialfeatures = PolynomialFeatures()
    polynomialfeatures.fit(trajs)

    y_ref1 = polynomialfeaturesr.transform(trajs[0])
    y1 = polynomialfeatures.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Esempio n. 6
0
File: tpot.py Progetto: vsolano/tpot
    def _polynomial_features(self, input_df):
        """Uses Scikit-learn's PolynomialFeatures to construct new degree-2 polynomial features from the existing feature set

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to scale

        Returns
        -------
        modified_df: pandas.DataFrame {n_samples, n_constructed_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the constructed features

        """

        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

        if len(training_features.columns.values) == 0:
            return input_df.copy()
        elif len(training_features.columns.values) > 700:
            # Too many features to produce - skip this operator
            return input_df.copy()

        # The feature constructor must be fit on only the training data
        poly = PolynomialFeatures(degree=2, include_bias=False)
        poly.fit(training_features.values.astype(np.float64))
        constructed_features = poly.transform(input_df.drop(['class', 'group', 'guess'], axis=1).values.astype(np.float64))

        modified_df = pd.DataFrame(data=constructed_features)
        modified_df['class'] = input_df['class'].values
        modified_df['group'] = input_df['group'].values
        modified_df['guess'] = input_df['guess'].values

        new_col_names = {}
        for column in modified_df.columns.values:
            if type(column) != str:
                new_col_names[column] = str(column).zfill(10)
        modified_df.rename(columns=new_col_names, inplace=True)

        return modified_df.copy()
Esempio n. 7
0
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Training the Polynomial Regression model on the Training set
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

# Predicting the Test set results
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Evaluating the Model Performance
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(r2)
# Imprimir cabecalho da lista 

print ( '           Regressao Simples   |     Regressao RIDGE    |     Regressao LASSO  ' )
print ( ' ' )
print ( ' Grau     Erro IN    Erro OUT  |   Erro IN    Erro OUT  |   Erro IN    Erro OUT' )
print ( ' ----     -------    --------  |   -------    --------  |   -------    --------' )

# Treinar  rodar os modelos e preencher a lista

for degree in range(1,5):
       
    # Transformar atributos originais em atributos polinomiais 

    pf = PolynomialFeatures(degree)
    X_train_poly = pf.fit_transform(X_train_trans)
    X_test_poly  = pf.transform(X_test_trans)

    #print(X_train_poly.shape)

    
    # Treinar regressores polinomiais 
    
    lr = LinearRegression()
    lr = lr.fit(X_train_poly, y_train)

    # Treinar regressor polinomial com regularizacao Ridge
    # alpha = 90
    
    lr_ridge = Ridge ( alpha = 90 , max_iter=1000000 )  # 4.E+1 Boston ; 
    lr_ridge = lr_ridge.fit ( X_train_poly , y_train )
    
Esempio n. 9
0
yy = regression.predict(xx)

import matplotlib.pyplot as plt
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label='Degree=1')

print 'The R-squared value of Linear Regression performing on the training data is ', regression.score(
    X_train, y_train)

from sklearn.preprocessing import PolynomialFeatures
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train)

regression_poly2 = LinearRegression()
regression_poly2.fit(X_train_poly2, y_train)
xx_poly2 = poly2.transform(xx)
yy_poly2 = regression_poly2.predict(xx_poly2)

plt2, = plt.plot(xx, yy_poly2, label='Degree=2')

print 'The R-squared value of Polynominal Regressor(Degree=2) performing on the training data is ', regression_poly2.score(
    X_train_poly2, y_train)

poly4 = PolynomialFeatures(degree=4)
X_train_poly4 = poly4.fit_transform(X_train)
regression_poly4 = LinearRegression()
regression_poly4.fit(X_train_poly4, y_train)

xx_poly4 = poly4.transform(xx)
yy_poly4 = regression_poly4.predict(xx_poly4)
Esempio n. 10
0
        data_set.loc[np.round(data_set['p_i'], 7) == np.round(p, 7)],
        ignore_index=False)
y_test = target.loc[X_test.index]

#%% Train, test one-step ahead mode - Linear regression
Ridge_model = Ridge(alpha=1E-8, fit_intercept=0)
X_tr = X_train.reindex(
    ['p_f^(n-2)', 'p^(n-2)', 'p_f^(n-1)', 'p^(n-1)', 'p_f^(n)', 'p^(n)'],
    axis=1)
X_te = X_test.reindex(
    ['p_f^(n-2)', 'p^(n-2)', 'p_f^(n-1)', 'p^(n-1)', 'p_f^(n)', 'p^(n)'],
    axis=1)
poly = PolynomialFeatures(2, include_bias=False).fit(X_tr)

## poly features
X_tr_poly = pd.DataFrame(poly.transform(X_tr),
                         columns=poly.get_feature_names(X_tr.columns))
X_te_poly = pd.DataFrame(poly.transform(X_te),
                         columns=poly.get_feature_names(X_te.columns))

## scalers
sc_x = StandardScaler()
X_trp_scaled = sc_x.fit_transform(X_tr_poly)
X_tep_scaled = sc_x.transform(X_te_poly)
sc_y = StandardScaler()
y_tr_scaled = sc_y.fit_transform(y_train.to_numpy().reshape((-1, 1)))
y_te_scaled = sc_y.transform(y_test.to_numpy().reshape((-1, 1)))
scaler = {'sc_x': sc_x, 'sc_y': sc_y}

## fitting
Ridge_model.fit(X_trp_scaled, y_tr_scaled)
Esempio n. 11
0
Y = datas[names[4]]
X = X.astype(np.float)
Y = Y.astype(np.float)

# 对数据集进行测试集合训练集划分
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

# 1. 做一个维度多项式扩展的操作
# degree:给定模型做几阶的多项式扩展,也就是转换后的最高次项是多少
poly = PolynomialFeatures(degree=3)
# fit_transform:首先使用给定的数据集进行模型训练,找出模型的转换函数,然后使用找出的转换函数对给定的X数据做一个转换操作
X_train = poly.fit_transform(X_train, Y_train)
X_test = poly.transform(X_test)

# 2. 做一个线性回归
#  fit_intercept:是否训练模型的截距项,默认为True,表示训练;如果设置为False,表示不训练。
algo = LinearRegression(fit_intercept=True)

# 七、算法模型的训练
algo.fit(X_train, Y_train)

# 7.1 查看训练好的模型参数
print("线性回归的各个特征属性对应的权重参数θ:{}".format(algo.coef_))
print("线性回归的截距项的值:{}".format(algo.intercept_))

# 八、模型效果评估
y_hat = algo.predict(X_test)
print("在训练集上的模型效果(回归算法中为R2):{}".format(algo.score(X_train, Y_train)))
Esempio n. 12
0
t_start = time.time()
"""=====================================================================================================================
0 读取原特征
"""
print("0 读取原特征")
features_path = '../feature_file/data_w_tfidf(lda+lsa)+doc2vec.pkl'
f = open(features_path, 'rb')
x_train, y_train, x_test = pickle.load(f)
f.close()
"""=====================================================================================================================
1 使用多项式方法构造出更多的特征
"""
print("1 使用多项式方法构造出更多的特征")
poly = PolynomialFeatures(degree=2, interaction_only=True,
                          include_bias=False)  #degree控制多项式最高次数
x_train_new = poly.fit_transform(x_train)
x_test_new = poly.transform(x_test)
"""=====================================================================================================================
2 将构造好的特征保存至本地
"""
print("2 将构造好的特征保存至本地")
data = (x_train_new, y_train, x_test_new)
features_constr_path = features_path.split('/')[-1] + '_constr.pkl'
f_data = open(features_constr_path, 'wb')
pickle.dump(data, f_data)
f_data.close()

t_end = time.time()
print("构造特征完成,共耗时:{}min".format((t_end - t_start) / 60))
Esempio n. 13
0
print ("y1= {0} + {1} x".format(lr_model.intercept_[0], lr_model.coef_[0][0]))
xx = np.linspace(0, 26, 100)
yy = lr_model.predict(xx.reshape(xx.shape[0], 1))
lr_score = lr_model.score(X_test, y_test)

print ("Linear regression (order 1) model score is: {0}".format(lr_score))
plt.plot(xx, yy)
plt.plot(X_test, y_test, "o")
plt.title("Linear regression (order 1) result")
plt.show()


poly = PolynomialFeatures(degree=5)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

lr_5_model = LinearRegression()
lr_5_model.fit(X_train_poly, y_train)

print ("y2= {0} + {1} x + {2} x*x + {3} x*x*x + {4} x*x*x*x +{5} x*x*x*x*x".
       format(lr_5_model.intercept_[0], lr_5_model.coef_[0][0], lr_5_model.coef_[0][1], lr_5_model.coef_[0][2],
              lr_5_model.coef_[0][3], lr_5_model.coef_[0][4]))

xx_poly = poly.transform(xx.reshape(xx.shape[0], 1))
yy_poly = lr_5_model.predict(xx_poly)

print ("Linear regression (order 5) score is: {0}".format(lr_5_model.score(X_test_poly, y_test)))

plt.plot(xx, yy_poly)
plt.plot(X_test, y_test, "o")
Esempio n. 14
0

runplt()
plt.plot(y_test, 'k-')

## 建立线性回归,并用训练的模型绘图
regressor = LinearRegression()
regressor.fit(X_train, y_train)
yy = regressor.predict(X_test)
#df_all['LR1'] = pd.Series()
#df_all['LR1'][count+1:count+count+1] = yy
plt.plot(yy, 'y-')

quadratic_featurizer = PolynomialFeatures(degree=2)
X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
X_test_quadratic = quadratic_featurizer.transform(X_test)
regressor_quadratic = LinearRegression()
regressor_quadratic.fit(X_train_quadratic, y_train)
xx_quadratic = quadratic_featurizer.transform(X_test)
plt.plot(regressor_quadratic.predict(xx_quadratic), 'r-')

cubic_featurizer = PolynomialFeatures(degree=3)
X_train_cubic = cubic_featurizer.fit_transform(X_train)
X_test_cubic = cubic_featurizer.transform(X_test)
regressor_cubic = LinearRegression()
regressor_cubic.fit(X_train_cubic, y_train)
xx_cubic = cubic_featurizer.transform(X_test)
plt.plot(regressor_cubic.predict(xx_cubic), 'g')


seventh_featurizer = PolynomialFeatures(degree=7)
x_train=[[6],[8],[10],[14],[18]]
y_train=[[7],[9],[13],[17.5],[18]]

x_test = [[6],[8],[11],[16]]
y_test = [[8],[12],[15],[18]]

regressor = LinearRegression()
regressor.fit(x_train,y_train)
xx = np.linspace(0,26,100)
yy=regressor.predict(xx.reshape(xx.shape[0],1))
plt.plot(xx,yy)


quadratic_featurizer = PolynomialFeatures(degree=2)
x_train_quadratic = quadratic_featurizer.fit_transform(x_train)
x_test_quadratic = quadratic_featurizer.transform(x_test)

regressor_quadratic = LinearRegression()
regressor_quadratic.fit(x_train_quadratic,y_train)

xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0],1))


print(x_train)
print(x_train_quadratic)
print(x_test)
print(x_test_quadratic)


print('Simple linear regression r-squared',regressor.score(x_test,y_test))
print('Quadratic regression r-squared',regressor_quadratic.score(x_test_quadratic,y_test))
Esempio n. 16
0
for iteration in range(iter):
    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
    theta = theta - eta * gradients
print("BatchGD Thetas:", theta)

# A linear model doesn't seem appropriate. Try Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False) #quadratic
X_poly = poly_features.fit_transform(X) # Square each feature and add as feature
print('Feature (before):', X[0], ', features (after):', X_poly[0])
lin_reg = linear_model.LinearRegression()
lin_reg.fit(X_poly, y)
print('lin_reg Thetas:', lin_reg.intercept_, lin_reg.coef_)
# Plot our new quadratic model
X_new=np.linspace(0, 31, 100).reshape(100, 1)
X_new_poly = poly_features.transform(X_new)
y_new = lin_reg.predict(X_new_poly)
plt.plot(X_new, y_new, "m-", linewidth=2, label="PolynomialSGD")

# Now make predictions with each model and compare
pred = [[15]]
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(penalty="l2", random_state=42, max_iter = 50, tol=1e-3) #sgd with l2 = Ridge
sgd_reg.fit(X, y)#.ravel())
y_new = sgd_reg.predict(X_new)
plt.plot(X_new, y_new, "b-", linewidth=2, label="RidgeSGD")
print("sgd:", sgd_reg.predict(pred))



from sklearn.linear_model import Ridge #try making alpha very big or very small and compare with Ridge equation
Esempio n. 17
0
features = train.columns[:-1]
test_features = test.columns[1:]
for ind, (train_index, test_index) in enumerate(folds):

    print()
    print('Fold:', ind)

    train_train = train.iloc[train_index]
    train_test = train.iloc[test_index]

    print('train shape:', train_train.shape)
    print('test shape:', train_test.shape)

    poly = PolynomialFeatures(include_bias=False)
    train_train_poly = poly.fit_transform(train_train[features])
    train_test_poly = poly.transform(train_test[features])

    pca = PCA(n_components=2)
    x_train_train_poly = pca.fit_transform(train_train_poly)
    x_train_test_poly = pca.transform(train_test_poly)

    train_train.insert(1, 'pca_poly1', x_train_train_poly[:, 0])
    train_train.insert(1, 'pca_poly2', x_train_train_poly[:, 1])

    train_test.insert(1, 'pca_poly1', x_train_test_poly[:, 0])
    train_test.insert(1, 'pca_poly2', x_train_test_poly[:, 1])

    features = train.columns[:-1]
    dtrain_train = xgb.DMatrix(train_train[features],
                               train_train.target.values,
                               silent=True)
Esempio n. 18
0
fig2 = plt.figure()
ax2 = fig2.add_subplot(1,1,1) 
x = np.linspace(0,100,21).reshape((21,1))
y = np.array(columns_pre_sum).reshape((21,1))/100000
ax2.plot(x,y,'ko-',label = "line chart")

      
#create cubic equation  
featurizer = PolynomialFeatures(degree = 3 )
x_featurizer = featurizer.fit_transform(x) 
regressor_featurizer = linear_model.LinearRegression()
regressor_featurizer.fit(x_featurizer,y)

#Fitting curve
xx = np.linspace(0,100,1000)
xx_featurizer = featurizer.transform(xx.reshape((1000,1)))
yy_featurizer = regressor_featurizer.predict(xx_featurizer)
ax2.plot(xx,yy_featurizer,"g-",label = "Curve fitting")
ax2.legend(loc = 1)
ax2.set_xlabel("The ratio of self-driving cars (%)")
ax2.set_ylabel("Daily traffic flow in country of KING$(1e5)$")
fig2.savefig(("fitting.png"),dpi = 200)        

R1_featurizer = regressor_featurizer.score(x_featurizer,y)
print "R^2: %f " % R1_featurizer
print "Polynomial coefficients: %s" % regressor_featurizer.coef_
print "Polynomials-intercept: %s" % regressor_featurizer.intercept_       
  
#calculate the ratio of unmanned vehicles when the vehicle is the largest
yy = list(yy_featurizer.reshape((1,1000))[0])
xx_yy_fea_dict = dict(zip(yy,list(xx)))
Esempio n. 19
0
def test():
    df_all = base.getOneStockData('000002')
    df_all['volume_diff'] = df_all.volume.pct_change()
    for index, row in df_all[df_all.volume_diff < 0].iterrows():
        df_all.loc[index, 'volume_diff'] = row['volume_diff'] / (1 + row['volume_diff'])
    df_all['close_diff'] = df_all.close.pct_change() * 100
    df_all = df_all.dropna(subset=['volume_diff', 'close_diff'])
    df_all = df_all[df_all.close_diff < 11]
    df_all = df_all[df_all.close_diff > -11]
    df_all = df_all[abs(df_all.volume_diff) > 1]
    dfx = df_all[['volume_diff']]
    dfy = df_all[['close_diff']]
    # X_train = [[6], [8], [10], [14], [18]]
    # y_train = [[7], [9], [13], [17.5], [18]]
    # X_test = [[6], [8], [11], [16]]
    # y_test = [[8], [12], [15], [18]]
    print df_all.shape
    count = dfx.shape[0] / 2 - 3
    X_train = dfx[:count]
    y_train = dfy[1:count + 1]
    X_test = dfx[count:count + count]
    y_test = dfy[count + 1:count + count + 1]

    runplt(X_train, y_train, X_test, y_test)
    plt.plot(X_train, y_train, 'k.')

    # 建立线性回归,并用训练的模型绘图
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    yy = regressor.predict(X_test)
    # df_all['LR1'] = pd.Series()
    # df_all['LR1'][count+1:count+count+1] = yy
    plt.plot(y_train, yy, 'y-')

    quadratic_featurizer = PolynomialFeatures(degree=2)
    X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
    X_test_quadratic = quadratic_featurizer.transform(X_test)
    regressor_quadratic = LinearRegression()
    regressor_quadratic.fit(X_train_quadratic, y_train)
    xx_quadratic = quadratic_featurizer.transform(X_test)
    plt.plot(X_test, regressor_quadratic.predict(xx_quadratic), 'r-')

    cubic_featurizer = PolynomialFeatures(degree=3)
    X_train_cubic = cubic_featurizer.fit_transform(X_train)
    X_test_cubic = cubic_featurizer.transform(X_test)
    regressor_cubic = LinearRegression()
    regressor_cubic.fit(X_train_cubic, y_train)
    xx_cubic = cubic_featurizer.transform(X_test)
    plt.plot(X_test, regressor_cubic.predict(xx_cubic), 'g')

    seventh_featurizer = PolynomialFeatures(degree=7)
    X_train_seventh = seventh_featurizer.fit_transform(X_train)
    X_test_seventh = seventh_featurizer.transform(X_test)
    regressor_seventh = LinearRegression()
    regressor_seventh.fit(X_train_seventh, y_train)
    xx_seventh = seventh_featurizer.transform(X_test)
    plt.plot(X_test, regressor_seventh.predict(xx_seventh), 'b')

    plt.plot(X_test, y_test, 'm+')

    plt.show()
    # print(X_train_cubic)
    # print(X_test_cubic)
    # print(X_train_seventh)
    # print(X_test_seventh)
    print('1 r-liner', regressor.score(X_test, y_test))
    print('2 r-squared', regressor_quadratic.score(X_test_quadratic, y_test))
    print('3 r-squared', regressor_cubic.score(X_test_cubic, y_test))
    print('7 r-squared', regressor_seventh.score(X_test_seventh, y_test))
Esempio n. 20
0
def model_it(mode, training_fraction, polynomial_degree, interactions_only):
    base_set = integrated_data[integrated_data.type == mode].sort_values(by=['date', 'hits_hour'],ascending=True)
    base_set['row_num'] = pd.Series(range(0, base_set.shape[0]), index=base_set.index)

    base_set['cumsum_7_tv_dur'] = pd.Series(base_set['tv_duration_secs'].rolling(window=168, center=False).sum())
    base_set['cumsum_7_radio_dur'] = pd.Series(base_set['radio_duration_secs'].rolling(window=168, center=False).sum())

    base_set['cumsum_14_tv_dur'] = pd.Series(base_set['tv_duration_secs'].rolling(window=2*168, center=False).sum())
    base_set['cumsum_14_radio_dur'] = pd.Series(base_set['radio_duration_secs'].rolling(window=2*168, center=False).sum())

    name_list = list(base_set.columns.values.tolist())

    lag_names = ['sessions_lag', 'registrations_lag', 'PL_QualStart_lag', 'PL_Submit_lag', 'SLR_QualStart_lag',
                 'SLR_Submit_lag']

    lags = range(1, 7)
    lags += [24, 48, 72, 96, 120, 144, 168]

    names = [t + "_" + str(l) for t in lag_names for l in lags]

    name_list += names

    lag_vars = ['sessions', 'registrations', 'PL_QualStart', 'PL_Submit', 'SLR_QualStart', 'SLR_Submit']

    new_cols = pd.DataFrame()

    for var in lag_vars:
        for lag in lags:
            new_data = base_set[var].shift(lag)
            new_cols = pd.concat([new_cols, new_data], axis=1, )

    enhanced_set = pd.concat([base_set, new_cols], axis=1)

    enhanced_set.columns = name_list

    enhanced_set['tv_dur_cumsum'] = enhanced_set['tv_duration_secs'].cumsum()
    enhanced_set['radio_dur_cumsum'] = enhanced_set['radio_duration_secs'].cumsum()

    # Create some sine wave bounded from 0 to 1 for hourly predictions since we know traffic is at a low point at 12 AM
    Sines1 = [((np.sin(((int(x) + 0) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']]
    Sines2 = [((np.sin(((int(x) + 3) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']]
    Sines3 = [((np.sin(((int(x) + 6) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']]
    Sines4 = [((np.sin(((int(x) + 9) / 12.0) * np.pi - np.pi / 2) + 1) / 2) for x in enhanced_set['hits_hour']]
    enhanced_set['sines1'] = pd.Series(data=Sines1, index=enhanced_set.index)
    enhanced_set['sines2'] = pd.Series(data=Sines2, index=enhanced_set.index)
    enhanced_set['sines3'] = pd.Series(data=Sines3, index=enhanced_set.index)
    enhanced_set['sines4'] = pd.Series(data=Sines4, index=enhanced_set.index)

    model_data = enhanced_set.dropna()
    training_limit = int(math.ceil(model_data.shape[0] * training_fraction))


    if SinusoidModel:
        model_cols = names + ['sines1', 'sines2', 'sines3', 'sines4',
                              'tv_dur_cumsum', 'radio_dur_cumsum',
                              'cumsum_7_tv_dur', 'cumsum_14_tv_dur',
                              'cumsum_7_radio_dur', 'cumsum_14_radio_dur'] + shows
    else:
        model_cols = names + shows

    explanatory_vars = model_data[model_cols]

    explained_vars = model_data[['sessions',
                                 'registrations',
                                 'PL_QualStart',
                                 'PL_Submit',
                                 'SLR_QualStart',
                                 'SLR_Submit']]

    training_xvar = explanatory_vars[0:training_limit]
    training_yvar = explained_vars[0:training_limit]

    poly = PolynomialFeatures(degree=polynomial_degree, interaction_only=interactions_only, include_bias=False)

    transformed_xvar = poly.fit_transform(training_xvar)
    target_feature_names = ['x'.join(['{}^{}'.format(pair[0], pair[1]) for pair in tuple if pair[1] != 0]) for tuple in
                            [zip(training_xvar.columns, p) for p in poly.powers_]]

    transformed_xvar = pd.DataFrame(transformed_xvar, columns=target_feature_names)

    transformed_explanatory = poly.transform(explanatory_vars)
    target_feature_names = ['x'.join(['{}^{}'.format(pair[0], pair[1]) for pair in tuple if pair[1] != 0]) for tuple in
                            [zip(explanatory_vars.columns, p) for p in poly.powers_]]
    transformed_explanatory = pd.DataFrame(transformed_explanatory, columns=target_feature_names)

    print "data formed, training model using {0} observations and {1} features".format(training_xvar.shape[0],
                                                                                               training_xvar.shape[1])

    ############################
    #
    # Training
    #
    ############################

    loop_components = ['type', 'hour']

    session_vars = [col for col in transformed_xvar.columns.values if
                  not re.search(pattern=r'PL_QualStart|PL_Submit|SLR_QualStart|SLR_Submit|registrations', string=col)]
    session_df = transformed_xvar[session_vars]
    linreg_ARIMA_sessions = fit_formula(x_vars=session_df, y=training_yvar['sessions'])

    registration_vars = [col for col in transformed_xvar.columns.values if
                  not re.search(pattern=r'sessions|PL_Submit|SLR_QualStart|SLR_Submit|PL_QualStart', string=col)]
    registration_df = transformed_xvar[registration_vars]
    linreg_ARIMA_registrations = fit_formula(x_vars=registration_df, y=training_yvar['registrations'])

    plqs_vars = [col for col in transformed_xvar.columns.values if
                  not re.search(pattern=r'sessions|PL_Submit|SLR_QualStart|SLR_Submit|registrations', string=col)]
    plqs_df = transformed_xvar[plqs_vars]
    linreg_ARIMA_PL_QS = fit_formula(x_vars=plqs_df, y=training_yvar['PL_QualStart'])

    plsub_vars = [col for col in transformed_xvar.columns.values if
                  not re.search(pattern=r'sessions|PL_QualStart|SLR_QualStart|SLR_Submit|registrations', string=col)]
    plsub_df = transformed_xvar[plsub_vars]
    linreg_ARIMA_PL_Submit = fit_formula(x_vars=plsub_df, y=training_yvar['PL_Submit'])

    slrqs_vars = [col for col in transformed_xvar.columns.values if
                  not re.search(pattern=r'sessions|PL_Submit|PL_QualStart|SLR_Submit|registrations', string=col)]
    slrqs_df = transformed_xvar[slrqs_vars]
    linreg_ARIMA_SLR_QS = fit_formula(x_vars=slrqs_df, y=training_yvar['SLR_QualStart'])

    slrsub_vars = [col for col in transformed_xvar.columns.values if
                  not re.search(pattern=r'sessions|PL_Submit|SLR_QualStart|PL_QualStart|registrations', string=col)]
    slrsub_df = transformed_xvar[slrsub_vars]
    linreg_ARIMA_SLR_Submit = fit_formula(x_vars=slrsub_df, y=training_yvar['SLR_Submit'])

    print "{0} traffic ARIMA models trained".format(mode)

    session_df = transformed_explanatory[session_vars]
    model_data.loc[:, 'predicted_sessions'] = pd.Series(data=linreg_ARIMA_sessions.predict(session_df),
                                                        index=model_data.index)
    plqs_df = transformed_explanatory[plqs_vars]
    model_data.loc[:, 'predicted_PL_QualStart'] = pd.Series(data=linreg_ARIMA_PL_QS.predict(plqs_df),
                                                            index=model_data.index)
    plsub_df = transformed_explanatory[plsub_vars]
    model_data.loc[:, 'predicted_PL_Submit'] = pd.Series(data=linreg_ARIMA_PL_Submit.predict(plsub_df),
                                                         index=model_data.index)
    slrqs_df = transformed_explanatory[slrqs_vars]
    model_data.loc[:, 'predicted_SLR_QualStart'] = pd.Series(data=linreg_ARIMA_SLR_QS.predict(slrqs_df),
                                                             index=model_data.index)
    slrsub_df = transformed_explanatory[slrsub_vars]
    model_data.loc[:, 'predicted_SLR_Submit'] = pd.Series(data=linreg_ARIMA_SLR_Submit.predict(slrsub_df),
                                                          index=model_data.index)
    registration_df = transformed_explanatory[registration_vars]
    model_data.loc[:, 'predicted_registrations'] = pd.Series(data=linreg_ARIMA_registrations.predict(registration_df),
                                                             index=model_data.index)



    # Model statistics
    model_data.loc[:, 'error_ARIMA'] = model_data['predicted_sessions'] - model_data['sessions']

    print("{0} Data metrics".format(mode))
    print metrics.r2_score(y_true=model_data['sessions'], y_pred=model_data['predicted_sessions'])
    print metrics.r2_score(y_true=model_data['registrations'], y_pred=model_data['predicted_registrations'])
    print metrics.r2_score(y_true=model_data['PL_QualStart'], y_pred=model_data['predicted_PL_QualStart'])
    print metrics.r2_score(y_true=model_data['PL_Submit'], y_pred=model_data['predicted_PL_Submit'])
    print metrics.r2_score(y_true=model_data['SLR_QualStart'], y_pred=model_data['predicted_SLR_QualStart'])
    print metrics.r2_score(y_true=model_data['SLR_Submit'], y_pred=model_data['predicted_SLR_Submit'])

    # model_data.to_csv("direct_traffic_predictions_new.csv")
    #out_df = pd.concat(objs=[out_df, model_data], axis=0)

    return [model_data,
            (linreg_ARIMA_PL_QS, plqs_df),
            (linreg_ARIMA_PL_Submit, plsub_df),
            (linreg_ARIMA_registrations, registration_df),
            (linreg_ARIMA_sessions, session_df),
            (linreg_ARIMA_SLR_QS, slrqs_df),
            (linreg_ARIMA_SLR_Submit, slrsub_df)
            ]
Esempio n. 21
0
def feature_engineering(df_train, df_test):
    df_d_train = feature_engineering_step1(df_train)
    df_d_test = feature_engineering_step1(df_test)

    df_d_train_HasAge = df_d_train[df_d_train['HasAge']==1]
    df_d_test_HasAge = df_d_test[df_d_test['HasAge']==1]

    df_d_HasAge = pd.concat([df_d_train_HasAge, df_d_test_HasAge])

    #df_d_HasAge = df_d_train_HasAge

    features_age=['Sex_', 'Sex_female','Sex_male', 'Title_Age_s', 'Cabin_s', 'Embarked__C','Embarked__Q','Embarked__S','SibSp_','Parch_','Fare_','Pclass']

    X_train = df_d_HasAge[features_age]
    y_train = df_d_HasAge['Age_']

    pca = PCA(n_components=50)
    poly = PolynomialFeatures(degree=6)
    lr = LinearRegression(n_jobs=-1)

    X_train_poly = poly.fit_transform(X_train)
    X_train_poly = pca.fit_transform(X_train_poly)

    lr.fit(X_train_poly, y_train)

    # Predict for all
    X_predict_train_poly = poly.transform(df_d_train[features_age])
    X_predict_train_poly = pca.transform(X_predict_train_poly)
    df_d_train['Age_P'] = lr.predict(X_predict_train_poly)
    df_d_train['Age_P'] = df_d_train['Age_P'].apply(lambda x: 0 if x<0 else x).apply(lambda x: 80 if x>80 else x)


    X_predict_test_poly = poly.transform(df_d_test[features_age])
    X_predict_test_poly = pca.transform(X_predict_test_poly)
    df_d_test['Age_P'] = lr.predict(X_predict_test_poly)
    df_d_test['Age_P']=df_d_test['Age_P'].apply(lambda x: 0 if x<0 else x).apply(lambda x: 80 if x>80 else x)


    # Fill in Age_ as Age_P
    df_d_train.loc[df_d_train['HasAge']==0, ('Age_')]= df_d_train[df_d_train['HasAge']==0]['Age_P']
    df_d_test.loc[df_d_test['HasAge']==0, ('Age_')]= df_d_test[df_d_test['HasAge']==0]['Age_P']

    del df_d_train['Age_P']
    del df_d_test['Age_P']

    df_d_train['IsChild'] = df_d_train['Age_'].map(lambda x: 1 if x < 16 else 0)
    df_d_test['IsChild'] = df_d_test['Age_'].map(lambda x: 1 if x < 16 else 0)

    df_d_train['Fare_b'] = np.digitize(df_d_train['Fare_'], [0,10,20,30,40])
    df_d_test['Fare_b'] = np.digitize(df_d_test['Fare_'], [0,10,20,30,40])

    df_d_train['Age_b'] = np.digitize(df_d_train['Age_'], [0,5,10,15,20,25,28,30,35,40,45,50,55,60,65,70])
    df_d_test['Age_b'] = np.digitize(df_d_test['Age_'], [0,5,10,15,20,25,28,30,35,40,45,50,55,60,65,70])

    df_d_train['AgeCat']=df_d_train['Age_']
    df_d_train.loc[ (df_d_train.Age_<=14) ,'AgeCat'] = 'child'
    df_d_train.loc[ (df_d_train.Age_>60),'AgeCat'] = 'aged'
    df_d_train.loc[ (df_d_train.Age_>14) & (df_d_train.Age_ <=30) ,'AgeCat'] = 'adult'
    df_d_train.loc[ (df_d_train.Age_>30) & (df_d_train.Age_ <=60) ,'AgeCat'] = 'senior'

    df_d_test['AgeCat']=df_d_test['Age_']
    df_d_test.loc[ (df_d_test.Age_<=14) ,'AgeCat'] = 'child'
    df_d_test.loc[ (df_d_test.Age_>60),'AgeCat'] = 'aged'
    df_d_test.loc[ (df_d_test.Age_>14) & (df_d_test.Age_ <=30) ,'AgeCat'] = 'adult'
    df_d_test.loc[ (df_d_test.Age_>30) & (df_d_test.Age_ <=60) ,'AgeCat'] = 'senior'

    return pd.get_dummies(df_d_train), pd.get_dummies(df_d_test)
Esempio n. 22
0
# Part 4 - Cross validation - Approach 2

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cross_validation import cross_val_score

train_valid_shuffled = pd.read_csv('data/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv('data/wk3_kc_house_test_data.csv', dtype=dtype_dict)
l2_set = np.logspace(3, 9, num=13)

poly15 = PolynomialFeatures(degree= 15)
X_train = poly15.fit_transform(train_valid_shuffled['sqft_living'].reshape(-1,1))
X_test = poly15.transform(test['sqft_living'].reshape(-1,1))
y_train = train_valid_shuffled['price']

for i, l2 in enumerate(l2_set):
    model = linear_model.Ridge(alpha = l2, normalize = True)
    scores = cross_val_score(model, X_train, y_train, cv=10)
    print("Using L2 of ", l2, "| Mean score: ", scores.mean())
#  -0.000600028584951


# Training on test set:

model = linear_model.Ridge(alpha =3.16227766e+03, normalize = True)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
sum((y_pred - test['price']) ** 2) # 284682323929148
Esempio n. 23
0
from sklearn.preprocessing import PolynomialFeatures
X_train = [[6], [8], [10], [14], [18]]
X_test = [[6],  [8],   [11], [16]]
featurizer = PolynomialFeatures(degree=2)
X_train = featurizer.fit_transform(X_train)
X_test = featurizer.transform(X_test)
print X_train
print X_test
data =str(raw_input())
m, n=data.split(" ")
m=int(m)
n=int(n)
X=[]
Y=[]
for i in range(0,n):
    data2 = str(raw_input()).split(" ")
    data = [float(x) for x in data2]
    X.append(data[:m])
    Y.append(data[m])
# Now I have to do linear regression
poly = PolynomialFeatures(degree=3)
X = np.matrix(X)
X=poly.fit_transform(X)
clf = linear_model.LinearRegression()
clf.fit(X,Y)

'''
Xt=np.matrix.transpose(X)
Fin=(linalg.inv(Xt.dot(X)).dot(Xt)).dot(Y)
#print Fin
'''
n = int(raw_input())
for i in range(0,n):
    data2 = str(raw_input()).split(" ")
    data = np.matrix([[float(x) for x in data2]])
    data=poly.transform(data)
    print clf.predict(data)[0]    
plt.close('all')
plt.figure(1)
plt.scatter(x[:,0], x[:,1], c=y)

x, y = make_moons()
plt.figure(2)
plt.scatter(x[:,0], x[:,1], c=y)

# plt.show()

from sklearn.preprocessing import PolynomialFeatures
# Data Preprocessing routines
x = np.asmatrix([[1,2],[2,4]])
poly = PolynomialFeatures(degree = 2)
poly.fit(x)
x_poly = poly.transform(x)
print "Original x variable shape", x.shape
print x
print
print "Transformed x variables", x_poly.shape
print x_poly

# alternatively
x_poly = poly.fit_transform(x)

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

data = load_iris()
x = data['data']
y = data['target']
Esempio n. 26
0
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import PolynomialFeatures

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)

result1 = tpot_data.copy()

# Use Scikit-learn's PolynomialFeatures to construct new features from the existing feature set
training_features = result1.loc[training_indices].drop('class', axis=1)

if len(training_features.columns.values) > 0 and len(training_features.columns.values) <= 700:
    # The feature constructor must be fit on only the training data
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly.fit(training_features.values.astype(np.float64))
    constructed_features = poly.transform(result1.drop('class', axis=1).values.astype(np.float64))
    result1 = pd.DataFrame(data=constructed_features)
    result1['class'] = result1['class'].values
else:
    result1 = result1.copy()

result2 = result1.copy()
# Perform classification with an Ada Boost classifier
adab2 = AdaBoostClassifier(learning_rate=0.15, n_estimators=500, random_state=42)
adab2.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values)

result2['adab2-classification'] = adab2.predict(result2.drop('class', axis=1).values)
# drop ids and get labels
labels = train.target.values
train = train.drop("id", axis = 1)
train = train.drop("target", axis = 1)
test = test.drop("id", axis = 1)

# transform counts to TFIDF features
tfidf = feature_extraction.text.TfidfTransformer()
train = tfidf.fit_transform(train).toarray()
test = tfidf.transform(test).toarray()

# generate polynomial features
poly = PolynomialFeatures()
train = poly.fit_transform(train)
test = poly.transform(test)
#train = np.hstack((train, poly_train))
#test = np.hstack((test, poly_test))

# encode labels
lbl_enc = LabelEncoder()
labels = lbl_enc.fit_transform(labels)

# set up datasets for cross eval
x_train, x_test, y_train, y_test = train_test_split(train, labels)

# train a DBN classifier
clf = DBN([train.shape[1], 8000, 9], learn_rates = 0.3,
            learn_rate_decays = 0.9, epochs = 50, verbose = 1) # l2_costs = 0.0001,

clf.fit(x_train, y_train)
Esempio n. 28
0
class Features:
    categories = [
        "Pclass",
        "Embarked"
    ]

    def __init__(self):
        self._train = None
        self._test = None
        self.scaler = StandardScaler()
        self._labels = {}  # to encode categorical variables, we use LabelEncoder to turn columns into integers,
        self._raw_features = {}
        self.enc = OneHotEncoder(sparse=False)  # then OneHotEncoder to turn integers into binary arrays.
        self.poly = PolynomialFeatures(2)
        # For example "Embarked C" --> 1 --> [0, 1, 0].
        self._is_scaled = False
        self._is_encoded = False
        self._means = {}

    def category_labels(self):
        labels = []
        for category in self.categories:
            for j in self.labelencoder(category).classes_:
                labels.append("{:s} {}".format(category, j))
        return labels

    def feature_labels(self):
        return ["gender",
                "age",
                "siblings and spouses",
                "parents and children",
                "fare"] + self.category_labels()

    @property
    def feature_funcs(self):
        return [self.gender_func,
                self.float_col("Age"),
                self.float_col("SibSp"),
                self.float_col("Parch"),
                self.float_col("Fare"),
                self.category_cols,
                self.poly_age_class
                ]

    def _encode(self):
        if not self._is_encoded:
            self._is_encoded = True

            for cat in self.categories:
                self.train[cat] = self.labelencoder(cat).transform(self.train[cat].values)

            self.enc.fit(self.train[self.categories].values)

    def labelencoder(self, col):
        if col not in self._labels:
            self._labels[col] = LabelEncoder().fit(self.train[col].values)
        return self._labels[col]

    def label_col(self, row, col):
        return self.labelencoder(col).transform(row[col])

    def mean_col(self, col):
        if col not in self._means:
            self._means[col] = numpy.mean([float(j[col]) for (idx, j) in self.train.iterrows() if j[col]])
        return self._means[col]

    def float_col(self, col):
        def func(row):
            try:
                return [float(row[col])]
            except ValueError:
                return [self.mean_col(col)]

        return func

    def poly_age_class(self, row):
        '''
        poly_age_class takes in the age and class of the passenger and creates
        a list of degree-2 polynomial features
        '''
        klass = self.label_col(row, 'Pclass')
        age = row['Age']
        return [x for x in self.poly.transform([klass, age])[0]]

    def category_cols(self, row):
        self._encode()
        try:
            val = [self.label_col(row, cat) for cat in self.categories]
            return self.enc.transform([val]).tolist()[0]
        except ValueError, e:
            print '\n\n*** ERROR: caught value error', e, '***\n\n'
            print 'row:\n', row
            sys.exit(1)
Esempio n. 29
0
plt.scatter(features, labels, color='red')
plt.plot(features, lin_reg_1.predict(features), color='blue')
plt.title('Linear Regression')
plt.xlabel('Year')
plt.ylabel('Claims Paid')
plt.show()

# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures

poly_object = PolynomialFeatures(degree=5)
features_poly = poly_object.fit_transform(features)

lin_reg_2 = LinearRegression()
lin_reg_2.fit(features_poly, labels)

print("Predicting result with Polynomial Regression")
print(lin_reg_2.predict(poly_object.transform(1981)))

# Visualising the Polynomial Regression results
plt.scatter(features, labels, color='red')
plt.plot(features,
         lin_reg_2.predict(poly_object.fit_transform(features)),
         color='blue')
plt.title('Polynomial Regression')
plt.xlabel('Year')
plt.ylabel('Claims Paid')
plt.show()
"""
https://towardsdatascience.com/polynomial-regression-bbe8b9d97491
"""
Esempio n. 30
0
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([15, 11, 2, 8, 25, 32])

transformer = PolynomialFeatures(degree=2, include_bias=False)

transformer.fit(x)
x_ = transformer.transform(x)

# x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

print(x_)

model = LinearRegression().fit(x_, y)

r_sq = model.score(x_, y)
intercept, coefficients = model.intercept_, model.coef_

y_pred = model.predict(x_)
print(y_pred)
Esempio n. 31
0
 
clf = LinearRegression()
clf.fit(x,y)
pre = clf.predict([[12]])[0]
print(u'预测直径为12英寸的价格: $%.2f' % pre)
x2 = [[0],[12],[15],[25]]
y2 = clf.predict(x2)
 
import matplotlib.pyplot as plt
import numpy as np
 
plt.figure()
plt.axis([0,25,0,25])
plt.scatter(x,y,marker="s",s=20)
plt.plot(x2,y2,"g-")
 
#导入多项式回归模型
from sklearn.preprocessing import PolynomialFeatures
xx = np.linspace(0,25,100) #0到25等差数列
quadratic_featurizer = PolynomialFeatures(degree = 2) #实例化一个二次多项式
x_train_quadratic = quadratic_featurizer.fit_transform(x) #用二次多项式多样本x做变换
X_test_quadratic = quadratic_featurizer.transform(x2)
regressor_quadratic = LinearRegression()
regressor_quadratic.fit(x_train_quadratic, y)
xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))# 把训练好X值的多项式特征实例应用到一系列点上,形成矩阵
 
plt.plot(xx, regressor_quadratic.predict(xx_quadratic),
         label="$y = ax^2 + bx + c$",linewidth=2,color="r")
plt.legend()
plt.show()
Esempio n. 32
0
def prep4WnD(data, label = None):
    data[['방송월', '방송시간(시간)', '방송시간(분)']] = data[['방송월', '방송시간(시간)', '방송시간(분)']].astype(int)

    data =  data.merge(scale_timeS,  on = ['방송월', '방송일'], how = 'left').fillna(0)

    data =  data.merge(scale_timeY,  on = ['방송월', '방송일', '방송시간(시간)'], how = 'left').fillna(0)

    data =  data.merge(scale_timeR,  on = ['방송시간(시간)'], how = 'left').fillna(0)

    data = data.merge(volume_v3, on = '방송시간(시간)', how = 'left')

    data = data.merge(volume_v4, on = ['방송월', '방송일', '방송시간(시간)'], how = 'left')

    data = data.fillna(0)

    data = data.merge(rate_v1[['방송월', '방송일', '일별평균시청률']], on = ['방송월', '방송일'], how = 'left')
    data['일별시간별최대시청률'] = None
    data['일별시간별평균시청률'] = None
    data['일별시간별중간시청률'] = None

    for m, d, h in tqdm(data[['방송월', '방송일', '방송시간(시간)']].drop_duplicates().values):
        max_r = rate_v3.loc[(rate_v3['방송월'] == m) & (rate_v3['방송일'] == d), h].values[0]
        min_r = rate_v4.loc[(rate_v4['방송월'] == m) & (rate_v4['방송일'] == d), h].values[0]
        med_r = rate_v5.loc[(rate_v5['방송월'] == m) & (rate_v5['방송일'] == d), h].values[0]

        data.loc[(data['방송월'] == m) & (data['방송일'] == d) & (data['방송시간(시간)'] == h), ['일별시간별최대시청률', '일별시간별평균시청률', '일별시간별중간시청률']] = [max_r, min_r, med_r]

    data['시간별월별최대시청률'] = None
    data['시간별월별평균시청률'] = None
    data['시간별월별중간시청률'] = None

    for m,h in tqdm(data[['방송월', '방송시간(시간)']].drop_duplicates().values):
        max_r = rate_v6.loc[(rate_v6['방송시간(시간)'] == h), m].values[0]
        min_r = rate_v7.loc[(rate_v7['방송시간(시간)'] == h), m].values[0]
        med_r = rate_v8.loc[(rate_v8['방송시간(시간)'] == h), m].values[0]

        data.loc[(data['방송월'] == m) & (data['방송시간(시간)'] == h), ['시간별월별최대시청률', '시간별월별평균시청률', '시간별월별중간시청률']] = [max_r, min_r, med_r]

    data = data.merge(volume_v1, on = ['방송월', '방송시간(시간)'], how = 'left')

    data = data.merge(volume_v2, on = ['방송시간(시간)'], how = 'left')

    X = data[COLUMNS]
    for c in CATEGORICAL_COLUMNS:
        le = LabelEncoder()
        X[c] = le.fit_transform(X[c])

    if args.dataset == 'train':
        
        label = pd.get_dummies(label).values
        
        from sklearn.model_selection import train_test_split
        
        x_train, x_valid, y_train, y_valid = train_test_split(X, label, test_size = 0.2, random_state = 42)

        x_train_category = np.array(x_train[CATEGORICAL_COLUMNS])
        x_valid_category = np.array(x_valid[CATEGORICAL_COLUMNS])
        x_train_continue = np.array(x_train[CONTINUOUS_COLUMNS], dtype = 'float64')
        x_valid_continue = np.array(x_valid[CONTINUOUS_COLUMNS], dtype = 'float64')

        scaler = MinMaxScaler()
        x_train_continue = scaler.fit_transform(x_train_continue)
        x_valid_continue = scaler.transform(x_valid_continue)

        poly = PolynomialFeatures(degree=2, interaction_only=True)
        x_train_category_poly = poly.fit_transform(x_train_category)
        x_valid_category_poly = poly.transform(x_valid_category)
        
        joblib.dump(scaler, os.path.join('..', 'data', '04_임시데이터', 'scaler4rec.pkl'))
        
        data4train = (x_train_continue, x_train_category, x_train_category_poly, y_train)
        data4valid = (x_valid_continue, x_valid_category, x_valid_category_poly, y_valid)
        return X, data4train, data4valid
    
    elif args.dataset == 'test':
        
        X_category = np.array(X[CATEGORICAL_COLUMNS])
        X_continue = np.array(X[CONTINUOUS_COLUMNS], dtype = 'float64')
        
        scaler = joblib.load(os.path.join('..', 'data', '04_임시데이터', 'scaler4rec.pkl'))
        
        X_continue = scaler.fit_transform(X_continue)
        
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        X_category_poly = poly.fit_transform(X_category)
        
        data4test = (X_continue, X_category, X_category_poly)
        
        return X, data4test
Esempio n. 33
0
import matplotlib.pyplot as plt
import numpy as np

# 导入mglearn模块
import sys
sys.path.append("../")
import mglearn

X, y = mglearn.datasets.make_wave(n_samples=100)
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)

# include polynomials up to x ** 10:
# the default "include_bias=True" adds a feature that's constantly 1
poly = PolynomialFeatures(degree=10, include_bias=False)
poly.fit(X)

X_poly = poly.transform(X)
print("X_poly.shape: {}".format(X_poly.shape))
print("Entries of X:\n{}".format(X[:5]))
print("Entries of X_poly:\n{}".format(X_poly[:5]))
print("Polynomial feature names:\n{}".format(poly.get_feature_names()))

reg = LinearRegression().fit(X_poly, y)

line_poly = poly.transform(line)
plt.plot(line, reg.predict(line_poly), label='polynomial linear regression')
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")
plt.show()
Esempio n. 34
0
poly_features = poly_features.drop(columns=['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures

# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree=3)

poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)
poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15]

poly_features = pd.DataFrame(poly_features,
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2',
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Add in the target
poly_features['TARGET'] = poly_target

# Find the correlations with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

# Display most negative and most positive
Esempio n. 35
0
            print(f'PolynomialRegressor save not found. Training...')
            regr = LinearRegression()
            regr.fit(X_poly_train, y_train)
            pickle.dump(regr, open(PATH, 'wb'))
            #### fit hardcoded model #####
            # degree=1  | MSE is 0.124031
            # degree=2  | MSE is 0.118514
            # degree=3  | MSE is 0.134984  |  5000 random features

            # X_poly_train = np.array(X_poly_train)
            # size = X_poly_train.shape[1]
            # idx = np.random.random_integers(0, size, 5000)
            # X_poly_train = X_poly_train[:, idx]
            # print(f'X_poly_train {X_poly_train.shape}')

        y_pred = regr.predict(poly_reg.transform(X_test)) # [:, idx]
        mse = mean_squared_error(y_test, y_pred)
        print(f'MSE for PolynomialRegressor is {mse:.12f}\n')
    
        def visualization(start, end, X_test=X_test, y_test=y_test):
            y_pred = regr.predict(poly_reg.transform(X_test[start:end])) 
            y_test = np.array(y_test)
            # print(y_test[start:end])
            print(y_test[start:end, 0])
            plt.figure(figsize=(10, 10))
            plt.plot(y_pred[start:end,0], y_pred[start:end,1], color='r', label='x,y of y_pred')
            plt.plot(y_test[start:end,0], y_test[start:end,1], color='k', label='x,y of X_test')
            plt.show()

        # print(f'y_test\n{np.array(y_test)[-100:]}')
        #### Visualization ####
Esempio n. 36
0

X = np.array([race["馬番"], race["斤量"], race["単勝"], race["人気"], race["前着順"], race["前馬番"], race["前人気"]]).T
Y = np.array(race["着順"])
Z = np.array(race["単勝オッズ"])
import sklearn.model_selection

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

result = 0

today = pd.read_csv('kikka.csv', index_col=0)
mm = preprocessing.MinMaxScaler() # インスタンスの作成
today_seiki = mm.fit_transform(today)


X_train, X_test, Y_train, Y_test, Z_train, Z_test = sklearn.model_selection.train_test_split(X, Y, Z)
poly = PolynomialFeatures(degree=2).fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)
today_poly = poly.transform(today_seiki)

ridge = Ridge().fit(X_train_poly, Y_train)
print(f"X_train_poly.shape : {X_train_poly.shape}")
print(f'Score with polynomial features  : {ridge.score(X_test_poly, Y_test):.3f}')

print('-'*50)

print(ridge.predict(today_poly))
Esempio n. 37
0
def homework():

    df = pd.read_csv(utils.PATH.COURSE_FILE(2, 'data.csv'))
    print(df.shape)
    #print(df.head())
    #print(df.info())

    X = df.drop('Grant.Status', axis=1)
    y = df['Grant.Status']

    numeric_cols = ['RFCD.Percentage.1', 'RFCD.Percentage.2', 'RFCD.Percentage.3',
                    'RFCD.Percentage.4', 'RFCD.Percentage.5',
                    'SEO.Percentage.1', 'SEO.Percentage.2', 'SEO.Percentage.3',
                    'SEO.Percentage.4', 'SEO.Percentage.5',
                    'Year.of.Birth.1', 'Number.of.Successful.Grant.1', 'Number.of.Unsuccessful.Grant.1']
    categorical_cols = list(set(X.columns.values.tolist()) - set(numeric_cols))

    X_real_zeros = X[numeric_cols].fillna(0.0)
    X_real_means = X[numeric_cols].fillna(X.mean())
    X_cat        = X[categorical_cols].fillna('NA').applymap(str)

    encoder = DictVectorizer(sparse=False)

    X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())
    print(X_cat_oh.shape)

    X_train_real_zeros, X_test_real_zeros, \
    y_train, y_test = train_test_split(X_real_zeros, y, test_size=0.3, random_state=0)

    X_train_real_means, X_test_real_means, \
    y_train, y_test = train_test_split(X_real_means, y, test_size=0.3, random_state=0)

    X_train_cat_oh, X_test_cat_oh, \
    y_train, y_test = train_test_split(X_cat_oh, y, test_size=0.3, random_state=0)

    X_train_zeros = np.hstack([X_train_real_zeros.values, X_train_cat_oh])
    X_test_zeros  = np.hstack([X_test_real_zeros.values, X_test_cat_oh])
    X_train_means = np.hstack([X_train_real_means.values, X_train_cat_oh])
    X_test_means  = np.hstack([X_test_real_means.values, X_test_cat_oh])

    def task_1():
        alg = LogisticRegression()
        params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_zeros, y_train)
        print(grid.best_params_, grid.best_score_)

        y_pred = grid.predict_proba(X_test_zeros)[:, 1]
        auc_1 = roc_auc_score(y_test, y_pred)
        print('ROC AUC on zeroes:', auc_1)
        #plot_scores(grid)

        grid.fit(X_train_means, y_train)
        print(grid.best_params_, grid.best_score_)

        y_pred = grid.predict_proba(X_test_means)[:, 1]
        auc_2 = roc_auc_score(y_test, y_pred)
        print('ROC AUC on zeroes:', auc_2)
        #plot_scores(grid)

        write_answer_1(auc_2, auc_1)
    #task_1()


    ##### Scaling #####

    scaler = StandardScaler()
    X_train_real_scaled = scaler.fit_transform(X_train_real_zeros)
    X_test_real_scaled  = scaler.transform(X_test_real_zeros)

    #data_numeric = pd.DataFrame(X_train_real_scaled, columns=numeric_cols)
    #list_cols = ['Number.of.Successful.Grant.1', 'SEO.Percentage.2', 'Year.of.Birth.1']
    #scatter_matrix(data_numeric[list_cols], alpha=0.5, figsize=(10, 10))
    #plt.show()

    X_train_scaled = np.hstack([X_train_real_scaled, X_train_cat_oh])
    X_test_scaled  = np.hstack([X_test_real_scaled, X_test_cat_oh])

    def task_2():
        alg = LogisticRegression()
        params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_scaled, y_train)
        print(grid.best_params_, grid.best_score_)

        y_pred = grid.predict_proba(X_test_scaled)[:, 1]
        auc = roc_auc_score(y_test, y_pred)
        print('ROC AUC on scaled zeroes:', auc)
        #plot_scores(grid)

        write_answer_2(auc_2)

        return
    #task_2()

    def example():
        np.random.seed(0)

        param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        cv = 3
        """Сэмплируем данные из первой гауссианы"""
        data_0 = np.random.multivariate_normal([0,0], [[0.5,0],[0,0.5]], size=40)
        """И из второй"""
        data_1 = np.random.multivariate_normal([0,1], [[0.5,0],[0,0.5]], size=40)
        """На обучение берём 20 объектов из первого класса и 10 из второго"""
        example_data_train = np.vstack([data_0[:20,:], data_1[:10,:]])
        example_labels_train = np.concatenate([np.zeros((20)), np.ones((10))])
        """На тест - 20 из первого и 30 из второго"""
        example_data_test = np.vstack([data_0[20:,:], data_1[10:,:]])
        example_labels_test = np.concatenate([np.zeros((20)), np.ones((30))])
        """Задаём координатную сетку, на которой будем вычислять область классификации"""
        xx, yy = np.meshgrid(np.arange(-3, 3, 0.02), np.arange(-3, 3, 0.02))
        """Обучаем регрессию без балансировки по классам"""
        optimizer = GridSearchCV(LogisticRegression(), param_grid, cv=cv, n_jobs=-1)
        optimizer.fit(example_data_train, example_labels_train)
        """Строим предсказания регрессии для сетки"""
        Z = optimizer.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2)
        plt.scatter(data_0[:,0], data_0[:,1], color='red')
        plt.scatter(data_1[:,0], data_1[:,1], color='blue')
        """Считаем AUC"""
        auc_wo_class_weights = roc_auc_score(example_labels_test, optimizer.predict_proba(example_data_test)[:,1])
        plt.title('Without class weights')
        plt.show()
        print('AUC: %f'%auc_wo_class_weights)
        """Для второй регрессии в LogisticRegression передаём параметр class_weight='balanced'"""
        optimizer = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=cv, n_jobs=-1)
        optimizer.fit(example_data_train, example_labels_train)
        Z = optimizer.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2)
        plt.scatter(data_0[:,0], data_0[:,1], color='red')
        plt.scatter(data_1[:,0], data_1[:,1], color='blue')
        auc_w_class_weights = roc_auc_score(example_labels_test, optimizer.predict_proba(example_data_test)[:,1])
        plt.title('With class weights')
        plt.show()
        print('AUC: %f'%auc_w_class_weights)
    #example()

    def task_3():
        alg = LogisticRegression(class_weight='balanced')
        params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_scaled, y_train)
        print(grid.best_params_, grid.best_score_)

        y_pred = grid.predict_proba(X_test_scaled)[:, 1]
        auc_1 = roc_auc_score(y_test, y_pred)
        print('ROC AUC on scaled zeroes:', auc_1)
        #plot_scores(grid)

        ## Balanced

        np.random.seed(0)
        n0 = sum(y_train==0)
        n1 = sum(y_train==1)
        print(n0, n1)

        y_less = np.nonzero(y_train)[0]
        indices_to_add = y_less[np.random.randint(0, len(y_less), n0 - n1)]
        X_train_to_add = X_train_scaled[indices_to_add, :]
        y_train_to_add = y_train.values[indices_to_add]
        X_train_balanced = np.vstack([X_train_scaled, X_train_to_add])
        y_train_balanced = np.hstack([y_train, y_train_to_add])

        alg = LogisticRegression()
        params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_balanced, y_train_balanced)
        print(grid.best_params_, grid.best_score_)

        y_pred = grid.predict_proba(X_test_scaled)[:, 1]
        auc_2 = roc_auc_score(y_test, y_pred)
        print('ROC AUC on scaled balanced zeroes:', auc_2)

        write_answer_3(auc_1, auc_2)

        return
    #task_3()


    X_train_real_zeros, X_test_real_zeros, \
    y_train, y_test = train_test_split(X_real_zeros, y, test_size=0.3, random_state=0, stratify=y)

    X_train_cat_oh, X_test_cat_oh, \
    y_train, y_test = train_test_split(X_cat_oh, y, test_size=0.3, random_state=0, stratify=y)

    scaler = StandardScaler()
    X_train_real_scaled = scaler.fit_transform(X_train_real_zeros)
    X_test_real_scaled  = scaler.transform(X_test_real_zeros)

    X_train_scaled = np.hstack([X_train_real_scaled, X_train_cat_oh])
    X_test_scaled  = np.hstack([X_test_real_scaled, X_test_cat_oh])

    def task_4():
        alg = LogisticRegression(class_weight='balanced')
        params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_scaled, y_train)
        print(grid.best_params_, grid.best_score_)

        y_pred = grid.predict_proba(X_test_scaled)[:, 1]
        auc = roc_auc_score(y_test, y_pred)
        print('ROC AUC on scaled y-stratified zeroes:', auc)

        write_answer_4(auc)

        return
    #task_4()

    def example_2():
        param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        cv = 3

        """Инициализируем класс, который выполняет преобразование"""
        transform = PolynomialFeatures(2)
        """Сэмплируем данные из первой гауссианы"""
        data_0 = np.random.multivariate_normal([0,0], [[0.5,0],[0,0.5]], size=40)
        """И из второй"""
        data_1 = np.random.multivariate_normal([0,1], [[0.5,0],[0,0.5]], size=40)
        """На обучение берём 20 объектов из первого класса и 10 из второго"""
        example_data_train = np.vstack([data_0[:20,:], data_1[:10,:]])
        example_labels_train = np.concatenate([np.zeros((20)), np.ones((10))])
        """На тест - 20 из первого и 30 из второго"""
        example_data_test = np.vstack([data_0[20:,:], data_1[10:,:]])
        """Обучаем преобразование на обучающей выборке, применяем его к тестовой"""
        example_data_train_poly = transform.fit_transform(example_data_train)
        example_data_test_poly = transform.transform(example_data_test)
        example_labels_test = np.concatenate([np.zeros((20)), np.ones((30))])
        """Обращаем внимание на параметр fit_intercept=False"""
        optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=False), param_grid, cv=cv, n_jobs=-1)
        optimizer.fit(example_data_train_poly, example_labels_train)
        """Задаём координатную сетку, на которой будем вычислять область классификации"""
        xx, yy = np.meshgrid(np.arange(-3, 3, 0.02), np.arange(-3, 3, 0.02))
        Z = optimizer.predict(transform.transform(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2)
        plt.scatter(data_0[:,0], data_0[:,1], color='red')
        plt.scatter(data_1[:,0], data_1[:,1], color='blue')
        plt.title('With class weights')
        plt.show()


        transform = PolynomialFeatures(15)
        example_data_train_poly = transform.fit_transform(example_data_train)
        example_data_test_poly = transform.transform(example_data_test)
        optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=False), param_grid, cv=cv, n_jobs=-1)
        optimizer.fit(example_data_train_poly, example_labels_train)
        Z = optimizer.predict(transform.transform(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2)
        plt.scatter(data_0[:,0], data_0[:,1], color='red')
        plt.scatter(data_1[:,0], data_1[:,1], color='blue')
        plt.title('Corrected class weights')
        plt.show()
        return
    #example_2()


    poly = PolynomialFeatures(2)
    X_train_real_zeros_poly = poly.fit_transform(X_train_real_zeros)
    X_test_real_zeros = poly.transform(X_test_real_zeros)

    scaler = StandardScaler()
    X_train_real_poly_scaled = scaler.fit_transform(X_train_real_zeros_poly)
    X_test_real_poly_scaled  = scaler.transform(X_test_real_zeros)

    X_train_poly_scaled = np.hstack([X_train_real_poly_scaled, X_train_cat_oh])
    X_test_poly_scaled  = np.hstack([X_test_real_poly_scaled, X_test_cat_oh])

    def task_5():
        alg = LogisticRegression(class_weight='balanced', fit_intercept=False)
        params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_poly_scaled, y_train)
        print(grid.best_params_, grid.best_score_)

        y_pred = grid.predict_proba(X_test_poly_scaled)[:, 1]
        auc = roc_auc_score(y_test, y_pred)
        print('ROC AUC on scaled y-stratified zeroes:', auc)

        write_answer_5(auc)
        return
    #task_5()

    def task_6():
        alg = LogisticRegression(class_weight='balanced', penalty='l1')
        params_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
        grid = GridSearchCV(alg, params_grid, cv=3, n_jobs=-1)
        grid.fit(X_train_scaled, y_train)
        print(grid.best_params_, grid.best_score_)

        alg = grid.best_estimator_
        zero_ids = np.where(alg.coef_[0,:X_train_real_scaled.shape[1]] == 0)[0]

        write_answer_6(zero_ids)
        return
    task_6()

    return
Esempio n. 38
0
linreg1 = LinearRegression().fit(polytrain1 , y_train)

x_pred1 = poly1.transform(x_pred)
y_pred1 = linreg1.predict(x_pred1)
final_array = np.array(y_pred1, ndmin=2)

'''

# 6 9
x_pred = np.linspace(0, 10, 100).reshape(-1, 1)
final_array = np.empty(shape=(4, 100))
for index, value in enumerate([1, 3, 6, 9]):
    poly = PolynomialFeatures(degree=value)
    poly_train = poly.fit_transform(X_train.reshape(-1, 1))
    linreg = LinearRegression().fit(poly_train, y_train)
    x_pred_poly = poly.transform(x_pred)
    y_pred = linreg.predict(x_pred_poly)
    final_array[index] = y_pred

plt.figure()
plt.scatter(X_train, y_train)
plt.scatter(X_test, y_test)
plt.plot(x_pred, final_array[0], lw=3)
plt.plot(x_pred, final_array[1], lw=3)
plt.plot(x_pred, final_array[2], lw=3)
plt.plot(x_pred, final_array[3], lw=3)
plt.legend(['a', 'b', 'c', 'd', 'e'])

# ANSWER 2

from sklearn.metrics.regression import r2_score
Esempio n. 39
0
def runplt():
    plt.figure()
    plt.title('匹萨价格与直径数据', fontproperties=font)
    plt.xlabel('直径(英寸)', fontproperties=font)
    plt.ylabel('价格(美元)', fontproperties=font)
    plt.axis([0, 25, 0, 25])
    plt.grid(True)
    return plt


plt = runplt()
plt.plot(X_train, y_train, 'k.')

quadratic_featurizer = PolynomialFeatures(degree=2)
X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
X_test_quadratic = quadratic_featurizer.transform(X_test)
regressor_quadratic = LinearRegression()
regressor_quadratic.fit(X_train_quadratic, y_train)
xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))
plt.plot(xx, regressor_quadratic.predict(xx_quadratic), 'r-')

seventh_featurizer = PolynomialFeatures(degree=7)
X_train_seventh = seventh_featurizer.fit_transform(X_train)
X_test_seventh = seventh_featurizer.transform(X_test)
regressor_seventh = LinearRegression()
regressor_seventh.fit(X_train_seventh, y_train)
xx_seventh = seventh_featurizer.transform(xx.reshape(xx.shape[0], 1))
plt.plot(xx, regressor_seventh.predict(xx_seventh))
plt.show()
print('二次回归 r-squared', regressor_quadratic.score(X_test_quadratic, y_test))
print('七次回归 r-squared', regressor_seventh.score(X_test_seventh, y_test))
Esempio n. 40
0
Xtest = scaler.transform(xtest.reshape(-1, 1))

degs = np.arange(1, 21, 1)
ndegs = np.max(degs)
mse_train = np.empty(ndegs)
mse_test = np.empty(ndegs)
ytest_pred_stored = np.empty(ndegs, dtype=np.ndarray)
ytrain_pred_stored = np.empty(ndegs, dtype=np.ndarray)
for deg in degs:
    model = LinearRegression()
    poly_features = PolynomialFeatures(degree=deg, include_bias=False)
    Xtrain_poly = poly_features.fit_transform(Xtrain)
    model.fit(Xtrain_poly, ytrain)
    ytrain_pred = model.predict(Xtrain_poly)
    ytrain_pred_stored[deg - 1] = ytrain_pred
    Xtest_poly = poly_features.transform(Xtest)
    ytest_pred = model.predict(Xtest_poly)
    mse_train[deg - 1] = mse(ytrain_pred, ytrain)
    mse_test[deg - 1] = mse(ytest_pred, ytest)
    ytest_pred_stored[deg - 1] = ytest_pred

# Plot MSE vs degree
fig, ax = plt.subplots()
mask = degs <= 15
ax.plot(degs[mask], mse_test[mask], color='r', marker='x', label='test')
ax.plot(degs[mask], mse_train[mask], color='b', marker='s', label='train')
ax.legend(loc='upper right', shadow=True)
plt.xlabel('degree')
plt.ylabel('mse')
pml.savefig('polyfitVsDegree.pdf')
plt.show()
Esempio n. 41
0
# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

# Model 2
lm_reg = Ridge(alpha=1)

# Feature Transform
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

# Model 3
lm_poly = LinearRegression()


############### 5.Choose Model ###############

lm.fit(X_train, y_train)
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')

lm_reg.fit(X_train_scaled, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}')

lm_poly.fit(X_train_poly, y_train)
# polynomial model (degree=2)
poly2 = PolynomialFeatures(degree=2)
X_poly2 = poly2.fit_transform(X)
poly2_fit = LinearRegression(fit_intercept=False)

# polynomial model (degree=10)
poly10 = PolynomialFeatures(degree=10)
X_poly10 = poly10.fit_transform(X)
poly10_fit = LinearRegression(fit_intercept=False)

# option 1: one loop for everything (faster)

# for plotting purposes
x_linspace = np.linspace(np.min(X), np.max(X), 100)
X_linspace = x_linspace.reshape(-1, 1)
X_linspace_poly2 = poly2.transform(X_linspace)
X_linspace_poly10 = poly10.transform(X_linspace)

scores_linreg_fit = []
scores_poly2_fit = []
scores_poly10_fit = []
kf = KFold(n_splits=5)
for train, test in kf.split(X):
    X_test, X_train = X[test], X[train]
    y_test, y_train = y[test], y[train]

    # subset training data
    X_lin_train = X[train]
    X_poly2_train = X_poly2[train]
    X_poly10_train = X_poly10[train]
Esempio n. 43
0
y1 = np.array(temp_start_China).reshape(114, 1)
clf.fit(x, y)
clf1.fit(x1, y1)  # 一次分析

year_set_China = year_set = list(range(2014, 2036))  # 将预测的年份
predict_temp = clf.predict([[i] for i in year_set])  # 按线性回归方程估计计算未来温度
predict_temp_China = clf1.predict([[i] for i in year_set_China
                                   ])  # 按线性回归方程估计计算中国未来温度
predict_temp_list = [i[0] for i in predict_temp]  # 化为一维列表
predict_temp_China_list = [i[0] for i in predict_temp_China]  # 化为一维列表

ploy = PolynomialFeatures(degree=2)  # 设置为2次项,多项式预测
x_ploy = ploy.fit_transform(x)
clf_ = LinearRegression()  # 设置二次线性实例
clf_.fit(x_ploy, y)
x_ployed = ploy.transform(x)
y_predict = clf_.predict(x_ployed)
predict_temp_two_list = [i[0] for i in y_predict]  # 化为一维列表

x, x1, x_dimensional = [], [], []  # 空列表存温度
for i in range(0, 22):
    x.append(
        opts.LineItem(name=year_set[i],
                      value=round(predict_temp_list[i], 2),
                      itemstyle_opts=opts.ItemStyleOpts(color='purple')))
for i in range(0, 22):
    x1.append(
        opts.LineItem(name=year_set_China[i],
                      value=round(predict_temp_China_list[i], 2),
                      itemstyle_opts=opts.ItemStyleOpts(color='blue')))
for i in range(0, 22):
Esempio n. 44
0
x_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]

x_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]

regressor = LinearRegression()
regressor.fit(x_train, y_train)
xx = np.linspace(0, 26, 100)
yy = regressor.predict(xx.reshape(xx.shape[0], 1))
plt.plot(xx, yy)

quadratic_featurizer = PolynomialFeatures(degree=2)
x_train_quadratic = quadratic_featurizer.fit_transform(x_train)
x_test_quadratic = quadratic_featurizer.transform(x_test)

regressor_quadratic = LinearRegression()
regressor_quadratic.fit(x_train_quadratic, y_train)

xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))

print(x_train)
print(x_train_quadratic)
print(x_test)
print(x_test_quadratic)

print('Simple linear regression r-squared', regressor.score(x_test, y_test))
print('Quadratic regression r-squared',
      regressor_quadratic.score(x_test_quadratic, y_test))
Esempio n. 45
0
# 多元线性回归:
model = LinearRegression()
model.fit(X, y)
X_test = [[8, 2], [9, 0], [11, 2], [16, 2], [12, 0]]
y_test = [[11], [8.5], [15], [18], [11]]
predictions = model.predict(X_test)
for i, prediction in enumerate(predictions):
    print('Predicted: %s, Target: %s' % (prediction, y_test[i]))
print('R-squared: %.2f' % model.score(X_test, y_test))

# 多元多项式回归
X_train = [[6, 2], [8, 1], [10, 0], [14, 2], [18, 0]]
y_train = [[7], [9], [13], [17.5], [18]]

quadratic_featurizer = PolynomialFeatures(degree=2)
X_train_quadratic = quadratic_featurizer.fit_transform(X_train)

regressor_quadratic = LinearRegression()
# 训练
regressor_quadratic.fit(X_train_quadratic, y_train)

xx_quadratic = quadratic_featurizer.transform([[6, 3]])
print("预测结果: {}".format(regressor_quadratic.predict(xx_quadratic)))


def main():
    pass


if __name__ == '__main__':
    main()
Esempio n. 46
0
X2 = [[0], [10], [14], [25]]
model = LinearRegression()
model.fit(X, y)
print 'A 12" pizza should cost: $%.2f' % model.predict([12])[0]
y2 = model.predict(X2)
plt.scatter(X, y)
# plt.plot(X2, y2, 'r-')
import numpy as np
poly = PolynomialFeatures(degree=9)
X_p = poly.fit_transform(X)
print len(X)
xx = np.linspace(0, 26, 1000)
regressor_p = LinearRegression()
regressor_p.fit(X_p, y)
print xx.shape
xx_p = poly.transform(xx.reshape(xx.shape[0], 1))
plt.plot(xx, regressor_p.predict(xx_p), c='r')


################# Sample 3 #################
"""
>>> import numpy as np
>>> print 'Residual sum of squares: %.2f' % np.mean((model.predict(X) - y) ** 2)
Residual sum of squares: 1.75
"""
import numpy as np
print 'Residual sum of squares: %.2f' % np.mean((model.predict(X) - y) ** 2)


################# Sample 4 #################
"""
Esempio n. 47
0
    dftagdata = dfdata[FeatureCols]
    dffaildata = dfdata[faildatacols]
    scaler = StandardScaler().fit(dftagdata)
    dftagdata = scaler.transform(dftagdata)

    x_train, x_test, y_train, y_test = train_test_split(dftagdata,
                                                        dffaildata,
                                                        random_state=0)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    for d in deg:
        print('Degree - ' + str(d))
        poly = PolynomialFeatures(degree=d)
        x_train_poly = poly.fit_transform(x_train)
        x_test_poly = poly.transform(x_test)
        pln = LogisticRegression(C=C1,
                                 max_iter=miter).fit(x_train_poly, y_train)
        print(idx + ' Train Score - ' + str(pln.score(x_train_poly, y_train)) +
              ' Test Score - ' + str(pln.score(x_test_poly, y_test)))
        print(pln.predict(x_train_poly))
        print(y_train.reshape(1, -1)[0])
        print(pln.predict(x_test_poly))
        print(y_test.reshape(1, -1)[0])

plot_graph(df, tag=FeatureCols, version=['std'])

#plot_graph(df3, tag = ['5PT1B2', '5PT3B2', '5TT1B2'],version=['max', 'min'])
#plot_graph(df3, tag = ['5PT1B2', '5PT3B2', '5PT2C1', '5PT3C1', '5PT2G1','5PT3G1', '5PT1H1', '5PT4H1', '5TT1B2', '5TT3B2', '5TT2C1', '5TT3C1','5TT2G1', '5TT3G1', '5TT1H1', '5TT4H1'],version=['max', 'min'])
#plot_graph(df3, tag = ['20PICP2Choke', '20PICP1Choke','20PT17FLDC', '20PT18FLDC', '20PT27FLDC', '20PT28FLDC', '20PT214FLLA', '20PT224FLRE', '20TT115FLT1', '20TT125FLTS', '20TT215T2FLL', '20TT225T2FLL', '20ZT114SSFL', '20ZT124SSFL', '20ZT214T2FL', '20ZT224T2FL', '21FQI10518NR', '21FT40518D', '21FT40518GVFR', '21HY10535OFL', '21HY40534OTSL', '21LIC10516SP', '21LIC10620CVH', '21LIC10620SPH', '21LIC40516SPTA', '21LT10515PVPSO', '21LT10516PVPSO', '21LT10618PVPSO2', '21LT10620PVPSO2', '21LT40515PVTA', '21LT40516PVTA', '21LY10516OPSO2', '21LY10616OPSO2', '21LY10620OSH2', '21LY11516OTT', '21LY40516OUT', '21PT10505PVPS', '21PT10605PVPS2', '21PT40505PVTA', '21TT10508PVPSO', '21TT10608PVPSO2', '21TT11616PVOTHO', '30FT19107PVSH2', '30FT19108PV', '30FT29108PV', '30FT69521PVFCP', '30LIC69516CVFCO', '30LIC69518CVFCP', '30LT69514PVFC', '30LT69515PVFC', '30LT69516PVFC', '30LT69518PVFC', '30LY69518OFCP', '30PDIC19104SPPHO', '30PDT19104PVSH2', '30PDT19104PVSHS2', '30PDT19104PVSHD2', '30PDY19104OSPH2', '30PT69503PVFC', '30PT69512PV', '30PY69503OFCO', '37PT62301PVCS' ],version=['max', 'min'])
#
Esempio n. 48
0

if __name__ == "__main__":
    
    x,y = get_data()
    
    # Divide the data into Train, dev and test    
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size = 0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
    
    
    
    #Prepare some polynomial features
    poly_features = PolynomialFeatures(2,interaction_only=True)
    poly_features.fit(x_train)
    x_train_poly = poly_features.transform(x_train)
    x_dev_poly   = poly_features.transform(x_dev)
    
    # Build model with polynomial features
    model_poly = build_model(x_train_poly,y_train)
    predicted_y = model_poly.predict(x_train_poly)
    print "\n Model Performance in Training set (Polynomial features)\n"
    model_worth(y_train,predicted_y)  

    # View model details
    view_model(model_poly)
    
    # Apply the model on dev set
    predicted_y = model_poly.predict(x_dev_poly)
    print "\n Model Performance in Dev set  (Polynomial features)\n"
    model_worth(y_dev,predicted_y)  
for l in list:
    t = pd.DataFrame(l, columns=['year', 'sale'])
    X = t['year'].values.reshape(-1, 1)
    y = t['sale']

    pf = PolynomialFeatures(degree=3)
    pf.fit(X)
    Xp = pf.fit_transform(X)

    lr = LinearRegression()
    lr.fit(Xp, y)
    if l == double11:
        test = [[11, 2684]]
        tt = pd.DataFrame(test, columns=['year', 'sale'])
        Xt = tt['year'].values.reshape(-1, 1)
        pred = lr.predict(pf.transform(Xt))
        print('{}{}{}{}{}{}'.format('双十一R2:', lr.score(Xp, y), ' 预测 ', pred,
                                    ' 实际:2684 误差: ', (pred - 2684) / 2684))
    if l == Thanksgiving:
        test = [[10, 1572]]
        tt = pd.DataFrame(test, columns=['year', 'sale'])
        Xt = tt['year'].values.reshape(-1, 1)
        pred = lr.predict(pf.transform(Xt))
        print('{}{}{}{}{}{}'.format('感恩节R2:', lr.score(Xp, y), ' 预测 ', pred,
                                    ' 实际:1572 误差: ', (pred - 1572) / 1572))
    if l == Blackfriday:
        test = [[10, 2360]]
        tt = pd.DataFrame(test, columns=['year', 'sale'])
        Xt = tt['year'].values.reshape(-1, 1)
        pred = lr.predict(pf.transform(Xt))
        print('{}{}{}{}{}{}'.format('黑五节R2:', lr.score(Xp, y), ' 预测 ', pred,
Esempio n. 50
0
        for epoch in range(n_epochs):
            for batch_index in range(n_batches):
                X_batch, y_batch = random_batch(X_train, y_train, batch_size)
                sess.run(train_op, feed_dict={X: X_train, y:y_train})
            if epoch %100 == 0:
                loss_val = loss.eval({X: X_train, y:y_train})
                loss_val1 = loss1.eval({X: X_train, y:y_train})
                print("Epoch:", epoch, "\tLoss:", loss_val, "\tLoss1:", loss_val1)
        best_w = w.eval()
        return best_w
w_tf = logistic_regression(X_train_tf, y_train_tf)

def log_prob(X,w):
    score = np.dot(X,w)
    prob = 1/(1 + np.exp(-score))
    return prob
y_prob_tf = log_prob(X_test_tf, w_tf)
y_pred_tf = (y_prob_tf > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_tf))


from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree=3)
X_train_tf2 = pf.fit_transform(X_train)
X_test_tf2 = pf.transform(X_test)

w_tf2 = logistic_regression(X_train_tf2, y_train_tf)
y_prob_tf2 = log_prob(X_test_tf2, w_tf2)
y_pred_tf2 = (y_prob_tf2 > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_tf2))
Esempio n. 51
0
plt.ylabel('Claims Paid')
plt.show()

# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_object = PolynomialFeatures(degree = 5)
features_poly = poly_object.fit_transform(features)


lin_reg_2 = LinearRegression()
lin_reg_2.fit(features_poly, labels)

#lin_reg_2.predict([1981])  not in degree 5


print ("Predicting result with Polynomial Regression")
print (lin_reg_2.predict(poly_object.transform([1981]))   #converting to degree 5

# Visualising the Polynomial Regression results
plt.scatter(features, labels, color = 'red')
plt.plot(features, lin_reg_2.predict(poly_object.fit_transform(features)), color = 'blue')
plt.title('Polynomial Regression')
plt.xlabel('Year')
plt.ylabel('Claims Paid')
plt.show()



"""
https://towardsdatascience.com/polynomial-regression-bbe8b9d97491
"""
Esempio n. 52
0
# Define targets
targets = [CASUAL, REGISTERED]

# CV
n_folds = 10
rmsle_fold = np.zeros(n_folds)
skf = KFold(y.shape[0], n_folds, shuffle=True, random_state=0)
i = 0
for train, test in skf:
    X_train, X_test = X.loc[train, :], X.loc[test, :]
    y_train, y_test = y.loc[train], y.loc[test]

    # Transform polynomial base functions
    poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=True)
    X_train = poly.fit_transform(X_train)
    X_test = poly.transform(X_test)

    # Train models
    model = {}
    for target in targets:
        # clf[target] = Lasso(random_state=0, alpha=1.0, normalize=True, max_iter=1000, tol=0.0001, positive=False,
        #                     selection='cyclic')
        model[target] = Ridge(random_state=0, alpha=1.0, normalize=True, max_iter=None, tol=0.001, solver='auto')
        model[target].fit(X_train, y_train[target])

    # Predict and clip
    y_pred = model[CASUAL].predict(X_test).clip(min=0) + model[REGISTERED].predict(X_test).clip(min=0)

    # Evaluate
    rmsle_fold[i] = rmsle(y_test[COUNT], y_pred)
    print 'Fold %d/%d, RMSLE = %f' % (i + 1, n_folds, rmsle_fold[i])
Esempio n. 53
0
feat_train = sc.fit_transform(feat_train)
feat_test = sc.transform(feat_test)
'''
'''
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(feat_train , labels_train)
labels_pred = regressor.predict(feat_test)
'''

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
poln_object = PolynomialFeatures(degree=2)
feat_train_poln = poln_object.fit_transform(feat_train)
feat_test_poln = poln_object.transform(feat_test)
feat_test_poln_abs = poln_object.transform(
    np.array([24, 23, 91, 1007, 5, 19, 33, 25, 55, 999, 3, 6]).reshape(1, -1))
lin_reg_2 = LinearRegression()
lin_reg_2.fit(feat_train_poln, labels_train)
import numpy as np
lin_2_reg_pred = lin_reg_2.predict(feat_test_poln)
lin_2_reg_pred2 = lin_reg_2.predict(feat_test_poln_abs)
score = lin_reg_2.score(feat_test_poln, labels_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=lin_reg_2,
                             X=feat_train_poln,
                             y=labels_train,
                             cv=10)
print("mean accuracy is", accuracies.mean())
    plt.show()

    pf = PolynomialFeatures(degree=6)
    reg = LogisticRegression(C=10)
    pipeline = Pipeline([("polynomial_features", pf),
                         ("logistic_regression", reg)])

    pipeline.fit(X, f_y)
    theta = reg.coef_.T

    u = np.linspace(-1, 1.5, 50)
    v = np.linspace(-1, 1.5, 50)

    X1, X2 = np.meshgrid(u, v)
    X1, X2 = X1.reshape(-1, 1), X2.reshape(-1, 1)
    temp_X = pf.transform(np.hstack((X1, X2)))

    z = np.dot(temp_X, theta).reshape(len(u), len(v))

    plt.plot(x1[f_y == 0], x2[f_y == 0], 'yo')
    plt.plot(x1[f_y == 1], x2[f_y == 1], 'bx')
    CS = plt.contour(u, v, z)
    plt.clabel(CS, inline=1, fontsize=10)
    plt.show()

    for lambda_coef in (0.03, 0.3, 0.1, 1, 3, 10):
        c = 1.0/lambda_coef
        pipeline.set_params(**{'logistic_regression__C': c})
        pipeline.fit(X, y.ravel())
        print 'lambda = {}, Train Accuracy: {}'.format(lambda_coef, pipeline.score(X, y)*100)
Esempio n. 55
0
class Config:
    def __init__(self, adict, pc):

        for k, v in adict.items():
            setattr(self, k, v)

        self.name = ", ".join(["%s: %s" % x for x in self.__dict__.items()])
        self.datetime = str(datetime.now())
        self.vals = adict
        self.learner = self.__class__.__name__.replace("Config", "")
        self.pc = pc

        # for SVR, allow feature selection only for linear models
        if (self.name in ["SVRpoly", "SVRrbf", "SVRsigmoid"]
                and self.pc['feature_selection'] > 0):
            raise Exception(
                "For non-linear SVR, cannot use feature selection!")

        if pc['poly_degree'] > 0:
            self.poly_features = PolynomialFeatures(pc['poly_degree'],
                                                    interaction_only=True)
        else:
            self.poly_features = None

    def __str__(self):
        return self.name

    def train(self, data):
        if self.pc['rfe_step'] != 0 and self.pc['feature_selection'] > 0:
            model = self.rfe_fit(data)
        else:
            model = self.fit(data)
        return model

    def fit(self, data):
        if isinstance(self, (ConfigGB, ConfigXGBoost)) and self.early_stopping:
            model = self.init_model(early_stopping=self.early_stopping,
                                    num_train=data.trainX.shape[0])
            x = np.concatenate((data.trainX, data.valX))
            y = np.concatenate((data.trainY, data.valY))
        else:
            model = self.init_model()
            x, y = data.trainX, data.trainY

        if self.poly_features:
            LOGGER.info("Fitting polynomial features ...")
            model.fit(self.poly_features.fit_transform(x), y)
        else:
            model.fit(x, y)

        return model

    def rfe_fit(self, data):
        """Recursive feature elimination
        """
        if isinstance(self, (ConfigGB, ConfigXGBoost)):
            model = self.init_model(early_stopping=self.early_stopping,
                                    num_train=data.trainX.shape[0])
        else:
            model = self.init_model()
        num = int(data.trainX.shape[1] * self.pc['feature_selection'])
        if num < 1:
            raise Exception(f"There will be {num} after selection!, " +
                            "change the feature_selection setting")
        LOGGER.debug("RFE will select %d features" % num)
        step = self.pc['rfe_step']
        selector = RFE(model, n_features_to_select=num, step=step)
        if isinstance(self, (ConfigGB, ConfigXGBoost)) and self.early_stopping:
            x = np.concatenate((data.trainX, data.valX))
            y = np.concatenate((data.trainY, data.valY))
            return selector.fit(x, y)
        else:
            return selector.fit(data.trainX, data.trainY)

    def forecast(self, model, testX):
        """Return forecasts for all horizons up to `horizon`
        """

        results = []
        horizon = self.pc['horizon']
        lags = self.pc['lags']
        #LOGGER.debug("Forecasting test: %s" % testX)
        n_forecast_success = 0
        n_forecast_errors = 0
        first_forecast_error = None
        for i in range(len(testX) - horizon + 1):
            for j in range(horizon):
                instance = testX[i + j]
                if j == 0:
                    buf = []
                else:
                    # insert the buffer into instance
                    buf_len = len(buf)
                    if buf_len > lags:
                        buf = buf[-lags:]
                        buf_len = lags
                    start = lags - buf_len
                    end = start + buf_len
                    instance = np.concatenate(
                        (instance[:start], buf, instance[end:]))
                if self.poly_features:
                    poly_instance = self.poly_features.transform(
                        instance.reshape(1, -1))
                    pred_val = model.predict(poly_instance)[0]
                else:
                    pred_val = model.predict(instance.reshape(1, -1))[0]
                if np.isnan(pred_val):
                    #warnings.warn("Error forecasting %s, returning 0.5"
                    #              % instance.tolist())
                    if first_forecast_error is None:
                        first_forecast_error = instance.tolist()
                    n_forecast_errors += 1
                    pred_val = 0.5
                else:
                    n_forecast_success += 1
                buf.append(pred_val)
                #LOGGER.debug("Predicting with instance %s, result %s" %
                #      (instance, pred_val))
            results.append(pred_val)

        if first_forecast_error is not None:
            LOGGER.info(f"{n_forecast_success} successes and ",
                        f"{n_forecast_errors} errors during forecasting")
            LOGGER.debug(f"First instance with error: {first_forecast_error}")

        return np.array(results).reshape(-1, 1)
Esempio n. 56
0
def learning_curve_old(data, feature_cols, target_col, classifier, train_sizes, test_sizes=200000,
    random_state=None, balanced=True, normalise=True, degree=1, pickle_path=None):
    """ Compute the learning curve of a classiifer.

        Parameters
        ----------
        data : DataFrame
            The DataFrame containing all the data.

        feature_cols : array
            A list of column names in data that are used as features.

        target_col : str
            The column name of the target.

        classifier : Classifier object
            A classifier object that will be used to train and test the data.
            It should have the same interface as scikit-learn classifiers.

        train_sizes : array
            The list of the sample sizes that the classifier will be trained on.

        test_sizes : int or list of ints
            The sizes of the test set.

        random_state : int
            The value of the Random State (used for reproducibility).

        normalise : boolean
            Whether we should first normalise the data to zero mean and unit variance.

        degree : int
            If greater than 1, the data will first be polynomially transformed
            with the given degree.

        pickle_path : str
            The path where the values of the learning curve will be saved.

        Returns
        -------
        lc_accuracy_test : array
            The list of balanced accuracy scores for the given sample sizes.

    """

    lc_accuracy_test = []

    if type(test_sizes) is int:
        test_sizes = [test_sizes] * len(train_sizes)

    for i, j in zip(train_sizes, test_sizes):
        gc.collect()
        # split data into test set and training set
        if balanced:
            X_train, X_test, y_train, y_test = balanced_train_test_split(
                data[feature_names], data[class_name], train_size=i, test_size=j, random_state=random_state)
        else:
            X_train, X_test, y_train, y_test = train_test_split(np.array(data[feature_cols]),
                np.array(data[target_col]), train_size=i, test_size=j, random_state=random_state)

        X_train, y_train = shuffle(X_train, y_train, random_state=random_state*2)
        X_test, y_test = shuffle(X_test, y_test, random_state=random_state*3)

        if normalise:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        if degree > 1:
            poly_features = PolynomialFeatures(degree=degree, interaction_only=False, include_bias=True)
            X_train = poly_features.fit_transform(X_train)
            X_test = poly_features.transform(X_test)

        # train the classifier
        classifier.fit(X_train, y_train)

        # apply classifier on test set
        y_pred_test = classifier.predict(X_test)
        confusion_test = metrics.confusion_matrix(y_test, y_pred_test)
        lc_accuracy_test.append(balanced_accuracy_expected(confusion_test))

    # pickle learning curve
    if pickle_path:
        with open(pickle_path, 'wb') as f:
            pickle.dump(lc_accuracy_test, f, protocol=4) 
    
    return lc_accuracy_test
Esempio n. 57
0
class Classifier:

    # def __init__(self, val_size, feature_cols, label_col, feature_degree,
    #              feature_scaling, including_classes, preprocess, shuffle):
    def __init__(self, **params):

        self.val_size = params.pop('val_size')
        self.feature_degree = params.pop('feature_degree')
        self.feature_scaling = params.pop('feature_scaling')
        self.including_classes = params.pop('including_classes')
        self.add_cluster_features = params.pop('add_cluster_features')
        self.shuffle = params.pop('shuffle') and self.add_cluster_features
        self.random_state = params.pop('random_state')

        self.model_params = params

        self.his = {
            'acc': None,
            'loss': None,
            'val_acc': None,
            'val_loss': None
        }
        self.model = None
        self.cm = None
        self.score = 0

    def preprocess_data(self, X, y):
        y = y.astype('int')

        self.le = LabelEncoder()
        y = self.le.fit_transform(y)

        self.num_labels = max(y) + 1
        self.labels = [i for i in range(self.num_labels)]
        self.labels_origin = self.le.inverse_transform(self.labels).tolist()

        X_train, X_val, y_train, y_val = split_data(X, y, self.val_size,
                                                    self.shuffle,
                                                    self.random_state)

        if self.add_cluster_features:
            X_train, y_train = add_cluster_features(X_train, y_train)
            X_val, y_val = add_cluster_features(X_val, y_val)

        self.poly = PolynomialFeatures(self.feature_degree, include_bias=False)
        X_train = self.poly.fit_transform(X_train)
        if len(X_val) > 0:
            X_val = self.poly.transform(X_val)

        self.num_samples, self.num_features = X_train.shape

        self.sc = StandardScaler()
        if self.feature_scaling:
            X_train = self.sc.fit_transform(X_train)
            if len(X_val) > 0:
                X_val = self.sc.transform(X_val)

        return X_train, X_val, y_train, y_val

    def preprocess_X(self, X):

        if self.add_cluster_features:
            X = add_features(X)

        X = self.poly.transform(X)
        if self.feature_scaling:
            X = self.sc.transform(X)

        return X

    def evaluate_helper(self, X, y, radius, verbose):
        prob = self.model.predict_proba(X)

        try:
            loss = log_loss(y, prob)
        except:
            loss = -1

        pred = self.model.predict(X=X)
        accuracy = np.count_nonzero(y == pred) / len(y)

        if verbose:
            print('Accuracy: ', accuracy * 100, ' Loss: ', loss)

        pred = smoothen(pred, radius)
        accuracy = np.count_nonzero(y == pred) / len(y)
        if verbose and radius > 0:
            print('Accuracy after smoothening with radius =', radius, ': ',
                  accuracy * 100)

        self.cm = confusion_matrix(y, pred, labels=self.labels)

        return accuracy, loss

    def evaluate_test(self, radius=0, verbose=False):

        if len(self.X_val) > 0:
            if verbose:
                print('\nEvaluating on test set...')
            X = self.X_val
            y = self.y_val

            accuracy, loss = self.evaluate_helper(X, y, radius, verbose)

            self.score = accuracy * 100

    def evaluate(self, X, y, radius=0, verbose=False):

        X, y = filt_data(X, y, self.including_classes)

        X = self.preprocess_X(X)
        y = self.le.transform(y)

        accuracy, loss = self.evaluate_helper(X, y, radius, verbose)

        return {'acc': accuracy, 'loss': loss}

    def judge(self, X, y, radius=0, verbose=False, threshold=0.8):

        X, y = filt_data(X, y, self.including_classes)

        X = self.preprocess_X(X)
        y = self.le.transform(y)

        prob = self.model.predict_proba(X)
        pred = prob.argmax(axis=1)
        confidence = np.max(prob, axis=1)

        pred = judge(pred, prob, threshold=threshold)

        pred = smoothen(pred, radius)

        cm = confusion_matrix(y, pred, labels=self.labels + [-9999])

        return cm

    def probability(self, X):

        X = self.preprocess_X(X)

        return self.model.predict_proba(X)

    def predict(self, X=None, radius=0, threshold=0.0):

        X = self.preprocess_X(X)

        prob = self.model.predict_proba(X)
        pred = prob.argmax(axis=1)
        confidence = np.max(prob, axis=1)

        pred = self.le.inverse_transform(pred)
        pred = judge(pred, confidence, threshold=threshold, null_type=None)
        pred = smoothen(pred, radius)

        return pred

    def get_result(self, X, radius=0, threshold=0.0):

        X = self.preprocess_X(X)

        prob = self.model.predict_proba(X)
        pred = prob.argmax(axis=1)
        confidence = np.max(prob, axis=1)

        pred = self.le.inverse_transform(pred)
        pred = judge(pred, confidence, threshold=threshold, null_type=None)
        pred = smoothen(pred, radius)

        cum_prob = cumulate_prob(prob)

        return dict(target=pred.tolist(), prob=cum_prob.tolist())

    def get_cm_data_url(self, id):
        if self.cm is None:
            return None

        draw_confusion_matrix(self.cm, self.labels_origin + [''])
        img = id + '.png'
        plt.savefig(img)
        data_url = image_to_data_url(img)
        os.remove(img)

        return data_url

    @classmethod
    def load(Classifier, file_name):
        return joblib.load(file_name)
Esempio n. 58
0
# 
# Теперь рассмотрим способы преобразования признаков. Существует достаточно много различных способов трансформации признаков, которые позволяют при помощи линейных методов получать более сложные разделяющие поверхности. Самым базовым является полиномиальное преобразование признаков. Его идея заключается в том, что помимо самих признаков вы дополнительно включаете набор все полиномы степени $p$, которые можно из них построить. Для случая $p=2$ преобразование выглядит следующим образом:
# 
# $$ \phi(x_i) = [x_{i,1}^2, ..., x_{i,D}^2, x_{i,1}x_{i,2}, ..., x_{i,D}, x_{i,D-1}, x_{i,1}, ..., x_{i,D}, 1] $$
# 
# Рассмотрим принцип работы данных признаков на данных, сэмплированных их гауссиан:

# In[ ]:

from sklearn.preprocessing import PolynomialFeatures

"""Инициализируем класс, который выполняет преобразование"""
transform = PolynomialFeatures(2)
"""Обучаем преобразование на обучающей выборке, применяем его к тестовой"""
example_data_train_poly = transform.fit_transform(example_data_train)
example_data_test_poly = transform.transform(example_data_test)
"""Обращаем внимание на параметр fit_intercept=False"""
optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=False), param_grid, cv=cv, n_jobs=-1)
optimizer.fit(example_data_train_poly, example_labels_train)
Z = optimizer.predict(transform.transform(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel2)
plt.scatter(data_0[:,0], data_0[:,1], color='red')
plt.scatter(data_1[:,0], data_1[:,1], color='blue')
plt.title('With class weights')
plt.show()


# Видно, что данный метод преобразования данных уже позволяет строить нелинейные разделяющие поверхности, которые могут более тонко подстраиваться под данные и находить более сложные зависимости. Число признаков в новой модели:

# In[ ]:
Esempio n. 59
0
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
feat_train = sc.fit_transform(feat_train)
feat_test = sc.transform(feat_test)
'''
'''
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(feat_train , labels_train)
labels_pred = regressor.predict(feat_test)
'''

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
poln_object = PolynomialFeatures(degree = 2)
feat_train_poln = poln_object.fit_transform(feat_train)
feat_test_poln = poln_object.transform(feat_test)
feat_test_poln_abs = poln_object.transform(np.array([24,23,91,1007,5,19,33,25,55,999,3,6]).reshape(1,-1))
lin_reg_2 = LinearRegression()
lin_reg_2.fit(feat_train_poln,labels_train)
import numpy as np
lin_2_reg_pred = lin_reg_2.predict(feat_test_poln)
lin_2_reg_pred2 = lin_reg_2.predict(feat_test_poln_abs)
score = lin_reg_2.score(feat_test_poln , labels_test)


from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = lin_reg_2, X = feat_train_poln, y = labels_train, cv = 10)
print ("mean accuracy is",accuracies.mean())
print (accuracies.std())