Beispiel #1
0
def SGDRegressor_pred(X_train, X_test, y_train_normalized, y_train_mean, y_test):
    # The learning rate:
    # ---constant: eta = eta0 [assign to the initial one, eta0]
    # ---optimal: eta = 1.0/(t+t0)
    # ---invscaling: eta = eta0 / pow(t, power_t) [default]
    clf = SGDRegressor(alpha=0.0001, eta0=0.001, n_iter=150, fit_intercept=False, shuffle=True, verbose=0)
    clf = clf.fit(X_train, y_train_normalized)

    # Conveting to back, (could be used sklearn standardization function for both decoding and encoding)
    predictions_train = clf.predict(X_train) + y_train_mean
    predictions = clf.predict(X_test) + y_train_mean

    score_test = clf.score(X_test, y_test)

    return predictions, predictions_train, score_test
Beispiel #2
0
    def predict(self, df):

        # get time frame
        time_frame = settings.time_frame
        
        # copy of data
        df_copy = df.copy()

        from sklearn.linear_model import SGDRegressor
        from sklearn.metrics import mean_absolute_error, mean_squared_error
    
        # partition data
        X_train, y_train, X_val, y_val, X_test, y_test = self.partition(df_copy)
        
        # normalize features
        X_train_std, X_val_std, X_test_std = self.feature_scale(X_train, X_val, X_test)
        
        # instance of Linear Regression classifier
        lr = SGDRegressor()
        
        # fit model
        lr.fit(X_train_std, y_train)
        
        # predictions on validation set
        predictions = lr.predict(X_val_std)
    
        # R^2 score
        score = lr.score(X_val_std, y_val)
        
        # error
        test_error = (mean_squared_error(y_val, predictions)**.5)
        print test_error
def sgd(X, y, weight, X_test=False):
    from sklearn.linear_model import SGDRegressor
    from sklearn import cross_validation
    from sklearn.metrics import confusion_matrix
    from sklearn.preprocessing import StandardScaler

    #X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
    #        X, y, weight, test_size=0.2, random_state=0)
    clf = SGDRegressor(loss="huber", n_iter=100, penalty="l1")
    #clf = LogisticRegression( max_iter=100)

    X_train = X
    y_train = y

    scaler = StandardScaler(with_mean=False)
    scaler.fit(X_train)  # Don't cheat - fit only on training data
    X_train = scaler.transform(X_train)

    X_test = scaler.transform(X_test)  # apply same transformation to test data

    clf.fit(X_train, y_train, sample_weight=weight)

    print(clf.score(X_train,y_train,weight))

    y_pred = clf.predict(X_test)
    
    from sklearn.externals import joblib
    import scipy.io as sio
    joblib.dump(clf, 'models/sgd_.pkl') 
    sio.savemat('predict_y_forward.mat', {'y':y_pred})
Beispiel #4
0
def predict_age():
    mask = ~np.isnan(train["Age"])
    age_train = train[mask]
    age_test = train[~mask]

    features = []
    features.append(embarked_enc.transform(age_train["Embarked"]))
    features.append(sex_enc.transform(age_train["Sex"]))
    features.append(title_enc.transform(age_train["Title"]))
    features.append(pclass_enc.transform(age_train["Pclass"]))

    age_clf = SGDRegressor()
    X = np.hstack(features)
    y = np.array(train["Age"][mask]).T
    age_clf.fit(X, y)

    features = []
    features.append(embarked_enc.transform(age_test["Embarked"]))
    features.append(sex_enc.transform(age_test["Sex"]))
    features.append(title_enc.transform(age_test["Title"]))
    features.append(pclass_enc.transform(age_test["Pclass"]))

    ages = age_clf.predict(np.hstack(features))
    j = 0
    for i in range(len(train)):
        if ~mask[i]:
            train.loc[i, "Age"] = ages[j]
            j += 1
Beispiel #5
0
def predictScores(trainFeatures,trainTargets,testFeatures,testItemIds,isRegression = False):
    logging.info("Feature preparation done, fitting model...")
    
    predicted_scores = []
    if isRegression:
        clf = SGDRegressor(     penalty="l2", 
                                alpha=1e-4)
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(trainFeatures,trainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict(testFeatures)
    else:         
        clf = SGDClassifier(    loss="log", 
                                penalty="l2", 
                                alpha=1e-4, 
                                class_weight="auto")
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(trainFeatures,trainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict_proba(testFeatures).T[1]    
    
    logging.info("Write results...")
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
Beispiel #6
0
class EdenRegressor(BaseEstimator, RegressorMixin):
    """Build a regressor for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 normalization=True, inner_normalization=True,
                 penalty='elasticnet', loss='squared_loss'):
        """construct."""
        self.set_params(r, d, nbits, discrete,
                        normalization, inner_normalization,
                        penalty, loss)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   normalization=True, inner_normalization=True,
                   penalty='elasticnet', loss='squared_loss'):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.model = SGDRegressor(
            loss=loss, penalty=penalty,
            average=True, shuffle=True,
            max_iter=5, tol=None)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    def fit(self, graphs, targets, randomize=True):
        """fit."""
        x = self.transform(graphs)
        self.model = self.model.fit(x, targets)
        return self

    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    def decision_function(self, graphs):
        """decision_function."""
        return self.predict(graphs)
def gradiantDescent(trainData,testData,trainOuts,testOuts):
	clf = SGDRegressor(loss="squared_loss")
	print(clf.fit(trainData,trainOuts))
	print(clf.coef_)
	predictions = clf.predict(testData)
	print(predictions)
	misses,error = sup.crunchTestResults(predictions,testOuts,.5)
	print(1-error)
Beispiel #8
0
def sgd(X_train, y_train, X_validate, y_validate, X_test, cw, alpha, regression=False):
  #cw = 2.5
  if regression:
    clf = SGDRegressor(alpha=alpha)
  else:
    #clf = SGDClassifier(class_weight = {1:cw}, alpha=alpha)
    clf = SGDClassifier(class_weight = {1:cw}, alpha=alpha, loss='log')
  print clf
  training_data_size = y_train.shape[0]
  n_iter = 3
  mb_size = 100
  iter_mb = minibatch_generator(training_data_size, mb_size = mb_size, n_iter = n_iter)
  total = 0
  n_total_batch = n_iter*training_data_size/mb_size
  t0 = time()
  recent_auc = []
  for n_batch, batch in enumerate(iter_mb):
    x, y = X_train[batch], y_train[batch]
    if regression:
      sw = np.ones(y.shape[0])
      sw[np.where(y==1)[0]] = cw
      clf.partial_fit(x, y, sample_weight=sw)
    else:
      clf.partial_fit(x, y, classes = [1, 0])
    total += y.shape[0]
    if (n_batch+1)%1000 == 0:
      if regression:
        #y_pred_validate_val = clf.decision_function(X_validate)
        y_pred_validate_val = clf.predict(X_validate)
      else:
        #y_pred_validate_val = clf.decision_function(X_validate)
        y_pred_validate_val = clf.predict_proba(X_validate)[:,1]
      print 'auc:%.3f, %d samples in %ds (cw: %.2f)' %(AUC(y_validate, y_pred_validate_val), total, time()-t0, cw)
    if n_batch>n_total_batch-100:
      if regression:
        y_pred_validate_val = clf.predict(X_validate)
      else:
        y_pred_validate_val = clf.predict_proba(X_validate)[:,1]
      recent_auc.append(AUC(y_validate, y_pred_validate_val))
  latest_auc_avg = np.mean(recent_auc)
  print 'cw=%.2f, avg auc of last %d bathes: %.3f' %(cw, len(recent_auc), latest_auc_avg)
  if regression:
    return clf.predict(X_test)
  else:
    return clf.predict_proba(X_test)[:,1]
Beispiel #9
0
def sgd_text_model(x_train, y_train, x_test, x_valid, cache_name, use_cache=False):
    if use_cache:
        fhand = open(cache_name, 'r')
        data_dict = pickle.load(fhand)
        return data_dict['test_pred'], data_dict['valid_pred']
    np.random.seed(seed=123)
    model = SGDRegressor(eta0=1000, fit_intercept=True, l1_ratio=0.15,
                         learning_rate='invscaling', loss='huber', n_iter=200,
                         p=None, penalty='l1', power_t=.1, random_state=123,
                         rho=None, shuffle=True, verbose=0, warm_start=False)
    model.fit(x_train, y_train)
    test_pred = model.predict(x_test)
    valid_pred = model.predict(x_valid)
    data_dict = {'test_pred': test_pred, 'valid_pred': valid_pred}
    fhand = open(cache_name, 'w')
    pickle.dump(data_dict, fhand)
    fhand.close()
    return test_pred, valid_pred
Beispiel #10
0
    def fit(self, U, Y):
        self.initialize()
        #learn X
        #X = self.getX(U,Y)
        X = self.getXBatched(U,Y,TSData.batchSize)
        print("Starting to train the model...")

        #clf = ElasticNet(alpha=5,l1_ratio=0.5,max_iter=50000)
        #for x1,y1 in izip(X,Y):
        #    clf.partial_fit(x1[np.newaxis,:], y1)
        #If not using generator
        X = np.array([i for i in X])
        #X = np.array(X)
        print(X.shape)
        print(Y.shape)
        clf = SGDRegressor(n_iter=100)
        clf.fit(X,np.ravel(Y))
        print(metrics.mean_absolute_error(clf.predict(X),Y))
        print(TSData().getScore(Y, clf.predict(X)))
        self.clf = clf
def test_multi_target_regression_partial_fit():
    X, y = datasets.make_regression(n_targets=3)
    X_train, y_train = X[:50], y[:50]
    X_test, y_test = X[50:], y[50:]

    references = np.zeros_like(y_test)
    half_index = 25
    for n in range(3):
        sgr = SGDRegressor(random_state=0)
        sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
        sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
        references[:, n] = sgr.predict(X_test)

    sgr = MultiOutputRegressor(SGDRegressor(random_state=0))

    sgr.partial_fit(X_train[:half_index], y_train[:half_index])
    sgr.partial_fit(X_train[half_index:], y_train[half_index:])

    y_pred = sgr.predict(X_test)
    assert_almost_equal(references, y_pred)
Beispiel #12
0
def sgd_regressor(x, y, alpha):
    kf = KFold(len(x), n_folds=3)
    scores = []
    for train_index, test_index in kf:
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train)
        x_train = scaler.transform(X_train)
        x_test = scaler.transform(X_test)
        clf = SGDRegressor(loss='squared_loss', alpha=alpha)
        clf.fit(x_train, y_train)
        scores.append(mean_squared_error(clf.predict(x_test), y_test) ** 0.5)
    # print 'SGDRegressor'
    return np.mean(scores)
def predictLinearRegress(attributeList, starTargetList):

    print("\nLinear Regression")

    starTargetList = np.array(starTargetList)
    Xtrain, Xtest, Ytrain, Ytest = ml.splitData(attributeList, starTargetList, 0.75)

    lr = ml.linear.linearRegress(Xtrain, Ytrain)

    yHatInitial = lr.predict(Xtest)
    print("MSE test: ", mean_squared_error(yHatInitial, Ytest))
    print("RMSE test: ", math.sqrt(mean_squared_error(yHatInitial, Ytest)))


    incorrect = 0
    total = 0
    for i, value in enumerate(yHatInitial):
        if(abs(yHatInitial[i] - Ytest[i]) > 0.5):
            incorrect += 1
        total += 1

    ratioIncorrect = float(float(incorrect) / float(total))
    print("Ratio incorrect: " + str(ratioIncorrect))


    onesCol = np.ones((len(Xtrain),1))
    Xtrain = np.concatenate((onesCol, Xtrain), 1)
    onesCol = np.ones((len(Xtest),1))
    Xtest = np.concatenate((onesCol, Xtest), 1)
    m, n = np.shape(Xtrain)

    clf = SGDRegressor(loss="squared_loss")
    clf.fit(Xtrain, Ytrain)
    yHat = clf.predict(Xtest)

    print("MSE after GD: ", mean_squared_error(yHat, Ytest))
    print("RMSE after GD: ", math.sqrt(mean_squared_error(yHat, Ytest)))

    incorrect = 0
    total = 0
    for i, value in enumerate(yHat):
        if(abs(yHat[i] - Ytest[i]) > 0.5):
            incorrect += 1
        total += 1

    ratioIncorrect = float(float(incorrect) / float(total))
    print("Ratio incorrect: " + str(ratioIncorrect))
Beispiel #14
0
def predictCrossValidatedScore(trainFeatures,trainTargets,trainItemIds,isRegression = False):
    logging.info("Feature preparation done, fitting model...")
                           
    randomPermutation = random.sample(range(trainFeatures.shape[0]), trainFeatures.shape[0])
    numPointsTrain = int(trainFeatures.shape[0]*0.5)
    
    dataTrainFeatures = trainFeatures[randomPermutation[:numPointsTrain]]
    dataValidationFeatures = trainFeatures[randomPermutation[numPointsTrain:]]
    
    dataTrainTargets = [trainTargets[i] for i in randomPermutation[:numPointsTrain]]
    dataValidationTargets = [trainTargets[i] for i in randomPermutation[numPointsTrain:]]

    predicted_scores = []
    if isRegression:
        clf = SGDRegressor(    penalty="l1", 
                                alpha=1e-4)
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(dataTrainFeatures,dataTrainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict(dataValidationFeatures)   
    else:         
        clf = SGDClassifier(    loss="log", 
                                penalty="l2", 
                                alpha=1e-4, 
                                class_weight="auto")
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(dataTrainFeatures,dataTrainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict_proba(dataValidationFeatures).T[1]
            
    error = mean_squared_error(dataValidationTargets,predicted_scores)
    print("% Error:"+ str(error))
Beispiel #15
0
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import SGDRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error


df = pd.read_csv("forestfires.txt", index_col=False, sep=" ")

X = df.iloc[:,0:-1].values
Y = df.iloc[:,-1].values
normalizer = Normalizer()

X = normalizer.fit_transform(X)
k_fold_cv = KFold(n=Y.shape[0], n_folds=10, shuffle=True)


sgdr = SGDRegressor()

for train_index, test_index in k_fold_cv:
	X_train, X_test = X[train_index], X[test_index]
	Y_train, Y_test = Y[train_index], Y[test_index]
	sgdr.fit(X_train, Y_train)
	pred = sgdr.predict(X_test)
	error = mean_squared_error(Y_test, pred)
	print(error)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#ols
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

#predicting the value
y_pred1 = lin_reg.predict(X_test)

#r2_score result
from sklearn.metrics import r2_score, mean_squared_error
r_squared1 = r2_score(y_test, y_pred1)
print("Coefficient of Determination using ols method = ", r_squared1)

#SGD
from sklearn.linear_model import SGDRegressor, LinearRegression
regressor = SGDRegressor(max_iter=10000, tol=1e-3)
regressor.fit(X_train, y_train)

#predicting the value
y_pred = regressor.predict(X_test)

#r2_score result
from sklearn.metrics import r2_score, mean_squared_error
r_squared = r2_score(y_test, y_pred)
print("Coefficient of Determination using sgd method = ", r_squared)
Beispiel #17
0
def test_regressor_regularization(normalize, loss):
    rng = np.random.RandomState(0)
    transformer = RBFSampler(n_components=100, random_state=0, gamma=10)
    transformer.fit(X)
    X_trans = transformer.transform(X)
    if normalize:
        X_trans = StandardScaler().fit_transform(X_trans)
    y, coef = generate_target(X_trans, rng, -0.1, 0.1)
    y_train = y[:n_train]
    y_test = y[n_train:]
    # overfitting
    clf = AdamRegressor(transformer,
                        max_iter=300,
                        warm_start=True,
                        verbose=False,
                        fit_intercept=True,
                        loss=loss,
                        alpha=0.0001,
                        intercept_decay=1e-6,
                        random_state=0,
                        tol=0,
                        normalize=normalize)
    clf.fit(X_train[:100], y_train[:100])
    l2 = np.mean((y_train[:100] - clf.predict(X_train[:100]))**2)
    assert l2 < 0.01

    # underfitting
    clf_under = AdamRegressor(transformer,
                              max_iter=100,
                              warm_start=True,
                              verbose=False,
                              fit_intercept=True,
                              loss=loss,
                              alpha=100000,
                              random_state=0,
                              normalize=normalize)
    clf_under.fit(X_train, y_train)
    assert np.sum(clf_under.coef_**2) < np.sum(clf.coef_**2)

    # l1 regularization
    clf_l1 = AdamRegressor(transformer,
                           max_iter=100,
                           warm_start=True,
                           verbose=False,
                           fit_intercept=True,
                           loss=loss,
                           alpha=1000,
                           l1_ratio=0.9,
                           random_state=0,
                           normalize=normalize)
    clf_l1.fit(X_train, y_train)
    assert_almost_equal(np.sum(np.abs(clf_l1.coef_)), 0)

    # comparison with sgd
    sgd = SGDRegressor(alpha=0.01,
                       max_iter=100,
                       eta0=1,
                       learning_rate='constant',
                       fit_intercept=True,
                       random_state=0)
    sgd.fit(X_trans[:n_train], y_train)
    test_l2_sgd = np.mean((y_test - sgd.predict(X_trans[n_train:]))**2)
    clf = AdamRegressor(transformer,
                        max_iter=100,
                        warm_start=True,
                        verbose=False,
                        fit_intercept=True,
                        loss=loss,
                        alpha=0.01,
                        random_state=0,
                        normalize=normalize)

    clf.fit(X_train, y_train)
    test_l2 = np.mean((y_test - clf.predict(X_test))**2)
    assert test_l2 <= test_l2_sgd
    for j in jsondata.keys():
        if j in SubjDict:
            data[SubjDict[j]]=jsondata[j]
    train_X.append(data)
    train_Y.append(jsondata['Mathematics'])
f.close()

#fit/train data
train_X=numpy.array(train_X)
test_X=numpy.array(test_X)
rsmax=0
zmax=0
LR=SGDRegressor(epsilon=0.17,fit_intercept=False,penalty='elasticnet',
                loss='epsilon_insensitive',random_state=692,alpha=0.000001,
                n_iter=4).fit(train_X[:,1:],train_Y)
test_Y = LR.predict(test_X[:,1:])
for i in range(len(test_Y)):
    if test_Y[i]<2:
        test_Y[i]=2
    elif test_Y[i]>7:
        test_Y[i]=7


#### Predict the result    
if local:
    #import test output
    filename = "sample-test2.out.json"
    f = open(filename)
    z=0
    for x in test_Y:
        y=int(f.readline())
plt.plot(X, Y, 'b.')
plt.plot(X_test, Y_test, 'r-')

plt.show()

"""

# 示例2:直接使用梯度下降法进行岭回归,penalty参数:使用的是哪种正则惩罚项,alpha:L1的alpha参数,n_iter:迭代次数
# 使用SGDRegressor可以替代Ridge Regtession、Lasso、Elastic Net三种回归方法

ridge = SGDRegressor(penalty="elasticnet",
                     alpha=0.001,
                     l1_ratio=0.15,
                     n_iter=1000)
# 训练数据
ridge.fit(X, Y)

# 打印截距
print(ridge.intercept_)
# 打印参数W
print(ridge.coef_)

X_test = np.array([[0], [2]])
# 预测数据
Y_test = ridge.predict(X_test)

plt.plot(X, Y, 'b.')
plt.plot(X_test, Y_test, 'r-')

plt.show()
X_train_poly_scaled = poly_scaler.fit_transform(X_train)
X_val_poly_scaled = poly_scaler.transform(X_val)

sgd_reg = SGDRegressor(max_iter=1,
                       tol=-np.infty,
                       penalty=None,
                       eta0=0.0005,
                       warm_start=True,
                       learning_rate="constant",
                       random_state=42)

n_epochs = 500
train_errors, val_errors = [], []
for epoch in range(n_epochs):
    sgd_reg.fit(X_train_poly_scaled, y_train)
    y_train_predict = sgd_reg.predict(X_train_poly_scaled)
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    train_errors.append(mean_squared_error(y_train, y_train_predict))
    val_errors.append(mean_squared_error(y_val, y_val_predict))

best_epoch = np.argmin(val_errors)
best_val_rmse = np.sqrt(val_errors[best_epoch])

plt.annotate(
    'Best model',
    xy=(best_epoch, best_val_rmse),
    xytext=(best_epoch, best_val_rmse + 1),
    ha="center",
    arrowprops=dict(facecolor='black', shrink=0.05),
    fontsize=16,
)
Beispiel #21
0
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error

# 参数一次或自变量一次的为线性模型,但参数一次的未必为线性关系
# 损失函数:平方差
# 优化方法
# 正规方程:w = (x.T*X).逆*x.T*y,求解速度慢,适合小数据集
# 梯度下降

# 获取数据集
data = load_boston()
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(data.data,
                                                    data.target,
                                                    random_state=22)
# 特征工程

trans = StandardScaler()
x_train = trans.fit_transform(x_train)
x_test = trans.transform(x_test)
# 线性回归
estimator = SGDRegressor(eta0=0.001, max_iter=10000)
print(x_train.shape, y_train.shape)
estimator.fit(x_train, y_train)

# 模型评估:用均方误差衡量
print(estimator.coef_)  # 回归系数
print(estimator.intercept_)  # 偏置
y_pred = estimator.predict(x_test)
score = mean_squared_error(y_test, y_pred)
print(score)
Y_test = ss_Y.transform(Y_test)

y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1,1))
需要把一维数组转化为二维数组形式
"""
"""以下是分别采用线性回归和随机梯度回归对模型进行参数估计以及预测"""
lr = LinearRegression()
lr.fit(X_train, Y_train)
lr_Y_predict = lr.predict(X_test)

sgdr = SGDRegressor(max_iter=5)
# 设置最大跌在次数为5

sgdr.fit(X_train, Y_train.ravel())
sgdr_Y_predict = sgdr.predict(X_test)
"""
我的Y是2D的形式(shapes, 1),要把二维的形式改成1D的形式(shapes, )
这就可以对fit输入的Y_train作y_train.ravel()这样的转换
即把sgdr.fit(Y_train, Y_train)代码修改为sgdr.fit(X_train, Y_train.ravel())
warning就会消失了
"""
"""接下来要评估准确性"""
print 'the value of default measurement of LinearRegression is:', lr.score(
    X_test, Y_test)

print 'the value of R_squared of LinearRegression is', r2_score(
    Y_test, lr_Y_predict)

print 'the mean squared error of LinerRegression is', mean_squared_error(
    ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict))
Beispiel #23
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor

np.random.seed(0)
x, y = make_regression(n_samples=100, n_features=1, noise=10)
plt.scatter(x, y)

model = SGDRegressor(max_iter = 1000, eta0 = 0.001)
model.fit(x, y)

print('Coeff R2 = ', model.score(x, y))
plt.scatter(x, y)
plt.plot(x, model.predict(x), c='pink', lw = 3)
Beispiel #24
0
from sklearn.preprocessing import StandardScaler

plt.figure()  # 实例化作图变量
plt.title('single variable')  # 图像标题
plt.xlabel('x')  # x轴文本
plt.ylabel('y')  # y轴文本
plt.grid(True)  # 是否绘制网格线

X_scaler = StandardScaler()
y_scaler = StandardScaler()
X = [[50], [100], [150], [200], [250], [300], [50], [100], [150], [200], [250],
     [300], [50], [100], [150], [200], [250], [300], [50], [100], [150], [200],
     [250], [300], [50], [100], [150], [200], [250], [300], [50], [100], [150],
     [200], [250], [300], [50], [100], [150]]
y = [[150], [200], [250], [280], [310], [330], [150], [200], [250], [280],
     [310], [330], [150], [200], [250], [280], [310], [330], [150], [200],
     [250], [280], [310], [330], [150], [200], [250], [280], [310], [330],
     [150], [200], [250], [280], [310], [330], [150], [200], [250]]
X = X_scaler.fit_transform(X)
y = y_scaler.fit_transform(y)
X_test = [[40], [400]]  # 用来做最终效果测试
X_test = X_scaler.transform(X_test)

plt.plot(X, y, 'k.')

model = SGDRegressor()
model.fit(X, y.ravel())
y_result = model.predict(X_test)
plt.plot(X_test, y_result, 'g-')

plt.show()  # 展示图像
Beispiel #25
0
for i in range(0,N):
    temp = input().split("\t")
    dates.append(datetime.strptime(temp[0], '%m/%d/%Y %H:%M:%S'))
    try:
        stock_list.append(float(temp[1]))
    except:
        stock_list.append(np.nan)
        missing_list.append(int(i))


df_stock = pd.DataFrame({"date":dates,"price":stock_list})
missing_dates = df_stock[df_stock['price'].isnull()]['date'].values
missing_dates = missing_dates.astype('datetime64[D]').astype(int)
missing_dates = [[x] for x in missing_dates]

df_stock = df_stock.dropna()
X= [[x] for x in df_stock['date'].values]
y=  df_stock['price'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle=False)


mdl = SGDRegressor(shuffle=False, max_iter=5000, learning_rate='optimal', random_state=0, n_iter_no_change=30)

mdl.fit(X_train, y_train)

y_pred = mdl.predict(missing_dates)

for pred in y_pred:
    print(pred)
Beispiel #26
0
    data = comm.bcast(data, root=0)
    n_jobs = get_args().n_jobs

    chunk_len = int(len(data['grid']) / n_jobs)
    offset = chunk_len * rank

    data_chunk = numpy.asarray(data['grid'])[offset:(offset + chunk_len)]
    min_mse = 1000
    best_param = {}
    for set in data_chunk:
        model = SGDRegressor(alpha=set[1],
                             max_iter=set[0],
                             random_state=settings.seed)

        model.fit(data['X'], data['y'])
        preds = model.predict(data['X_t'])

        pred = preds.reshape(len(preds))
        real = data['y_t']

        mse = mean_squared_error(real, pred)

        if mse < min_mse:
            min_mse = mse
            best_param = set

    if rank == 0:
        result.append([min_mse, best_param])
        for i in range(1, n_jobs):
            p_res = comm.recv(source=i)
            result.append(p_res)
Beispiel #27
0
# In[13]:

sgd = SGDRegressor()
sgd


# In[14]:

sgd = SGDRegressor().fit(topics.values, nums.favorite_count)


# Well, that was **much** faster...

# In[15]:

predicted_favorites = sgd.predict(topics.values)
predicted_favorites


# In[16]:

np.sum(predicted_favorites >= 1)


# Well that seems more "balanced" at least.  
# And it's nice to have a continuous score.  

# In[17]:

np.sum(nums.favorite_count.values >= 1)
# y_test = ss_y.transform(pd.colume_or_1d(y_test))

#数组新的shape属性应该要与原来的配套,如果等于-1的话,那么Numpy会根据剩下的维度计算出数组的另外一个shape属性值。

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)
lr_y_predict = lr.predict(x_test)
#线性回归

from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor(max_iter=5)  #最大迭代5
sgd.fit(x_train, y_train.ravel())  #将y降维为1维
sgd_y_predict = sgd.predict(x_test)
#随机梯度下降

print("The value of default measurement of LinearRegression is ",
      lr.score(x_test, y_test))
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("The value of R-squared of LinearRegression is ",
      r2_score(y_test, lr_y_predict))
print(
    "The mean of squared error of LinearRegression is ",
    mean_squared_error(ss_y.inverse_transform(y_test),
                       ss_y.inverse_transform(lr_y_predict)))
print(
    "The mean of absolute error of LinearRegression is ",
    mean_absolute_error(ss_y.inverse_transform(y_test),
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 文件名: ridge_regression.py

import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

__author__ = 'yasaka'

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
"""
ridge_reg = Ridge(alpha=1, solver='sag')
ridge_reg.fit(X, y)
print(ridge_reg.predict(1.5))
print(ridge_reg.intercept_)
print(ridge_reg.coef_)
"""
sgd_reg = SGDRegressor(penalty='l2', n_iter=1000)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.predict(1.5))
print("W0=", sgd_reg.intercept_)
print("W1=", sgd_reg.coef_)

print 'MAE:', mean_absolute_error(testing_labels,preds), '\n'
 
# PCA + Bayesian Ridge Regression
br = BayesianRidge()
br.fit(reduced_training_features, training_labels)
preds = br.predict(reduced_testing_features)
score = br.score(reduced_testing_features,testing_labels)
print 'PCA + Bayesian Ridge Regression Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Stochastic Gradient Descent Regression
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(training_features, training_labels)
preds = sgd.predict(testing_features)
score = sgd.score(testing_features,testing_labels)
print 'SGD Regression Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n'
 
# PCA + Stochastic Gradient Descent Regression
sgd = SGDRegressor()
sgd.fit(reduced_training_features, training_labels)
preds = sgd.predict(reduced_testing_features)
score = sgd.score(reduced_testing_features,testing_labels)
print 'PCA + SGD Regression Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Polynomial Regression
print(submission_features.head(3))
# # Prediction

# In[58]:
X_test = np.asarray(submission_features)[:, :-2]
y_true = np.asarray(submission_features)[:, -2]
clf = SGDRegressor()
y_pred = np.zeros(len(X_test))

local_df = features[features.DATE < df2.DATE[0] - DateOffset(days=3)]

X_train = np.asarray(local_df)[:, :-2]
y_train = np.asarray(local_df)[:, -2]

clf.partial_fit(X_train, y_train)
y_pred[0] = clf.predict(X_test[0])

for i in trange(1, len(X_test)):
    local_df = features[(features.DATE > df2.DATE[i - 1]) & (features.DATE < (df2.DATE[i] - DateOffset(days=3)))]
    X_train = np.asarray(local_df)[:, :-2]
    y_train = np.asarray(local_df)[:, -2]
    if X_train.shape[0] != 0:
        clf.partial_fit(X_train, y_train)
    y_pred[i] = clf.predict([X_test[i]])[0]

# In[59]:

y_pred_round = [int(math.ceil(x)) if x > 0 else 0 for x in y_pred]
# print(y_pred_round)

# # Output
Beispiel #32
0
from sklearn.linear_model import SGDRegressor

X = [[0, 0], [2, 1], [5, 4]]
y = [0, 2, 2]

# penalty: 损失函数惩罚项,取值none、l1、l2、elasticnet
# l2惩罚项:对应岭回归。l1惩罚性:对应Lasso回归
reg = SGDRegressor(penalty="l2", max_iter=10000)
reg.fit(X, y)

print(reg.predict([[4, 3]]))
print(reg.intercept_)
print(reg.coef_)
Beispiel #33
0
train_prepared = full_pipeline.fit_transform(train_set)

test_prepared = full_pipeline.fit_transform(test_set)

from sklearn.base import clone
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty="l2",
                       learning_rate="constant", eta0=0.0005)

minimum_val_error = float("inf")
best_epoch = None
best_model = None
for epoch in range(1000):
    sgd_reg.fit(train_prepared, train_label)  # continues where it left off
    y_val_predict = sgd_reg.predict(test_prepared)
    val_error = mean_squared_error(y_val_predict, test_label)
    print(val_error)
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = clone(sgd_reg)

print(best_epoch)

test = pd.read_csv("test.csv",index_col="Id")

missed_cols = ['Utilities_NoSeWa', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'HouseStyle_2.5Fin', 'RoofMatl_ClyTile', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_Other', 'Heating_Floor', 'Heating_OthW', 'Electrical_Mix', 'GarageQual_Ex', 'PoolQC_Fa', 'MiscFeature_TenC']
for col in missed_cols:
    test[col] = [0] * 1459
Beispiel #34
0
    def stacklearning(self):
        class sparseNorm(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                from sklearn import preprocessing
                Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values))
                return Y
        fm = sgd.FMRegression(
            n_iter=4743,
            init_stdev=0.1,
            rank=100,
            l2_reg_w=0,
            l2_reg_V=0,
            step_size=0.1,
        )
        fm = sgd.FMRegression(
            n_iter=9943,
            init_stdev=0.1,
            rank=219,
            l2_reg_w=0,
            l2_reg_V=0.06454,
            step_size=0.1,
        )
        pipe = make_pipeline(sparseNorm(), fm)
        calcACC(pipe, X=X2)

        xgb = xgboost.XGBRegressor(
                    n_estimators=100,
                    max_depth=7,
                    gamma=0,
                    colsample_bytree=0.1
                )
        lgbm = LGBMRegressor(
            boosting_type='gbdt', num_leaves=367,
            learning_rate=0.06,feature_fraction=0.14,
            max_depth=28, min_data_in_leaf=8
        )
        rgf = RGFRegressor(
            max_leaf=1211, algorithm="RGF", test_interval=100,
            loss="LS", verbose=False, l2=0.93,
            min_samples_leaf=2
        )
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=56,min_samples_split=2,
            max_features=0.21
        )
        rf = RandomForestRegressor()
        ext = ExtraTreesRegressor(
            n_estimators=384,max_features= 2228,
            min_samples_split= 0.01,max_depth= 856,
            min_samples_leaf= 1
        )
        svr = SVR(
            gamma=9.5367431640625e-07,
            epsilon=0.0009765625,
            C= 2048.0
        )

        #test combination
        desNew = make_pipeline(extdescriptorNew(),rf)
        morNew = make_pipeline(extMorganNew(),rf)
        kotNew = make_pipeline(extklekotaTothNew(),rf)
        macNew = make_pipeline(extMACCSNew(),rf)

        desMac = make_pipeline(extDescriptorMACCS(),rf)
        morMac = make_pipeline(extMorganMACCS(),rf)
        kotMac = make_pipeline(extKlekotaTothMACCS(),rf)

        morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf)
        des = make_pipeline(extOnlyDescriptor(),rf)
        mor = make_pipeline(extOnlyMorgan(),rf)
        kot = make_pipeline(extOnlyklekotaToth(),rf)
        mac = make_pipeline(extOnlyMACCS(),rf)
        all = make_pipeline(extAll(),rf)
        allwithoutNew = make_pipeline(extAllwithoutNew(),rf)
        allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf)
        allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf)

        testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew,
        "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without "
                                                                      "new":allwithoutNew,
                   "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes}

        #10fold
        cv = KFold(n_splits=10, shuffle=True, random_state=0)

        #Fingerprinttest
        resultDic={}
        resultDic2={}
        for name,model in testDic.items():
            #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1)
            #calcACC(model,X=X,y=y2,name=name)

            Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc)
            RMSETmp = Scores['test_RMSE'].mean()
            CORRTmP = Scores['test_Correlation coefficient'].mean()
            resultDic.update({name:[RMSETmp,CORRTmP]})
            print(name,RMSETmp,CORRTmP)

        #stacking
        alldata = make_pipeline(extAll())
        # random forest
        #1.1546 0.70905
        stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1)

        # Light Gradient boosting
        # 1.160732 0.703776
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1)

        # XGboost
        # 1.1839805 0.689571
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1)

        # Regularized greedily forest
        # 1.17050 0.6992
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1)

        #pls 22.808047774809697 0.6410026452910016 i=4
        for i in np.arange(3,11,1):
            pls = PLSRegression(n_components=i)
            testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0)
            calcACC(testmodel)
        pls = PLSRegression(n_components=4)

        #SVR
        svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592,
                  epsilon=0.0009765625,)
        svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0)

        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1)
        calcACC(svr)

        #Extratree  1.157420824123527 0.7061010221224269
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1)
        calcACC(testmodel)

        #k-NN
        nbrs = KNeighborsRegressor(3)

        ##Linear regressions
        #Stochastic Gradient Descenta
        sgd = SGDRegressor(max_iter=1000)
        # Ridge
        for i in [1,10,100,1000]:
            ridge = Ridge(alpha=i)
            calcACC(ridge)
        ridge = Ridge(alpha=45.50940042350705)
        calcACC(ridge)
        # multiple linear
        lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1))
        calcACC(lin)



        #stacking
        #0.69
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1)
        #1.1532 0.70926
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,
                              verbose=1)
        #1.16420 0.7041
        testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1)
        #1.16379 0.7044
        stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1)
        testmodel  = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1)
        #1.1535496740699531 0.7108839199109559
        pcaFeature = make_pipeline(extPCA())
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=rf,verbose=1)
        #1.181801005432221 0.6889745579620922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=lgbm,verbose=1)
        #0.70613
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=xgb,verbose=1)
        #0.71641717
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)
        #0.7146922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)

        #new features
        pcaFeature = make_pipeline(extPCA())

        #old
        pipe1 = make_pipeline(extMACCS(), rf)
        pipe2 = make_pipeline(extMorgan(), rf)
        pipe3 = make_pipeline(extDescriptor(), rf)

        pipe4 = make_pipeline(extPCA(), rgf)
        pipe7 =make_pipeline(extDescriptor(), rgf)
        pipe8 =make_pipeline(extDescriptor(), rgf)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')

        pls = PLSRegression(n_components=4)

        extMACCSdata = make_pipeline(extMACCS())

        nbrsPipe = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())
        ave = extAverage()
        withoutdesc =  make_pipeline(extMACCS())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)
        #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1)

        #0.70
        stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1)

        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)

        #0.69######################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        #0.70
        stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1)

        #0.71
        stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1)
        ###########################
        ###########################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1)
        stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1)
        ###########################

        #stackingwithknn
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1)


        #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1)

        cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        St1Scores = cross_validate(stack1,X,y,cv=cv)
        St1Scores['test_score'].mean()**(1/2)

        St2Scores = cross_validate(stack2,X,y,cv=cv)
        St2Scores['test_score'].mean()**(1/2)

        St3Scores = cross_validate(stack3,X,y,cv=cv)
        St3Scores['test_score'].mean()**(1/2)

        stackScore = cross_validate(stack, X, y, cv=cv)
        stackScore['test_score'].mean()**(1/2)

        lgbmScores =cross_validate(lgbm,X,y,cv=cv)
        lgbmScores['test_score'].mean()**(1/2)

        rgfScores = cross_validate(rgf,X,y,cv=cv)
        rgfScores['test_score'].mean()**(1/2)

        RFScores = cross_validate(rf,X,y,cv=cv)
        RFScores['test_score'].mean()**(1/2)

        scores = cross_validate(stack2,X,y,cv=cv)
        scores['test_score'].mean()**(1/2)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking'))

        stack3.fit(X, y)
        y_pred = stack3.predict(X_train)
        y_val = stack3.predict(X_test)
        #stack3.score(X_train, y_train)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(stack3.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        stack1.fit(X, y)
        valy =  (10 **(stack1.predict(exX))).tolist()

        sgd.fit(X,y)
        valy =  (10 **(sgd.predict(exX))).tolist()

        rgfpipe = make_pipeline(extMACCS(), rf)
        rgf.fit(X,y)
        valy =  (10 **(rgf.predict(exX))).tolist()

        nbrs.fit(X,y)
        valy =  (10 **(nbrs.predict(exX))).tolist()

        pipe = make_pipeline(extMACCS(), rf)
        pipe.fit(X,y)
        valy =  (10 **(pipe.predict(exX))).tolist()


        rf.fit(X, y)
        y_pred = rf.predict(X_train)
        y_val = rf.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(rf.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        lgbm.fit(X, y)
        #y_pred = pipe1.predict(X_train)
        #y_val = pipe1.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(lgbm.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
class Model(object):
    def __init__(self, params):
        self.model_class = params['class']
        self.model = {}
        self.feature_constructor = None
        self.all_possible_decisions = []
        self.X = []
        self.y = []
        self.buffer = 0

    def initialize(self):
        if self.model_class == 'scikit':
            self.model = SGDRegressor(loss='squared_loss', alpha=0.1, n_iter=10, shuffle=True, eta0=0.0001)
            self.feature_constructor = FeatureHasher(n_features=200, dtype=np.float64, non_negative=False, input_type='dict')

        elif self.model_class == 'lookup':
            self.model = {}

    def clean_buffer(self):
        self.X = []
        self.y = []
        self.buffer = 0

    def return_design_matrix(self, all_decision_states, reward=None):
        if self.model_class == 'lookup_table':
            return all_decision_states, reward

        elif self.model_class == 'scikit':
            X, y = [], []
            for decision_state in all_decision_states:
                information, decision_taken = decision_state
                tr = {}
                tr['-'.join([str(information[1]), decision_taken])] = 1
                tr['-'.join([str(information[0]), decision_taken])] = 1
                tr['-'.join([str(information[0]), str(information[1]), decision_taken])] = 1

                X.append(tr)
                y.extend([reward])
            X = self.feature_constructor.transform(X).toarray()

            return X, y

    def fit(self, X, y):
        if self.model_class == 'scikit':
            # X, y = self.shuffle_data(X, y)
            self.model.partial_fit(X, y)
            print self.model.score(X, y)

        if self.model_class == 'lookup_table':
            for decision_state in X:
                if decision_state not in self.model:
                    for d in self.all_possible_decisions:
                        self.model[(decision_state[0], d)] = DecisionState()

                self.model[decision_state].count += 1
                updated_value = self.model[decision_state].value_estimate + (1.0 / self.model[decision_state].count) * (
                y - self.model[decision_state].value_estimate)
                self.model[decision_state].value_estimate = updated_value

    def predict(self, X):
        if self.model_class == 'scikit':
            return self.model.predict(X)

        if self.model_class == 'lookup_table':
            if X not in self.model:
                for d in self.all_possible_decisions:
                    self.model[(X[0], d)] = DecisionState()
            return self.model[X].value_estimate

    @staticmethod
    def shuffle_data(a, b):
        assert len(a) == len(b)
        p = np.random.permutation(len(a))
        return a[p], b[p]
Beispiel #36
0
# 从sklearn.linear_model导入LinearRegression.
from sklearn.linear_model import LinearRegression

# 使用默认配置初始化线性回归器LinearRegression
lr = LinearRegression()
# 使用训练数据进行参数估计。
lr.fit(X_train, y_train)
# 对测试数据进行回归预测。
lr_y_predict = lr.predict(X_test)

# 从sklearn.linear_model导入SGDRegression
from sklearn.linear_model import SGDRegressor

sgdr = SGDRegressor()
sgdr.fit(X_train, y_train)
sgdr_y_predict = sgdr.predict(X_test)
"""
使用三种回归评价机制以及两种调用R-squared评价模块的方法,对本节模型的回归性能做出评价。
"""
# 使用LinearRegression模型自带的评估模块,并输出评价结果。
print('The value of default measurement of LinearRegression is',
      lr.score(X_test, y_test))

# 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# 使用r2_score模块,并输出评估结果
print('The value of R-squared of LinearRegression is',
      r2_score(y_test, lr_y_predict))
# 使用mean_squared_error模块,并输出评估结果。
print(
Beispiel #37
0
 
 # forget the oldest
 train_data_n_frames = train_data_n_frames[1:]
 render()
 n_frames_reward -= rewards[0]
 rewards = rewards[1:]
 
 # try predict all actions
 action = 0#env.action_space.sample()
 curr_max_reward_for_action = 0.;
 before_action_observation = observation
 for try_action in range(env.action_space.n):
     try_data = u.concatNewStep(train_data_n_frames,
                                observation,
                                try_action)
     predicted = rf.predict([u.to1D(try_data)])[0]
     print(predicted)
     if (predicted > curr_max_reward_for_action):
         action = try_action
         curr_max_reward_for_action = predicted
 #print(curr_max_reward_for_action)
 # apply the best predicted action
 observation, reward, done, info = env.step(action)
 print('\naction: (' + str(action) + ') reward: ' + str(reward))
 n_frames_reward += reward
 rewards = np.hstack((rewards, reward))
 
 train_data_n_frames = u.concatNewStep(train_data_n_frames,
                                       before_action_observation,
                                       action)
 if done:
Beispiel #38
0
    print('Linear Regression Train RMSE:', train_rmse)
    train_r2 = r2_score(y_train, y_train_pred)
    print('Linear Regression Train R^2:', train_r2)

    # 테스트 세트의 예측값
    y_test_pred = lin_reg.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_test_pred)
    print('Linear Regression Test RMSE:', test_rmse)
    print('Linear Regression Test R^2:', test_r2)

    # LinearRegression vs SGDRegressor
    sgd_reg = SGDRegressor(random_state=1)  # 모델 생성
    sgd_reg.fit(X_train, y_train)  # 모델 훈련
    y_train_pred = sgd_reg.predict(X_train)  # 학습 세트 예측값
    # -> 학습 세트의 RMSE, R2-score
    y_test_pred = sgd_reg.predict(X_test)  # 테스트 세트 예측값
    # -> 테스트 세트의 RMSE, R2-Score

    # Scaler 사용 -> Pipeline
    pipe1 = Pipeline([('scaler', StandardScaler()),
                      ('regressor', LinearRegression())])
    pipe1.fit(X_train, y_train)  # 학습
    y_train_pred = pipe1.predict(X_train)  # Train 예측값
    # -> Train RMSE, R2-score
    y_test_pred = pipe1.predict(X_test)  # Test 예측값

    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_test_scale = scaler.transform(X_test)
    Y_scaler = StandardScaler()

    X_train = X_scaler.fit_transform(X_train)
    Y_train = Y_scaler.fit_transform(Y_train)
    X_test = X_scaler.transform(X_test)
    Y_test = Y_scaler.transform(Y_test)

    print X_train[0:5]


    print len(X_train)
    print Y_test

    clf =SGDRegressor(loss="squared_loss")
    scores = cross_val_score(clf,X_train,Y_train,cv=5)
    print scores
    print np.mean(scores)

    clf.fit_transform(X_train,Y_train)

    pred  = clf.predict(X_test)

    print  clf.score(X_test,Y_test)




    # correlation(X_train,Y_train)
    # feature_selection(X_train,Y_train)
    scatter_plot(X_train,Y_train)
Beispiel #40
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 文件名: elastic_net.py

import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

__author__ = 'yasaka'

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

elastic_net = ElasticNet(alpha=0.0001, l1_ratio=0.15)
elastic_net.fit(X, y)
print(elastic_net.predict(1.5))

sgd_reg = SGDRegressor(penalty='elasticnet', n_iter=1000)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.predict(1.5))
def main():
    inmatesMap = mapCreator()
    featureVector = createFeatureVector()

    allInmateCrimes = []
    allInmateCrimesYValues = []
    allInmates = []
    allInmateYValues = []
    for inmate in inmatesMap:
        if 'IncarcerationDate' not in inmatesMap[inmate]:
            continue
        if inmatesMap[inmate]['PrisonReleaseDate'] == '':
            inmatesMap[inmate]['PrisonReleaseDate'] = inmatesMap[inmate]['IncarcerationDate'] + datetime.timedelta(days=36525)
        if (inmatesMap[inmate]["PrisonReleaseDate"] - inmatesMap[inmate]["IncarcerationDate"]).days <= 0:
            continue

        currentPerson = extractFeatures(inmatesMap[inmate], featureVector)

        sentenceLength = (inmatesMap[inmate]["PrisonReleaseDate"] - inmatesMap[inmate]["IncarcerationDate"]).days
        if 'CURRENT_OFFENSES' in inmatesMap[inmate]:
            for offense in inmatesMap[inmate]['CURRENT_OFFENSES']:
                crimeDescription = "CURRENT_" + offense["adjudicationcharge_descr"]
                allInmateCrimes.append(crimeDescription)
                allInmateCrimesYValues.append(sentenceLength)

        allInmates.append(currentPerson)
        # allInmateYValues.append(inmatesMap[inmate]["prisonterm"])
        allInmateYValues.append(sentenceLength)

    X = allInmates[:10000]
    y = allInmateYValues[:10000]

    # print testSet
    # print testSetY

    sgd = SGDRegressor(loss='epsilon_insensitive', fit_intercept=True, learning_rate='constant', n_iter=4, penalty='none', epsilon=0)
    sgd.fit(X, y)
    sgdPredictedSetY = []
    sgdTrueSetY = []
    for i in range(10001, 20001):
        sgdTrueSetY.append(allInmateYValues[i]);
        sgdPredictedSetY.append(sgd.predict(allInmates[i]))
    percentErrors = []
    print "SGD Mean absolute test error:", util.mean_absolute_percentage_error(sgdTrueSetY, sgdPredictedSetY, percentErrors)
    print "SGD Standard deviation:", np.std(np.array(percentErrors))


    svr = svm.SVR()
    svr.fit(X, y)
    svrPredictedSetY = []
    svrTrueSetY = []
    for i in range(10001, 20001):
        print "true value:", allInmateYValues[i]
        print "predicted value:", svr.predict(allInmates[i])
        print "Difference in true and predicted values:", allInmateYValues[i] - svr.predict(allInmates[i])
        svrTrueSetY.append(allInmateYValues[i]);
        svrPredictedSetY.append(svr.predict(allInmates[i]))
    percentErrors = []
    print "SVR Mean absolute test error:", util.mean_absolute_percentage_error(svrTrueSetY, svrPredictedSetY, percentErrors)
    print "SVR Standard deviation:", np.std(np.array(percentErrors))


    # baselineTest(allInmateCrimes[:10000], allInmateCrimesYValues[:10000])

    nbAllInmates = nbTestTransform(allInmates)
    nbAllInmateYValues = nbRound(allInmateYValues)
    nbTestSet = [nbAllInmates[i] for i in range(0, 10000)]
    nbTestSetY = [nbAllInmateYValues[i] for i in range(0, 10000)]
    nb = BernoulliNB()
    nb.fit(np.array(nbTestSet), np.array(nbTestSetY))
    nbTrueSentenceLength = []
    nbTestSentenceLength = []
    for i in range(10001, 20001):
        nbTrueSentenceLength.append(nbAllInmateYValues[i] * 10.0)
        nbTestSentenceLength.append(nb.predict(nbAllInmates[i] * 10.0))
    # print nbTrueSentenceLength
    # print nbTestSentenceLength
    percentErrors = []
    print "Naive Bayes Mean absolute test error:", util.mean_absolute_percentage_error(nbTrueSentenceLength, nbTestSentenceLength, percentErrors)
    print "Naive Bayes standard deviation:", np.std(np.array(percentErrors))
Beispiel #42
0
class SGDPolyCartPoleSolver:
    def __init__(self, n_episodes=1000, max_env_steps=None, gamma=0.9, epsilon=1.0, epsilon_min=0.01,
                 epsilon_decay=0.005, alpha=0.0001, batch_size=32, monitor=False):
        self.memory = deque(maxlen=100000)
        self.env = gym.make('CartPole-v0')

        if monitor:  # whether or not to display video
            self.env = gym.wrappers.Monitor(self.env, '../data/cartpole-1', force=True)

        # hyper-parameter setting
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.alpha = alpha
        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.feature_tuning = PolynomialFeatures(interaction_only=True)
        if max_env_steps is not None:
            self.env._max_episode_steps = max_env_steps

        # Init model
        self.model = SGDRegressor(
                alpha=self.alpha,
                learning_rate='optimal',
                shuffle=False,
                warm_start=True)

        # Initialize feature tunning
        self.feature_tuning.fit(np.reshape(np.hstack((self.env.reset(), 0)), [1, 5]))
        # Initialize model
        self.model.partial_fit(self.preprocess_state(self.env.reset(), 0), [0])

    def remember(self, state, action, reward, next_state, done):
        """In this method, the (s, a, r, s') tuple is stored in the memory"""
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state, epsilon):
        """Chooses the next action according to the model trained and the policy"""

        qsa = np.asarray([self.model.predict(self.preprocess_state(state, a))
                          for a in range(self.env.action_space.n)]).flatten()

        return self.env.action_space.sample() if (np.random.random() <= epsilon) \
            else np.argmax(qsa)  # exploits the current knowledge if the random number > epsilon, otherwise explores

    def get_epsilon(self, episode):
        """Returns an epsilon that decays over time until a minimum epsilon value is reached; in this case the minimum
        value is returned"""
        return max(self.epsilon_min, self.epsilon * math.exp(-self.epsilon_decay * episode))

    def preprocess_state(self, state, action):
        """State and action are stacked horizontally and its features are combined as a polynomial to be passed as an
        input of the approximator"""

        # poly_state converts the horizontal stack into a combination of its parameters i.e.
        # [1, s_1, s_2, s_3, s_4, a_1, s_1 s_2, s_1 s_3, ...]
        poly_state = self.feature_tuning.transform(np.reshape(np.hstack((state, action)), [1, 5]))
        return poly_state

    def replay(self, batch_size):
        """Previously stored (s, a, r, s') tuples are replayed (that is, are added into the model). The size of the
        tuples added is determined by the batch_size parameter"""

        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))

        for state, action, reward, next_state, done in minibatch:
            qsa_s_prime = np.asarray([self.model.predict(self.preprocess_state(next_state, a))
                                      for a in range(self.env.action_space.n)])

            qsa_s = reward if done \
                else reward + self.gamma * np.max(qsa_s_prime)

            x_batch.append(self.preprocess_state(state, action)[0])
            y_batch.append(qsa_s)

        self.model.partial_fit(np.array(x_batch), np.array(y_batch))

    def run(self):
        """Main loop that controls the execution of the agent"""

        scores100 = deque(maxlen=100)
        scores = []
        for e in range(self.n_episodes):
            state = self.env.reset()
            done = False
            t = 0  # t counts the number of time-steps the pole has been kept up
            while not done:
                action = self.choose_action(state, self.get_epsilon(e))
                next_state, reward, done, _ = self.env.step(action)
                self.remember(state, action, reward, next_state, done)

                self.replay(self.batch_size)

                state = next_state
                t += 1

            scores100.append(t)
            scores.append(t)
            mean_score = np.mean(scores100)
            if e % 100 == 0:
                print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score))

        # noinspection PyUnboundLocalVariable
        print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score))
        return scores
Beispiel #43
0
plt.show()

#%% Ridge Regression closed form

from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])

#%% Ridge Regression SGD

from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(penalty="l2")
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])

#%% Lasso Regression

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso

np.random.seed(42)
m = 20
X = 3 * np.random.rand(m, 1)
y = 1 + 0.5 * X + np.random.randn(m, 1) / 1.5
        pred = users_current.sum(axis=1)/users_current.getnnz(axis=1)
        print pred, 'pred last'
    else:
        movie_current = R_m[probe_movies[i:min(i+batch_size, probe_num)], :]
        m_mean = movie_current.sum(axis=1)/movie_current.getnnz(axis=1)
        print movie_current.getnnz(axis=1), 'sd'
        m_stdev = np.sqrt((np.sum(np.power((movie_current - m_mean*(movie_current!=0)),2),axis=1).flatten())/movie_current.getnnz(axis=1)).T
        pred = movie_current.sum(axis=1)/movie_current.getnnz(axis=1)+(u_mean-all_movie_avg)*(m_stdev/all_movie_stdev)
        print pred, 'pred last'
        '''

    m_mean = movie_avg[probe_movies[i:min(i + batch_size, probe_num)]]
    u_mean = np.array([u_mean]).T
    m_mean = np.array([m_mean]).T
    preding = np.concatenate((u_mean, m_mean), axis=1)
    pred = lin_model.predict(preding)
    #print u_mean, m_mean, pred
    given = probe_ratings[i:min(i + batch_size, probe_num)]

    pred = np.maximum(
        np.minimum(pred, (5 - 3 - 0.60951619727280626) * np.ones(len(pred))),
        (1 - 3 - 0.60951619727280626) * np.ones(len(pred)))

    probe_se += np.sum(np.power((given - pred), 2))

    #print math.sqrt(probe_se/(i+batch_size))

print 'trained avg1', math.sqrt(probe_se / probe_num)

# sgd fitter
lin_model = SGDRegressor()
Beispiel #45
0
lr = LinearRegression()
# 使用训练数据进行参数估计。
lr.fit(X_train, y_train)
# 对测试数据进行回归预测。
lr_y_predict = lr.predict(X_test)


# 从sklearn.linear_model导入SGDRegressor。
from sklearn.linear_model import SGDRegressor

# 使用默认配置初始化线性回归器SGDRegressor。
sgdr = SGDRegressor()
# 使用训练数据进行参数估计。
sgdr.fit(X_train, y_train)
# 对测试数据进行回归预测。
sgdr_y_predict = sgdr.predict(X_test)

# 使用LinearRegression模型自带的评估模块,并输出评估结果。
print('The value of default measurement of LinearRegression is', lr.score(X_test, y_test))

# 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估。
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# 使用r2_score模块,并输出评估结果。
print( 'The value of R-squared of LinearRegression is', r2_score(y_test, lr_y_predict)
)
# 使用mean_squared_error模块,并输出评估结果。
print('The mean squared error of LinearRegression is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict)))

# 使用mean_absolute_error模块,并输出评估结果。
print('The mean absoluate error of LinearRegression is', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict)))
    eta0=eta0, max_iter=max_iter, warm_start=True, learning_rate="constant")

rmse_val_score = []
rmse_train_score = []
model_list = []

X_train, X_val, y_train, y_val = train_test_split(
    X_train_dataset,y_train_dataset, test_size=0.2, random_state=42)
sgd_regressor.fit(X_train,y_train)

# kf = KFold(n_splits=100, shuffle=True)
# for train_index, test_index in kf.split(X_train_dataset):

for i in range(300):

    y_pred = sgd_regressor.predict(X_train)
    y_true = y_train
    rmse_train_score.append(rmse(y_pred, y_true))

    y_pred = sgd_regressor.predict(X_val)
    y_true = y_val
    rmse_val_score.append(rmse(y_pred, y_true))
    model_list.append(sgd_regressor)

    coef = sgd_regressor.coef_.copy()
    intercept = sgd_regressor.intercept_.copy()

    sgd_regressor = SGDRegressor(
        eta0=eta0, max_iter=max_iter, warm_start=True, learning_rate="constant")

    sgd_regressor.fit(X_train,y_train, coef_init=coef, intercept_init=intercept)
Beispiel #47
0
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

plt.figure() # 实例化作图变量
plt.title('single variable') # 图像标题
plt.xlabel('x') # x轴文本
plt.ylabel('y') # y轴文本
plt.grid(True) # 是否绘制网格线

X_scaler = StandardScaler()
y_scaler = StandardScaler()
#X = [[50],[100],[150],[200],[250],[300]]
#y = [[150],[200],[250],[280],[310],[330]]
X = [[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300],[50],[100],[150],[200],[250],[300]]
y = [[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330],[150],[200],[250],[280],[310],[330]]
X = X_scaler.fit_transform(X)
y = y_scaler.fit_transform(y)
X_test = [[40],[400]] # 用来做最终效果测试
X_test = X_scaler.transform(X_test)

plt.plot(X, y, 'k.')

model = SGDRegressor()
model.fit(X, y.ravel())
y_result = model.predict(X_test)
print y_result
plt.plot(X_test, y_result, 'g-')

plt.show() # 展示图像

Beispiel #48
0
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# 方法一
ridge_reg = Ridge(alpha=1, solver="auto")  # alpha是惩罚项中的alpha,slover是自动选择
ridge_reg.fit(X, y)
"""
由于在新版的sklearn中,所有的数据都应该是二维矩阵,哪怕它只是单独一行或一列
(比如仅仅只用了一个样本数据),所以需要使用.reshape(1,-1)进行转换
"""
print(ridge_reg.predict(np.array(1).reshape(1, -1)))  # 预测值
print(ridge_reg.intercept_)  # 截距
print(ridge_reg.coef_)  # 参数

# 方法二
# penalty是惩罚函数,可选l1或l2(默认为l2),max_iter是最大迭代次数(可以不填默认1000)
sgd_reg = SGDRegressor(penalty="l2", max_iter=10000)
sgd_reg.fit(X, y.ravel())  # ravel函数把y由列向量变为行向量

print(sgd_reg.predict(np.array(1).reshape(1, -1)))  # 预测值
print(sgd_reg.intercept_)  # 截距
print(sgd_reg.coef_)  # 参数
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle=True)
# ----------------------------------------------------
# Applying SGDRegressor Model
SGDRegressionModel = SGDRegressor(random_state=33)
SGDRegressionModel.fit(X_train, y_train) 

# Calculating Details
print('SGD Regression Train Score is : ',SGDRegressionModel.score(X_train, y_train))
print('SGD Regression Test Score is : ',SGDRegressionModel.score(X_test, y_test))
print('SGD Regression Coef is : ', SGDRegressionModel.coef_)
print('SGD Regression intercept is : ', SGDRegressionModel.intercept_)
print('-'*25)
# ----------------------------------------------------

# Calculating Prediction
y_pred = SGDRegressionModel.predict(X_test)
print('Pred Value for SGD Regression is : ', y_pred[:5])
print('True Value for SGD Regression is : ', y_test[:5])

# ----------------------------------------------------
# Calculating Mean Absolute Error
MAEValue = mean_absolute_error(y_test, y_pred, multioutput='uniform_average')  # it can be raw_values
print('Mean Absolute Error Value is : ', MAEValue)
# ----------------------------------------------------
# Calculating Mean Squared Error
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')  # it can be raw_values
print('Mean Squared Error Value is : ', MSEValue)
# ----------------------------------------------------
# Calculating Median Squared Error
MdSEValue = median_absolute_error(y_test, y_pred)
print('Median Squared Error Value is : ', MdSEValue)
Beispiel #50
0
scaled = scaler.transform(X)

scaled_df = pd.DataFrame(scaled, columns= X.columns)

scaled_df[:5]

X = scaled_df
X[:5]

from sklearn.linear_model import LinearRegression
model = LinearRegression()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

model.fit(X_train, y_train)

pred = model.predict(X_test)

from sklearn import metrics
metrics.r2_score(y_test, pred)

from sklearn.linear_model import SGDRegressor
mod = SGDRegressor()

mod.fit(X_train, y_train)

predict = mod.predict(X_test)

metrics.r2_score(y_test, predict)

pprint.pprint("Testing with test data...")

test_data = list()
test_diff = list()
predict_diff = list()
for index in test_indices:
    tmp = data[index][1:5]
    my_tmp = list()
    for item in tmp:
        my_tmp.append(float(item))
    test_data.append(my_tmp)
    test_diff.append(float(data[index][4]) - float(data[index][1]))
# #
prediction_results_close = clf.predict(test_data)
prediction_results_open = clf2.predict(test_data)

for i in xrange(len(prediction_results_close)):
    p_diff = prediction_results_close[i] - prediction_results_open[i]
    predict_diff.append(p_diff)


print test_diff
print predict_diff

test_inc = 0
for diff in test_diff:
    if diff > 0:
        test_inc += 1
Beispiel #52
0
print('{:.4f}'.format(line.slope))
# 0.29

sms['line'] = line.predict(sms['topic4'])


##########################

from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor(n_iter=20000)
sgd = sgd.fit(sms[['topic4']], sms['vader'])
print('{:.4f}'.format(sgd.coef_[0]))
# 0.2930

sms['sgd'] = sgd.predict(sms[['topic4']])


##########################

from nlpia.models import OneNeuronRegressor

nn = OneNeuronRegressor(alpha=100, n_iter=200)
nn = nn.fit(sms[['topic4']], sms['vader'])
print(nn.W[0, 1])
# 0.29386408

sms['neuron'] = nn.predict(sms[['topic4']])


##########################
from sklearn.pipeline import Pipeline

polynomial_regression = Pipeline([("poly_features", PolynomialFeatures(degree=10, include_bias=False)), ("lin_reg", LinearRegression()),])
plot_learning_curves(polynomial_regression, X, y)      


#Ridge Regression 
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])

#Stochastic Gradient Descent 
sgd_reg = SGDRegressor(penalty="12")
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])

#Build classifier to detect Iris-Virginia type 
from sklearn import datasets
iris = datasets.load_iris()
iris
list(iris.keys())
X = iris["data"][:, 3:] #iris["data"] returns matrix. [:,3:] all rows and column from 3 until end column
X
y = (iris["target"] == 2).astype(np.int) #astype : 
y

from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)
Beispiel #54
0
    ytrain, yval = y_train[train_index], y_train[val_index]

    model = SGDRegressor(penalty='l2',
                         loss='squared_epsilon_insensitive',
                         max_iter=200,
                         tol=0.00001,
                         epsilon=0.0001,
                         learning_rate='invscaling',
                         fit_intercept=False,
                         alpha=1e-10,
                         l1_ratio=0.09,
                         shuffle=True,
                         verbose=0,
                         random_state=1001)
    model.fit(Xtrain, ytrain)
    sgd_scores_val = model.predict(Xval)
    sgd_RMSLE = np.sqrt(mean_squared_error(yval, sgd_scores_val))
    print('\n Fold %02d SGD RMSLE: %.6f' % ((i + 1), sgd_RMSLE))
    sgd_y_pred = model.predict(X_test)

    model = Ridge(alpha=4.75,
                  solver='sag',
                  fit_intercept=False,
                  random_state=1001,
                  max_iter=1000)
    model.fit(Xtrain, ytrain)
    ridge_scores_val = model.predict(Xval)
    ridge_RMSLE = np.sqrt(mean_squared_error(yval, ridge_scores_val))
    print(' Fold %02d Ridge RMSLE: %.6f' % ((i + 1), ridge_RMSLE))
    ridge_y_pred = model.predict(X_test)
Beispiel #55
0
for l in trainf.readlines():
    sl = l.strip().split()
    L.append(sl[0])
    Y.append(int(sl[1]))
    xx=map(float,sl[2:])
    X.append(xx)

#print X
clf = SGDRegressor(loss='squared_epsilon_insensitive',n_iter=1000)
clf = clf.fit(X, Y)
#print clf
#scores = cross_val_score(clf, X, Y)
#print scores
#print clf.score(X,Y)
print clf.coef_
YY=clf.predict(X)
print roc_auc_score(Y,YY)

del X

pf=open('clf-linearReg.pkl','w')
s = pickle.dump(clf, pf)
pf.close()

X=[]
L=[]
testf=open(sys.argv[2])
for l in testf.readlines():
    sl = l.strip().split()
    L.append(sl[0])
    xx=map(float,sl[1:])
                                      test_size=0.2,
                                      random_state=1)
    test_data = pd.DataFrame.from_records(X_t)
    test_data.to_csv('./Titles1/testing' + str(step) + '.csv',
                     header=False,
                     index=False)
    with open("./Titles1/testing_label" + str(step) + ".csv", "w") as f:
        wr = csv.writer(f, delimiter="\n")
        wr.writerow(y_t)

    lr.partial_fit(X, y)  ## not overwrite the model's previous parameters
    print('Step {} is done!\n'.format(step))

#### Test the training dataset
## The last X and y
predictions = lr.predict(X)
print('predictions: ', predictions[0:10])
print('the true upvote: ', y[0:10])
mse = mean_squared_error(predictions, y)
print(mse)

##############################################################################
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(lr, open(filename, 'wb'))

###### load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.predict(X_test, Y_test)

mse_list = []
Beispiel #57
0
Ordinary Least Squares with SGD
===============================

Simple Ordinary Least Squares example with stochastic
gradient descent, we draw the linear least
squares solution for a random set of points in the plane.
"""
print __doc__

import numpy as np
import pylab as pl

from sklearn.linear_model import SGDRegressor

# this is our test set, it's just a straight line with some
# gaussian noise
xmin, xmax = -5, 5
n_samples = 100
X = [[i] for i in np.linspace(xmin, xmax, n_samples)]
Y = 2 + 0.5 * np.linspace(xmin, xmax, n_samples) \
      + np.random.randn(n_samples, 1).ravel()

# run the classifier
clf = SGDRegressor(alpha=0.1, n_iter=20)
clf.fit(X, Y)

# and plot the result
pl.scatter(X, Y, color='black')
pl.plot(X, clf.predict(X), color='blue', linewidth=3)
pl.show()
Beispiel #58
0
print(' ')
print(' REGRESSOR SGD:')
print(' ')

regressor_sgd = SGDRegressor(
    loss='squared_loss',
    alpha=0.1,
    penalty='l2',
    tol=1e-5,
    max_iter=100000,
)

regressor_sgd = regressor_sgd.fit(x_treino, y_treino)

y_resposta_treino = regressor_sgd.predict(x_treino)
y_resposta_teste = regressor_sgd.predict(x_teste)

print(' Métrica   DENTRO da amostra   FORA da amostra ')
print(' -------   -----------------   --------------- ')

mse_in = mean_squared_error(y_treino, y_resposta_treino)
rmse_in = math.sqrt(mse_in)
r2_in = r2_score(y_treino, y_resposta_treino)

mse_out = mean_squared_error(y_teste, y_resposta_teste)
rmse_out = math.sqrt(mse_out)
r2_out = r2_score(y_teste, y_resposta_teste)

print(' %7s   %17.4f   %15.4f ' % ('mse', mse_in, mse_out))
print(' %7s   %17.4f   %15.4f ' % ('rmse', rmse_in, rmse_out))
Beispiel #59
0
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test,
                                                       y_train, y_test)

    #print(X_train)
    #X_values = np.delete(raw_data, raw_data.shape[1]-1, 1)
    #Y_values = raw_data[:,raw_data.shape[1]-1]

    weights_sk = np.full(
        (1, X_train.shape[1]), 1.0
    )  #do not reuse the weights since sk-learn does inplace work with the coef_init matrix!
    intercept_sk = 1
    weights_own = np.full((1, X_train.shape[1]), 1.0)
    intercept_own = 1

    sk_gdc = SGDRegressor()
    sk_gdc.fit(
        X_train, y_train, coef_init=weights_sk, intercept_init=intercept_sk
    )  #coef_init is the same as our weights for comparison reasons (sklear does not pass w_0!)
    print("Weights and intercept found by sk:", weights_sk, intercept_sk)

    own_gdc = OwnGradientDescentRegressor(debug_output=True)
    print(weights_own, weights_own.shape)
    weights_own, intercept_own = own_gdc.fit(X_train,
                                             y_train,
                                             coef_init=weights_own,
                                             intercept_init=intercept_own)
    print("Weights and intercept found by own:", weights_own, intercept_own)

    print("Prediction with sk-learn:", sk_gdc.predict(X_test))
    print("Prediction with own-imp:", own_gdc.predict(X_test))
Beispiel #60
0
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))  #修改1
y_test = ss_y.transform(y_test.reshape(-1, 1))  #修改2

#使用线性回归模型
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y = lr.predict(X_test)

#使用随机梯度下降即SGD模型
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor(max_iter=5)  #参数增添修改可选,否则warning
sgdr.fit(X_train, y_train.ravel())  #使用扁平化函数ravel,否则warning
shdr_y = sgdr.predict(X_test)

#LinearRegression模型自带的评估函数
print('The value of default measurement of LinearRegression is ',
      lr.score(X_test, y_test))

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

#使用r2_score模块
print('The value of R_squared of LinearRegression is ', r2_score(y_test, lr_y))

#使用mean_squared_error模块
print(
    'The mean squared error of LinearRegression is ',
    mean_squared_error(ss_y.inverse_transform(y_test),
                       ss_y.inverse_transform(lr_y)))