def ls_sklearn_sgd(x, y):
    # Parameter estimation by sklearn SGD
    sgd = SGDRegressor(fit_intercept=True)
    sgd.fit(x.reshape((N, 1)), y)
    beta_0_sk = sgd.intercept_
    beta_1_sk = sgd.coef_[0]
    return beta_0_sk, beta_1_sk
Beispiel #2
0
def initialize_regressors(sgd_grid):
    l = SGDRegressor()
    regressions_to_fit = []
    for params in ParameterGrid(sgd_grid):
        l.set_params(**params)
        regressions_to_fit.append(clone(l))
    return regressions_to_fit
def sgd(X, y, weight, X_test=False):
    from sklearn.linear_model import SGDRegressor
    from sklearn import cross_validation
    from sklearn.metrics import confusion_matrix
    from sklearn.preprocessing import StandardScaler

    #X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
    #        X, y, weight, test_size=0.2, random_state=0)
    clf = SGDRegressor(loss="huber", n_iter=100, penalty="l1")
    #clf = LogisticRegression( max_iter=100)

    X_train = X
    y_train = y

    scaler = StandardScaler(with_mean=False)
    scaler.fit(X_train)  # Don't cheat - fit only on training data
    X_train = scaler.transform(X_train)

    X_test = scaler.transform(X_test)  # apply same transformation to test data

    clf.fit(X_train, y_train, sample_weight=weight)

    print(clf.score(X_train,y_train,weight))

    y_pred = clf.predict(X_test)
    
    from sklearn.externals import joblib
    import scipy.io as sio
    joblib.dump(clf, 'models/sgd_.pkl') 
    sio.savemat('predict_y_forward.mat', {'y':y_pred})
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """
    model = SGDRegressor()
    fit = model.fit(features, values)
    return intercept, params
Beispiel #5
0
def predict_age():
    mask = ~np.isnan(train["Age"])
    age_train = train[mask]
    age_test = train[~mask]

    features = []
    features.append(embarked_enc.transform(age_train["Embarked"]))
    features.append(sex_enc.transform(age_train["Sex"]))
    features.append(title_enc.transform(age_train["Title"]))
    features.append(pclass_enc.transform(age_train["Pclass"]))

    age_clf = SGDRegressor()
    X = np.hstack(features)
    y = np.array(train["Age"][mask]).T
    age_clf.fit(X, y)

    features = []
    features.append(embarked_enc.transform(age_test["Embarked"]))
    features.append(sex_enc.transform(age_test["Sex"]))
    features.append(title_enc.transform(age_test["Title"]))
    features.append(pclass_enc.transform(age_test["Pclass"]))

    ages = age_clf.predict(np.hstack(features))
    j = 0
    for i in range(len(train)):
        if ~mask[i]:
            train.loc[i, "Age"] = ages[j]
            j += 1
Beispiel #6
0
def build_sgd_regressor(X_test, X_train_full, y_train_full):

    #print "Building SGD regressor..."

    rf = SGDRegressor(loss="modified_huber", penalty="elasticnet", n_iter=20000, alpha=0.1, epsilon=0.01)
    probas_rf = rf.fit(X_train_full, y_train_full).predict(X_test)
    return probas_rf
    def __init__(self, name, actions):
        self.debug = True
        self.actions = actions
        self.initial_data = np.array([[3, 3.80, 0.40, 1.50], [3.0783744000000004, 1.7999999999999998, 0.04, 2.5], 
                                    [3.6603792000000004, 5.8500000000000005, 0.08, -1.5], [2.8383936000000003, 5.8500000000000005, 0.04, -3.0],
                                    [4.5679104000000015, 5.8500000000000005, 0.04, -2.0], [2.885976, 4.05, 0.04, 1.0]])

        self.initial_labels = np.array([[-1.0], [-0.2], [-0.5], [-.25], [0.0], [-2.1]])
        self.model_name = str(name)
        self.observation_samples = np.array([self.sample_states() for obv in range(10000)])
        self.featurizer = sklearn.pipeline.FeatureUnion([("rbf1", RBFSampler(gamma=5.0, n_components=100)),
                                                         ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
                                                         ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
                                                         ("rbf4", RBFSampler(gamma=0.5, n_components=100))])
        self.featurizer.fit(self.observation_samples)
        self.feature_scaler = StandardScaler()
        self.feature_scaler.fit(self.observation_samples)

        if self.model_name == 'svr':
                self.model = SVR(kernel='rbf')
                #self.model.fit(self.initial_data, self.initial_labels)

        elif self.model_name == 'extra_trees':    
                self.model = ExtraTreesRegressor().fit(self.initial_data, self.initial_labels)

        elif self.model_name == 'sgd':
            self.models = {}
            for a in range(len(self.actions)):
                model = SGDRegressor(learning_rate="constant")
                model.partial_fit([ self.featurize_state([3.6603792000000004, 2.8500000000000005, 0.08]) ], [0])
                self.models[a] = model
            #self.model = SGDRegressor(penalty='none')
            #self.model.fit(self.feature_scaler.transform(self.initial_data), self.initial_labels)
        else:
            self.model = None
Beispiel #8
0
    def predict(self, df):

        # get time frame
        time_frame = settings.time_frame
        
        # copy of data
        df_copy = df.copy()

        from sklearn.linear_model import SGDRegressor
        from sklearn.metrics import mean_absolute_error, mean_squared_error
    
        # partition data
        X_train, y_train, X_val, y_val, X_test, y_test = self.partition(df_copy)
        
        # normalize features
        X_train_std, X_val_std, X_test_std = self.feature_scale(X_train, X_val, X_test)
        
        # instance of Linear Regression classifier
        lr = SGDRegressor()
        
        # fit model
        lr.fit(X_train_std, y_train)
        
        # predictions on validation set
        predictions = lr.predict(X_val_std)
    
        # R^2 score
        score = lr.score(X_val_std, y_val)
        
        # error
        test_error = (mean_squared_error(y_val, predictions)**.5)
        print test_error
Beispiel #9
0
def linear_regression_GD(features, values):
    means, std_devs, features = normalized_features(features)
    model = SGDRegressor(eta0=0.001)
    results = model.fit(features, values)
    intercept = results.intercept_
    params = results.coef_
    return intercept, params
Beispiel #10
0
class EdenRegressor(BaseEstimator, RegressorMixin):
    """Build a regressor for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 normalization=True, inner_normalization=True,
                 penalty='elasticnet', loss='squared_loss'):
        """construct."""
        self.set_params(r, d, nbits, discrete,
                        normalization, inner_normalization,
                        penalty, loss)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   normalization=True, inner_normalization=True,
                   penalty='elasticnet', loss='squared_loss'):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.model = SGDRegressor(
            loss=loss, penalty=penalty,
            average=True, shuffle=True,
            max_iter=5, tol=None)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    def fit(self, graphs, targets, randomize=True):
        """fit."""
        x = self.transform(graphs)
        self.model = self.model.fit(x, targets)
        return self

    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    def decision_function(self, graphs):
        """decision_function."""
        return self.predict(graphs)
Beispiel #11
0
def linear_regression(features, values):

    sgd = SGDRegressor()
    results = sgd.fit(values, features)
    intercept = sgd.intercept_
    params = results.get_params()

    return intercept, params
 def __init__(self, env, feature_transformer, learning_rate):
   self.env = env
   self.models = []
   self.feature_transformer = feature_transformer
   for i in range(env.action_space.n):
     model = SGDRegressor(learning_rate=learning_rate)
     model.partial_fit(feature_transformer.transform( [env.reset()] ), [0])
     self.models.append(model)
def gradiantDescent(trainData,testData,trainOuts,testOuts):
	clf = SGDRegressor(loss="squared_loss")
	print(clf.fit(trainData,trainOuts))
	print(clf.coef_)
	predictions = clf.predict(testData)
	print(predictions)
	misses,error = sup.crunchTestResults(predictions,testOuts,.5)
	print(1-error)
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """
    clf = SGDRegressor(n_iter = 20)
    clf.fit(features, values)
    intercept,params = clf.intercept_,clf.coef_ 
    return intercept, params
def linear_regression(features, values):

    
    model = SGDRegressor(n_iter=1000)
    results = model.fit(features, values)
    intercept = results.intercept_
    params = results.coef_
    
    return intercept, params
Beispiel #16
0
def slim_train(A, l1_reg=0.001, l2_reg=0.0001):
    """
    Computes W matrix of SLIM

    This link is useful to understand the parameters used:

        http://web.stanford.edu/~hastie/glmnet_matlab/intro.html

        Basically, we are using this:

            Sum( yi - B0 - xTB) + ...
        As:
            Sum( aj - 0 - ATwj) + ...

    Remember that we are wanting to learn wj. If you don't undestand this
    mathematical notation, I suggest you to read section III of:

        http://glaros.dtc.umn.edu/gkhome/slim/overview
    """
    alpha = l1_reg + l2_reg
    l1_ratio = l1_reg / alpha

    model = SGDRegressor(
        penalty='elasticnet',
        fit_intercept=False,
        alpha=alpha,
        l1_ratio=l1_ratio,
    )

    # TODO: get dimensions in the right way
    m, n = A.shape

    # Fit each column of W separately
    W = lil_matrix((n, n))

    for j in range(n):
        if j % 50 == 0:
            print('-> %2.2f%%' % ((j / float(n)) * 100))

        aj = A[:, j].copy()
        # We need to remove the column j before training
        A[:, j] = 0

        model.fit(A, aj.toarray().ravel())
        # We need to reinstate the matrix
        A[:, j] = aj

        w = model.coef_

        # Removing negative values because it makes no sense in our approach
        w[w < 0] = 0

        for el in w.nonzero()[0]:
            W[(el, j)] = w[el]

    return W
def SGD_Regression(kf,data,label,k):
	val=0
	for train, test in kf:
		X_train, X_test, y_train, y_test = data[train,:], data[test,:], label[train], label[test]
		log =  SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15,n_iter=5)
		logit = log.fit(X_train,y_train)
		y_pred =  logit.predict(X_test)
		val += metrics.mean_squared_error(y_test, y_pred) 
	return val/3 
	# print "SGD_Regression, Mean Squared Error ", "{0:.4f}".format(val/3)
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """
    regressor = SGDRegressor()
    result = regressor.fit(features, values)
    intercept = result.intercept_
    params = result.coef_
   
    return intercept, params
Beispiel #19
0
def perform_sgd_regression(features, values):
    
    clf = SGDRegressor(n_iter=20)
    clf = clf.fit(features, values)
    intercept = clf.intercept_
    params = clf.coef_
    print "intercept:"
    print intercept
    print "params:"
    for i in range(len(params)):
        print "%s: %f" %(features.columns.values[i], params[i])
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """
    X = features
    y = values
    clf = SGDRegressor(n_iter=20)
    results = clf.fit(X, y)
    intercept = results.intercept_[0]
    params = results.coef_
    return intercept, params
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """
    
    model = SGDRegressor(n_iter=100)
    model.fit(features,values)
    print 'SCORE: ',model.score(features,values)
    intercept = model.intercept_
    params = model.coef_
    
    return intercept, params
 def __init__(self):
     # We create a separate model for each action in the environment's
     # action space. Alternatively we could somehow encode the action
     # into the features, but this way it's easier to code up.
     self.models = []
     for _ in range(env.action_space.n):
         model = SGDRegressor(learning_rate="constant")
         # We need to call partial_fit once to initialize the model
         # or we get a NotFittedError when trying to make a prediction
         # This is quite hacky.
         model.partial_fit([self.featurize_state(env.reset())], [0])
         self.models.append(model)
def sgd(pd, pl, qd, ql):
    params = {'loss':['squared_loss', 'huber', 'epsilon_insensitive',
                     'squared_epsilon_insensitive'],
                'alpha':expon(scale=1),
                'epsilon':expon(scale=1),
                'l1_ratio':uniform(),
                'penalty':[ 'l2', 'l1', 'elasticnet']}
    clf = SGDRegressor()
    #clf = RandomizedSearchCV(clf, params, n_jobs=2, n_iter=10, verbose=10)
    print("Training Linear SVM Randomly")
    clf.fit(pd, pl)
    print("Score: " + str(clf.score(qd, ql)))
    return clf
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """

    ###########################
    ### YOUR CODE GOES HERE ###
    ###########################
    clf = SGDRegressor()
    model = clf.fit(features,values)
    params = model.coef_
    intercept = model.intercept_
    return intercept, params
Beispiel #25
0
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """
    
    ###########################
    ### YOUR CODE GOES HERE ###
    ###########################
    classifier = SGDRegressor(n_iter = 20)
    classifier.fit(features, values)
    intercept = classifier.intercept_
    params = classifier.coef_
    return intercept, params
Beispiel #26
0
def predictScores(trainFeatures,trainTargets,testFeatures,testItemIds,isRegression = False):
    logging.info("Feature preparation done, fitting model...")
    
    predicted_scores = []
    if isRegression:
        clf = SGDRegressor(     penalty="l2", 
                                alpha=1e-4)
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(trainFeatures,trainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict(testFeatures)
    else:         
        clf = SGDClassifier(    loss="log", 
                                penalty="l2", 
                                alpha=1e-4, 
                                class_weight="auto")
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(trainFeatures,trainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict_proba(testFeatures).T[1]    
    
    logging.info("Write results...")
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
def main(train_file, model_file):
    #train_x, train_y = load_sparse_trainingData_memory(train_file, 2 * get_len_vector())
    train_x, train_y = load_long_training_data_memory()
    #train_x, train_y = load_trainingData(train_file)
    logging('len of y: %d' % train_y.shape)
    logging(train_x.shape)
    #LR = LinearRegression(copy_X = False, normalize = True)
    LR = SGDRegressor(verbose=1)
    logging("training model...")
    starttime = datetime.now()
    LR.fit(train_x, train_y)
    logging("training model, eplased time:%s" % str(datetime.now() - starttime))
    logging("saving model")
    joblib.dump(LR, model_file)
def linear_regression(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """
    
    y = values
    X = features
    clf = SGDRegressor()
    clf.fit(X, y)
    
    intercept = clf.intercept_
    params = clf.coef_
    
    return intercept, params
def linear_regression_gradient_descent(features, values):
    """
    Perform linear regression given a data set with an arbitrary number of features.
    """

    ###########################
    ### YOUR CODE GOES HERE ###
    ###########################
    clf = SGDRegressor(n_iter=15)
    results = clf.fit(features, values)
    intercept= results.intercept_
    params = results.coef_

    return intercept, params
Beispiel #30
0
def SGDRegressor_pred(X_train, X_test, y_train_normalized, y_train_mean, y_test):
    # The learning rate:
    # ---constant: eta = eta0 [assign to the initial one, eta0]
    # ---optimal: eta = 1.0/(t+t0)
    # ---invscaling: eta = eta0 / pow(t, power_t) [default]
    clf = SGDRegressor(alpha=0.0001, eta0=0.001, n_iter=150, fit_intercept=False, shuffle=True, verbose=0)
    clf = clf.fit(X_train, y_train_normalized)

    # Conveting to back, (could be used sklearn standardization function for both decoding and encoding)
    predictions_train = clf.predict(X_train) + y_train_mean
    predictions = clf.predict(X_test) + y_train_mean

    score_test = clf.score(X_test, y_test)

    return predictions, predictions_train, score_test
Beispiel #31
0
    plt.show()


'''
Spot check Algorithms
'''
Rmodels = []
Rmodels.append(('LR',
                LinearRegression(copy_X=True,
                                 fit_intercept=True,
                                 n_jobs=None,
                                 normalize=True)))
Rmodels.append(('LGBM', LGBMRegressor()))
Rmodels.append(('XGB', XGBRegressor()))
#Rmodels.append(('CatBoost', CatBoostRegressor()))
Rmodels.append(('SGD', SGDRegressor()))
Rmodels.append(('KernelRidge', KernelRidge()))
Rmodels.append(('ElasticNet', ElasticNet()))
Rmodels.append(('BayesianRidge', BayesianRidge()))
Rmodels.append(('GradientBoosting', GradientBoostingRegressor()))
Rmodels.append(('SVR', SVR(gamma='auto')))  # kernel = linear, svr
Rmodels.append(('NN', MLPRegressor(solver='lbfgs')))  #neural network
Rmodels.append(('KNN', KNeighborsRegressor()))  # kneighbor
Rmodels.append(('RF', RandomForestRegressor(
    n_estimators=10)))  # Ensemble method - collection of many decision trees

# decision tree
# Gradient boosting
Cmodels = []
Cmodels.append(('Logistic', LogisticRegression()))
Cmodels.append(('SVM', SVC(gamma='auto')))
Beispiel #32
0
def run_sgd_reg():
    sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
    return sgd_reg.fit(X_train_scaled, y_train)
# Model configuration

base = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=LinearSVR(C=0.01, dual=True, epsilon=0.001, loss="epsilon_insensitive", tol=0.1)),
    MaxAbsScaler(),
    StackingEstimator(estimator=RidgeCV()),
    Normalizer(norm="l2"),
    StackingEstimator(estimator=LinearSVR(C=0.5, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.4, min_samples_leaf=2, min_samples_split=4, n_estimators=100)),
    MinMaxScaler(),    
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)),
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=SGDRegressor()),
    RobustScaler(),
    StackingEstimator(estimator=LinearSVR(C=15.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.75, tol=0.001)),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=1, min_child_weight=6, n_estimators=100, nthread=1, objective="reg:squarederror", subsample=0.6500000000000001)),
    MinMaxScaler(),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.2, min_samples_leaf=2, min_samples_split=4, n_estimators=100)),
    StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)),
    MaxAbsScaler(),
    RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=1, min_samples_split=4, n_estimators=100)
)

parameters = {'base_estimator': base,
              'n_estimators': 100,          #default = 50
              'learning_rate': 0.3,         #default = 1.0
              'loss': 'linear',
Beispiel #34
0
# scikit-learn 中的 SGD 随机梯度下降法
# 只能解决线性模型
from time import time
from sklearn.linear_model import SGDRegressor
from GradientDescent.own_SGD import X_train_standard, X_test_standard, y_train, y_test

# 初始化
sgd_reg = SGDRegressor()
# 进行训练
start_time1 = time()
sgd_reg.fit(X_train_standard, y_train)
end_time1 = time()
sgd_reg.score(X_test_standard, y_test)

if __name__ == "__main__":
    print(end_time1 - start_time1)
Beispiel #35
0
class SGDPolyCartPoleSolver:
    def __init__(self, n_episodes=1000, max_env_steps=None, gamma=0.9, epsilon=1.0, epsilon_min=0.01,
                 epsilon_decay=0.005, alpha=0.0001, batch_size=32, monitor=False):
        self.memory = deque(maxlen=100000)
        self.env = gym.make('CartPole-v0')

        if monitor:  # whether or not to display video
            self.env = gym.wrappers.Monitor(self.env, '../data/cartpole-1', force=True)

        # hyper-parameter setting
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.alpha = alpha
        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.feature_tuning = PolynomialFeatures(interaction_only=True)
        if max_env_steps is not None:
            self.env._max_episode_steps = max_env_steps

        # Init model
        self.model = SGDRegressor(
                alpha=self.alpha,
                learning_rate='optimal',
                shuffle=False,
                warm_start=True)

        # Initialize feature tunning
        self.feature_tuning.fit(np.reshape(np.hstack((self.env.reset(), 0)), [1, 5]))
        # Initialize model
        self.model.partial_fit(self.preprocess_state(self.env.reset(), 0), [0])

    def remember(self, state, action, reward, next_state, done):
        """In this method, the (s, a, r, s') tuple is stored in the memory"""
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state, epsilon):
        """Chooses the next action according to the model trained and the policy"""

        qsa = np.asarray([self.model.predict(self.preprocess_state(state, a))
                          for a in range(self.env.action_space.n)]).flatten()

        return self.env.action_space.sample() if (np.random.random() <= epsilon) \
            else np.argmax(qsa)  # exploits the current knowledge if the random number > epsilon, otherwise explores

    def get_epsilon(self, episode):
        """Returns an epsilon that decays over time until a minimum epsilon value is reached; in this case the minimum
        value is returned"""
        return max(self.epsilon_min, self.epsilon * math.exp(-self.epsilon_decay * episode))

    def preprocess_state(self, state, action):
        """State and action are stacked horizontally and its features are combined as a polynomial to be passed as an
        input of the approximator"""

        # poly_state converts the horizontal stack into a combination of its parameters i.e.
        # [1, s_1, s_2, s_3, s_4, a_1, s_1 s_2, s_1 s_3, ...]
        poly_state = self.feature_tuning.transform(np.reshape(np.hstack((state, action)), [1, 5]))
        return poly_state

    def replay(self, batch_size):
        """Previously stored (s, a, r, s') tuples are replayed (that is, are added into the model). The size of the
        tuples added is determined by the batch_size parameter"""

        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))

        for state, action, reward, next_state, done in minibatch:
            qsa_s_prime = np.asarray([self.model.predict(self.preprocess_state(next_state, a))
                                      for a in range(self.env.action_space.n)])

            qsa_s = reward if done \
                else reward + self.gamma * np.max(qsa_s_prime)

            x_batch.append(self.preprocess_state(state, action)[0])
            y_batch.append(qsa_s)

        self.model.partial_fit(np.array(x_batch), np.array(y_batch))

    def run(self):
        """Main loop that controls the execution of the agent"""

        scores100 = deque(maxlen=100)
        scores = []
        for e in range(self.n_episodes):
            state = self.env.reset()
            done = False
            t = 0  # t counts the number of time-steps the pole has been kept up
            while not done:
                action = self.choose_action(state, self.get_epsilon(e))
                next_state, reward, done, _ = self.env.step(action)
                self.remember(state, action, reward, next_state, done)

                self.replay(self.batch_size)

                state = next_state
                t += 1

            scores100.append(t)
            scores.append(t)
            mean_score = np.mean(scores100)
            if e % 100 == 0:
                print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score))

        # noinspection PyUnboundLocalVariable
        print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score))
        return scores
Beispiel #36
0
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=89),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=50,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
                                             power_t=0.0)),
    StackingEstimator(estimator=LinearSVR(
        C=25.0, dual=True, epsilon=0.1, loss="epsilon_insensitive",
        tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"),
    SelectPercentile(score_func=f_regression, percentile=6),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False,
                                                    max_features=0.8,
                                                    min_samples_leaf=19,
                                                    min_samples_split=10,
                                                    n_estimators=400)),
    ZeroCount(), FeatureAgglomeration(affinity="l2", linkage="complete"),
    StackingEstimator(estimator=RidgeCV()), RidgeCV())
Beispiel #37
0
def run(config, train=True):
    """
    Trains our pipeline according to the configuration provided.
    """

    train_dir = config["train_dir"]
    val_dir = config["val_dir"]

    print("Reading in data...")

    train_data = NucleiDataset(train_dir).load()
    val_data = NucleiDataset(val_dir).load()

    x_train = train_data.images_
    y_train = train_data.masks_  # value in 0, 1, ..., n
    y_train_bin = (y_train > 0).astype(y_train.dtype)  # value in {0, 1}
    x_val = val_data.images_
    y_val = val_data.masks_
    y_val_bin = (y_val > 0).astype(y_val.dtype)

    print("Preprocessing data...")

    preprocesser = Preprocesser()

    x_train_pre = preprocesser.fit_transform(x_train)
    x_val_pre = preprocesser.transform(x_val)

    bilateral_d = 2
    bilateral_sigma_color = 75
    bilateral_sigma_space = 75
    equalize_hist_clip_limit = 0.03
    dialation_kernel = disk(radius=3)
    dialation_iters = 1

    print("Transforming data...")

    print(x_train_pre.min())
    print(x_train_pre.max())
    print(x_val_pre.min())
    print(x_val_pre.max())

    transformer = BasisTransformer(
        bilateral_d=bilateral_d,
        bilateral_sigma_color=bilateral_sigma_color,
        bilateral_sigma_space=bilateral_sigma_space,
        equalize_hist_clip_limit=equalize_hist_clip_limit,
        dialation_kernel=dialation_kernel,
        dialation_iters=dialation_iters)

    x_train_feat = transformer.fit_transform(x_train_pre)
    x_val_feat = transformer.fit_transform(x_val_pre)

    sgd_params = {
        "regressor":
        SGDRegressor(penalty='elasticnet', l1_ratio=0.11, max_iter=5,
                     tol=None),
        "batch_size":
        1000,
        "num_iters":
        25000,
    }
    pa_params = {
        "regressor": PassiveAggressiveRegressor(C=.2, max_iter=5, tol=None),
        "batch_size": 1000,
        "num_iters": 25000,
    }

    sgd = MiniBatchRegressor(**sgd_params)
    pa = MiniBatchRegressor(**pa_params)

    print("Fitting linear models...")

    sgd.fit(x_train_feat, y_train_bin)
    pa.fit(x_train_feat, y_train_bin)

    x_train_extended = extend_features(x_train_feat, sgd, pa)
    x_val_extended = extend_features(x_val_feat, sgd, pa)

    #   Now we train UNet
    numchannels = x_train_extended.shape[-1]
    unet_config = {
        "numchannels": numchannels,
        "epochs": 50,
        "callbacks": [],
        "weights": none
    }
    unet = UNet(**unet_config)

    if unet_config["weights"] is not None:
        unet.load_weights(unet_config["weights"])

    print("Fitting UNet...")

    unet.fit(x_train_extended, y_train_bin, x_val_extended, y_val_bin)

    #   begin inference and print out test scores
    x_train_pred = unet.predict(x_train_extended)
    x_val_pred = unet.predict(x_val_extended)

    segmenter_params = {"nms_min_distance": 3, "watershed_line": True}
    segmenter = NucleiSegmenter(**segmenter_params)

    print("Segmenting nuclei...")

    train_components = segmenter.fit_transform(x_train_pred, x_train_pre)
    val_components = segmenter.fit_transform(x_val_pred, x_val_pre)
Beispiel #38
0
def dict_method_reg():
    dict_method = {}
    # 1st part
    """4KNR"""
    me4 = neighbors.KNeighborsRegressor(n_neighbors=5,
                                        weights='uniform',
                                        algorithm='auto',
                                        leaf_size=30,
                                        p=2,
                                        metric='minkowski')
    cv4 = 5
    scoring4 = 'r2'
    param_grid4 = [{
        'n_neighbors': [3, 4, 5, 6, 7],
        "weights": ['uniform', "distance"],
        "leaf_size": [10, 20, 30]
    }]
    dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]})
    """1SVR"""
    me1 = SVR(kernel='rbf',
              gamma='auto',
              degree=3,
              tol=1e-3,
              epsilon=0.1,
              shrinking=False,
              max_iter=2000)
    cv1 = 5
    scoring1 = 'r2'
    param_grid1 = [{
        'C': [10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01],
        'kernel': ker
    }]
    dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]})
    """5kernelridge"""
    me5 = kernel_ridge.KernelRidge(alpha=1,
                                   gamma="scale",
                                   degree=3,
                                   coef0=1,
                                   kernel_params=None)
    cv5 = 5
    scoring5 = 'r2'
    param_grid5 = [{
        'alpha': [100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01, 0.001, 1e-4, 1e-5],
        'kernel':
        ker
    }]
    dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]})
    """6GPR"""
    me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel,
                                                    alpha=1e-10,
                                                    optimizer='fmin_l_bfgs_b',
                                                    n_restarts_optimizer=0,
                                                    normalize_y=False,
                                                    copy_X_train=True,
                                                    random_state=0)
    cv6 = 5
    scoring6 = 'r2'
    param_grid6 = [{'alpha': [1e-3, 1e-2], 'kernel': ker}]
    dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]})

    # 2nd part
    """6RFR"""
    me7 = RandomForestRegressor(n_estimators=100,
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                bootstrap=True,
                                oob_score=False,
                                random_state=None,
                                verbose=0,
                                warm_start=False)
    cv7 = 5
    scoring7 = 'r2'
    param_grid7 = [{'max_depth': [3, 4, 5, 6], 'min_samples_split': [2, 3]}]
    dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]})
    """7GBR"""
    me8 = GradientBoostingRegressor(loss='ls',
                                    learning_rate=0.1,
                                    n_estimators=100,
                                    subsample=1.0,
                                    criterion='friedman_mse',
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.,
                                    max_depth=3,
                                    min_impurity_decrease=0.,
                                    min_impurity_split=None,
                                    init=None,
                                    random_state=None,
                                    max_features=None,
                                    alpha=0.9,
                                    verbose=0,
                                    max_leaf_nodes=None,
                                    warm_start=False,
                                    presort='auto')
    cv8 = 5
    scoring8 = 'r2'
    param_grid8 = [{
        'max_depth': [3, 4, 5, 6],
        'min_samples_split': [2, 3],
        'learning_rate': [0.1, 0.05]
    }]
    dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]})

    "AdaBR"
    dt = DecisionTreeRegressor(criterion="mse",
                               splitter="best",
                               max_features=None,
                               max_depth=5,
                               min_samples_split=4)
    me9 = AdaBoostRegressor(dt,
                            n_estimators=200,
                            learning_rate=0.05,
                            loss='linear',
                            random_state=0)
    cv9 = 5
    scoring9 = 'explained_variance'
    param_grid9 = [{'n_estimators': [100, 200], 'learning_rate': [0.1, 0.05]}]
    dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]})
    '''DTR'''
    me10 = DecisionTreeRegressor(criterion="mse",
                                 splitter="best",
                                 max_depth=None,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.,
                                 max_features=None,
                                 random_state=0,
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.,
                                 min_impurity_split=None,
                                 presort=False)
    cv10 = 5
    scoring10 = 'r2'
    param_grid10 = [{
        'max_depth': [2, 3, 4, 5, 6, 7, 8],
        "min_samples_split": [2, 3, 4],
        "min_samples_leaf": [1, 2]
    }]
    dict_method.update({'DTR-em': [me10, cv10, scoring10, param_grid10]})

    'ElasticNet'
    me11 = ElasticNet(alpha=1.0,
                      l1_ratio=0.7,
                      fit_intercept=True,
                      normalize=False,
                      precompute=False,
                      max_iter=1000,
                      copy_X=True,
                      tol=0.0001,
                      warm_start=False,
                      positive=False,
                      random_state=None)

    cv11 = 5
    scoring11 = 'r2'
    param_grid11 = [{
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
        'l1_ratio': [0.3, 0.5, 0.8]
    }]
    dict_method.update(
        {"ElasticNet-L1": [me11, cv11, scoring11, param_grid11]})

    'Lasso'
    me12 = Lasso(
        alpha=1.0,
        fit_intercept=True,
        normalize=False,
        precompute=False,
        copy_X=True,
        max_iter=1000,
        tol=0.001,
        warm_start=False,
        positive=False,
        random_state=None,
    )

    cv12 = 5
    scoring12 = 'r2'
    param_grid12 = [
        {
            'alpha': [
                0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100,
                1000
            ]
        },
    ]
    dict_method.update({"LASSO-set": [me12, cv12, scoring12, param_grid12]})
    """2BayesianRidge"""
    me2 = BayesianRidge(alpha_1=1e-06,
                        alpha_2=1e-06,
                        compute_score=False,
                        copy_X=True,
                        fit_intercept=True,
                        lambda_1=1e-06,
                        lambda_2=1e-06,
                        n_iter=300,
                        normalize=False,
                        tol=0.01,
                        verbose=False)
    cv2 = 5
    scoring2 = 'r2'
    param_grid2 = [{
        'alpha_1': [1e-07, 1e-06, 1e-05],
        'alpha_2': [1e-07, 1e-06, 1e-05]
    }]
    dict_method.update({'BRR-set': [me2, cv2, scoring2, param_grid2]})
    """3SGDRL2"""
    me3 = SGDRegressor(alpha=0.0001,
                       average=False,
                       epsilon=0.1,
                       eta0=0.01,
                       fit_intercept=True,
                       l1_ratio=0.15,
                       learning_rate='invscaling',
                       loss='squared_loss',
                       max_iter=1000,
                       penalty='l2',
                       power_t=0.25,
                       random_state=0,
                       shuffle=True,
                       tol=0.01,
                       verbose=0,
                       warm_start=False)
    cv3 = 5
    scoring3 = 'r2'
    param_grid3 = [{
        'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05],
        'loss': ['squared_loss', "huber"],
        "penalty": ["l1", "l2"]
    }]
    dict_method.update({'SGDR-set': [me3, cv3, scoring3, param_grid3]})
    """PassiveAggressiveRegressor"""
    me14 = PassiveAggressiveRegressor(C=1.0,
                                      fit_intercept=True,
                                      max_iter=1000,
                                      tol=0.001,
                                      early_stopping=False,
                                      validation_fraction=0.1,
                                      n_iter_no_change=5,
                                      shuffle=True,
                                      verbose=0,
                                      loss='epsilon_insensitive',
                                      epsilon=0.1,
                                      random_state=None,
                                      warm_start=False,
                                      average=False)
    cv14 = 5
    scoring14 = 'r2'
    param_grid14 = [{
        'C': [1.0e8, 1.0e6, 10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01]
    }]
    dict_method.update({'PAR-set': [me14, cv14, scoring14, param_grid14]})

    return dict_method
Beispiel #39
0
blend_train = []
blend_test = []

train_time = timer(None)
kf = KFold(n_splits=folds, random_state=1001)
for i, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    start_time = timer(None)
    Xtrain, Xval = X_train[train_index], X_train[val_index]
    ytrain, yval = y_train[train_index], y_train[val_index]

    model = SGDRegressor(penalty='l2',
                         loss='squared_epsilon_insensitive',
                         max_iter=200,
                         tol=0.00001,
                         epsilon=0.0001,
                         learning_rate='invscaling',
                         fit_intercept=False,
                         alpha=1e-10,
                         l1_ratio=0.09,
                         shuffle=True,
                         verbose=0,
                         random_state=1001)
    model.fit(Xtrain, ytrain)
    sgd_scores_val = model.predict(Xval)
    sgd_RMSLE = np.sqrt(mean_squared_error(yval, sgd_scores_val))
    print('\n Fold %02d SGD RMSLE: %.6f' % ((i + 1), sgd_RMSLE))
    sgd_y_pred = model.predict(X_test)

    model = Ridge(alpha=4.75,
                  solver='sag',
                  fit_intercept=False,
                  random_state=1001,
Beispiel #40
0
from nlpia.models import LinearRegressor

line = LinearRegressor()
line = line.fit(sms['topic4'], sms['vader'])
print('{:.4f}'.format(line.slope))
# 0.29

sms['line'] = line.predict(sms['topic4'])


##########################

from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor(n_iter=20000)
sgd = sgd.fit(sms[['topic4']], sms['vader'])
print('{:.4f}'.format(sgd.coef_[0]))
# 0.2930

sms['sgd'] = sgd.predict(sms[['topic4']])


##########################

from nlpia.models import OneNeuronRegressor

nn = OneNeuronRegressor(alpha=100, n_iter=200)
nn = nn.fit(sms[['topic4']], sms['vader'])
print(nn.W[0, 1])
# 0.29386408
import json
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, SGDRegressor, PassiveAggressiveClassifier, PassiveAggressiveRegressor, Perceptron
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler

penalty = ['l1', 'l2', 'elasticnet'][2]

sgd_classifiers = [
    ('SGD-h', SGDClassifier(loss='hinge', penalty=penalty)),
    ('SGD-l', SGDClassifier(loss='log', penalty=penalty)),
    ('SGD-mh', SGDClassifier(loss='modified_huber', penalty=penalty)),
    ('SGD-sh', SGDClassifier(loss='squared_hinge', penalty=penalty)),
    ('SGD-p', SGDClassifier(loss='perceptron', penalty=penalty))
]

sgd_regressors = [('SGD-sl', SGDRegressor(loss='squared_loss',
                                          penalty=penalty)),
                  ('SGD-h', SGDRegressor(loss='huber', penalty=penalty)),
                  ('SGD-ei',
                   SGDRegressor(loss='epsilon_insensitive', penalty=penalty)),
                  ('SGD-sei',
                   SGDRegressor(loss='squared_epsilon_insensitive',
                                penalty=penalty))]

selected_classifiers = [('SGD', SGDClassifier(loss='hinge', penalty='l1')),
                        ('Perceptron-I', Perceptron(penalty='l1')),
                        ('Perceptron-II', Perceptron(penalty='l2')),
                        ('Perceptron-II', Perceptron(penalty='elasticnet')),
                        ('PA-I', PassiveAggressiveClassifier(loss='hinge')),
                        ('PA-II',
                         PassiveAggressiveClassifier(loss='squared_hinge'))]
def learning_schedule(t):
    return t0 / (t + t1)


theta = np.random.randn(2, 1)

for epoch in range(n_epochs):
    for i in range(m):
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index + 1]
        yi = y[random_index:random_index + 1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients

print("SGD: ", theta.flatten())

# =============================================================================
# SGD using sklearn's inbuilt SGDregressor class
# =============================================================================

from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter=500,
                       penalty=None,
                       eta0=0.1,
                       early_stopping=True)
sgd_reg.fit(X, y.ravel())

print(sgd_reg.intercept_, sgd_reg.coef_)
Beispiel #43
0
def create(X, X_column_types, y, y_column_types, arm, **kwargs):
  
    method = kwargs.get("method", None)
    #method = kwargs.get("method", "Binary_operators")
    #method = kwargs.get("method", "Binning")
    #method = kwargs.pop("method", "Cluster")
    categorical_cols = [c for c, t in zip(X.columns, X_column_types) if t in [DATATYPE_CATEGORY_INT, DATATYPE_CATEGORY_STRING]]
    numerical_cols = [c for c, t in zip(X.columns, X_column_types) if t == DATATYPE_NUMBER]
    categorical = X[categorical_cols]
    numerical = X[numerical_cols]
    # feature selection using Genetic Algorithm
    if method == "fs_GA":
        print("fs_GA")
        enc = OneHotEncoder()
        enc.fit(categorical)
        Data_cat=pd.DataFrame(enc.transform(categorical).toarray())
        X_data = pd.concat([numerical, Data_cat], axis=1)
        
        if y_column_types[0] == DATATYPE_NUMBER:
            y = y
        else:
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            le.fit(y)
            y = le.transform(y)

        X_train, X_test, y_train, y_test = train_test_split(X_data, y, train_size=0.8, random_state=42)

        def get_fitness(individual):
            if y_column_types[0] == DATATYPE_NUMBER:
                rg = RandomForestRegressor(random_state=42)
            else:
                rg = RandomForestClassifier(random_state=42)
                
            columns = [column for (column, binary_value) in zip(X_train.columns, individual) if binary_value]
            training_set = X_train[columns]
            test_set = X_test[columns]
            rg.fit(training_set.values, y_train)
            preds = rg.predict(test_set.values)
            return 100 / np.sqrt(mean_squared_error(y_test, preds))

        individual = [1] * 100
        get_fitness(individual)

        def get_population_fitness(population):
            return sorted([(individual, get_fitness(individual)) for individual in population], key=lambda tup: tup[1], reverse=True)

        def crossover(individual_a, individual_b):
            crossing_point = random.randint(0, 99)
            offspring_a = individual_a[0:crossing_point] + individual_b[crossing_point:100]
            offspring_b = individual_b[0:crossing_point] + individual_a[crossing_point:100]
            return offspring_a, offspring_b

        def tournament(current_population):
            index = sorted(random.sample(range(0, 20), 5))
            tournament_members  = [current_population[i] for i in index]
            total_fitness = sum([individual[1] for individual in tournament_members])
            probabilities = [individual[1] / total_fitness for individual in tournament_members]
            index_a, index_b = np.random.choice(5, size=2, p=probabilities)
            return crossover(tournament_members[index_a][0], tournament_members[index_b][0])

        def mutation(individual):
            mutation_point = random.randint(0, 99)
            if(individual[mutation_point]):
                individual[mutation_point] = 0
            else:
                individual[mutation_point] = 1

        def build_next_generation(current_population, mutation_rate):
            next_generation = []
            next_generation.append(current_population[0][0]) # elitism
            next_generation.append(current_population[random.randint(1,19)][0]) # randomness
    
            for i in range(9): # tournaments
                offspring_a, offspring_b = tournament(current_population)
                next_generation.append(offspring_a)
                next_generation.append(offspring_b)
    
            for individual in next_generation: # mutation
                if(random.randint(1,mutation_rate) == 1):
                    mutation(individual)
            return next_generation
    

        def run_ga(current_population, num_of_generations, mutation_rate=1000):
            fittest_individuals = []
            for i in range(num_of_generations):
                current_population = get_population_fitness(current_population) # get pop fitness
                fittest_individuals.append(current_population[0]) # record fittest individual (for graphing and analysis)
                current_population = build_next_generation(current_population, mutation_rate) # make new population
                return fittest_individuals


        initial_population = [[random.randint(0, 1) for i in range(100)] for i in range(20)]
        high_mutation_fittest = run_ga(initial_population, 100, mutation_rate=5)



        high_mutation_fitness = [ind[1] for ind in high_mutation_fittest]
        for item in high_mutation_fittest[:-1]:
            if item[1] == max(high_mutation_fitness):
                top_performer = item
                break
        #print("Total features included: " + str(top_performer[0].count(1)))

        selected_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if binary_value]
        excluded_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if not binary_value]
        X = X[selected_features]
        categorical_cols = [c for c, t in zip(X.columns, X_column_types) if t in [DATATYPE_CATEGORY_INT, DATATYPE_CATEGORY_STRING]]
        numerical_cols = [c for c, t in zip(X.columns, X_column_types) if t == DATATYPE_NUMBER]
        categorical = X[categorical_cols]
        numerical = X[numerical_cols]
        
    if method == "Binary_operators":
        print("binaryoperators")
        Binary_operator = pd.DataFrame()
        #Apply binary operators
        for i in range(numerical.shape[1]):
            a = numerical.iloc[:,i]
            for j in range(i+1, numerical.shape[1]):
                b = numerical.iloc[:,j]
                result = a*b
                Binary_operator = pd.concat([Binary_operator, result], axis=1)
        # apply addition operation 
        for i in range(numerical.shape[1]):
            a = numerical.iloc[:,i]
            for j in range(i+1, numerical.shape[1]):
                b = numerical.iloc[:,j]
                result = a+b
                Binary_operator = pd.concat([Binary_operator, result], axis=1)
        numerical = Binary_operator.copy()
        
    if method == "Binning":
        print("Binning")
        num_discretizer=pd.DataFrame()
        for i in range(numerical.shape[1]):
            
            d_f1 = pd.DataFrame(pd.cut(numerical.iloc[:,i], 6, labels=False, duplicates='drop'))
            d_f2 = pd.DataFrame(pd.cut(numerical.iloc[:,i], 4, labels=False, duplicates='drop'))
            
            num_discretizer = pd.concat([num_discretizer, d_f1, d_f2], axis=1)
            
    else:
        # discritize the numerical features
        num_discretizer=pd.DataFrame()
        for i in range(numerical.shape[1]):
            d_f = pd.DataFrame(pd.cut(numerical.iloc[:,i], 10, labels=False))
            d_f2 = pd.DataFrame(pd.cut(numerical.iloc[:,i], 5, labels=False))
            d_f3 = pd.DataFrame(pd.cut(numerical.iloc[:,i], 4, labels=False))
            d_f4 = pd.DataFrame(pd.cut(numerical.iloc[:,i], 3, labels=False))
    
            num_discretizer = pd.concat([num_discretizer, d_f, d_f2, d_f3, d_f4], axis=1)
    
    # function to rename the duplicate columns
    def df_column_uniquify(df):
        df_columns = df.columns
        new_columns = []
        for item in df_columns:
            counter = 0
            newitem = item
            while newitem in new_columns:
                counter += 1
                newitem = "{}_{}".format(item, counter)
            new_columns.append(newitem)
        df.columns = new_columns
        return df
    
    
    num_discretizer=df_column_uniquify(num_discretizer)
    
    
    # Categorical features encoding 
    cat_list=[]
    for i in range(categorical.shape[1]):
        if (len(categorical.iloc[:, i].unique()) >= 2):
            cat_list.append(categorical.keys()[i])
            
    categorical = categorical[cat_list]
    # We cluster the categorical features by most frequently repated and kmeans clustering
    if method == "cluster":
        print("clustering")
        categorical_strg = [c for c, t in zip(categorical.columns, X_column_types) if t in [DATATYPE_CATEGORY_STRING]]
        categorical_int = [c for c, t in zip(categorical.columns, X_column_types) if t in [DATATYPE_CATEGORY_INT]]
        strg_cate = categorical[categorical_strg]
        int_cate = categorical[categorical_int]
        frequent = pd.DataFrame()
        frequent_col = []
        if len(strg_cate.columns)>=1:
            # clustering the string categorical variables by top 10 frequently repeated values
            for i in range(len(strg_cate.columns)):
                if (strg_cate[strg_cate.columns[i]].nunique() > 10):
                    frequent_col.append(strg_cate.columns[i])
                    n=10
                    top=strg_cate[strg_cate.columns[i]].value_counts()[:n].index.tolist()
                    for label in top:
                        strg_cate[label]= np.where(strg_cate[strg_cate.columns[i]]==label, 1, 0)
                        df1 = strg_cate[[strg_cate.columns[i]]+top]
                        frequent = pd.concat([frequent, df1.drop([strg_cate.columns[i]], axis=1)], axis=1)
            if len(frequent_col)>=1:
                strg_cate=strg_cate.drop(frequent_col, axis=1)
            else:
                strg_cate = strg_cate.copy()
                
        if len(int_cate.columns)>=1:
            # clustering the interger categorical variables by using kmeans clustering    
            int_col=[]    
            for i in range(len(int_cate.columns)):
                if (int_cate[int_cate.columns[i]].nunique() > 10):
                    x = int_cate.iloc[:,i:i+1]
                    kmeans = KMeans(10)
                    kmeans.fit(x)
                    cluster = kmeans.fit_predict(x)
                    int_cate[int_cate.columns[i] + '_cluster']=cluster
                    int_col.append(int_cate.columns[i])
            if len(int_col)>=1:
                int_cate = int_cate.drop(int_col, axis=1)
            else:
                int_cate = int_cate.copy()
            
        if (len(strg_cate.columns)>0 or len(int_cate.columns)>0):
            categorical = pd.concat([strg_cate, int_cate], axis=1)

        enc = OneHotEncoder()
        enc.fit(categorical)
        Data_cat=pd.DataFrame(enc.transform(categorical).toarray())
        
        if len(frequent_col)>=1:
            original_feats = pd.concat([numerical, Data_cat, frequent], axis=1)
            num_discret = pd.concat([numerical, Data_cat, frequent, num_discretizer], axis=1)
            
        else:
            original_feats = pd.concat([numerical, Data_cat], axis=1)
            num_discret = pd.concat([numerical, Data_cat, num_discretizer], axis=1)
            
            
            
        
    else:
        # One hot encode the categorical_features      
        #Data_cat = pd.get_dummies(categorical)
        enc = OneHotEncoder()
        enc.fit(categorical)
        Data_cat=pd.DataFrame(enc.transform(categorical).toarray())
    
        original_feats = pd.concat([numerical, Data_cat], axis=1)
    
        num_discret = pd.concat([numerical, Data_cat, num_discretizer], axis=1)

   
    
   #Select the best half of discretized features by Mini batch gradient descent 
   
    #clf = SGDClassifier(loss="log", penalty="l1")
    
    mini_batches = [] 
    batch_size=32
    data = np.hstack((num_discret, (y.values).reshape(-1,1)))
    #data =pd.concat([num_discretizer, y], axis=1) 
    np.random.shuffle(data) 
    n_minibatches = data.shape[0] // batch_size 
    i = 0
  
    for i in range(n_minibatches + 1): 
        mini_batch = data[i * batch_size:(i + 1)*batch_size, :] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
    if data.shape[0] % batch_size != 0: 
        mini_batch = data[i * batch_size:data.shape[0]] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
        
    if (y_column_types[0] == DATATYPE_NUMBER):
       
        model = SGDRegressor(loss="squared_loss", penalty="l1")
        for X_mini, Y_mini in mini_batches:
            model.partial_fit(X_mini, Y_mini)
        coefs=model.coef_
    else:
        model = SGDClassifier(loss="log", penalty="l1")
        for X_mini, Y_mini in mini_batches:
            model.partial_fit(X_mini, Y_mini, classes=np.unique(y))
        coefs=model.coef_[0]
            
    num = len(numerical.columns)+len(Data_cat.columns)    
    #coefs=model.coef_
    h=np.argsort(coefs[num:])[::-1][:int(num_discretizer.shape[1]/2)]
    best_half_sorted = [x+num for x in h]
    best_dicretized = num_discret.iloc[:,best_half_sorted]
    
    
    total = pd.concat([categorical, best_dicretized], axis=1)
    
    # one hot encode the interger discretized features
    enc = OneHotEncoder()
    enc.fit(best_dicretized)
    dicretized_ohe=pd.DataFrame(enc.transform(best_dicretized).toarray())
   
    
    

    
    # combine cat_ohe and disretized_ohe  features 
    Data = pd.concat([Data_cat, dicretized_ohe], axis=1)
    
    # Rename the features which has duplicates 
    Data = df_column_uniquify(Data)
    

    second_order = pd.DataFrame()
    final_feats = pd.DataFrame()
    cnt = 0
    cnt_1 = 0
    for i in range(len(total.columns)-1):
        a= Data.iloc[:,[o for o in range(cnt, cnt+len(total.iloc[:, i].unique()))]]
        cnt = cnt+len(total.iloc[:, i].unique())
        cnt_1 = cnt
        for j in range(i+1, len(total.columns)):
            b= Data.iloc[:,[p for p in range(cnt_1, cnt_1+len(total.iloc[:, j].unique()))]]
            cnt_1 = cnt_1+len(total.iloc[:, j].unique())
            first = pd.DataFrame()
            for k in range(a.shape[0]):
                c = a.iloc[[k]].values
                d = b.iloc[[k]].values
        
                result = np.outer(c, d).ravel()
                first=first.append(pd.Series(result), ignore_index=True)
        
   
            second_order = pd.concat([second_order, first], axis =1)
    second_order = df_column_uniquify(second_order)
    
    firstorder_select = pd.concat([original_feats, second_order], axis=1)
        
    # slect the second order features using Logistic regression
    #clf = SGDClassifier(loss="log", penalty="l1")

    mini_batches = [] 
    batch_size=32
    data = np.hstack((firstorder_select, (y.values).reshape(-1,1))) 
    #data = pd.concat([second_order, y], axis=1)
    np.random.shuffle(data) 
    n_minibatches = data.shape[0] // batch_size 
    i = 0
  
    for i in range(n_minibatches + 1): 
        mini_batch = data[i * batch_size:(i + 1)*batch_size, :] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
    if data.shape[0] % batch_size != 0: 
        mini_batch = data[i * batch_size:data.shape[0]] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
        #create_mini_batches(gen_feats, y, 32)
        
    if (y_column_types[0] == DATATYPE_NUMBER):
        model = SGDRegressor(loss="squared_loss", penalty="l1")
        for X_mini, Y_mini in mini_batches:
            model.partial_fit(X_mini, Y_mini)
        coefs=model.coef_
    else:
        model = SGDClassifier(loss="log", penalty="l1")
        for X_mini, Y_mini in mini_batches:
            model.partial_fit(X_mini, Y_mini, classes=np.unique(y))
        coefs=model.coef_[0]
            
    num1 = len(original_feats.columns)   
    #selected top 10 features
    g=np.argsort(coefs[num1:])[::-1][:10]
    selected_sorted=[x+num1 for x in g]
    selected_best = firstorder_select.iloc[:, selected_sorted]
    selected = selected_best.copy()
    new_col_types = X_column_types+[DATATYPE_CATEGORY_INT]*len(selected_best.columns)
    total_feats = pd.concat([original_feats, selected_best], axis=1)
    final_feats = pd.concat([X, selected_best], axis=1)            
    
    # higher order features generation
   
    if len(categorical.columns)>2:
        for i in range(len(categorical.columns)-2):
           cnt = 0
           Higher_order = pd.DataFrame()
           for i in range(len(total.columns)):     
               a= Data.iloc[:,[o for o in range(cnt, cnt+len(total.iloc[:, i].unique()))]]
               cnt = cnt+len(total.iloc[:, i].unique())
               for j in range(selected_best.shape[1]):
                   b= selected_best.iloc[:,j]
                   second = pd.DataFrame()
                   for k in range(a.shape[0]):
                       c = a.iloc[[k]].values
                       d = b.iloc[[k]].values
                       result_1 = np.outer(c, d).ravel()
                       second=second.append(pd.Series(result_1), ignore_index=True)
                       
                   Higher_order = pd.concat([Higher_order, second], axis =1)
                   
           Higher_order=df_column_uniquify(Higher_order)
           
           High_order_sel = pd.concat([total_feats, Higher_order], axis=1)
           mini_batches = [] 
           batch_size=32
           data = np.hstack((High_order_sel, (y.values).reshape(-1,1))) 
           #data = pd.concat([Higher_order, y], axis=1)
           np.random.shuffle(data) 
           n_minibatches = data.shape[0] // batch_size 
           i = 0
           for i in range(n_minibatches + 1):
               mini_batch = data[i * batch_size:(i + 1)*batch_size, :] 
               X_mini = mini_batch[:, :-1] 
               Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
               mini_batches.append((X_mini, Y_mini))
           if data.shape[0] % batch_size != 0:
               mini_batch = data[i * batch_size:data.shape[0]] 
               X_mini = mini_batch[:, :-1] 
               Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
               mini_batches.append((X_mini, Y_mini))
               #create_mini_batches(gen_feats, y, 32)
           if (y_column_types[0] == DATATYPE_NUMBER):
               model = SGDRegressor(loss="squared_loss", penalty="l1")
               for X_mini, Y_mini in mini_batches:
                   model.partial_fit(X_mini, Y_mini)
               coefs=model.coef_
           else:
               model = SGDClassifier(loss="log", penalty="l1")
               for X_mini, Y_mini in mini_batches:
                   model.partial_fit(X_mini, Y_mini, classes=np.unique(y))
               coefs=model.coef_[0]
               
               
           #coefs=model.coef_
           num2 = len(total_feats.columns)
           sort=np.argsort(coefs[num2:])[::-1][:5]
           selected_sorted=[x+num2 for x in sort]
           selected_best = High_order_sel.iloc[:, selected_sorted]
           selected = pd.concat([selected, selected_best], axis=1)
           total_feats = pd.concat([total_feats, selected_best], axis=1)
           final_feats = pd.concat([final_feats, selected_best], axis=1)
           
    
        transformed_X = final_feats
        new_col_types = X_column_types+[DATATYPE_CATEGORY_INT]*len(selected.columns)
        
           
    else:
        
        transformed_X = final_feats
        
    return None, transformed_X, new_col_types
        
Beispiel #44
0
 def create_sgd(arguments):
     return SGDRegressor(**arguments)


    preds = []



    from sklearn.linear_model import (LinearRegression, SGDRegressor)

    import lightgbm as lgb



    sgdr= SGDRegressor(

        penalty = 'l2' ,

        random_state = SEED )

    lgb_params = {

                  'feature_fraction': 0.75,

                  'metric': 'rmse',

                  'nthread':1,

                  'min_data_in_leaf': 2**7,

                  'bagging_fraction': 0.75,
Beispiel #46
0
def train_SGDRegressor(X_train, y_train):

    model = SGDRegressor()
    model.fit(X_train, y_train)
    return model
user_avg = np.zeros(nusers)
user_std = np.zeros(nusers)
for i in range(0, nusers, batch_size):
    users_current = R_u[i:min(i + batch_size, nusers), :]
    batch_avg = ((users_current.sum(axis=1).flatten()) /
                 users_current.getnnz(axis=1))
    user_avg[i:min(i + batch_size, nusers)] = batch_avg
    user_std[i:min(i + batch_size, nusers)] = np.sqrt(
        abs(
            users_current.power(2).sum(axis=1).flatten() /
            users_current.getnnz(axis=1) - batch_avg))
print 'done avging', movie_avg, user_avg

# sgd fitter
lin_model = SGDRegressor()

rat_num = len(probe_ratings)

for i in range(0, rat_num, batch_size):

    given = probe_ratings[i:min(i + batch_size, rat_num)]
    u_mean = user_avg[probe_users[i:min(i + batch_size, probe_num)]]
    m_mean = movie_avg[probe_movies[i:min(i + batch_size, probe_num)]]
    u_mean = np.array([u_mean]).T
    m_mean = np.array([m_mean]).T

    preding = np.concatenate((u_mean, m_mean), axis=1)

    lin_model.partial_fit(preding, given)
Beispiel #48
0
model_2 = BayesianRidge()

model_3 = LassoLars(alpha=0.3, fit_intercept=False, normalize=True)

model_4 = DecisionTreeRegressor(min_samples_leaf=20)

model_5 = RandomForestRegressor(n_estimators=30)


model_6 = KNeighborsRegressor(n_neighbors = 30)

model_7 = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)



model_8=SGDRegressor(max_iter=np.ceil(10**6 / max([len(x) for x in train_X_drug_catg])), tol=1e-3,loss="squared_loss", penalty=None)


model_list=[model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8]



model_name=['Linear','Bayesian','LassoLars','DecisionTree','RandomForest','KNeighbors','XGB','SGD']


model_dict=dict(zip(model_list,model_name))

for model_x in model_list:
  
  for which_drg_catg in range(len(drug_catgrs)):
    for z in store_list_drug_catg[which_drg_catg]:
Beispiel #49
0
def train(features, labels):
    lr = SGDRegressor()
    lr.fit(features, labels)
    weights = lr.coef_
    return weights
Beispiel #50
0
    # print(appIndex, score, timerEnd-timerStart)

    return prediction


# initialize vars -------------------------------------------------------------

# define scaler for normalization
scaler = StandardScaler()

# define PCA for dimentionality reduction
pca = IncrementalPCA()

# define regression model
regressor = SGDRegressor(random_state=42, max_iter=1, tol=1e-3)
regressors, predictions = None, None

# load training data one batch at a time
trainFiles = glob(DATA_SUBFOLDER + '/trainBatch*')
timerStartLocal = time.time()
for fileNum, file in enumerate(trainFiles):

    # load batch
    timerStartLocal2 = time.time()
    data, labels = load(file)

    #init scaler and pca
    scaler.partial_fit(data)
    # pca.partial_fit(data)
Beispiel #51
0
def train_model(features, targets, *params):
    model = SGDRegressor()
    model.fit(features, targets)
    return model
                                                                           ('tfidf_transformer', TfidfTransformer())])
                                                 )


## prediction
def fillna_and_log(x):
    x = x.copy()
    x[np.isnan(x)] = 0
    return np.log(1 + x)


from sklearn.linear_model import SGDRegressor, SGDClassifier

title_word_1_2gram_dtm_0_predict_log_price = PredictionFeature('title_word_1_2gram_dtm_0_predict_log_price',
                                                               title_word_1_2gram_dtm_0,
                                                               SGDRegressor(penalty='elasticnet', l1_ratio=0.7,
                                                                            random_state=132, n_iter=20), price,
                                                               y_transformer=fillna_and_log, keep_true=True,
                                                               true_name='log_price')
title_word_1_2gram_dtm_0_predict_is_test = PredictionFeature('title_word_1_2gram_dtm_0_predict_is_test',
                                                             title_word_1_2gram_dtm_0, \
                                                             SGDClassifier(penalty='elasticnet', l1_ratio=0.7,
                                                                           random_state=132, n_iter=20), is_test,
                                                             y_transformer=None, keep_true=False,
                                                             only_predict=True, predict_binary_probability=True,
                                                             true_name='')

title_description_dtm_0_predict_log_price = PredictionFeature('title_description_dtm_0_predict_log_price',
                                                              title_description_dtm_0,
                                                              SGDRegressor(penalty='elasticnet', l1_ratio=0.7,
                                                                           random_state=133, n_iter=30), price,
                                                              y_transformer=fillna_and_log)
Beispiel #53
0
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta_path_sgd.append(theta)

plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.title('SGD')
plt.axis([0, 2, 0, 15])
plt.show()

#%% SGD SCikit-Learn

params = []
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)
sgd_reg.fit(X, y)
params.append(sgd_reg.intercept_)
params.append(sgd_reg.coef_)

#%% Mini Batch Gradient Descendent

theta_path_mgd = []
n_iterations = 50
minibatch_size = 20
theta = np.random.randn(2, 1)  # random initialization

t0, t1 = 10, 1000


def learning_schedule(t):
Beispiel #54
0
    X_train, X_test, y_train, y_test = split_data(X, y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test,
                                                       y_train, y_test)

    #print(X_train)
    #X_values = np.delete(raw_data, raw_data.shape[1]-1, 1)
    #Y_values = raw_data[:,raw_data.shape[1]-1]

    weights_sk = np.full(
        (1, X_train.shape[1]), 1.0
    )  #do not reuse the weights since sk-learn does inplace work with the coef_init matrix!
    intercept_sk = 1
    weights_own = np.full((1, X_train.shape[1]), 1.0)
    intercept_own = 1

    sk_gdc = SGDRegressor()
    sk_gdc.fit(
        X_train, y_train, coef_init=weights_sk, intercept_init=intercept_sk
    )  #coef_init is the same as our weights for comparison reasons (sklear does not pass w_0!)
    print("Weights and intercept found by sk:", weights_sk, intercept_sk)

    own_gdc = OwnGradientDescentRegressor(debug_output=True)
    print(weights_own, weights_own.shape)
    weights_own, intercept_own = own_gdc.fit(X_train,
                                             y_train,
                                             coef_init=weights_own,
                                             intercept_init=intercept_own)
    print("Weights and intercept found by own:", weights_own, intercept_own)

    print("Prediction with sk-learn:", sk_gdc.predict(X_test))
    print("Prediction with own-imp:", own_gdc.predict(X_test))
Beispiel #55
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 文件名: elastic_net.py

import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

__author__ = 'yasaka'

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

elastic_net = ElasticNet(alpha=0.0001, l1_ratio=0.15)
elastic_net.fit(X, y)
print(elastic_net.predict(1.5))

sgd_reg = SGDRegressor(penalty='elasticnet', n_iter=1000)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.predict(1.5))
eta0 = 0.01
max_iter = 100






from sklearn.model_selection import train_test_split


X_train_dataset, X_test, y_train_dataset, y_test = train_test_split(
            X_scaled,y, test_size=0.2, random_state=42)

sgd_regressor = SGDRegressor(
    eta0=eta0, max_iter=max_iter, warm_start=True, learning_rate="constant")

rmse_val_score = []
rmse_train_score = []
model_list = []

X_train, X_val, y_train, y_val = train_test_split(
    X_train_dataset,y_train_dataset, test_size=0.2, random_state=42)
sgd_regressor.fit(X_train,y_train)

# kf = KFold(n_splits=100, shuffle=True)
# for train_index, test_index in kf.split(X_train_dataset):

for i in range(300):

    y_pred = sgd_regressor.predict(X_train)
Beispiel #57
0
    train_mse = mean_squared_error(y_true=y_train, y_pred=y_train_pred)
    train_rmse = np.sqrt(train_mse)
    print('Linear Regression Train RMSE:', train_rmse)
    train_r2 = r2_score(y_train, y_train_pred)
    print('Linear Regression Train R^2:', train_r2)

    # 테스트 세트의 예측값
    y_test_pred = lin_reg.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_test_pred)
    print('Linear Regression Test RMSE:', test_rmse)
    print('Linear Regression Test R^2:', test_r2)

    # LinearRegression vs SGDRegressor
    sgd_reg = SGDRegressor(random_state=1)  # 모델 생성
    sgd_reg.fit(X_train, y_train)  # 모델 훈련
    y_train_pred = sgd_reg.predict(X_train)  # 학습 세트 예측값
    # -> 학습 세트의 RMSE, R2-score
    y_test_pred = sgd_reg.predict(X_test)  # 테스트 세트 예측값
    # -> 테스트 세트의 RMSE, R2-Score

    # Scaler 사용 -> Pipeline
    pipe1 = Pipeline([('scaler', StandardScaler()),
                      ('regressor', LinearRegression())])
    pipe1.fit(X_train, y_train)  # 학습
    y_train_pred = pipe1.predict(X_train)  # Train 예측값
    # -> Train RMSE, R2-score
    y_test_pred = pipe1.predict(X_test)  # Test 예측값

    scaler = StandardScaler()
Beispiel #58
0
#Borramos las 5 primeras columnas pues no aportan información
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x = np.delete(x, 1, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.33,
                                                    shuffle=True)

#Preprocesado de los datos
preproc = [("missing", SimpleImputer()), ("var", VarianceThreshold(0.01)),
           ("poly", PolynomialFeatures(1)), ("standardize", StandardScaler())]

pipe = Pipeline(preproc + [('model', SGDRegressor())])

params_grid = [{
    "model": [SGDRegressor(max_iter=500)],
    "model__loss": [
        'huber', 'squared_loss', 'epsilon_insensitive',
        'squared_epsilon_insensitive'
    ],
    "model__penalty": ['l1', 'l2'],
    "model__alpha":
    np.logspace(-5, 5, 5),
    "poly__degree": [1, 2]
}, {
    "model": [LinearRegression()],
    "poly__degree": [1, 2]
}, {
Beispiel #59
0
scaled = scaler.transform(X)

scaled_df = pd.DataFrame(scaled, columns= X.columns)

scaled_df[:5]

X = scaled_df
X[:5]

from sklearn.linear_model import LinearRegression
model = LinearRegression()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

model.fit(X_train, y_train)

pred = model.predict(X_test)

from sklearn import metrics
metrics.r2_score(y_test, pred)

from sklearn.linear_model import SGDRegressor
mod = SGDRegressor()

mod.fit(X_train, y_train)

predict = mod.predict(X_test)

metrics.r2_score(y_test, predict)
Beispiel #60
0
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from script import features as f

train, test, feature_names = f.get_sets(1)
dict = train[feature_names].to_dict()
print(dict)
X_train, X_test, y_train, y_test = train_test_split(train[feature_names],
                                                    train.Sales.values,
                                                    test_size=0.10,
                                                    random_state=2)

clf = make_pipeline(StandardScaler(),
                    SGDRegressor(loss='squared_loss', penalty='l2'))
f.score("SGDRegressor", clf, X_train, y_train, X_test, y_test)
f.kfold_validation("SGDRegressor", clf, X_train, y_train)
f.submission(clf, train, test, feature_names)

# clf = DecisionTreeRegressor()
# f.score("DecisionTreeRegressor", clf, X_train, y_train, X_test, y_test)
# f.kfold_validation("DecisionTreeRegressor", clf, X_train, y_train)
#
# clf = RandomForestRegressor(n_jobs=-1, n_estimators=25)
# f.score("RandomForestRegressor", clf, X_train, y_train, X_test, y_test)
# f.kfold_validation("RandomForestRegressor", clf, X_train, y_train)

# clf = GradientBoostingRegressor()
# f.score("GradientBoostingRegressor", clf, X_train, y_train, X_test, y_test)
# f.kfold_validation("GradientBoostingRegressor", clf, X_train, y_train)