Esempio n. 1
0
def test_lin_reg_sklearn_coherence():
    """Checks that the sklearn and creme implementations produce the same results."""
    class SquaredLoss:
        """sklearn removes the leading 2 from the gradient of the squared loss."""
        def gradient(self, y_true, y_pred):
            return y_pred - y_true

    ss = preprocessing.StandardScaler()
    cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss())
    sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0)

    for x, y in datasets.TrumpApproval():
        x = ss.fit_one(x).transform_one(x)
        cr.fit_one(x, y)
        sk.partial_fit([list(x.values())], [y])

    for i, w in enumerate(cr.weights.values()):
        assert math.isclose(w, sk.coef_[i])

    assert math.isclose(cr.intercept, sk.intercept_[0])
Esempio n. 2
0
def lgdModel(server=1, intercept=None, coef=None):
    """ Iterate a generalized linear model

    :param server: the id of the server
    :type server: integer
    :param intercept: an intercept parameter
    :type intercept: float
    :param coef: a coefficient
    :type coef: float

    """

    clf = linear_model.SGDRegressor(tol=None,
                                    max_iter=1,
                                    verbose=0,
                                    warm_start=False,
                                    early_stopping=False)
    # The server ID
    n = server

    # Load data from local storage
    # TODO remove hardwiring
    Data_Location = './server_dirs/' + str(n) + '/'
    df = pd.read_csv(Data_Location + 'regression_data.csv')

    # Extract explanatory and target variables
    X = df[['X']]
    y = df['Y']

    # Estimate model (initial or update mode)
    if intercept is None or coef is None:
        clf.fit(X, y)
    else:
        clf.fit(X, y, intercept_init=intercept, coef_init=coef)

    # Return the current parameter estimates
    fitted_params = {
        'intercept': clf.intercept_[0],
        'coefficient': clf.coef_[0]
    }
    return fitted_params
def __initialize_model(model_name, lamda=0, hyper_parameters={}):
    """
    initialize machine learning model.

    Args:
        model_name: learning algorithm name
        lamda: coefficient of standardization item
        hyper_parameter: other parameters for algorithms
               See parameters for RandomForest Regression in sci-kit-learn

    Returns:
        an initialized classifier
    """
    if model_name == constants.MODEL_NAME_LASSO:
        # note: alpha in scikit-learn reprsents lamda which is the constant that
        # multiplies the regularization term
        clf_lasso = linear_model.Lasso(alpha=lamda)
        return clf_lasso
    elif model_name == constants.MODEL_NAME_ELASTICNET:
        clf_elasticnet = ElasticNet(alpha=lamda)
        return clf_elasticnet
    elif model_name == constants.MODEL_NAME_RIDGE:
        clf_ridge = linear_model.Ridge(alpha=lamda)
        return clf_ridge
    elif model_name == constants.MODEL_NAME_RIDGECV:
        clf_ridgecv = linear_model.RidgeCV(alphas=constants.lamdaArray)
        return clf_ridgecv
    elif model_name == constants.MODEL_NAME_LARS:
        clf_lars = linear_model.Lars(n_nonzero_coefs=1)
        return clf_lars
    elif model_name == constants.MODEL_NAME_BAYESIAN:
        clf_bayesian = linear_model.BayesianRidge()
        return clf_bayesian
    elif model_name == constants.MODEL_NAME_SGD:
        clf_sgd = linear_model.SGDRegressor(alpha=lamda)
        return clf_sgd
    elif model_name == constants.MODEL_NAME_RANDOM_FOREST:
        clf_random_forest = RandomForestRegressor(**hyper_parameters,
                                                  random_state=0,
                                                  n_jobs=-1)
        return clf_random_forest
Esempio n. 4
0
def main():
    # Read the data from the train.csv file (into a pandas data object)
    data = pd.read_csv('./data/train.csv')
    yData = data["y"]
    xData = data.drop(['Id', 'y'], 1)

    # Transform the data according to the model
    print(f'The input data looks like: {data.head()}\n')

    headers = xData.columns

    xData[['x6', 'x7', 'x8', 'x9',
           'x10']] = data[headers].applymap(lambda x: x * x)
    xData[['x11', 'x12', 'x13', 'x14',
           'x15']] = data[headers].applymap(math.exp)
    xData[['x16', 'x17', 'x18', 'x19',
           'x20']] = data[headers].applymap(math.cos)
    xData['x21'] = 1

    print(f'The feature transformed data looks like: {xData.head()}')

    clf = linear_model.SGDRegressor()
    clf.fit(xData.to_numpy(), yData.to_numpy())

    predict = clf.predict(xData)

    print(f'Linear Coefficients: {clf.coef_}\n')

    print(f'Mean squared error: {mean_squared_error(yData, predict)}\n')

    print(
        f'Root mean squared error: {math.sqrt(mean_squared_error(yData, predict))}\n'
    )

    # Write the CSV result
    pd.DataFrame(clf.coef_).to_csv('./data/result.csv',
                                   header=False,
                                   index=False)

    return
Esempio n. 5
0
    def models_evaluation(self):
        classifiers = [  # Allows for easy selection for SMVI testing
            svm.SVR(),
            linear_model.SGDRegressor(),
            linear_model.BayesianRidge(),
            linear_model.LassoLars(),
            linear_model.ARDRegression(),
            linear_model.PassiveAggressiveRegressor(),
            linear_model.TheilSenRegressor(),
            linear_model.LinearRegression()
        ]

        prediction_length = 10000

        trainingData_stock, trainingScores_stock, predictionData_stock = self.get_model_data(
            prediction_length, self.joint_data_frame['# of Tweets'].tolist(),
            self.joint_data_frame['Stock Volume'].tolist())
        trainingData_base, trainingScores_base, predictionData_base = self.get_model_data(
            prediction_length, self.joint_data_frame['# of Tweets'].tolist(),
            self.joint_data_frame['Base Volume'].tolist())

        predicted_stock = classifiers[2].fit(
            trainingData_stock,
            trainingScores_stock).predict(predictionData_stock)
        predicted_base = classifiers[2].fit(
            trainingData_base,
            trainingScores_base).predict(predictionData_base)

        Stock_SVMI = (sum(predicted_stock) /
                      prediction_length) / len(trainingData_stock)
        Base_SMVI = (sum(predicted_base) /
                     prediction_length) / len(trainingData_base)

        os.system('clear')
        print('Stock SMVI: ', Stock_SVMI)
        print('Base SMVI: ', Base_SMVI)
        self.SMVI = abs(
            abs(Stock_SVMI) - abs(Base_SMVI)
        )  # Using the difference between the SMVI for the stock and the base allows us to remove the possibility of a market crash
        print('Real SMVI (Unscaled): ', self.SMVI)
Esempio n. 6
0
    def __init__(self, df, run_prefix, max_iter, cv_count):
        self.run_prefix = run_prefix
        self.max_iter = max_iter
        self.cv_count = cv_count
       
        self.y_tune = df.PHENO
        self.X_tune = df.drop(columns=['PHENO'])
        self.IDs_tune = self.X_tune.ID
        self.X_tune = self.X_tune.drop(columns=['ID'])

        best_algo_name_in = run_prefix + '.best_algorithm.txt'
        best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False)
        self.best_algo = str(best_algo_df.iloc[0,0])

        self.algorithms = [
            linear_model.LinearRegression(),
            ensemble.RandomForestRegressor(),
            ensemble.AdaBoostRegressor(),
            ensemble.GradientBoostingRegressor(),
            linear_model.SGDRegressor(),
            svm.SVR(),
            neural_network.MLPRegressor(),
            neighbors.KNeighborsRegressor(),
            ensemble.BaggingRegressor(),
            xgboost.XGBRegressor()
        ]

        # Initialize a few variables we will be using later 
        self.log_table = None
        self.best_algo_name_in = None
        self.best_algo_df = None
        self.hyperparameters = None
        self.scoring_metric = None
        self.cv_tuned = None 
        self.cv_baseline = None 
        self.algo = None
        self.searchCVResults = None
        self.rand_search = None
        self.algo_tuned = None
        self.tune_out = None
Esempio n. 7
0
    def execute(self):

        # create model
        model = linear_model.SGDRegressor()

        # recursively eliminate features
        rfecv = RFECV(estimator=model,
                      step=1,
                      scoring="neg_mean_squared_error")
        rfecv.fit(self.partitions.x_train, self.partitions.y_train)
        rfecv.transform(self.partitions.x_train)

        # number of best features
        self.n_features = rfecv.n_features_

        # which categories are best
        self.best_features = rfecv.support_

        # rank features best (1) to worst
        self.feature_ranking = rfecv.ranking_

        return self.n_features, self.best_features, self.feature_ranking
Esempio n. 8
0
def UnivariateStochasticTool():
    regressor = linear_model.SGDRegressor(alpha=0.01, max_iter=1000)

    xx = [[el] for el in trainGdp]
    regressor.partial_fit(xx, trainOutputs)
    w0, w1 = regressor.intercept_[0], regressor.coef_[0]
    w = [w0, w1]
    print("-----with tool-----")
    print("Regression for attribute: GDP")
    print("\tThe learnt model: f(X,w) = " + str(w0) + " + " + str(w1) + " * X")

    computedOutputs = regressor.predict([[x] for x in testGdp])
    print("\tPrediction error (tool): ",
          str(mean_squared_error(testOutputs, computedOutputs)))
    print("\tPrediction error (manual): ",
          str(meanSquareError(testOutputs, computedOutputs)))

    plotDataForUni(gdpData, outputs, w, "Train & test data")
    plotDataForUni(trainGdp, trainOutputs, w,
                   "Train data and the learnt model")
    plotData2ForUni(testGdp, testOutputs, computedOutputs,
                    "Computed vs real test data")
Esempio n. 9
0
    def choose_model(self, X, y):
        """
        Automatic model chooser.

        :param X: data
        :param y: target

        :type X: ndarray or scipy.sparse matrix, (n_samples, n_features)
        :type y: ndarray, shape (n_samples,) or (n_samples, n_targets)
        """
        #{'linear', 'polynomial',logistic','logisticcv','elasticnet','elasticnetcv','orthogonal','orthogonalcv','theil','sgd','perceptron','passive_aggressive'}
        models = {
            'linear': linear_model.LinearRegression(),
            'logistic': linear_model.LogisticRegression(),
            'elasticnet': linear_model.ElasticNet(),
            'orthogonal': linear_model.OrthogonalMatchingPursuit(),
            'theil': linear_model.TheilSenRegressor(),
            'sgd': linear_model.SGDRegressor(),
            'passive_agressive': linear_model.PassiveAggressiveRegressor()
        }
        scores = {}
        for name, model in models.items():
            scores[name] = []

        sss = StratifiedShuffleSplit(10, 0.25)
        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            for name, model in models:
                mode.fit(X_train, y_train)
                scores[name].append(metrics.mean_squared_error(X_test, y_test))

        #Choose http://blog.minitab.com/blog/adventures-in-statistics-2/how-to-choose-the-best-regression-model
        index = None
        for name, model in models:
            min = 10000
            if scores[name][-1] < min:
                min = scores[name][-1]
                self._model = model
Esempio n. 10
0
def SGD():

    train_X, train_y, test_X, test_y, nonescaled_y = pre_process()

    clf = linear_model.SGDRegressor()

    for i in range(len(train_X)):
        X, y = train_X[i:i + 1], train_y[i:i + 1]
        clf.partial_fit(X, y)

    predsgdr = clf.predict(test_X)

    pred_vals = [
        (pred * (config.column1_max - config.column1_min)) + config.column1_min
        for pred in predsgdr
    ]

    pred_vals = np.asarray(pred_vals)

    get_scores("---------SGDRegressor----------", pred_vals, nonescaled_y)

    plot(nonescaled_y, pred_vals, "SGDRegressor Prediction Vs Truth.png")
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

    # TODO: Create a decision tree regressor object
       
    regressor1 = DecisionTreeRegressor()
    regressor2 = linear_model.SGDRegressor()
    #regressor3 = SVC()

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    tree_params = {'max_depth' : [3, 6, 9, 20, 100], 'min_samples_split':[2, 3, 4, 5]}
    sgd_params = {'loss':['squared_loss', 'huber'], 'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'n_iter':[10, 75, 100, 500]}
   #svm_params = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
    
    
    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Updated cv_sets and scoring parameter
    grid = GridSearchCV(regressor1, tree_params, scoring = scoring_fnc, cv = cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    #print("grid fit")
    grid = grid.fit(X, y)

    # Updated cv_sets and scoring parameter
    #grid = GridSearchCV(regressor2, sgd_params, scoring = scoring_fnc, cv = cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    #grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
Esempio n. 12
0
    def load_default(self, machine_list='basic'):
        """
        Loads 4 different scikit-learn regressors by default. The advanced list adds more machines. 

        Parameters
        ----------
        machine_list: optional, list of strings
            List of default machine names to be loaded.
        Returns
        -------
        self : returns an instance of self.
        """

        if machine_list == 'basic':
            machine_list = ['tree', 'ridge', 'random_forest', 'svm']
        if machine_list == 'advanced':
            machine_list=['lasso', 'tree', 'ridge', 'random_forest', 'svm', 'bayesian_ridge', 'sgd']

        self.estimators_ = {}
        for machine in machine_list:
            try:
                if machine == 'lasso':
                    self.estimators_['lasso'] = linear_model.LassoCV(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'tree':
                    self.estimators_['tree'] = DecisionTreeRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'ridge':
                    self.estimators_['ridge'] = linear_model.RidgeCV().fit(self.X_k_, self.y_k_)
                if machine == 'random_forest':
                    self.estimators_['random_forest'] = RandomForestRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'svm':
                    self.estimators_['svm'] = LinearSVR(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'sgd':
                    self.estimators_['sgd'] = linear_model.SGDRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'bayesian_ridge':
                    self.estimators_['bayesian_ridge'] = linear_model.BayesianRidge().fit(self.X_k_, self.y_k_)
            except ValueError:
                continue
        return self
Esempio n. 13
0
def get_model(model_type, c=0, epsilon=0, gamma=0):

    if model_type == RBF:
        model =  model = svm.SVR(kernel='rbf', C=c, epsilon=epsilon, gamma=gamma)
    elif model_type == POLY2:
        model = svm.SVR(kernel='poly', C=c, degree=2, epsilon=epsilon)
    elif model_type == POLY3:
        model = svm.SVR(kernel='poly', C=c, degree=3, epsilon=epsilon)
    elif model_type == POLY4:
        model = svm.SVR(kernel='poly', C=c, degree=4, epsilon=epsilon)
    elif model_type == LIN:
        model = svm.SVR(kernel='linear', C=c, epsilon=epsilon)
    elif model_type == Rand_F:
        model = ensemble.RandomForestRegressor()
    elif model_type == SGD:
        model = linear_model.SGDRegressor()
    elif model_type == KRR:
        model = kernel_ridge.KernelRidge(kernel='linear', alpha=1/(2*c))
    elif model_type == DT:
        model = DecisionTreeRegressor()
    else:
        raise(ValueError('unknown model type: ' + str(model_type)))
    return model
Esempio n. 14
0
def compute_params_SGDR(diamonds, prices, validation, validation_prices, _it,
                        _lr):
    np_X = numpy.array(diamonds, dtype=float)
    np_X_validation = numpy.array(validation, dtype=float)

    np_Y = prices
    np_Y_validation = validation_prices

    np_Y.transpose()
    np_Y_validation.transpose()

    regr = linear_model.SGDRegressor(max_iter=_it, eta0=_lr)
    regr.fit(np_X, np_Y)
    diamonds_y_pred = regr.predict(np_X_validation)

    print('Coefficients: \n', regr.coef_)
    print('Intercept: \n', regr.intercept_)

    print("Mean squared error: %.2f" %
          mean_squared_error(np_Y_validation, diamonds_y_pred))
    print("R2 Score: %.2f" % r2_score(np_Y_validation, diamonds_y_pred))

    return regr
def run_sgdreg(down_station, input_list, include_time, sample_size,
               network_type, _tol, _eta0):
    start_time_run = time.time()

    result_dir = util.get_result_dir(down_station, network_type, _tol, _eta0,
                                     sample_size)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    (y_train, x_train, y_cv, x_cv, _, _, _, _, train_y_max, train_y_min, _, _,
     _, _, _) = data.construct(down_station, input_list, include_time,
                               sample_size, network_type)

    sgdreg = linear_model.SGDRegressor(max_iter=100000, tol=_tol, eta0=_eta0)
    sgdreg.fit(x_train, y_train)
    y_pred = sgdreg.predict(x_cv)

    predict.plot_prediction(y_pred, result_dir, y_cv, train_y_max, train_y_min)

    elapsed_time_run = time.time() - start_time_run
    print(
        time.strftime("Fitting time : %H:%M:%S",
                      time.gmtime(elapsed_time_run)))
Esempio n. 16
0
def BivariateStochasticTool():
    regressor = linear_model.SGDRegressor(alpha=0.01, max_iter=1000)

    regressor.fit(trainInputs, trainOutputs)
    w0, w1, w2 = regressor.intercept_[0], regressor.coef_[0], regressor.coef_[
        1]
    w = [w0, w1, w2]
    print("-----with tool-----")
    print("Regression for attributes: GDP & Freedom")
    print("\tThe learnt model: f(X,w) = " + str(w0) + " + " + str(w1) +
          " * X1 + " + str(w2) + " * X2")

    computedTestOutputs = regressor.predict(testInputs)
    print("\tPrediction error (tool): ",
          str(mean_squared_error(testOutputs, computedTestOutputs)))
    print("\tPrediction error (manual): ",
          str(meanSquareError(testOutputs, computedTestOutputs)))

    plotDataForBi(gdpData, freedomData, outputs, w, "Train & test data")
    plotDataForBi(trainGdp, trainFreedom, trainOutputs, w,
                  "Train data and the learnt model")
    plotData2ForBi(testGdp, testFreedom, testOutputs, computedTestOutputs,
                   "Computed(green) vs real(red) test data")
Esempio n. 17
0
    def _fit_regression(self, dataset, target, level=None, features=None):
        """Fits a regression -- to be implemented by subclasses.

    This method updates self.model[target] with the trained model and does
    not return anything.

    Args:
      dataset: src.data.dataset.Dataset, the data which is to be used
        for fitting.
      target: string, the name of the target variable.
      level: string, the target's sub-class. If this isn't specified, the system
        will assume that the target is monolithic.
      features: list(string), a subset of dataset.vocab which is to be used
        while fitting.

    Returns:
      regression_base.ModelResult, the fitted parameters.
    """
        iterator = self._iter_minibatches(dataset=dataset,
                                          target_name=target['name'],
                                          features=features,
                                          batch_size=self.params['batch_size'],
                                          level=level)

        print('REGRESSION: fitting target %s', target['name'])
        model = linear_model.SGDRegressor(penalty=self.regularizer or 'none',
                                          alpha=self.lmbda,
                                          learning_rate='constant',
                                          eta0=self.params.get('lr', 0.001))

        for _ in tqdm(range(self.params['num_train_steps'])):
            xi, yi, x_features = next(iterator)
            model.partial_fit(xi, yi)

        return ModelResult(model=model,
                           weights=self._sklearn_weights(model, x_features),
                           response_type='continuous')
    def __init__(self, env, use_kernel=False, **agent_params):
        self.env = env
        self.use_kernel = use_kernel

        if use_kernel:
            # Sample feature space and define scaler to detrend data
            observation_samples = np.array(
                [env.observation_space.sample() for x in range(10000)])
            self.detrend = preprocessing.StandardScaler()
            self.detrend.fit(observation_samples)

            # Use detrended data to generate feature space with RBF kernels
            self.featurizer = pipeline.FeatureUnion([
                ("rbf1", RBFSampler(gamma=3.0, n_components=100)),
                ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
                ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
                ("rbf4", RBFSampler(gamma=0.5, n_components=100))
            ])
            self.featurizer.fit(self.detrend.transform(observation_samples))

        # Generate linear value function model for each action in our action space
        self.models = []
        initReward = np.array(0)
        for k in range(env.action_space.n):
            self.models.append(
                linear_model.SGDRegressor(learning_rate="constant"))
            random_features = self.map_to_features(self.env.reset())
            self.models[k].partial_fit(random_features.reshape(1, -1),
                                       initReward.ravel())

        self.agent_params = {
            "epsilon_min": 0.01,
            "decay_rate": 0.02,
            "discount": 0.99,
            "iter": 1000
        }
        self.agent_params.update(agent_params)
Esempio n. 19
0
def testing_using_crossvalidation_regression(df, label, features, alpha,
                                             l1_ratio, penalty, loss, epsilon,
                                             label_std):
    """Fit a model, then test it using 5-fold crossvalidation

    Parameters
    ----------
    df : pandas.DataFrame
        pandas dataframe of features and labels
    features : list of strings
        list of feature labels to use in model training
    alpha : float
        weighting of the regularization term
    l1_ratio : float
        the Elastic Net mixing parameter, 0 <= l1_ratio <= 1
        l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1
    penalty : string
        penalty specification, 'none', 'l2', 'l1', or 'elasticnet'

    Returns
    -------
    float
        average crossvalidation score (accuracy)
    """
    reg = linear_model.SGDRegressor(alpha=alpha,
                                    loss=loss,
                                    penalty=penalty,
                                    l1_ratio=l1_ratio,
                                    epsilon=epsilon,
                                    max_iter=1000)
    scores = model_selection.cross_val_score(reg,
                                             df[features],
                                             df[label],
                                             cv=5,
                                             scoring='neg_mean_absolute_error')
    return -1.0 * scores.mean() / label_std
Esempio n. 20
0
def train(training_data):
    training_data_cutoff = int(floor(len(training_data) * .7))
    random.shuffle(training_data)

    x_data = []
    y_data = []

    for i, sample in enumerate(training_data):
        x_data.append(sample["amplitudes"])
        y_data.append(sample["rating"])

    reg = linear_model.SGDRegressor()
    reg.fit(x_data[:training_data_cutoff], y_data[:training_data_cutoff])

    predicted = reg.predict(x_data[training_data_cutoff:])
    actual = y_data[training_data_cutoff:]

    return Bunch({
        "predicted": predicted,
        "actual": actual,
        "x_data_test": x_data[training_data_cutoff:],
        "y_data_test": y_data[training_data_cutoff:],
        "reg": reg
    })
Esempio n. 21
0
def find_one_LRSGD_parameter_all_CV(data,
                                    columns_used,
                                    output,
                                    grid_size,
                                    feature_elim=False):
    SGD_model_eval = []
    print("Feature ELim: ", feature_elim)
    if feature_elim == False:
        k = 'all'
        for penalty,alpha,lr,eta0 in itertools.product(['l1','l2','elasticnet'],[0.1,0.001,0.01,0.0001],\
                                                    ['constant','optimal'],[1,0.1,0.001,0.01,0.0001]):
            model = linear_model.SGDRegressor(penalty=penalty, alpha=alpha, learning_rate=lr, \
                                          eta0=eta0, random_state=4, shuffle=False)

            #now run it for all CV and find average error
            error = run_validation_CV(data, columns_used, output, model)
            SGD_model_eval.append([penalty, alpha, lr, eta0, k, error])

    SGD_model_eval = pd.DataFrame(SGD_model_eval,\
                              columns=['penalty','alpha','lr','eta0','k','RMSE']).groupby(by=['penalty','alpha','lr','eta0','k']).sum()
    print(SGD_model_eval.RMSE.argmin())
    SGD_model_eval.to_csv("../results/SGD/%s/%s_param_CV_error.csv" %
                          (grid_size, output))
    return SGD_model_eval.RMSE.argmin()
Esempio n. 22
0
    def detect(self, results, job=None, type='alert'):
        logger = logging.getLogger(__name__)
        logger.debug(results)
        name = job['name']
        logger.info('Processing results for: %s' % (name))

        df = pd.read_csv(job['data'])
        n = len(df)
        sdg = linear_model.SGDRegressor()

        mapper = DataFrameMapper([('value', None)])
        df_train = df[:n / 2]['value'].as_matrix()
        df_test = df[n / 2:]['value'].as_matrix()

        logger.info([df_train])
        #        ft_df = fft(df['value'].as_matrix())
        #        ft_x = fftfreq(len(df['value']))

        df.plot(title=name)
        #        r = sdg.fit(df_train['value'].as_matrix(),df_test['value'].as_matrix())
        #        logger.info(r)
        self.decompose(df['value'])
        plt.show()
        logger.info(df)
Esempio n. 23
0
    def trainRegressionModel(self, training_dataset):
        # Create matrix:
        features = self.fe.calculateFeatures(training_dataset, input='file')
        Xtr = []
        Ytr = []
        f = open(training_dataset)
        c = -1
        for line in f:
            data = line.strip().split('\t')
            cands = [cand.strip().split(':')[1] for cand in data[3:]]
            indexes = [int(cand.strip().split(':')[0]) for cand in data[3:]]
            featmap = {}
            for cand in cands:
                c += 1
                featmap[cand] = features[c]
            for i in range(0, len(cands) - 1):
                for j in range(i + 1, len(cands)):
                    indexi = indexes[i]
                    indexj = indexes[j]
                    indexdiffji = indexj - indexi
                    indexdiffij = indexi - indexj
                    positive = featmap[cands[i]]
                    negative = featmap[cands[j]]
                    v1 = np.concatenate((positive, negative))
                    v2 = np.concatenate((negative, positive))
                    Xtr.append(v1)
                    Xtr.append(v2)
                    Ytr.append(indexdiffji)
                    Ytr.append(indexdiffij)
        f.close()
        Xtr = np.array(Xtr)
        Ytr = np.array(Ytr)

        model = linear_model.SGDRegressor()
        model.fit(Xtr, Ytr)
        return model
 def __init__(self,
              history_length,
              prediction_horizon,
              difference_learning,
              averaging,
              streaming,
              settings=None):
     super().__init__(history_length,
                      prediction_horizon,
                      difference_learning,
                      averaging=averaging,
                      streaming=streaming)
     eta0 = 0.0001
     epochs = 1
     if settings:
         eta0 = settings['eta0']
         epochs = settings.get('epochs', 1)
     self.models_ = []
     for i in range(self.observation_dimension):
         self.models_.append(
             linear_model.SGDRegressor(verbose=False,
                                       learning_rate='constant',
                                       eta0=eta0))
     self.epochs_ = epochs
Esempio n. 25
0
def regr(df, mod, modScale, sgdScale=1, ForMod=1):
    hold = 0
    sgd = linear_model.SGDRegressor(max_iter=1000,
                                    alpha=0.0001,
                                    penalty='elasticnet')
    if sgdScale == 1:
        scaler = StandardScaler()
        normalized = scaler.fit_transform(df.iloc[:, 1:9])
        xTs = pd.DataFrame(normalized)
        sgd.fit(pd.DataFrame(normalized), df.iloc[:, 0])
        if ForMod == 2:
            predDat, uniPred, arimaPred = arimaRNN(df, mod)
        if ForMod == 1:
            predDat, uniPred = rscript(df)
        else:
            predDat, uniPred, mod = esRNN(df, mod, modScale=0)
            if len(predDat) == 1:
                norm = scaler.fit_transform(np.array(predDat).reshape(-1, 1))
            else:
                norm = scaler.fit_transform(np.array(predDat).reshape(1, -1))
        print(norm)
        multiPred = sgd.predict(norm)
    else:
        sgd.fit(df.iloc[:, 1:9], df.iloc[:, 0])
        if ForMod == 2:
            predDat, uniPred, arimaPred = arimaRNN(df, mod)
            hold = arimaPred
        if ForMod == 1:
            predDat, uniPred = rscript(df)
        else:
            predDat, uniPred, mod = esRNN(df, mod, modScale=0)
        multiPred = sgd.predict([np.asarray(predDat)])
    naive = df.iloc[:, 0].mean()
    m = df.iloc[:, 0].rolling(2).mean()
    ma = m[m.shape[0] - 1]
    return uniPred, multiPred.item(0), naive, ma, mod, hold
Esempio n. 26
0
def get_stats(path):
    info = pd.read_csv(path)
    info = info.dropna()

    f = info['price'] < 100000
    info = info[f]  # Get information only about flats with price < 100'000

    X = info[['type', 'size', 'locality']].values
    scaler_X = preprocessing.StandardScaler().fit(X)
    X = scaler_X.transform(X)
    y = info['price'].values

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=20)

    estimators = [
        linear_model.LinearRegression(),
        linear_model.Ridge(alpha=0.1),
        linear_model.Lasso(alpha=0.1),
        linear_model.ElasticNet(alpha=0.01, l1_ratio=0.25),
        linear_model.BayesianRidge(n_iter=500),
        linear_model.OrthogonalMatchingPursuit(),
        linear_model.SGDRegressor(max_iter=2500, epsilon=0.01),
        SVR(kernel='rbf', epsilon=0.01, C=20)
    ]

    estimator_values = np.array([])

    for e in estimators:
        e.fit(X_train, y_train)
        this_err = metrics.median_absolute_error(y_test, e.predict(X_test))
        estimator_values = np.append(estimator_values, this_err)

    return estimator_values
Esempio n. 27
0
    zero_count = ratings[-1].count(0)
    zero_feat_count = features[-1].count(0)
    if (zero_count > 4 or zero_feat_count > 4):
        ratings.pop()
        features.pop()
features = np.array(features)
ratings = np.array(ratings)
not_feature_index = [4, 10]
features = np.delete(features, not_feature_index, axis=1)
features = preprocessing.scale(features)
for i in range(22):
    min_error = np.inf
    best_alpha = -1
    al = 0.01
    for c in range(20):
        clf = linear_model.SGDRegressor(penalty='l1', alpha=al, n_iter=100)
        al = 0.01 * c
        ans = cross_val_predict(clf, features, ratings[:, i], cv=5)
        if top_row[ratings_start_point +
                   i] == 'cspostur' or top_row[ratings_start_point +
                                               i] == 'cseyecon':
            ans = cross_val_predict(clf,
                                    features[12:, :],
                                    ratings[12:, i],
                                    cv=5)
            if (min_error > mean_squared_error(ratings[12:, i], ans)):
                min_error = mean_squared_error(ratings[12:, i], ans)
                best_clf = clf.fit(features[12:, :], ratings[12:, i])
                best_alpha = al
                ssreg = np.sum((ans - np.mean(ratings[12:, i]))**2)
                sstot = np.sum((ratings[12:, i] - np.mean(ratings[12:, i]))**2)
Esempio n. 28
0
trainOutputs, testOutputs = statisticalNormalisation(trainO, testO)
#tool data normalisation
# toolTrainInputs=tool_normalisation(trainI)

myGD = GD(len(trainInputs[0]))
myGD.train(trainInputs, trainOutputs)
# myGD.train(trainI, trainO)
model = "The MANUAL BATCH learnt model: " + str(myGD.intercept)
for i in range(len(myGD.coef)):
    model += " + " + str(myGD.coef[i]) + " * x" + str(i + 1)
print(model)
computedTestOutputs = myGD.predict(testInputs)
err = myGD.eroare(computedTestOutputs, testOutputs)

#tool
toolRegressor = linear_model.SGDRegressor(alpha=0.01)
for ep in range(1000):
    toolRegressor.partial_fit(trainInputs, trainOutputs)

model = "The TOOL learnt model: " + str(toolRegressor.intercept_[0])
for i in range(len(toolRegressor.coef_)):
    model += " + " + str(toolRegressor.coef_[i]) + " * x" + str(i + 1)
print(model)

toolComputed = toolRegressor.predict(testInputs)
print("Eroare tool regresor:" +
      str(mean_squared_error(toolComputed, testOutputs)))
print("Eroare tool pentru regresorul meu:" +
      str(mean_squared_error(computedTestOutputs, testOutputs)))
print("Eroare fara tool:" + str(err))
print(y_score)

# Predict on the test data: y_pred
y_pred = svr.predict(X_test)
# Compute and print R^2 and RMSE
print("R^2: {}".format(svr.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test , y_pred))
print("Root Mean Squared Error: {}".format(rmse))


##########################################
# SGD

from sklearn import linear_model

clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)

clf.fit(X_train, y_train) 

# Calling the score method, which compares the predicted values to the actual values

y_score = clf.score(X_test, y_test)

# The score is directly comparable to R-Square
print(y_score)




##################################
# comparing results to evaluate model
Esempio n. 30
0
#See Ridge Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge
lr = linear_model.Ridge(alpha=1.0,
                        fit_intercept=True,
                        normalize=False,
                        copy_X=True,
                        max_iter=None,
                        tol=0.001,
                        solver='auto',
                        random_state=None)

#See SGD Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor
sgd = linear_model.SGDRegressor(loss='squared_loss',
                                penalty='l2',
                                alpha=0.0001,
                                l1_ratio=0.15,
                                fit_intercept=True,
                                max_iter=1000,
                                tol=0.001,
                                shuffle=True,
                                verbose=0,
                                epsilon=0.1,
                                random_state=None,
                                learning_rate='invscaling',
                                eta0=0.01,
                                power_t=0.25,
                                early_stopping=False,
                                validation_fraction=0.1,
                                n_iter_no_change=5,
                                warm_start=False,
                                average=False)