Ejemplo n.º 1
0
def ridge_optimized_fit(m_structure_list, a_min, a_max, step):
    alpha = a_min
    old_CV2_min = 10000000
    alpha_opt = 0
    while alpha < a_max:
        energies = []
        for i in range(len(m_structure_list)):
            mat = m_structure_list[i]
            if mat.mag_phase != "pera" and mat.phase_name != "pm":
                energies.append(m_structure_list[i].enrg)
        x_rows = len(energies)
        x_colm = m_structure_list[0].num_beg_rules + m_structure_list[
            0].num_cluster_rules + m_structure_list[0].num_j_rules
        x_matrix = np.zeros((x_rows, x_colm))
        x_matrix = np.matrix(x_matrix)
        i = 0
        for itter in range(len(m_structure_list)):
            mat = m_structure_list[itter]
            if mat.mag_phase != "pera" and mat.phase_name != "pm":
                for j in range(m_structure_list[0].num_beg_rules):
                    x_matrix[i, j] = m_structure_list[i].BEG_sums[j]
                for k in range(m_structure_list[0].num_cluster_rules):
                    x_matrix[i, j + k] = m_structure_list[i].Cluster_sums[k]
                for l in range(m_structure_list[0].num_j_rules):
                    x_matrix[i, j + k + l] = m_structure_list[i].J_sums[l]
                i += 1
        x_cross = np.zeros((x_rows - 1, x_colm))
        CV2 = 0
        for i in range(len(energies)):
            inc = 0
            eng_cross = []
            for j in range(len(energies)):
                if i != j:
                    eng_cross.append(energies[j])
                    x_cross[inc, :] = x_matrix[j, :]
                    inc += 1
            b = np.transpose(np.matrix(eng_cross))
            a = np.matrix(x_cross)
            r_fit = Ridge(alpha=alpha)
            r_fit.fit(a, b)
            Js = r_fit.get_params()
            Ei = 0
            for j in range(len(Js)):
                Ei += x_matrix[i, j] * Js[j]
            CV2 += (energies[i] - Ei)**2
        CV2 *= 1 / len(m_structure_list)
        if CV2 < old_CV2_min:
            old_CV2_min = CV2
            alpha_opt = alpha
    r_fit = Ridge(alpha=alpha_opt)
    r_fit.fit(a, b)
    Js = r_fit.get_params()
    print(alpha_opt)
    print(CV2)
    return Js
Ejemplo n.º 2
0
def ridge_model(xy):
    # ridge模型
    x = xy[:, 0].reshape(-1, 1)
    y = xy[:, 1]
    model = Ridge()
    alpha_can = np.linspace(-1, 10, 30)
    model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5)
    model.fit(x, y)
    print(model.best_params_)
    pred_y = model.predict(x)
    params = model.get_params()
    print(params)
    ridge_r2 = sm.r2_score(y, pred_y)
    ridge_absolute = sm.mean_absolute_error(y, pred_y)
    ridge_squared = sm.mean_squared_error(y, pred_y)
    ridge_median = sm.median_absolute_error(y, pred_y)
    drawing_ridge(xy, x, pred_y, model.best_params_)
    return {
        'ridge_score': {
            'ridge_r2': round(ridge_r2, 5),
            'ridge_absolute': round(ridge_absolute, 5),
            'ridge_squared': round(ridge_squared, 5),
            'ridge_median': round(ridge_median, 5)
        }
    }
Ejemplo n.º 3
0
    def test_regressor_modifications(self):
        regressor = Ridge(alpha=1e-8)
        pcovr = self.model(mixing=0.5, regressor=regressor)

        # PCovR regressor matches the original
        self.assertTrue(regressor.get_params() == pcovr.regressor.get_params())

        # PCovR regressor updates its parameters
        # to match the original regressor
        regressor.set_params(alpha=1e-6)
        self.assertTrue(regressor.get_params() == pcovr.regressor.get_params())

        # Fitting regressor outside PCovR fits the PCovR regressor
        regressor.fit(self.X, self.Y)
        self.assertTrue(hasattr(pcovr.regressor, "coef_"))

        # PCovR regressor doesn't change after fitting
        pcovr.fit(self.X, self.Y)
        regressor.set_params(alpha=1e-4)
        self.assertTrue(hasattr(pcovr.regressor_, "coef_"))
        self.assertTrue(
            regressor.get_params() != pcovr.regressor_.get_params())
Ejemplo n.º 4
0
with open('valid_sets.json', 'w') as fp:
    dump(valid_sets, fp)

# # Загрузка модели

# In[ ]:

dir = abspath('')
enc_categ = load(join(dir, 'enc_categ.pkl'))
enc_text = load(join(dir, 'enc_text.pkl'))
clf = load(join(dir, 'model.pkl'))

clf

# # Предсказание на случайном примере

# In[ ]:

X_rand = random(
    m=1,  # количество объектов
    n=24627,
    random_state=clf.get_params()['random_state']  # необязательно
)
X_rand

# In[ ]:

pred = clf.predict(X=X_rand)
pred
Ejemplo n.º 5
0
ridge_score = cvs(ridge_reg,
                  X_train,
                  Y_train,
                  scoring="neg_mean_squared_error",
                  cv=10)
ridge_score = np.sqrt(-ridge_score)
display_scores(reg_score)
display_scores(ridge_score)

#%%
# Grid search to find good hyperparameters
from sklearn.model_selection import GridSearchCV

# get parameters for hyperparameter tuning
#print(regressor.get_params().keys())
print(ridge_reg.get_params().keys())

#param_grid_reg = [{'fit_intercept': [True, False], 'normalize': [True, False]}]

param_grid_ridge = [{
    'alpha': [1e-3, 1e-2, 1e-1, 1],
    'fit_intercept': [False],
    'normalize': [True, False],
    'solver': ['cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}, {
    'alpha': [1e-3, 1e-2, 1e-1, 1],
    'fit_intercept': [True],
    'normalize': [True, False],
    'solver': ['sparse_cg', 'sag']
}]
Ejemplo n.º 6
0
X = clf.transform(X)
print "pca X: ", X, len(X), len(X[0])
X = PolynomialFeatures(degree=1).fit_transform(X)
print "poly features: ", X, len(X), len(X[0])
clf = Ridge(alpha=0.001, max_iter=100000, normalize="True", fit_intercept=True)

# params = {'alpha':[0.0001,0.001,0.01,0.1,1,10,50,100]}
# clf = GridSearchCV(Ridge(normalize=True,max_iter=100000), params,  cv = 5,verbose=True)
X = X[:, 1:len(X[0])]
clf.fit(X, y)
# y_pred = clf.predict(X)
# print y_pred, y
# print (np.sum((y_pred - y)** 2)/len(X))
# print clf.score(X,y)
# print clf.best_estimator_
# print clf.best_score_
# print clf.best_params_
# print clf.cv_results_

print "coef:", clf.coef_
print clf.n_iter_
y_pred = clf.predict(X)
print "y:", y
print "y_pred:", y_pred
print(np.sum((y_pred - y)**2) / len(X))
print clf.score(X, y)
print clf.get_params()
joblib.dump(clf, "poly_Ridge_20k.pkl")

# clf = joblib.load('filename.pkl')
Ejemplo n.º 7
0
def main():
    #load data
    dnn = VGG16
    #dnn = ResNet50
    #dnn = VGG19

    X_train, y_train, X_test, y_test = load_data(hdf5_path)

    # if features are not obtained yet
    #get_features(hdf5_path, dnn, 'block2_pool');

    print("Loading feature from HDF5 files...")
    hdf5_manip = MyHDF5()

    hdf5_manip.hfile = os.path.join(hdf5_path, dnn.__name__,
                                    "train_features.hdf5")
    train_features = hdf5_manip.load()

    hdf5_manip.hfile = os.path.join(hdf5_path, dnn.__name__,
                                    "test_features.hdf5")
    test_features = hdf5_manip.load()

    hdf5_manip.hfile = os.path.join(hdf5_path, "y_train.hdf5")
    y_train = hdf5_manip.load()

    hdf5_manip.hfile = os.path.join(hdf5_path, "y_test.hdf5")
    y_test = hdf5_manip.load()

    train_model = False
    alpha = 15.0

    if train_model:
        print("Training linear regression model...")
        clf = Ridge(alpha=alpha)
        tic = time.time()
        clf.fit(train_features, y_train)
        toc = time.time()
        print("Time elapsed: {0} seconds".format(toc - tic))

        print("Predicting...")
        predictions = clf.predict(test_features)

        mse = metrics.mean_squared_error(y_test, predictions)
        print("MSE: {0}".format(mse))

    pickle_manip = MyPickle()
    pickle_manip.pfile = os.path.join(pickle_path, dnn.__name__,
                                      "clf_all_" + str(alpha) + ".sav")

    if train_model:
        # Store the model in a pickle file
        pickle_manip.dump(clf)

    clf = pickle_manip.load()
    print("\nRidge model: ")
    print(clf.get_params())
    print(clf.score(train_features, y_train))

    print("\nModel on Train data:")
    pred = clf.predict(train_features)
    print(pred)
    print(metrics.mean_squared_error(y_train, pred))

    print("\nModel on Test data:")
    predictions = clf.predict(test_features)
    print(predictions)
    mse = metrics.mean_squared_error(y_test, predictions)
    print("\nMSE: {0}".format(mse))

    #### Test results

    extract_features = False

    if extract_features:
        # image parameters
        width = 50
        height = 50
        channels = 3

        # grab all filenames
        extensions = [".jpg"]
        file_names = [
            fn for fn in os.listdir(test_img_path) if any(
                fn.endswith(ext) for ext in extensions)
        ]

        # initialize image array which holds frames
        num_imgs = len(file_names)
        X = np.empty((num_imgs, height, width, channels), dtype='float32')

        print("\nConverting jpegs to numpy arrays...\n")

        img_size = (height, width)

        for idx in range(num_imgs):
            if idx % 1000 == 0:
                print("Converting image {0}".format(file_names[idx]))
            file_path = os.path.join(test_img_path, file_names[idx])
            img_manip = MyImage(file_path, img_size)
            img = img_manip.conv_jpg2array()
            X[idx] = img

        print("\nWriting X to HDF5...")
        hdf5_manip = MyHDF5()

        hdf5_manip.hfile = os.path.join(test_out_path, "X.hdf5")
        hdf5_manip.write(X)

        base_model = dnn(weights='imagenet', include_top=False)

        model = Model(input=base_model.input,
                      output=base_model.get_layer('block2_pool').output)

        print("Computing X features...")
        X_features = model.predict(X)
        print("X features shape, before reshape: {0}".format(X_features.shape))
        X_features = np.reshape(X_features, (X_features.shape[0], -1))
        print("X features shape, after reshape: {0}".format(X_features.shape))

        print("Writing X feature to HDF5 files...")
        hdf5_manip = MyHDF5()

        hdf5_manip.hfile = os.path.join(test_out_path, dnn.__name__,
                                        "X_features.hdf5")
        hdf5_manip.write(X_features)
    else:
        print("Loading X feature from HDF5 files...")
        hdf5_manip = MyHDF5()

        hdf5_manip.hfile = os.path.join(test_out_path, dnn.__name__,
                                        "X_features.hdf5")
        X_features = hdf5_manip.load()

    print(X_features.shape)
    predictions = clf.predict(X_features)
    print(predictions)
Ejemplo n.º 8
0
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import Ridge

data = pd.read_csv("../resources/data.csv")

r = Ridge()
r.set_params(alpha=10)
print(r.get_params()['alpha'])
Ejemplo n.º 9
0
class Regressor(object):

    """Wraps scikit regressors"""

    def __init__(self, modelname='Linear', num_bagged_est=None, random_state=None, **kwargs):
        """Construct a regressor
    
        Parameters
        ----------
        modelname : str, model name to be used as regressor
            Available models:
            - "XGBoost", 
            - "LightGBM",
            - "Keras", 
            - "RandomForest", 
            - "ExtraTrees", 
            - "Tree", 
            - "Bagging", 
            - "AdaBoost" 
            - "Linear"
        num_bagged_est: int or None
            Number of estimators to be averaged after bagged fitting. 
            If None then bagged fitting is not performed. 
        random_state:  int, RandomState instance or None, optional, default=None
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator; 
            If None, the random number generator is the RandomState instance used by models. 
        **kwargs : default = None
            Parameters of the corresponding regressor.
            Examples : n_estimators, max_depth, ...
        """
        if not _IS_SKLEARN_INSTALLED:
            raise ValueError('Scikit-learn is required for this module')
            
        self.__modelname = modelname
        if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED:
            raise ValueError('Package XGBoost is not installed.')
        elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED:
            raise ValueError('Package LightGBM is not installed.')
        elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED:
            raise ValueError('Package Keras is not installed.')

        self.__regressor = None
        self.__set_regressor(self.__modelname)
        self.set_params(**kwargs)
        
        self.__num_bagged_est = num_bagged_est
        if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None:
            raise ValueError("num_bagged_est must be either None or an integer.")
        self.__random_state = random_state
        if type(self.__random_state) != int and self.__random_state is not None:
            raise ValueError("random_state must be either None or an integer.")
        self.set_params(random_state=self.__random_state)
        
        self.__fitOK = False
        self.__bagged_est = None
        
    def get_params(self, deep=True):

        params = {}
        params.update({"modelname": self.__modelname,
                       "num_bagged_est": self.__num_bagged_est,
                       "random_state": self.__random_state})
        params.update(self.__regressor.get_params())

        return params
    
    def set_params(self, **params):

        self.__fitOK = False
        self.__bagged_est = None

        if 'modelname' in params.keys():
            self.__set_regressor(params['modelname'])
            del params['modelname']
            if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED:
                raise ValueError('Package XGBoost is not installed.')
            elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED:
                raise ValueError('Package LightGBM is not installed.')
            elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED:
                raise ValueError('Package Keras is not installed.')
                    
        if 'num_bagged_est' in params.keys():
            self.__num_bagged_est = params['num_bagged_est']
            del params['num_bagged_est']
            if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None:
                raise ValueError("num_bagged_est must be either None or an integer.")
                
        if 'random_state' in params.keys():
            self.__random_state = params['random_state']
            if 'random_state' not in self.__regressor.get_params().keys():
                del params['random_state']
            if type(self.__random_state) != int and self.__random_state is not None:
                raise ValueError("random_state must be either None or an integer.")
        
        if 'build_fn' in params.keys() and self.get_estimator_name == 'Keras':
            setattr(self.__regressor, 'build_fn', params['build_fn'])
            del params['build_fn']
            
        self.__regressor.set_params(**params)
                    
    def __set_regressor(self, modelname):

        self.__modelname = modelname

        if(modelname == 'XGBoost'):
            self.__regressor = XGBRegressor()

        elif(modelname == "LightGBM"):
            self.__regressor = LGBMRegressor()
        
        elif(modelname == "Keras"):
            self.__regressor = KerasRegressor(build_fn=Sequential())
            
        elif(modelname == 'RandomForest'):
            self.__regressor = RandomForestRegressor()

        elif(modelname == 'ExtraTrees'):
            self.__regressor = ExtraTreesRegressor()

        elif(modelname == 'Tree'):
            self.__regressor = DecisionTreeRegressor()

        elif(modelname == "Bagging"):
            self.__regressor = BaggingRegressor()

        elif(modelname == "AdaBoost"):
            self.__regressor = AdaBoostRegressor()

        elif(modelname == "Linear"):
            self.__regressor = Ridge()

        else:
            raise ValueError(
                "Model name invalid. Please choose between LightGBM " +
                "(if installed), XGBoost(if installed), Keras(if installed)," +
                "RandomForest, ExtraTrees, Tree, Bagging, AdaBoost or Linear")
            
    def fit(self, X, y, **kwargs):
        """Fit model. In case num_bagged_est is not None then additionally 
        performing a type of bagging ensamble - ensamble from the same models, 
        but with different seed values/reshuffled data which aims to decrease
        variance of the predictions.
        
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.
        **kwargs : default = None
            Additional fitting arguments accepted by model. Not tested.  
            
        Returns
        -------
        object
            self
        """
        y = self.__process_target(y)
            
        if self.__num_bagged_est is None:
            self.__regressor.fit(X, y, **kwargs)
            
        else:
            if not hasattr(self.__regressor, 'random_state'):
                 warnings.warn("The regressor " + str(self.__modelname) + 
                               " has no random_state attribute and only random " +
                               " shuffling will be used.")
        
            self.__bagged_est = []
            for i in range(0, self.__num_bagged_est):
                X_shuff, y_shuff = shuffle(X, y, random_state=self.__random_state+i)
                est = self.get_estimator()
                if hasattr(est, 'random_state'):
                    est.set_params(random_state=self.__random_state+i)
                est.fit(X_shuff, y_shuff, **kwargs)
                self.__bagged_est.append(est)
                
        self.__fitOK = True
        
        return self
    
    def predict(self, X):

        """Predicts the target.
        
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            
        Returns
        -------
        array of shape = [n_samples, ] 
            The target to be predicted.
        """

        try:
            if not callable(getattr(self.__regressor, "predict")):
                raise ValueError("predict attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:
            if self.__num_bagged_est is None:
                return self.__regressor.predict(X)
            else:
                bagged_pred = np.zeros(X.shape[0])
                for c, est in enumerate(self.__bagged_est): 
                    bagged_pred += est.predict(X) / self.__num_bagged_est
                    
        else:
            raise ValueError("You must call the fit function before !")
        
        return bagged_pred
 
    def transform(self, X):

        """Transforms X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]

        Returns
        -------
        array-like or sparse matrix of shape = [n_samples, n_features]
            The transformed X.
        """

        try:
            if not callable(getattr(self.__regressor, "transform")):
                raise ValueError("transform attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            return self.__regressor.transform(X)
        else:
            raise ValueError("You must call the fit function before !")


    def score(self, X, y, sample_weight=None):

        """Returns the coefficient of determination R^2 of the prediction.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training and cv.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.

        Returns
        -------
        float
            R^2 of self.predict(df) wrt. y.
        """

        try:
            if not callable(getattr(self.__regressor, "score")):
                raise ValueError("score attribute is not callable")
        except Exception as e:
            raise e

        if self.__fitOK:

            return self.__regressor.score(X, y, sample_weight)
        else:
            raise ValueError("You must call the fit function before !")
            
    def cross_val_predict(self, X, y, cv=None, scoring=None, **kwargs):
        
        """Performing cross validation hold out predictions for stacking.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training and cv.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.
        cv : int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy.
            Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a StratifiedKFold,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.
        scoring : callable, default: None
                A callable to evaluate the predictions on the cv set.
                None, accuracy score
        **kwargs : default = None
            Additional fitting arguments accepted by model. Not tested.         
        Returns
        -------
        array of shape = [n_samples, ]
            The hold out target
        """
        y = self.__process_target(y)
        
        y_pred = np.zeros(X.shape[0]) 
        
        cv = check_cv(cv, y, classifier=False)
        n_splits = cv.get_n_splits(X, y)
           
        if scoring is None:
            scoring = make_scorer(accuracy_score)
            
        i = 0 
        score_mean = 0.0
        print("Starting hold out prediction with {} splits.".format(n_splits))
        for train_index, cv_index in cv.split(X, y): 
            X_train = X[train_index]    
            y_train = y[train_index]
            X_cv = X[cv_index]
            y_cv = y[cv_index]
            
            est = self.get_estimator()
            est.fit(X_train, y_train, **kwargs)
            y_pred_cv = est.predict(X_cv)
            
#            score = scoring(y_cv, y_pred_proba_cv)                        
            
#            print("Train size: {} ::: cv size: {} score (fold {}/{}): {:.4f}".format(len(train_index), len(cv_index), i + 1, n_splits, score)) 
#            score_mean += score / float(n_splits)
            
            y_pred[cv_index] = y_pred_cv
            
            i += 1 
        
#        print("Mean score: {:.4f}".format(score_mean))    

        return y_pred
        
    def cross_validate(self, X, y, cv=None, scoring=None, **kwargs):
        """Performing a cross validation method.
        
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training.
        y : array-like of shape = [n_samples, ]
            The numerical encoded target for regression tasks.
        cv : int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy.
            Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a StratifiedKFold,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.
        scoring : 
            For scikit learn models:
                string, callable, list/tuple, dict or None, default: None
                A single string or a callable to evaluate the predictions on the test set.
                None, the estimator’s default scorer (if available) is used.
            For LightGBM:
                callable or None, optional (default=None)
                Customized evaluation function.
                Note: should return (eval_name, eval_result, is_higher_better) or list of such tuples.
            For XGBoost:
                callable or None, optional (default=None)
                Customized evaluation function.  
        **kwargs : default = None
            Additional fitting arguments.  
            
        Returns
        -------
        object
            self
        """  
        y = self.__process_target(y)
        
        if self.get_estimator_name == 'LightGBM':
            params = self.__regressor.get_params()
            data = lgb.Dataset(X, label=y)
            cv = check_cv(cv, y, classifier=False)
            ret = lgb.cv(params, data, feval=scoring, folds=cv, **kwargs)
        
        elif self.get_estimator_name == 'XGBoost':
            params = self.__regressor.get_xgb_params()
            data = xgb.DMatrix(X, label=y)
            cv = check_cv(cv, y, classifier=False)
            ret = xgb.cv(params, data, feval=scoring, folds=cv, **kwargs)

        else:  
            ret = cross_validate(self.__regressor, X, y, cv=cv, scoring=scoring)
        
        return ret
    
    def __process_target(self, y):
        
        y = np.array(y, dtype='float') 
               
        return y
    
    def get_estimator(self):

        return self.__classifier 
    
    def get_estimator_copy(self):

        return make_copy(self.__classifier)
    
    @property
    def feature_importances_(self):  
        if self.__fitOK:
            
            if hasattr(self.__regressor, 'feature_importances_'):
                return self.__regressor.feature_importances_
            else:
                raise ValueError('The regressor ' + self.get_estimator_name + 
                                 ' does not have feature_importances_ attribute.')
                
        else:
            
            raise ValueError("You must call the fit function before !")
            
    @property
    def get_estimator_name(self):
        
        return self.__modelname
Ejemplo n.º 10
0
# In[20]:

mean_absolute_error(y_valid, ridge_regr.predict(X_valid_sparse))

# #### Try RidgeCV

# In[37]:

kf = KFold(n_splits=3, shuffle=True, random_state=42)
alphas = np.logspace(-2, 6, 10)
ridge = Ridge(random_state=42)

# In[15]:

ridge.get_params()

# In[41]:

params = {'alpha': np.logspace(-2, 2, 30)}

# In[42]:

get_ipython().run_cell_magic(
    'time', '',
    "grid = GridSearchCV(ridge, param_grid=params,\n                    scoring='neg_mean_absolute_error',\n                    cv=kf, n_jobs=-1, verbose=1)\ngrid.fit(X_train_part_sparse, y_train_part)"
)

# In[44]:

mean_absolute_error(y_valid, grid.predict(X_valid_sparse))
Ejemplo n.º 11
0
def ridge(X, Y, kfold=3, feature_set=None):
    arr = index_splitter(N=len(X), fold=kfold)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.values[train_index, :], Y.values[train_index]
    test_X, test_y = X.values[test_index, :], Y.values[test_index]
    arr = index_splitter(N=len(train_X), fold=kfold)
    ps2 = PredefinedSplit(arr)

    # Create the random grid
    alpha = np.linspace(1, 100, 100)

    random_grid = {'alpha': alpha}

    ridge = Ridge(random_state=42)

    # Look at parameters used by our current forest
    print('Parameters currently in use:\n')
    pprint(ridge.get_params())

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune

    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    ridge_random = RandomizedSearchCV(estimator=ridge,
                                      param_distributions=random_grid,
                                      scoring='neg_mean_squared_error',
                                      cv=ps2.split(),
                                      verbose=2,
                                      random_state=42,
                                      n_jobs=-1)

    # Fit the random search model
    ridge_random.fit(train_X, train_y)
    pprint(ridge_random.best_params_)
    cv_result_rd = ridge_random.cv_results_

    BestPara_random = ridge_random.best_params_
    print(BestPara_random)

    ## Grid search of parameters, using 3 fold cross validation based on Random search
    from sklearn.model_selection import GridSearchCV

    # Number of trees in random forest
    alpha = np.linspace(BestPara_random["alpha"] - 1,
                        BestPara_random["alpha"] + 1, 10)

    # Create the random grid
    grid_grid = {'alpha': alpha}

    ridge_grid = GridSearchCV(estimator=ridge,
                              param_grid=grid_grid,
                              scoring='neg_mean_squared_error',
                              cv=ps2.split(),
                              verbose=2,
                              n_jobs=-1)
    # Fit the grid search model
    ridge_grid.fit(train_X, train_y)
    BestPara_grid = ridge_grid.best_params_

    pprint(ridge_grid.best_params_)
    cv_results_grid = ridge_grid.cv_results_

    # Fit the base line search model
    ridge.fit(train_X, train_y)

    #prediction
    predict_y = ridge_random.predict(test_X)
    predict_y_grid = ridge_grid.predict(test_X)
    predict_y_base = ridge.predict(test_X)

    # Performance metrics

    def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y):
        errors_Grid_CV = np.sqrt(mean_squared_log_error(
            predict_y_grid, test_y))
        errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y))
        errors_baseline = np.sqrt(
            mean_squared_log_error(predict_y_base, test_y))
        return errors_Grid_CV, errors_Random_CV, errors_baseline

    errors_Grid_CV = (mean_squared_error(predict_y_grid,
                                         test_y))  #,squared = False))
    errors_Random_CV = (mean_squared_error(predict_y,
                                           test_y))  #,squared = False))
    errors_baseline = (mean_squared_error(predict_y_base,
                                          test_y))  #,squared = False))

    results = [errors_Grid_CV, errors_Random_CV, errors_baseline]
    print('ridge results:', results)

    if True:
        fig = plt.figure(figsize=(15, 8))
        x_axis = range(3)

        plt.bar(x_axis, results)
        plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline'))
        #plt.show()
        plt.savefig('ridge_error_compare.png')

        print('min index:', results.index(min(results)))

        if results.index(min(results)) is 0:
            model = ridge_grid
            pred_y = predict_y_grid
        else:
            if results.index(min(results)) is 1:
                model = ridge_random
                pred_y = predict_y_random
            else:
                mode = ridge
                pred_y = predict_y_base

        #feature importance
        #predictors = x_train.columns
        #coef = Series(lreg.coef_,predictors).sort_values()
        #coef.plot(kind='bar', title='Model Coefficients, kflod:'+str(kfold))

        #plt.show()
        #plt.savefig('ridge_feature_importance.png')

        fig = plt.figure(figsize=(20, 8))
        ax = fig.gca()
        x_label = range(0, len(pred_y))
        plt.title("kfold=" + str(kfold))
        ax.plot(x_label, pred_y, 'r--', label="predict")
        ax.plot(x_label, test_y, label="ground_truth")
        ax.set_ylim(0, 200)
        ax.legend()
        #plt.show()
        plt.savefig('ridge_prediction.png')

    return ridge_grid.predict, ridge_grid.best_estimator_
Ejemplo n.º 12
0
y = dataMat[:,0]  #变量y


# ========岭回归========
alphas = 10**np.linspace(- 3, 3, 100)
model = Ridge(alpha=alphas)
model = RidgeCV(alphas=alphas,store_cv_values=True)  # 通过RidgeCV可以设置多个参数值,算法使用交叉验证获取最佳参数值
model.fit(X, y)   # 线性回归建模
print(model.alpha_)#岭系数
print(model.cv_values_.shape)#loss值
plt.plot(alphas,model.cv_values_.mean(axis=0))
plt.plot(model.alpha_,min(model.cv_values_.mean(axis=0)),"bo")
print(model.coef_)  # 系数
print(model.intercept_)  # 常量
print(model.score(X, y))  # R^2,拟合优度
print(model.get_params())  # 获取参数信息
print(model.set_params(fit_intercept=False))  # 重新设置参数
# print('交叉验证最佳alpha值',model.alpha_)  # 只有在使用RidgeCV算法时才有效
# 使用模型预测
predicted = model.predict(X)

# 绘制散点图 参数:x横轴 y纵轴
plt.scatter(X, y, marker='x')
plt.plot(X, predicted,c='r')

# 绘制x轴和y轴坐标
plt.xlabel("x")
plt.ylabel("y")

# 显示图形
plt.show()
Ejemplo n.º 13
0
def ridge(X, Y, kfold=3, feature_set=None):
    arr = index_splitter(N=len(X), fold=kfold)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.values[train_index, :], Y.values[train_index]
    test_X, test_y = X.values[test_index, :], Y.values[test_index]
    arr = index_splitter(N=len(train_X), fold=kfold)
    ps2 = PredefinedSplit(arr)

    # Create the random grid
    alpha = np.linspace(0, 1, 10)
    random_grid = {'alpha': alpha}

    ridge = Ridge(random_state=42)

    from pprint import pprint

    # Look at parameters used by our current forest
    print('Parameters currently in use:\n')
    pprint(ridge.get_params())

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune

    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    ridge_random = RandomizedSearchCV(estimator=ridge,
                                      param_distributions=random_grid,
                                      scoring='neg_mean_squared_error',
                                      cv=ps2.split(),
                                      verbose=2,
                                      random_state=42,
                                      n_jobs=-1)

    # Fit the random search model
    ridge_random.fit(train_X, train_y)
    pprint(ridge_random.best_params_)
    cv_result_rd = ridge_random.cv_results_

    BestPara_random = ridge_random.best_params_

    ## Grid search of parameters, using 3 fold cross validation based on Random search
    from sklearn.model_selection import GridSearchCV

    # Number of trees in random forest
    alpha = [
        int(x) for x in range(BestPara_random["alpha"] -
                              2, BestPara_random["alpha"] + 2, 100)
    ]

    # Create the random grid
    grid_grid = {'alpha': alpha}

    ridge_grid = GridSearchCV(estimator=ridge,
                              param_grid=grid_grid,
                              scoring='neg_mean_squared_error',
                              cv=ps2.split(),
                              verbose=2,
                              n_jobs=-1)
    # Fit the grid search model
    ridge_grid.fit(train_X, train_y)
    BestPara_grid = ridge_grid.best_params_

    pprint(ridge_grid.best_params_)
    cv_results_grid = ridge_grid.cv_results_

    # Fit the base line search model
    ridge.fit(train_X, train_y)

    #prediction
    predict_y = ridge_random.predict(test_X)
    predict_y_grid = ridge_grid.predict(test_X)
    predict_y_base = ridge.predict(test_X)
    # Performance metrics
    from sklearn.metrics import mean_squared_log_error
    from sklearn.metrics import mean_squared_error

    def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y):
        errors_Grid_CV = np.sqrt(mean_squared_log_error(
            predict_y_grid, test_y))
        errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y))
        errors_baseline = np.sqrt(
            mean_squared_log_error(predict_y_base, test_y))
        return errors_Grid_CV, errors_Random_CV, errors_baseline

    errors_Grid_CV = (mean_squared_error(predict_y_grid, test_y,
                                         squared=False))
    errors_Random_CV = (mean_squared_error(predict_y, test_y, squared=False))
    errors_baseline = (mean_squared_error(predict_y_base,
                                          test_y,
                                          squared=False))

    x_axis = range(3)
    results = [errors_Grid_CV, errors_Random_CV, errors_baseline]
    plt.bar(x_axis, results)
    plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline'))
    plt.show()

    # =============================================================================
    #     #feature importance
    #     num_feature = len(rf_random.best_estimator_.feature_importances_)
    #     plt.figure(figsize=(12,6))
    #     plt.bar(range(0,num_feature*4,4),rf_random.best_estimator_.feature_importances_)
    #     label_name = X.keys()
    #     plt.xticks(range(0,num_feature*4,4), label_name)
    #     plt.title("Feature Importances")
    #     plt.show()
    # =============================================================================

    #feature importance
    num_feature = len(ridge_grid.best_estimator_.feature_importances_)
    plt.figure(figsize=(24, 6))
    plt.bar(range(0, num_feature * 4, 4),
            ridge_grid.best_estimator_.feature_importances_)

    label_name = X.keys()

    plt.xticks(range(0, num_feature * 4, 4), label_name)
    plt.title("Feature Importances" + ",kfold=" + str(kfold))
    plt.show()
    fig = plt.figure(figsize=(20, 8))
    ax = fig.gca()
    x_label = range(0, len(predict_y_grid))
    plt.title("kfold=" + str(kfold))
    ax.plot(x_label, predict_y_grid, 'r--', label="predict")
    ax.plot(x_label, test_y, label="ground_truth")
    ax.set_ylim(0, 200)
    ax.legend()
    plt.show()

    return ridge_grid.predict, ridge_grid.best_estimator_
Ejemplo n.º 14
0
def single_model(args):
    import h5py
    import pandas as pd
    import numpy as np
    import dill as pickle
    from utils import read_hdf5_dataset, prepare_output_file, read_hdf5_single
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import r2_score, mean_squared_error
    from tqdm import tqdm

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    #phenotypes = pd.read_table(args.phenotype_file)
    phenotypes = read_hdf5_dataset(args.phenotype_file)
    logger.info('read genotypes from file: ' + args.genotype_file)
    X = read_hdf5_dataset(args.genotype_file)
    if args.transpose_x:
        logger.info('transpose X')
        X = X.T
    y = phenotypes
    if args.feature_indices_file:
        logger.info('read feature indices from: ' + args.feature_indices_file)
        feature_indices = read_hdf5_dataset(args.feature_indices_file)
        X = np.take(X, feature_indices, axis=1)
    if args.normalize_x:
        logger.info('normalize X')
        X = StandardScaler().fit_transform(X)
    if args.sample_indices_file:
        logger.info('read sample indices from: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(~np.isnan(phenotypes))[0]
    X_train = X[sample_indices]
    y_train = y[sample_indices]
    logger.info('read parent table from file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)

    logger.info('use model ' + args.model_name)
    logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape)))
    if args.model_name == 'ridge':
        from sklearn.linear_model import Ridge
        model = Ridge(alpha=10000)
        model.fit(X_train, y_train)
        y_pred = np.ravel(model.predict(X))
        y_pred_train = y_pred[sample_indices]
    elif args.model_name == 'ridge_cv':
        from sklearn.linear_model import Ridge
        alphas = 10.0**np.arange(1, 6)
        train_masks, test_masks = generate_cv_masks(sample_indices,
                                                    parent_table,
                                                    k_female=5,
                                                    k_male=5)
        cv_metrics = {}
        cv_metrics['mse'] = np.zeros((len(alphas), train_masks.shape[0]))
        cv_metrics['r2'] = np.zeros((len(alphas), train_masks.shape[0]))
        pbar = tqdm(total=len(alphas) * train_masks.shape[0])
        for i, alpha in enumerate(alphas):
            for j in range(train_masks.shape[0]):
                model = Ridge(alpha=alpha)
                model.fit(X[train_masks[j]], y[train_masks[j]])
                y_pred = model.predict(X[test_masks[j]])
                cv_metrics['mse'][i, j] = mean_squared_error(
                    y[test_masks[j]], y_pred)
                cv_metrics['r2'][i, j] = r2_score(y[test_masks[j]], y_pred)
                pbar.update(1)
        pbar.close()
        best_alpha = alphas[cv_metrics['r2'].mean(axis=1).argmax()]
        logger.info('optmized alpha = %f' % best_alpha)
        model = Ridge(alpha=best_alpha)
        model.fit(X_train, y_train)
        y_pred = np.ravel(model.predict(X))
        y_pred_train = y_pred[sample_indices]
    elif args.model_name == 'gpr':
        from sklearn.gaussian_process import GaussianProcessRegressor
        from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF
        kernel = RBF() + WhiteKernel()
        model = GaussianProcessRegressor(kernel=kernel)
        model.fit(X_train, y_train)
        logger.info('kernel params: %s' % repr(model.get_params()))
        y_pred_train = np.ravel(model.predict(X_train))
        y_pred = np.ravel(model.predict(X))
    elif args.model_name == 'gpy':
        from GPy.kern import Linear
        from GPy.models import GPRegression
        kernel = Linear(input_dim=2, name='linear')
        model = GPRegression(X_train, y_train, kernel=kernel)
        model.optimize()

    else:
        raise ValueError('unknown model name: ' + args.model_name)

    logger.info('r2 score = %f' % r2_score(y_train, y_pred_train))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    model_file = os.path.join(args.output_dir, 'model')
    logger.info('save model file: ' + model_file)
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)
    pred_file = os.path.join(args.output_dir, 'predictions')
    logger.info('save predictions to file: ' + pred_file)
    with h5py.File(pred_file, 'w') as f:
        if args.output_residuals:
            f.create_dataset('residual', data=(y - y_pred))
        f.create_dataset('y_true', data=y)
        f.create_dataset('y_pred', data=y_pred)
        f.create_dataset('y_pred_train', data=y_pred_train)
        f.create_dataset('indices_train', data=sample_indices)
        if args.model_name == 'ridge_cv':
            f.create_dataset('alpha', data=alphas)
            g = f.create_group('cv_metrics')
            for key in cv_metrics.keys():
                g.create_dataset(key, data=cv_metrics[key])
Ejemplo n.º 15
0
        scores_dict = {}
        mse_dict = {}
        mse_dict_raw = {}
        mae_dict = {}
        mae_dict_raw = {}
        mape_dict = {}

        scores_dict_f3 = {}
        mse_dict_f3 = {}
        mse_dict_f3_raw = {}
        mae_dict_f3 = {}
        mae_dict_f3_raw = {}
        mape_dict_f3 = {}
        test_start_time = time.clock()

        getparams_dict = aux_reg_regressor.get_params(deep=True)
        print("getparams_dict: ", getparams_dict)
        getparams_df = pd.DataFrame.from_dict(data=getparams_dict,
                                              orient='index')
        getparams_df.to_csv(analysis_path + model_id + str(model)[:-4] +
                            "getparams.csv")
        model_as_pkl_filename = analysis_path + model_id + str(
            model)[:-4] + ".pkl"
        joblib.dump(aux_reg_regressor, filename=model_as_pkl_filename)
        #np.savetxt(analysis_path + "rf5getparams.txt",fmt='%s',X=str(aux_reg_regressor.get_params(deep=True)))
        #np.savetxt(analysis_path + "rf5estimatorparams.txt",fmt='%s',X=aux_reg_regressor.estimator_params) USELESS
        #np.savetxt(analysis_path + "rf5classes.txt",fmt='%s',X=aux_reg_regressor.classes_)
        #np.savetxt(analysis_path + "rf5baseestim.txt",fmt='%s',X=aux_reg_regressor.base_estimator_)

        #TODO: CHANGE THIS BACK IF CUT SHORT!!
        for files in combined_filenames:
Ejemplo n.º 16
0
def ridge(X, Y, kfold=3, feature_set=None):

    arr = index_splitter(N=len(X), fold=kfold)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.values[train_index, :], Y.values[train_index]
    test_X, test_y = X.values[test_index, :], Y.values[test_index]
    arr = index_splitter(N=len(train_X), fold=kfold)
    ps2 = PredefinedSplit(arr)

    # base
    ridge = Ridge(random_state=42)
    ridge.fit(train_X, train_y)
    print('Base Parameters in use:')
    print(ridge.get_params())

    # grid search
    alpha_log = np.linspace(-8, 5, 14)

    alpha = []
    for i in alpha_log:
        a = math.pow(10, i)
        alpha = alpha + [a]

    grid_grid = {'alpha': alpha}
    ridge_grid = GridSearchCV(estimator=ridge,
                              param_grid=grid_grid,
                              scoring='neg_mean_squared_error',
                              cv=ps2.split(),
                              verbose=2,
                              n_jobs=-1)

    # Fit the grid search model
    ridge_grid.fit(train_X, train_y)
    BestPara_grid = ridge_grid.best_params_
    print("grid search, best parameter:", ridge_grid.best_params_)
    cv_results_grid = ridge_grid.cv_results_

    new_measure = int(ridge_grid.best_params_['alpha'])
    new_alpha = [x for x in range(100, new_measure * 10, 10)]
    print(new_alpha)
    grid_grid = {'alpha': new_alpha}
    ridge_grid2 = GridSearchCV(estimator=ridge,
                               param_grid=grid_grid,
                               scoring='neg_mean_squared_error',
                               cv=ps2.split(),
                               verbose=2,
                               n_jobs=-1)

    # Fit the grid search model
    ridge_grid2.fit(train_X, train_y)
    BestPara_grid = ridge_grid2.best_params_
    print("grid search, best parameter:", ridge_grid2.best_params_)
    cv_results_grid2 = ridge_grid2.cv_results_

    #prediction
    predict_y_grid = ridge_grid.predict(test_X)
    predict_y_base = ridge.predict(test_X)
    predict_y_grid2 = ridge_grid2.predict(test_X)

    # Performance metrics
    errors_Grid_CV = np.sqrt(mean_squared_error(predict_y_grid, test_y))
    errors_Grid2_CV = np.sqrt(mean_squared_error(predict_y_grid2, test_y))
    errors_baseline = np.sqrt(mean_squared_error(predict_y_base, test_y))

    results = [errors_Grid2_CV, errors_Grid_CV, errors_baseline]
    print('ridge results:', results)

    return ridge_grid2
Ejemplo n.º 17
0
data = data.dropna()

#Train test split:
data.dropna(axis=1, how='any', inplace=True)
a = len(data.T) - 1  # The last column is the label
X = data.iloc[:, range(0, a)]
Y = data.iloc[:, a]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.34)

#fit Random forest model:
model = Ridge(alpha=1).fit(X_train, Y_train)
pred = model.predict(X_test)

#Get scores and analysis
score_r = model.score(X_test, Y_test)
print("Model = {}, R^2 = {}".format(model.get_params(), score_r))
print score_r

# X_test1.to_csv('xtest.csv',na_rep='NA',index=False)
# X_test1['dataset'].to_csv('ytest.csv',na_rep='NA',index=False)
#print model.coef_
#print X_test.columns
#print zip(model.coef_, X_test.columns)

#print pred
#print Y_test

NRMSD = (pred - Y_test)**2 / (pred.mean() * Y_test.mean())
NRMSD = NRMSD.mean()
print "NRMSD:"
print NRMSD
Ejemplo n.º 18
0
ax = plt.gca()

ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

clf = Ridge(alpha=2.0)
clf.fit(X_train, Y_train)
clf.score(X_cv, Y_cv)
clf.get_params(deep=True)
arreglo = clf.predict(X_train)
readerWriter = ReaderAndWriter()
readerWriter.write_file([arreglo], 'answer1.txt')


def date_substraction(date1, date2):
    d1 = date1.split("-")
    d2 = date2.split("-")
    yearDiff = int(d1[0]) - int(d2[0])
    monthDiff = int(d1[1]) - int(d2[1])
    dayDiff = int(d1[2]) - int(d2[2])
    return dayDiff + 30 * monthDiff + 365 * yearDiff


def days_between(d1, d2):