Beispiel #1
0
def OMP_cv(problem, **kwargs):
    r"""High level description.

    Requirements
    ------------
    kwargs['choose'] must be a positive integer

    kwargs['coef_tolerance'] must be a nonnegative float

    Returns
    -------
    output : tuple
        (optimum, maximum)

    """
    data_list = [datum['data']['values'] for datum in problem.data]
    data = numpy.array(data_list)
    OMP = OrthogonalMatchingPursuitCV(max_iter=kwargs['choose'])
    OMP.fit(data.T, problem.goal['data']['values'])
    OMP_coefficients = OMP.coef_
    optimum = [
        problem.data[index] for index, element in enumerate(OMP_coefficients)
        if abs(element) > kwargs['coef_tolerance']
    ]
    maximum = OMP.score(data.T, problem.goal['data']['values'])
    output = (optimum, maximum)
    return output
Beispiel #2
0
def plot_omp():
    n_components, n_features = 512, 100
    n_nonzero_coefs = 17

    # generate the data

    # y = Xw
    # |x|_0 = n_nonzero_coefs

    y, X, w = make_sparse_coded_signal(n_samples=1,
                                       n_components=n_components,
                                       n_features=n_features,
                                       n_nonzero_coefs=n_nonzero_coefs,
                                       random_state=0)

    idx, = w.nonzero()

    # distort the clean signal
    y_noisy = y + 0.05 * np.random.randn(len(y))

    # plot the sparse signal
    plt.figure(figsize=(7, 7))
    plt.subplot(4, 1, 1)
    plt.xlim(0, 512)
    plt.title("Sparse signal")
    plt.stem(idx, w[idx], use_line_collection=True)

    # plot the noise-free reconstruction
    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs)
    omp.fit(X, y)
    coef = omp.coef_
    idx_r, = coef.nonzero()
    plt.subplot(4, 1, 2)
    plt.xlim(0, 512)
    plt.title("Recovered signal from noise-free measurements")
    plt.stem(idx_r, coef[idx_r], use_line_collection=True)

    # plot the noisy reconstruction
    omp.fit(X, y_noisy)
    coef = omp.coef_
    idx_r, = coef.nonzero()
    plt.subplot(4, 1, 3)
    plt.xlim(0, 512)
    plt.title("Recovered signal from noisy measurements")
    plt.stem(idx_r, coef[idx_r], use_line_collection=True)

    # plot the noisy reconstruction with number of non-zeros set by CV
    omp_cv = OrthogonalMatchingPursuitCV()
    omp_cv.fit(X, y_noisy)
    coef = omp_cv.coef_
    idx_r, = coef.nonzero()
    plt.subplot(4, 1, 4)
    plt.xlim(0, 512)
    plt.title("Recovered signal from noisy measurements with CV")
    plt.stem(idx_r, coef[idx_r], use_line_collection=True)

    plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38)
    plt.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit',
                 fontsize=16)
    plt.show()
Beispiel #3
0
def test_omp_cv():
    y_ = y[:, 0]
    gamma_ = gamma[:, 0]
    ompcv = OrthogonalMatchingPursuitCV(normalize=True, fit_intercept=False,
                                        max_iter=10, cv=5)
    ompcv.fit(X, y_)
    assert_equal(ompcv.n_nonzero_coefs_, n_nonzero_coefs)
    assert_array_almost_equal(ompcv.coef_, gamma_)
    omp = OrthogonalMatchingPursuit(normalize=True, fit_intercept=False,
                                    n_nonzero_coefs=ompcv.n_nonzero_coefs_)
    omp.fit(X, y_)
    assert_array_almost_equal(ompcv.coef_, omp.coef_)
Beispiel #4
0
def test_omp_cv():
    y_ = y[:, 0]
    gamma_ = gamma[:, 0]
    ompcv = OrthogonalMatchingPursuitCV(normalize=True, fit_intercept=False,
                                        max_iter=10, cv=5)
    ompcv.fit(X, y_)
    assert_equal(ompcv.n_nonzero_coefs_, n_nonzero_coefs)
    assert_array_almost_equal(ompcv.coef_, gamma_)
    omp = OrthogonalMatchingPursuit(normalize=True, fit_intercept=False,
                                    n_nonzero_coefs=ompcv.n_nonzero_coefs_)
    omp.fit(X, y_)
    assert_array_almost_equal(ompcv.coef_, omp.coef_)
class _OrthogonalMatchingPursuitCVImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Beispiel #6
0
def test_omp_cv():
    # FIXME: This test is unstable on Travis, see issue #3190 for more detail.
    check_skip_travis()
    y_ = y[:, 0]
    gamma_ = gamma[:, 0]
    ompcv = OrthogonalMatchingPursuitCV(normalize=True, fit_intercept=False,
                                        max_iter=10, cv=5)
    ompcv.fit(X, y_)
    assert_equal(ompcv.n_nonzero_coefs_, n_nonzero_coefs)
    assert_array_almost_equal(ompcv.coef_, gamma_)
    omp = OrthogonalMatchingPursuit(normalize=True, fit_intercept=False,
                                    n_nonzero_coefs=ompcv.n_nonzero_coefs_)
    omp.fit(X, y_)
    assert_array_almost_equal(ompcv.coef_, omp.coef_)
Beispiel #7
0
def train_regression_model(X, y, model_type='elastic cv', cv=3, extra_params={}):
    '''Wrapper function to train various regression models with X,y input,
       where extra params can be passed to override any default parameters''' 

    model_type = model_type.lower()

    if model_type == 'linear':
        model = LinearRegression(fit_intercept=True)
    
    elif model_type == 'elastic cv':
        model = ElasticNetCV(cv=cv)
    
    elif model_type == 'omp cv':
        model = OrthogonalMatchingPursuitCV(cv=cv)
    
    elif model_type == 'lars cv':
        model = LarsCV(cv=cv)
    
    elif model_type == 'ridge cv':
        model = RidgeCV(cv=cv)
    
    elif model_type == 'full lightgbm':
        model = Train_Light_GBM(X, y, int_cv=cv, regression=True, **extra_params)
        return model
        
    model.fit(X, y)
    return model
Beispiel #8
0
def test_omp_cv():
    # FIXME: This test is unstable on Travis, see issue #3190 for more detail.
    check_skip_travis()
    y_ = y[:, 0]
    gamma_ = gamma[:, 0]
    ompcv = OrthogonalMatchingPursuitCV(normalize=True,
                                        fit_intercept=False,
                                        max_iter=10,
                                        cv=5)
    ompcv.fit(X, y_)
    assert_equal(ompcv.n_nonzero_coefs_, n_nonzero_coefs)
    assert_array_almost_equal(ompcv.coef_, gamma_)
    omp = OrthogonalMatchingPursuit(normalize=True,
                                    fit_intercept=False,
                                    n_nonzero_coefs=ompcv.n_nonzero_coefs_)
    omp.fit(X, y_)
    assert_array_almost_equal(ompcv.coef_, omp.coef_)
Beispiel #9
0
def createOrthogonalMatchingPursuitRegressor(params=None):
    info("Creating Orthogonal Matching Pursuit Regressor", ind=4)

    ## Params
    params = mergeParams(OrthogonalMatchingPursuit(), params)
    params = mergeParams(OrthogonalMatchingPursuitCV(), params)
    tuneParams = getOrthogonalMatchingPursuitRegressorParams()

    ## estimator
    if params.get('cv') is True:
        info("Using Built-In Cross Validation With Parameters", ind=4)
        reg = OrthogonalMatchingPursuitCV()
    else:
        info("Without Parameters", ind=4)
        reg = OrthogonalMatchingPursuit()

    return {"estimator": reg, "params": tuneParams}
def get_model_by_name(model_name):
    return {
        'Linear Regression': LinearRegression(),
        'Lars CV': LarsCV(cv=10),
        'Lasso CV': LassoCV(cv=10),
        'Ridge CV': RidgeCV(cv=10),
        'Elastic Net CV': ElasticNetCV(cv=10),
        'Orthogonal Matching Pursuit CV': OrthogonalMatchingPursuitCV(cv=10),
        'Decision Tree Regressor': DecisionTreeRegressor(max_depth=3),
    }[model_name]
Beispiel #11
0
    def predict(self):
        """
         trains the scikit-learn  python machine learning algorithm library function
         https://scikit-learn.org

         then passes the trained algorithm the features set and returns the
         predicted y test values form, the function

         then compares the y_test values from scikit-learn predicted to
         y_test values passed in

         then returns the accuracy
         """

        n_nonzero_coefs = 17
        algorithm = OrthogonalMatchingPursuitCV()
        algorithm.fit(self.X_train, self.y_train)
        y_pred = list(algorithm.predict(self.X_test))
        self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test)
        return self.acc
Beispiel #12
0
 def test_model_orthogonal_matching_pursuit_cv(self):
     model, X = fit_regression_model(OrthogonalMatchingPursuitCV())
     model_onnx = convert_sklearn(
         model, "orthogonal matching pursuit cv",
         [("input", FloatTensorType([None, X.shape[1]]))])
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X,
                         model,
                         model_onnx,
                         verbose=False,
                         basename="SklearnOrthogonalMatchingPursuitCV-Dec4")
Beispiel #13
0
    def get_feature_coefficients(self, norm_prior=1):
        """
        get feature coefficients using linear regression.
        Linear models penalized with the L1 norm have sparse solutions: many of their estimated
        coefficients are zero.
        Args:
            norm_prior: 1 for L1-norm as default. use L0 to get the sparsest result.
        """
        model = None
        alphas = np.logspace(-4, -0.5, 30)
        tuned_parameters = [{'alpha': alphas}]
        coefficient_value = None
        if norm_prior == 0:
            # L0-norm
            model = OrthogonalMatchingPursuitCV()
            model.fit(self.X_df.values, self.y_df.values)
            coefficient_value = model.coef_
        elif norm_prior == 1:
            # L1-norm
            # Lasso
            lasso = Lasso(random_state=0)
            n_folds = 3
            gridsearch = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)
            gridsearch.fit(self.X_df.values, self.y_df.values)
            coefficient_value = gridsearch.best_estimator_.coef_
        elif norm_prior == 2:
            # L2-norm
            # Ridge
            ridge = Ridge(random_state=0)
            n_folds = 3
            gridsearch = GridSearchCV(ridge, tuned_parameters, cv=n_folds, refit=False)
            gridsearch.fit(self.X_df.values, self.y_df.values)
            coefficient_value = gridsearch.best_estimator_.coef_
        else:
            print("invalid norm!")

        self.coef_ = coefficient_value
        return coefficient_value
Beispiel #14
0
def solve_preconditioned_orthogonal_matching_pursuit(basis_matrix_func,
                                                     samples,
                                                     values,
                                                     precond_func,
                                                     tol=1e-8):
    basis_matrix = basis_matrix_func(samples)
    weights = precond_func(basis_matrix, samples)
    basis_matrix = basis_matrix * weights[:, np.newaxis]
    rhs = values * weights[:, np.newaxis]
    if basis_matrix.shape[1] == 1 or tol > 0:
        omp = OrthogonalMatchingPursuit(tol=tol)
    else:
        omp = OrthogonalMatchingPursuitCV(cv=min(samples.shape[1], 10))
    res = omp.fit(basis_matrix, rhs)
    coef = omp.coef_
    coef[0] += res.intercept_
    return coef[:, np.newaxis]
Beispiel #15
0
def train_regression_model(X, y, model_type='elastic', cv=3):

    if model_type == 'linear':
        model = LinearRegression(fit_intercept=True)
    elif model_type == 'elastic cv':
        model = ElasticNetCV(cv=cv)
    elif model_type == 'omp cv':
        model = OrthogonalMatchingPursuitCV(cv=cv)
    elif model_type == 'lars cv':
        model = LarsCV(cv=cv)
    elif model_type == 'ridge cv':
        model = RidgeCV(cv=cv)
    elif model_type == 'simple xgboost':
        model = XGBRegressor()
    elif model_type == 'simple lightgbm':
        model = LGBMRegressor()
    elif model_type == 'full lightgbm':
        model = train_light_gbm_regressor(X, y, cv, n_params=10, test_size=.2)
        return model

    model.fit(X, y)
    return model
Beispiel #16
0
def fit_linear_model(basis_matrix, train_vals, solver_type, **kwargs):
    solvers = {
        'lasso_lars': LassoLarsCV(cv=kwargs['cv']).fit,
        'lasso': LassoCV(cv=kwargs['cv']).fit,
        'lars': LarsCV(cv=kwargs['cv']).fit,
        'omp': OrthogonalMatchingPursuitCV(cv=kwargs['cv'], verbose=5).fit
    }
    assert train_vals.ndim == 2
    if solver_type in solvers:
        fit = solvers[solver_type]
        res = fit(basis_matrix, train_vals[:, 0])
    else:
        msg = f'Solver type {solver_type} not supported\n'
        msg += 'Supported solvers are:\n'
        for key in solvers.keys():
            msg += f'\t{key}\n'
        raise Exception(msg)

    cv_score = res.score(basis_matrix, train_vals[:, 0])
    coef = res.coef_[:, np.newaxis]
    coef[0] = res.intercept_
    return coef, cv_score
Beispiel #17
0
def check_w(w=[12, 24, 36, 48, 60]):
    '''
    robustness check for w_min, save the prediction results (Avew window) and OOS R_square

    Parameters
    ----------
    w: possible w_min  (list)
    '''
    for w_min in w:
        #linear ML prediction
        pre1 = linear_prediction(RidgeCV(), w_min=w_min, window_type="Avew")
        pre2 = linear_prediction(LassoCV(cv=5),
                                 w_min=w_min,
                                 window_type="Avew")
        pre3 = linear_prediction(ElasticNetCV(cv=5),
                                 w_min=w_min,
                                 window_type="Avew")
        pre4 = linear_prediction(LarsCV(cv=5), w_min=w_min, window_type="Avew")
        pre5 = linear_prediction(OrthogonalMatchingPursuitCV(cv=5),
                                 w_min=w_min,
                                 window_type="Avew")
        pre6 = MR(w_min=w_min, window_type="Avew")
        all_pre = pd.DataFrame({
            'Kintchen Sink': pre6,
            "ridge": pre1,
            "lasso": pre2,
            "elasticnet": pre3,
            "lars": pre4,
            "OMP": pre5,
        })
        all_pre['FC'] = all_pre.iloc[:, 1:].mean(axis=1)
        #save the prediction results
        all_pre.to_csv(
            os.path.join(path, "稳健性检验", "w_min", "预测结果",
                         "w_min=" + str(w_min) + ".csv"))
        #R2 test
        R2_test(all_pre, name="w_min=" + str(w_min) +
                ".csv")  #then you need move the result on your own
Beispiel #18
0
        def choose_ML_alg(self):

            models = [
                RANSACRegressor(),
                HuberRegressor(),
                LinearRegression(),
                ElasticNet(),
                ElasticNetCV(),
                Lars(),
                Lasso(),
                LassoLars(),
                LassoLarsIC(),
                OrthogonalMatchingPursuit(),
                OrthogonalMatchingPursuitCV(),
                Ridge(),
                SGDRegressor(),
                RandomForestRegressor(),
                GradientBoostingRegressor(),
                AdaBoostRegressor(),
                NGBRegressor(Dist=Normal),
                DecisionTreeRegressor()
            ]

            return models
Beispiel #19
0
def _ompcv(*,
           train,
           test,
           x_predict=None,
           metrics,
           copy=True,
           fit_intercept=True,
           normalize=True,
           max_iter=None,
           cv=None,
           n_jobs=None,
           verbose=False):
    """For more info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.OrthogonalMatchingPursuitCV.html#sklearn.linear_model.OrthogonalMatchingPursuitCV
    """

    model = OrthogonalMatchingPursuitCV(fit_intercept=fit_intercept,
                                        copy=copy,
                                        normalize=normalize,
                                        max_iter=max_iter,
                                        cv=cv,
                                        n_jobs=n_jobs,
                                        verbose=verbose)
    model.fit(train[0], train[1])
    model_name = 'OrthogonalMatchingPursuitCV'
    y_hat = model.predict(test[0])

    if metrics == 'mse':
        accuracy = _mse(test[1], y_hat)
    if metrics == 'rmse':
        accuracy = _rmse(test[1], y_hat)
    if metrics == 'mae':
        accuracy = _mae(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Beispiel #20
0
df = test_regressor(
    MLPRegressor(random_state=1,
                 activation='logistic',
                 solver='sgd',
                 learning_rate='adaptive',
                 learning_rate_init=0.013000000000000001,
                 early_stopping=True,
                 hidden_layer_sizes=(140, 140),
                 max_iter=10000,
                 momentum=0.9697272727272728), df)
df = test_regressor(LassoCV(cv=5), df)
df = test_regressor(LassoLarsCV(cv=5), df)
df = test_regressor(RidgeCV(cv=5), df)
df = test_regressor(LinearRegression(), df)
df = test_regressor(ElasticNetCV(cv=5), df)
df = test_regressor(OrthogonalMatchingPursuitCV(cv=5), df)
df = test_regressor(ARDRegression(compute_score=True, copy_X=True), df)
# test_regressor(LogisticRegressionCV(cv=5)) - it's used for classification
df = test_regressor(SGDRegressor(), df)
df = test_regressor(PassiveAggressiveRegressor(), df)
df = test_regressor(RANSACRegressor(), df)
df = test_regressor(TheilSenRegressor(copy_X=True), df)
df = test_regressor(HuberRegressor(), df)
df = test_regressor(AdaBoostRegressor(n_estimators=1000), df)
df = test_regressor(BaggingRegressor(n_estimators=1000), df)
df = test_regressor(ExtraTreesRegressor(n_estimators=1000), df)
df = test_regressor(GradientBoostingRegressor(n_estimators=1000), df)
df = test_regressor(RandomForestRegressor(n_estimators=1000), df)
df = test_regressor(GaussianProcessRegressor(), df)
# df = test_regressor(IsotonicRegression(), df) - has errors
df = test_regressor(LinearSVR(), df)
Beispiel #21
0
dat_l, alg_l, f_l, ab_l, sq_l, cp_l = run_gridSearch(dataname, fold, model_fn, algname, modelCV)

dataset_l   += dat_l
algoritmo_l += alg_l
fold_l      += f_l
mae_l       += ab_l
rmse_l      += sq_l
cplx_l      += cp_l

mse = np.mean(sq_l)
print(f'{algname}: {mse}')
print('\n')

algname  = 'IT-ELM (OMP)'
modelCV  = OrthogonalMatchingPursuitCV(n_jobs=-1)

dat_l, alg_l, f_l, ab_l, sq_l, cp_l = run_gridSearch(dataname, fold, model_fn, algname, modelCV)

dataset_l   += dat_l
algoritmo_l += alg_l
fold_l      += f_l
mae_l       += ab_l
rmse_l      += sq_l
cplx_l      += cp_l

mse = np.mean(sq_l)
print(f'{algname}: {mse}')
print('\n')

Beispiel #22
0
coef = omp.coef_
(idx_r, ) = coef.nonzero()
plt.subplot(4, 1, 2)
plt.xlim(0, 512)
plt.title("Recovered signal from noise-free measurements")
plt.stem(idx_r, coef[idx_r], use_line_collection=True)

# plot the noisy reconstruction
omp.fit(X, y_noisy)
coef = omp.coef_
(idx_r, ) = coef.nonzero()
plt.subplot(4, 1, 3)
plt.xlim(0, 512)
plt.title("Recovered signal from noisy measurements")
plt.stem(idx_r, coef[idx_r], use_line_collection=True)

# plot the noisy reconstruction with number of non-zeros set by CV
omp_cv = OrthogonalMatchingPursuitCV(normalize=False)
omp_cv.fit(X, y_noisy)
coef = omp_cv.coef_
(idx_r, ) = coef.nonzero()
plt.subplot(4, 1, 4)
plt.xlim(0, 512)
plt.title("Recovered signal from noisy measurements with CV")
plt.stem(idx_r, coef[idx_r], use_line_collection=True)

plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38)
plt.suptitle("Sparse signal recovery with Orthogonal Matching Pursuit",
             fontsize=16)
plt.show()
Beispiel #23
0
        explain_weights(clf, unknown_argument=True)


@pytest.mark.parametrize(['reg'], [
    [ElasticNet(random_state=42)],
    [ElasticNetCV(random_state=42)],
    [HuberRegressor()],
    [Lars()],
    [LarsCV(max_n_alphas=10)],
    [Lasso(random_state=42)],
    [LassoCV(random_state=42)],
    [LassoLars(alpha=0.01)],
    [LassoLarsCV(max_n_alphas=10)],
    [LassoLarsIC()],
    [OrthogonalMatchingPursuit(n_nonzero_coefs=10)],
    [OrthogonalMatchingPursuitCV()],
    [PassiveAggressiveRegressor(C=0.1, random_state=42)],
    [Ridge(random_state=42)],
    [RidgeCV()],
    [SGDRegressor(random_state=42)],
    [LinearRegression()],
    [LinearSVR(random_state=42)],
    [TheilSenRegressor(random_state=42)],
])
def test_explain_linear_regression(boston_train, reg):
    assert_explained_weights_linear_regressor(boston_train, reg)


@pytest.mark.parametrize(['reg'], [
    [Lasso(random_state=42)],
    [Lasso(fit_intercept=False, random_state=42)],
Beispiel #24
0
    'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
    'Walc', 'health', 'Medu', 'famsup'
]

# convert categorical data to dummy variables
student_data = handle_cat_data(cat_data, student_data)

# split testing and training data
X_train, X_test, y_train, y_test = train_test_split(
    student_data.drop('failures', axis=1),
    student_data.failures,
    test_size=0.25,
    stratify=student_data.failures)

reg_algs_names = [
    'Linear Regression', 'Ridge Regression', 'Lasso Regression',
    'Elastic Net Regression', 'Orthongonal Matching Pursuit CV',
    'MLP Regressor'
]

reg_algs = [
    LinearRegression(normalize=True),
    Ridge(alpha=0, normalize=True),
    Lasso(alpha=0.01, normalize=False),
    ElasticNet(random_state=0),
    OrthogonalMatchingPursuitCV(cv=8, normalize=True),
    MLPRegressor(max_iter=1000)
]

run_reg_models(reg_algs_names, reg_algs, X_train, X_test, y_train, y_test)
Beispiel #25
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/age/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['age'].copy().reset_index(drop=True)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'Huber', 'OMP']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000, reg_depth=5, normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge(),
        HuberRegressor(epsilon=2.5, alpha=1),
        OrthogonalMatchingPursuit(n_nonzero_coefs=300)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 5, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 5, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['RGF', 'ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')
    print('Reading scores from ', folder_preds)

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train[prefix +
              '_score'] = np.load(folder_preds +
                                  '{}_score_seed{}.npy'.format(prefix, seed))
        test[prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(
            set(ic_cols + fnc_cols + pca_cols + agg_cols + im_cols) -
            set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_cols=pca_cols)

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # dl
    dict_cols = sorted(
        list(
            set(ic_cols + fnc_cols + dl_cols + im_cols + agg_cols) -
            set(['IC_20'])))
    train_dl, test_dl = scale_select_data(train, test, df_scale, dict_cols)

    # learning process on different datasets
    names = ['MLP', 'RGF', 'SVM', 'BR', 'OMP', 'EN', 'KR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        MLPRegressor(activation='tanh', random_state=0),
        RGFRegressor(max_leaf=1500, loss='Abs'),
        NuSVR(C=10, nu=0.4, kernel='rbf'),
        BayesianRidge(),
        OrthogonalMatchingPursuitCV(),
        ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=0),
        KernelRidge(kernel='poly', alpha=0.5)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2 +
            [train_dl] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2 +
                        [test_dl] * 2,
                        names,
                        is_blend=False)

    # rewrite folders for models and preds
    folder_models = './models/age/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    folds = KFold(n_splits=10, shuffle=True, random_state=0)
    stack = pd.DataFrame(zoo.oof_preds).T
    stack.columns = names

    model_stacker_rgf = RGFRegressor(max_leaf=1000,
                                     reg_depth=25,
                                     verbose=False)
    rgf_pred = cross_val_predict(model_stacker_rgf,
                                 stack,
                                 y.dropna(),
                                 cv=folds,
                                 n_jobs=-1)

    model_stacker_br = BayesianRidge()
    br_pred = cross_val_predict(model_stacker_br,
                                stack,
                                y.dropna(),
                                cv=folds,
                                n_jobs=-1)

    model_stacker_rgf.fit(stack, y.dropna())
    model_stacker_br.fit(stack, y.dropna())

    # save models
    save_pickle(model_stacker_br,
                folder_models + 'BRidge_stack_seed{}'.format(seed))
    save_pickle(model_stacker_rgf,
                folder_models + 'RGF_stack_seed{}'.format(seed))
    print('Final age NMAE: {:.5f}'.format(
        NMAE(y, 0.75 * br_pred + 0.25 * rgf_pred)))

    test_preds = pd.DataFrame(preds).T
    test_preds.columns = names

    age_prediction = pd.DataFrame()
    age_prediction['Id'] = test['Id'].values
    age_prediction['pred'] = 0.25 * model_stacker_rgf.predict(
        test_preds) + 0.75 * model_stacker_br.predict(test_preds)
    age_prediction.to_csv(folder_preds + 'age_stack_seed{}.csv'.format(seed),
                          index=False)
    print('age seed pred is saved as',
          folder_preds + 'age_stack_seed{}.csv'.format(seed))
Beispiel #26
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/domain1_var1/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain1_var1/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['domain1_var1'].copy().reset_index(drop=True)
    d11_index = list(train['domain1_var1'].dropna().index)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['ENet', 'BRidge']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['ENet', 'BRidge']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['ENet', 'BRidge']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['ENet', 'BRidge']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')
    print('Reading scores from ', folder_preds)

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train.loc[d11_index, prefix + '_score'] = np.load(
            folder_preds + '{}_score_seed{}.npy'.format(prefix, seed))
        test.loc[:, prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(set(ic_cols + fnc_cols + pca_cols) - set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_factor=0.2,
                                                  scale_cols=pca_cols,
                                                  sc=MinMaxScaler())

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # learning process on different datasets
    names = ['GP', 'SVM1', 'SVM2', 'OMP', 'KR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        GaussianProcessRegressor(DotProduct(), random_state=0),
        NuSVR(C=5, kernel='rbf'),
        NuSVR(C=5, kernel='rbf'),
        OrthogonalMatchingPursuitCV(),
        KernelRidge(kernel='poly', degree=2, alpha=10)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2,
                        names,
                        is_blend=True)

    # rewrite folders for models and preds
    folder_models = './models/domain1_var1/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain1_var1/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    d11_prediction = pd.DataFrame()
    d11_prediction['Id'] = test['Id'].values
    d11_prediction['pred'] = preds
    d11_prediction.to_csv(folder_preds +
                          'domain1_var1_stack_seed{}.csv'.format(seed),
                          index=False)
    print('domain1_var1 seed pred is saved as',
          folder_preds + 'domain1_var1_stack_seed{}.csv'.format(seed))
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = Op(**self._hyperparams)
Beispiel #28
0
models_summary.append(evaluate_model('Ridge', Ridge(alpha=alpha, max_iter=max_iter)))

models_summary.append(evaluate_model('Ridge CV', RidgeCV(alphas=alphas)))

models_summary.append(evaluate_model('Kernel Ridge', KernelRidge(alpha=alpha)))

models_summary.append(evaluate_model('Elastic Net', ElasticNet(alpha=alpha, max_iter=max_iter)))

models_summary.append(evaluate_model('Elastic Net CV', ElasticNetCV(alphas=alphas, max_iter=max_iter)))

models_summary.append(evaluate_model('Bayesian Ridge', BayesianRidge(n_iter=max_iter)))

models_summary.append(evaluate_model('Orthogonal Matching Pursuit', OrthogonalMatchingPursuit()))

models_summary.append(evaluate_model('Orthogonal Matching Pursuit CV', OrthogonalMatchingPursuitCV()))

print('Models sorted by confidence')
for model_summary in sorted(models_summary, key=itemgetter('confidence'), reverse=True):
    print('| {} | {}% | {} | {} | {} |'.format(
        model_summary['name'],
        round(model_summary['confidence'], 4),
        round(model_summary['mae'], 3),
        round(model_summary['mse'], 3),
        round(model_summary['rmse'], 3),
    ))

print('Models sorted by RSME')
for model_summary in sorted(models_summary, key=itemgetter('rmse')):
    print('| {} | {}% | {} | {} | {} |'.format(
        model_summary['name'],
Beispiel #29
0
idx_r, = coef.nonzero()
plt.subplot(4, 1, 2)
plt.xlim(0, 512)
plt.title("Recovered signal from noise-free measurements")
plt.stem(idx_r, coef[idx_r])

# plot the noisy reconstruction
###############################
omp.fit(X, y_noisy)
coef = omp.coef_
idx_r, = coef.nonzero()
plt.subplot(4, 1, 3)
plt.xlim(0, 512)
plt.title("Recovered signal from noisy measurements")
plt.stem(idx_r, coef[idx_r])

# plot the noisy reconstruction with number of non-zeros set by CV
##################################################################
omp_cv = OrthogonalMatchingPursuitCV()
omp_cv.fit(X, y_noisy)
coef = omp_cv.coef_
idx_r, = coef.nonzero()
plt.subplot(4, 1, 4)
plt.xlim(0, 512)
plt.title("Recovered signal from noisy measurements with CV")
plt.stem(idx_r, coef[idx_r])

plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38)
plt.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit',
             fontsize=16)
plt.show()
Beispiel #30
0
def GetAllModelsForComparison(X_train, Y_train):
    models = {
        'ARDRegression': ARDRegression(),
        'BayesianRidge': BayesianRidge(),
        'ElasticNet': ElasticNet(),
        'ElasticNetCV': ElasticNetCV(),
        'Hinge': Hinge(),
        #'Huber': Huber(),
        'HuberRegressor': HuberRegressor(),
        'Lars': Lars(),
        'LarsCV': LarsCV(),
        'Lasso': Lasso(),
        'LassoCV': LassoCV(),
        'LassoLars': LassoLars(),
        'LassoLarsCV': LassoLarsCV(),
        'LinearRegression': LinearRegression(),
        'Log': Log(),
        'LogisticRegression': LogisticRegression(),
        'LogisticRegressionCV': LogisticRegressionCV(),
        'ModifiedHuber': ModifiedHuber(),
        'MultiTaskElasticNet': MultiTaskElasticNet(),
        'MultiTaskElasticNetCV': MultiTaskElasticNetCV(),
        'MultiTaskLasso': MultiTaskLasso(),
        'MultiTaskLassoCV': MultiTaskLassoCV(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
        'Perceptron': Perceptron(),
        'RANSACRegressor': RANSACRegressor(),
        #'RandomizedLasso': RandomizedLasso(),
        #'RandomizedLogisticRegression': RandomizedLogisticRegression(),
        'Ridge': Ridge(),
        'RidgeCV': RidgeCV(),
        'RidgeClassifier': RidgeClassifier(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        'SquaredLoss': SquaredLoss(),
        'TheilSenRegressor': TheilSenRegressor(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LinearClassifierMixin': LinearClassifierMixin(),
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'StandardScaler': StandardScaler(),
        'TransformerMixin': TransformerMixin(),
        'BaseEstimator': BaseEstimator(),
        'KernelRidge': KernelRidge(),
        'RegressorMixin': RegressorMixin(),
        'LinearSVC': LinearSVC(),
        'LinearSVR': LinearSVR(),
        'NuSVC': NuSVC(),
        'NuSVR': NuSVR(),
        'OneClassSVM': OneClassSVM(),
        'SVC': SVC(),
        'SVR': SVR(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        #'BallTree': BallTree(),
        #'DistanceMetric': DistanceMetric(),
        #'KDTree': KDTree(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'KNeighborsRegressor': KNeighborsRegressor(),
        'KernelDensity': KernelDensity(),
        #'LSHForest': LSHForest(),
        'LocalOutlierFactor': LocalOutlierFactor(),
        'NearestCentroid': NearestCentroid(),
        'NearestNeighbors': NearestNeighbors(),
        'RadiusNeighborsClassifier': RadiusNeighborsClassifier(),
        'RadiusNeighborsRegressor': RadiusNeighborsRegressor(),
        #'GaussianProcess': GaussianProcess(),
        'GaussianProcessRegressor': GaussianProcessRegressor(),
        'GaussianProcessClassifier': GaussianProcessClassifier(),
        'CCA': CCA(),
        'PLSCanonical': PLSCanonical(),
        'PLSRegression': PLSRegression(),
        'PLSSVD': PLSSVD(),
        #'ABCMeta': ABCMeta(),
        #'BaseDiscreteNB': BaseDiscreteNB(),
        'BaseEstimator': BaseEstimator(),
        #'BaseNB': BaseNB(),
        'BernoulliNB': BernoulliNB(),
        'ClassifierMixin': ClassifierMixin(),
        'GaussianNB': GaussianNB(),
        'LabelBinarizer': LabelBinarizer(),
        'MultinomialNB': MultinomialNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'DecisionTreeRegressor': DecisionTreeRegressor(),
        'ExtraTreeClassifier': ExtraTreeClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'BaggingClassifier': BaggingClassifier(),
        'BaggingRegressor': BaggingRegressor(),
        #'BaseEnsemble': BaseEnsemble(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'IsolationForest': IsolationForest(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RandomForestRegressor': RandomForestRegressor(),
        'RandomTreesEmbedding': RandomTreesEmbedding(),
        #'VotingClassifier': VotingClassifier(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LabelBinarizer': LabelBinarizer(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'OneVsOneClassifier': OneVsOneClassifier(),
        #'OneVsRestClassifier': OneVsRestClassifier(),
        #'OutputCodeClassifier': OutputCodeClassifier(),
        'Parallel': Parallel(),
        #'ABCMeta': ABCMeta(),
        'BaseEstimator': BaseEstimator(),
        #'ClassifierChain': ClassifierChain(),
        'ClassifierMixin': ClassifierMixin(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'MultiOutputClassifier': MultiOutputClassifier(),
        #'MultiOutputEstimator': MultiOutputEstimator(),
        #'MultiOutputRegressor': MultiOutputRegressor(),
        'Parallel': Parallel(),
        'RegressorMixin': RegressorMixin(),
        'LabelPropagation': LabelPropagation(),
        'LabelSpreading': LabelSpreading(),
        'BaseEstimator': BaseEstimator(),
        'IsotonicRegression': IsotonicRegression(),
        'RegressorMixin': RegressorMixin(),
        'TransformerMixin': TransformerMixin(),
        'BernoulliRBM': BernoulliRBM(),
        'MLPClassifier': MLPClassifier(),
        'MLPRegressor': MLPRegressor()
    }
    return models
Beispiel #31
0
pl.subplot(4, 1, 2)
pl.xlim(0, 512)
pl.title("Recovered signal from noise-free measurements")
pl.stem(idx_r, coef[idx_r])

# plot the noisy reconstruction
###############################
omp.fit(X, y_noisy)
coef = omp.coef_
idx_r, = coef.nonzero()
pl.subplot(4, 1, 3)
pl.xlim(0, 512)
pl.title("Recovered signal from noisy measurements")
pl.stem(idx_r, coef[idx_r])

# plot the noisy reconstruction with number of non-zeros set by CV
##################################################################
omp_cv = OrthogonalMatchingPursuitCV()
omp_cv.fit(X, y_noisy)
coef = omp_cv.coef_
idx_r, = coef.nonzero()
pl.subplot(4, 1, 4)
pl.xlim(0, 512)
pl.title("Recovered signal from noisy measurements with CV")
pl.stem(idx_r, coef[idx_r])

pl.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38)
pl.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit',
            fontsize=16)
pl.show()
Beispiel #32
0
	build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False)
	build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto")
	build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
	build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto")
	build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto")
	build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto")
	build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto")
	build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto")
	build_auto(HistGradientBoostingRegressor(max_iter = 31, random_state = 13), "HistGradientBoostingAuto")
	build_auto(HuberRegressor(), "HuberAuto")
	build_auto(LarsCV(cv = 3), "LarsAuto")
	build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto")
	build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto")
	build_auto(LinearRegression(), "LinearRegressionAuto")
	build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto")
	build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto")
	build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True)
	build_auto(RidgeCV(), "RidgeAuto")
	build_auto(StackingRegressor([("ridge", Ridge(random_state = 13)), ("lasso", Lasso(random_state = 13))], final_estimator = GradientBoostingRegressor(n_estimators = 7, random_state = 13)), "StackingEnsembleAuto")
	build_auto(TheilSenRegressor(n_subsamples = 31, random_state = 13), "TheilSenAuto")
	build_auto(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("knn", KNeighborsRegressor()), ("lr", LinearRegression())], weights = [3, 1, 2]), "VotingEnsembleAuto")
	build_auto(XGBRFRegressor(n_estimators = 31, max_depth = 6, random_state = 13), "XGBRFAuto")

if "Auto" in datasets:
	build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto")
	build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto")

def build_auto_isotonic(regressor, auto_isotonic_X, name):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
Beispiel #33
0
def RunMP(aligned_data_root_path, output_path):
    do_compute_individual_k_motifs = True
    do_compute_anchored_chains = False
    do_compute_semantic_segmentation = False
    do_compute_multimodal_mp = False
    window_size = 1300
    #window_size = 1500
    data_dict = LoadAlignedTILESData(aligned_data_root_path)

    #plt.ion()

    pids = list(data_dict.keys())[0:1]
    streams = ['HeartRatePPG', 'StepCount']

    # Compute motifs from the individual MP using a greedy method
    if do_compute_individual_k_motifs:
        num_motifs = 2
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            fitbit_df = fitbit_df.iloc[0:10000, :]  # HACK

            for stream in streams:
                exclusion_signal = fitbit_df[stream].copy()
                # Keep a NaN'd version for MP and interpolated one for OMP
                #nan_replace_value = -1000000
                #fitbit_df[stream][np.isnan(fitbit_df[stream])] = nan_replace_value
                #fitbit_df_smooth = fitbit_df[stream].interpolate(method='linear', axis=0, inplace=False)
                #fitbit_df_smooth = fitbit_df[stream].copy()
                fitbit_df_smooth = exclusion_signal.copy()

                if np.isnan(fitbit_df_smooth[0]
                            ):  # Fill NaNs at the beginning and end
                    idx = 0
                    while np.isnan(fitbit_df_smooth[idx]):
                        idx += 1
                    fitbit_df_smooth[0:idx] = fitbit_df_smooth[idx]
                if np.isnan(fitbit_df_smooth[fitbit_df_smooth.shape[0] - 1]):
                    idx = fitbit_df_smooth.shape[0] - 1
                    while np.isnan(fitbit_df_smooth[idx]):
                        idx -= 1
                    fitbit_df_smooth[idx:] = fitbit_df_smooth[idx]

                # Use Matrix Profile methods to learn a motif dictionary
                motifs = []
                while len(motifs) < num_motifs:
                    #fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size) # TODO - use the exclusion_signal
                    fitbit_mp = stumpy.stump(
                        exclusion_signal,
                        m=window_size)  # TODO - use the exclusion_signal
                    fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
                    for motif_idx in range(len(fitbit_mp_argsort)):
                        stream_motif_idx = fitbit_mp_argsort[motif_idx]
                        num_nan = np.sum(
                            np.isnan(exclusion_signal.
                                     values[stream_motif_idx:stream_motif_idx +
                                            window_size]))

                        # Avoid finding bad motifs
                        if num_nan >= 5.0 * window_size / 6.0:
                            continue
                        if stream == 'HeartRatePPG':
                            pass
                        break
                    motif_left_idx = fitbit_mp_argsort[motif_idx]
                    motif = fitbit_df_smooth[motif_left_idx:motif_left_idx +
                                             window_size]
                    motif[motif ==
                          0] = 1e-12  # OMP requires non-zeros in the support
                    motifs.append(motif)
                    plt.plot(range(motif_left_idx,
                                   motif_left_idx + window_size),
                             motifs[-1],
                             'g-',
                             linewidth=5)

                # Build a redundant dictionary from the motifs
                num_repetitions = len(fitbit_df_smooth) - window_size
                dictionary_mat = csr_matrix(
                    (len(motifs) * num_repetitions, len(fitbit_df_smooth)))
                for motif_idx in range(len(motifs)):
                    motif_values = motifs[motif_idx].values
                    for repeat_idx in range(num_repetitions):
                        # SLOW: TODO - find better way of generating this matrix.  Maybe I can change the sparse encoding directly and just push extra zeros in front of the motif sequence? Better yet, why not abandon the matrix representation and just use a list of motifs and their starting index in the signal
                        dictionary_mat[motif_idx * num_repetitions +
                                       repeat_idx, repeat_idx:repeat_idx +
                                       window_size] = motif_values

                # Reconstruct the signal using the motif dictionary
                # TODO : Write my own OMP with exclusion of each atom's support. Gram mat?
                # TODO : Use L1 optimization (Lasso)?
                #omp = OrthogonalMatchingPursuit(n_nonzero_coefs=2, fit_intercept=False)
                omp = OrthogonalMatchingPursuitCV(fit_intercept=False)
                omp.fit(dictionary_mat.T, fitbit_df_smooth)
                intercept = omp.intercept_
                coef = omp.coef_
                idx_r = coef.nonzero()
                num_nonzero = omp.n_nonzero_coefs_

                #max_nonzero = 20
                #skip_nan_percent = 0.1
                #coef = np.zeros((dictionary_mat.T.shape[1],1))
                #intercept = np.zeros((dictionary_mat.T.shape[0],1))
                #for num_nonzero in range(1,max_nonzero+1):
                #   # Reconstruct the signal using the motif dictionary
                #   best_dict_idx = -1
                #   best_error = np.inf
                #   best_dict_support = None
                #   for dict_idx in range(dictionary_mat.shape[0]):
                #      # SLOW
                #      dict_vec = dictionary_mat[dict_idx,:].toarray().reshape(-1,)

                #      # Find the support
                #      left_support_idx = 0
                #      right_support_idx = len(dict_vec)-1
                #      while dict_vec[left_support_idx] == 0 and left_support_idx < len(dict_vec):
                #         left_support_idx += 1
                #      while dict_vec[right_support_idx] == 0 and right_support_idx >= 0:
                #         right_support_idx -= 1

                #      # Skip mostly NaN regions
                #      if np.sum(np.isnan(exclusion_signal[left_support_idx:right_support_idx+1])) > skip_nan_percent*(right_support_idx-left_support_idx+1):
                #         continue

                #      # Find the best match
                #      residual = exclusion_signal[left_support_idx:right_support_idx+1] - dict_vec[left_support_idx:right_support_idx+1]
                #      np.nan_to_num(residual, copy=False) # Replace NaN with zero
                #      error = np.dot(residual, residual)
                #      if error < best_error:
                #         best_error = error
                #         coef_val = 1 # TODO - constrain between 0.5 and 2?
                #         best_dict_idx = dict_idx
                #         best_dict_support = (left_support_idx, right_support_idx)

                #   if best_dict_idx < 0:
                #      print("No best next dictionary element found")
                #      break

                #   # Update coef
                #   coef_nonzero = (coef != 0).reshape(-1,)
                #   if np.sum(coef_nonzero) > 0:
                #      dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                #      coef_reduced = coef[coef_nonzero]

                #      #prev_fit_signal = np.matmul(dictionary_mat.T, coef)
                #      prev_fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      prev_residual = fitbit_df_smooth - prev_fit_signal.reshape(-1,)
                #      np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero
                #      prev_error = np.dot(prev_residual, prev_residual)

                #      coef[best_dict_idx] = coef_val
                #      #fit_signal = np.matmul(dictionary_mat.T, coef)
                #      fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,)
                #      np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero
                #      fit_error = np.dot(fit_residual, fit_residual)
                #   else:
                #      prev_residual = fitbit_df_smooth- np.zeros(len(fitbit_df_smooth))
                #      np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero
                #      prev_error = np.dot(prev_residual, prev_residual)

                #      coef[best_dict_idx] = coef_val
                #      coef_nonzero = (coef != 0).reshape(-1,)
                #      dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                #      coef_reduced = coef[coef_nonzero]

                #      fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,)
                #      np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero
                #      fit_error = np.dot(fit_residual, fit_residual)

                #   if best_dict_support is not None:
                #      exclusion_signal[best_dict_support[0]:best_dict_support[1]+1] = np.inf

                #   if prev_error < fit_error:
                #      print("Avoiding overfitting...")
                #      coef[best_dict_idx,0] = 0
                #      break

                coef_nonzero = (coef != 0).reshape(-1, )
                dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                coef_reduced = coef[coef_nonzero]
                fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(),
                                       coef_reduced) + intercept
                plt.plot(range(fitbit_df[stream].shape[0]), fitbit_df[stream],
                         'b-')
                #plt.plot(range(fitbit_df_smooth.shape[0]), fitbit_df_smooth, 'k-')
                plt.plot(range(fitbit_df[stream].shape[0]), fit_signal, 'r--')
                plt.title('OMP (%d coefs) + MP Motifs (%d motifs)' %
                          (num_nonzero, num_motifs))
                plt.xlabel('Time')
                plt.ylabel(stream)
                plt.show()
                return
                pdb.set_trace()

    # Compute individual matrix profiles (stump)
    if do_compute_anchored_chains or do_compute_semantic_segmentation:
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            for stream in streams:
                fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size)

                if do_compute_anchored_chains:
                    left_mp_idx = fitbit_mp[:, 2]
                    right_mp_idx = fitbit_mp[:, 3]
                    #atsc_idx = 10
                    #anchored_chain = stumpy.atsc(left_mp_idx, right_mp_idx, atsc_idx)
                    all_chain_set, unanchored_chain = stumpy.allc(
                        left_mp_idx, right_mp_idx)

                if do_compute_semantic_segmentation:
                    subseq_len = window_size
                    correct_arc_curve, regime_locations = stumpy.fluss(
                        fitbit_mp[:, 1],
                        L=subseq_len,
                        n_regimes=2,
                        excl_factor=5)

                # Find the first motif with nearly no NaN values in the stream signal
                fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
                for motif_idx in range(len(fitbit_mp_argsort)):
                    stream_motif_idx = fitbit_mp_argsort[motif_idx]
                    num_nan = np.sum(
                        np.isnan(fitbit_df[stream].
                                 values[stream_motif_idx:stream_motif_idx +
                                        window_size]))

                    # Avoid finding bad motifs
                    if num_nan >= 5.0 * window_size / 6.0:
                        continue
                    if stream == 'HeartRatePPG':
                        pass
                        # Check for flat heart rate
                        #nan_like_value = 70
                        #num_valid = np.count_nonzero((fitbit_df[stream] - nan_like_value)[stream_motif_idx:stream_motif_idx+window_size])
                        #if num_valid < window_size - 2:
                        #   continue

                        # Check for linear heart rate over time
                        #residual_threshold = window_size*(4.0**2)
                        #p, res, rank, sing_vals, rcond = np.polyfit(range(window_size), fitbit_df[stream][stream_motif_idx:stream_motif_idx+window_size], deg=1, full=True)
                        #if res < residual_threshold:
                        #   continue
                    break

                num_subplots = 3 if do_compute_semantic_segmentation else 2
                fig, axs = plt.subplots(num_subplots,
                                        sharex=True,
                                        gridspec_kw={'hspace': 0})
                plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid),
                             fontsize='30')
                axs[0].plot(fitbit_df[stream].values)
                rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0),
                                     window_size,
                                     2000,
                                     facecolor='lightgrey')
                axs[0].add_patch(rect)
                rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0),
                                     window_size,
                                     2000,
                                     facecolor='lightgrey')
                axs[0].add_patch(rect)
                axs[0].set_ylabel(stream, fontsize='20')
                axs[1].plot(fitbit_mp[:, 0])
                axs[1].axvline(x=fitbit_mp_argsort[motif_idx],
                               linestyle="dashed")
                axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1],
                               linestyle="dashed")
                axs[1].set_ylabel('Matrix Profile', fontsize='20')

                if do_compute_anchored_chains:
                    for i in range(unanchored_chain.shape[0]):
                        y = fitbit_df[stream].iloc[
                            unanchored_chain[i]:unanchored_chain[i] +
                            window_size]
                        x = y.index.values
                        axs[0].plot(x, y, linewidth=3)

                if do_compute_semantic_segmentation:
                    axs[2].plot(range(correct_arc_curve.shape[0]),
                                correct_arc_curve,
                                color='C1')
                    axs[0].axvline(x=regime_locations[0], linestyle="dashed")
                    axs[2].axvline(x=regime_locations[0], linestyle="dashed")

                plt.show()

    # Compute multi-dimensional matrix profiles (mstump)
    if do_compute_multimodal_mp:
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            data = fitbit_df.loc[:, streams].values
            mp, mp_indices = stumpy.mstump(data.T, m=window_size)
            #print("Stumpy's mstump function does not handle NaN values. Skipping multi-dimensional MP")
            #break

            # TODO - This code is copied from above. Fix and finish it once mstump supports NaN
            # Find the first motif with nearly no NaN values in the stream signal
            fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
            for motif_idx in range(len(fitbit_mp_argsort)):
                stream_motif_idx = fitbit_mp_argsort[motif_idx]
                num_nan = np.sum(
                    np.isnan(fitbit_df[stream].
                             values[stream_motif_idx:stream_motif_idx +
                                    window_size]))

                # Avoid finding bad motifs
                if num_nan >= 2:
                    continue
                if stream == 'HeartRatePPG':
                    # Check for flat heart rate
                    nan_like_value = 70
                    num_valid = np.count_nonzero(
                        (fitbit_df[stream] -
                         nan_like_value)[stream_motif_idx:stream_motif_idx +
                                         window_size])
                    if num_valid < window_size - 2:
                        continue

                    # Check for linear heart rate over time
                    residual_threshold = window_size * (4.0**2)
                    p, res, rank, sing_vals, rcond = np.polyfit(
                        range(window_size),
                        fitbit_df[stream][stream_motif_idx:stream_motif_idx +
                                          window_size],
                        deg=1,
                        full=True)
                    if res < residual_threshold:
                        continue
                break

            fig, axs = plt.subplots(2, sharex=True, gridspec_kw={'hspace': 0})
            plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid),
                         fontsize='30')
            axs[0].plot(fitbit_df[stream].values)
            rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0),
                                 window_size,
                                 2000,
                                 facecolor='lightgrey')
            axs[0].add_patch(rect)
            rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0),
                                 window_size,
                                 2000,
                                 facecolor='lightgrey')
            axs[0].add_patch(rect)
            axs[0].set_ylabel(stream, fontsize='20')
            axs[1].plot(fitbit_mp[:, 0])
            axs[1].axvline(x=fitbit_mp_argsort[motif_idx], linestyle="dashed")
            axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1],
                           linestyle="dashed")
            axs[1].set_ylabel('Matrix Profile', fontsize='20')
            plt.show()

    plt.ioff()
    plt.figure()
    plt.plot()
    plt.title('Dummy plot')
    plt.show()
    return