Beispiel #1
0
def mean_absolute_error(y_true, y_pred):
    """
    Mean absolute error and its standard deviation.
    
    If you need only mean absolute error, use 
    :func:`sklearn.metrics.mean_absolute_error`
    
    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores

    Returns
    -------
    mean : float
        mean of squared errors
    stdev : float
        standard deviation of squared errors
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calculate errors
    errs = np.abs(y_true - y_pred)
    mean = np.nanmean(errs)
    stdev = np.nanstd(errs)

    return mean, stdev
    def set_x_d(self, treatment_var):
        """
        Function that assigns the role for the treatment variables in the multiple-treatment case.

        Parameters
        ----------
        treatment_var : str
            Active treatment variable that will be set to d.
        """
        if not isinstance(treatment_var, str):
            raise TypeError(
                'treatment_var must be of str type. '
                f'{str(treatment_var)} of type {str(type(treatment_var))} was passed.'
            )
        if treatment_var not in self.d_cols:
            raise ValueError('Invalid treatment_var. '
                             f'{treatment_var} is not in d_cols.')
        if self.use_other_treat_as_covariate:
            # note that the following line needs to be adapted in case an intersection of x_cols and d_cols as allowed
            # (see https://github.com/DoubleML/doubleml-for-py/issues/83)
            xd_list = self.x_cols + self.d_cols
            xd_list.remove(treatment_var)
        else:
            xd_list = self.x_cols
        assert_all_finite(self.data.loc[:, treatment_var])
        if self.force_all_x_finite:
            assert_all_finite(self.data.loc[:, xd_list],
                              allow_nan=self.force_all_x_finite == 'allow-nan')
        self._d = self.data.loc[:, treatment_var]
        self._X = self.data.loc[:, xd_list]
    def fit(self, X, y=None):
        """
        Saves the `r` vector (log-ratio vector) that will be applied during transformation

        Parameters
        ----------
        X: array-like of shape = [n_samples, n_features]
            Feature matrix representing the vectorized input

        """
        # checks
        X = check_array(
            X.toarray() if isinstance(X, sparse.csr.csr_matrix) else X)
        if not isinstance(X, np.ndarray) and not isinstance(
                X, sparse.csr.csr_matrix):
            raise TypeError(
                "data type of X must be dense or sparse array; type = {}".
                format(type(X)))
        assert_all_finite(X)
        # get type of feature_matrix
        fm_type = None
        if isinstance(X, np.ndarray):
            fm_type = "dense"
        elif isinstance(X, sparse.csr.csr_matrix):
            fm_type = "sparse"
        # get p, not_p
        _p, _not_p = self._get_p_not_p(X)
        # get r
        self._r = self._get_r(_p, _not_p)
        # ensure is_fitted
        self.X_ = X
        self.y_ = y
        return self
Beispiel #4
0
def score_predictor_report(y_true, y_pred, disp=True):
    """
    Report brief summary of prediction performance
    
    * mean absolute error
    * root mean squared error
    * number of data
    * mean and standard dev. of true scores
    * mean and standard dev. of predicted scores

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores
    disp : bool, optional, default=True
        if True, print report

    Returns
    -------
    stats : dict
        belief summary of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {
        'mean absolute error':
        skm.mean_absolute_error(y_true, y_pred),
        'root mean squared error':
        np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)),
        'n_samples':
        y_true.size,
        'true': {
            'mean': np.mean(y_true),
            'stdev': np.std(y_true)
        },
        'predicted': {
            'mean': np.mean(y_pred),
            'stdev': np.std(y_pred)
        }
    }

    # display statistics
    if disp:
        print(json.dumps(stats,
                         sort_keys=True,
                         indent=4,
                         separators=(',', ': '),
                         ensure_ascii=False),
              file=sys.stderr)

    return stats
Beispiel #5
0
def _binary_clf_curve(y_true, y_score):
    check_consistent_length(y_true, y_score, None)
    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    assert_all_finite(y_true)
    assert_all_finite(y_score)

    # make y_true a boolean vector
    y_true = (y_true == 1)

    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = stable_cumsum(y_true)[threshold_idxs]
    fps = 1 + threshold_idxs - tps

    return fps, tps, y_score[threshold_idxs]
Beispiel #6
0
def mean_absolute_error(y_true, y_pred):
    """
    Mean absolute error and its standard deviation.
    
    If you need only mean absolute error, use 
    :func:`sklearn.metrics.mean_absolute_error`
    
    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores

    Returns
    -------
    mean : float
        mean of squared errors
    stdev : float
        standard deviation of squared errors
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calculate errors
    errs = np.abs(y_true - y_pred)
    mean = np.nanmean(errs)
    stdev = np.nanstd(errs)

    return mean, stdev
Beispiel #7
0
def item_finder_report(y_true, y_pred, disp=True):
    """
    Report brief summary of prediction performance

    * AUC
    * number of data
    * mean and standard dev. of true scores
    * mean and standard dev. of predicted scores

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores
    disp : bool, optional, default=True
        if True, print report

    Returns
    -------
    stats : dict
        belief summary of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    if not is_binary_score(y_true):
        raise ValueError('True scores must be binary')
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {
        'n_samples': y_true.size,
        'true': {
            'mean': np.mean(y_true),
            'stdev': np.std(y_true)
        },
        'predicted': {
            'mean': np.mean(y_pred),
            'stdev': np.std(y_pred)
        }
    }

    # statistics at least 0 and 1 must be contained in a score array
    if is_binary_score(y_true, allow_uniform=False):
        stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred)

    # display statistics
    if disp:
        print(json.dumps(stats,
                         sort_keys=True,
                         indent=4,
                         separators=(',', ': '),
                         ensure_ascii=False),
              file=sys.stderr)

    return stats
Beispiel #8
0
    def fit(self, X, y=None):
        """Compute the Deterministic Shared Response Model
        Parameters
        ----------
        X : list of 2D arrays, element i has shape=[voxels_i, samples]
            Each element in the list contains the fMRI data of one subject.
        y : not used
        """
        logger.info('Starting Deterministic SRM')

        # Check the number of subjects
        if len(X) <= 1:
            raise ValueError("There are not enough subjects "
                             "({0:d}) to train the model.".format(len(X)))

        # Check for input data sizes
        if X[0].shape[1] < self.features:
            raise ValueError(
                "There are not enough samples to train the model with "
                "{0:d} features.".format(self.features))

        # Check if all subjects have same number of TRs
        number_trs = X[0].shape[1]
        number_subjects = len(X)
        for subject in range(number_subjects):
            assert_all_finite(X[subject])
            if X[subject].shape[1] != number_trs:
                raise ValueError("Different number of samples between subjects"
                                 ".")

        # Run SRM
        self.w_, self.s_ = self._srm(X)

        return self
 def _set_y_z(self):
     assert_all_finite(self.data.loc[:, self.y_col])
     self._y = self.data.loc[:, self.y_col]
     if self.z_cols is None:
         self._z = None
     else:
         assert_all_finite(self.data.loc[:, self.z_cols])
         self._z = self.data.loc[:, self.z_cols]
Beispiel #10
0
 def transform(self, df, *_):
     X = df[df['Age'].isnull()]
     X = X.drop(['Age'], axis=1)
     y = pd.Series(self._model.predict(X))
     y.index = X.index
     df.loc[y.index, 'Age'] = y
     assert_all_finite(df)
     return df
def test_int_overflow_mutual_info_score():
    # Test overflow in mutual_info_classif
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] *
                 (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x.ravel(), y.ravel(), log_base='e'))
def test_int_overflow_mutual_info_score():
    # Test overflow in mutual_info_classif
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 +
                 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x.ravel(), y.ravel()))
Beispiel #13
0
def test_int_overflow_mutual_info_fowlkes_mallows_score():
    # Test overflow in mutual_info_classif and fowlkes_mallows_score
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] *
                 (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x, y))
    assert_all_finite(fowlkes_mallows_score(x, y))
Beispiel #14
0
def _validate_mcmc_fit_input(X_train, y_train, X_test):

        check_consistent_length(X_train, y_train)
        assert_all_finite(y_train)
        y_train = check_array(y_train, ensure_2d=False, dtype=np.float64)

        assert X_train.shape[1] == X_test.shape[1]
        X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64,
                              order="F")
        X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64,
                             order="F")
        return X_train, y_train, X_test
Beispiel #15
0
    def fit(self, X: pd.DataFrame, y=None):
        cols = self.cols or X.columns.tolist()

        if not self.fill:
            assert_all_finite(X, allow_nan=False)

        self.categories_ = dict()

        for col in cols:
            cutoff = _encode_python(X[col].fillna('_MISSING').astype(str))
            self.categories_[col] = cutoff
        return self
Beispiel #16
0
def entropy(*args):
   xy = zip(*args)
   # probs
   proba = [ float(xy.count(c)) / len(xy) for c in dict.fromkeys(list(xy)) ]
   safe_asarray(xy)
   #very pythonic list comprehension
   # the follwoing line is just a list comprehnsion with x =
   # x[numpy.isfinite(x)] having ability to filter crap out
   # x = x[numpy.logical_not(numpy.isnan(x))]
   entropy = -np.sum([ ((p * np.log2(p)) , 0 )    [ math.isnan(p * np.log2(p)) or math.isinf(p * np.log2(p)) ] for p in proba ])
   assert_all_finite(entropy)
   return entropy
Beispiel #17
0
    def fit(self, X, y=None):
        """Compute the probabilistic Shared Response Model

        Parameters
        ----------
        X :  list of 2D arrays, element i has shape=[voxels_i, samples]
            Each element in the list contains the fMRI data of one subject.

        y : not used
        """
        logger.info('Starting Probabilistic SRM')

        # Check the number of subjects
        if len(X) <= 1:
            raise ValueError("There are not enough subjects "
                             "({0:d}) to train the model.".format(len(X)))

        # Check for input data sizes
        number_subjects = len(X)
        number_subjects_vec = self.comm.allgather(number_subjects)
        for rank in range(self.comm.Get_size()):
            if number_subjects_vec[rank] != number_subjects:
                raise ValueError(
                    "Not all ranks have same number of subjects")

        # Collect size information
        shape0 = np.zeros((number_subjects,), dtype=np.int)
        shape1 = np.zeros((number_subjects,), dtype=np.int)

        for subject in range(number_subjects):
            if X[subject] is not None:
                assert_all_finite(X[subject])
                shape0[subject] = X[subject].shape[0]
                shape1[subject] = X[subject].shape[1]

        shape0 = self.comm.allreduce(shape0, op=MPI.SUM)
        shape1 = self.comm.allreduce(shape1, op=MPI.SUM)

        # Check if all subjects have same number of TRs
        number_trs = np.min(shape1)
        for subject in range(number_subjects):
            if shape1[subject] < self.features:
                raise ValueError(
                    "There are not enough samples to train the model with "
                    "{0:d} features.".format(self.features))
            if shape1[subject] != number_trs:
                raise ValueError("Different number of samples between subjects"
                                 ".")
        # Run SRM
        self.sigma_s_, self.w_, self.mu_, self.rho2_, self.s_ = self._srm(X)

        return self
Beispiel #18
0
    def fit(self, X, y=None):
        """Compute the probabilistic Shared Response Model

        Parameters
        ----------
        X :  list of 2D arrays, element i has shape=[voxels_i, samples]
            Each element in the list contains the fMRI data of one subject.

        y : not used
        """
        logger.info('Starting Probabilistic SRM')

        # Check the number of subjects
        if len(X) <= 1:
            raise ValueError("There are not enough subjects "
                             "({0:d}) to train the model.".format(len(X)))

        # Check for input data sizes
        number_subjects = len(X)
        number_subjects_vec = self.comm.allgather(number_subjects)
        for rank in range(self.comm.Get_size()):
            if number_subjects_vec[rank] != number_subjects:
                raise ValueError("Not all ranks have same number of subjects")

        # Collect size information
        shape0 = np.zeros((number_subjects, ), dtype=np.int)
        shape1 = np.zeros((number_subjects, ), dtype=np.int)

        for subject in range(number_subjects):
            if X[subject] is not None:
                assert_all_finite(X[subject])
                shape0[subject] = X[subject].shape[0]
                shape1[subject] = X[subject].shape[1]

        shape0 = self.comm.allreduce(shape0, op=MPI.SUM)
        shape1 = self.comm.allreduce(shape1, op=MPI.SUM)

        # Check if all subjects have same number of TRs
        number_trs = np.min(shape1)
        for subject in range(number_subjects):
            if shape1[subject] < self.features:
                raise ValueError(
                    "There are not enough samples to train the model with "
                    "{0:d} features.".format(self.features))
            if shape1[subject] != number_trs:
                raise ValueError("Different number of samples between subjects"
                                 ".")
        # Run SRM
        self.sigma_s_, self.w_, self.mu_, self.rho2_, self.s_ = self._srm(X)

        return self
Beispiel #19
0
def item_finder_report(y_true, y_pred, disp=True):
    """
    Report brief summary of prediction performance

    * AUC
    * number of data
    * mean and standard dev. of true scores
    * mean and standard dev. of predicted scores

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores
    disp : bool, optional, default=True
        if True, print report

    Returns
    -------
    stats : dict
        belief summary of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    if not is_binary_score(y_true):
        raise ValueError('True scores must be binary')
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {
        'n_samples': y_true.size,
        'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)},
        'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}}

    # statistics at least 0 and 1 must be contained in a score array
    if is_binary_score(y_true, allow_uniform=False):
        stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred)

    # display statistics
    if disp:
        print(
            json.dumps(
                stats, sort_keys=True, indent=4, separators=(',', ': '),
                ensure_ascii=False), file=sys.stderr)

    return stats
Beispiel #20
0
def item_finder_statistics(y_true, y_pred):
    """
    Full Statistics of prediction performance

    * n_samples
    * mean_absolute_error: mean, stdev
    * mean_squared_error: mean, rmse, stdev
    * predicted: mean, stdev
    * true: mean, stdev

    Parameters
    ----------
    y_true : array, shape=(n_samples,)
        Ground truth scores
    y_pred : array, shape=(n_samples,)
        Predicted scores

    Returns
    -------
    stats : dict
        Full statistics of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    if not is_binary_score(y_true):
        raise ValueError('True scores must be binary')
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {}

    # dataset size
    stats['n_samples'] = y_true.size

    # descriptive statistics of ground truth scores
    stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)}

    # descriptive statistics of ground predicted scores
    stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}

    # statistics at least 0 and 1 must be contained in a score array
    if is_binary_score(y_true, allow_uniform=False):

        # AUC (area under the curve)
        stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred)

    return stats
Beispiel #21
0
def item_finder_statistics(y_true, y_pred):
    """
    Full Statistics of prediction performance

    * n_samples
    * mean_absolute_error: mean, stdev
    * mean_squared_error: mean, rmse, stdev
    * predicted: mean, stdev
    * true: mean, stdev

    Parameters
    ----------
    y_true : array, shape=(n_samples,)
        Ground truth scores
    y_pred : array, shape=(n_samples,)
        Predicted scores

    Returns
    -------
    stats : dict
        Full statistics of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    if not is_binary_score(y_true):
        raise ValueError('True scores must be binary')
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {}

    # dataset size
    stats['n_samples'] = y_true.size

    # descriptive statistics of ground truth scores
    stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)}

    # descriptive statistics of ground predicted scores
    stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}

    # statistics at least 0 and 1 must be contained in a score array
    if is_binary_score(y_true, allow_uniform=False):

        # AUC (area under the curve)
        stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred)

    return stats
Beispiel #22
0
def score_predictor_report(y_true, y_pred, disp=True):
    """
    Report brief summary of prediction performance
    
    * mean absolute error
    * root mean squared error
    * number of data
    * mean and standard dev. of true scores
    * mean and standard dev. of predicted scores

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores
    disp : bool, optional, default=True
        if True, print report

    Returns
    -------
    stats : dict
        belief summary of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {
        'mean absolute error': skm.mean_absolute_error(y_true, y_pred),
        'root mean squared error':
            np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)),
        'n_samples': y_true.size,
        'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)},
        'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}}

    # display statistics
    if disp:
        print(json.dumps(
            stats, sort_keys=True, indent=4, separators=(',', ': '),
            ensure_ascii=False),
            file=sys.stderr)

    return stats
Beispiel #23
0
def gpflow_predict(model, Xin):
    Xin = check_array(Xin, copy=False, warn_on_dtype=True, dtype=FLOAT_DTYPES)
    fmean, fvar, _, _, _ = model._build_predict(Xin)  # pylint: disable=protected-access
    y_mean_var = model.likelihood.predict_mean_and_var(fmean, fvar)
    y_mean = y_mean_var[0]
    y_var = y_mean_var[1]
    y_std = tf.sqrt(y_var)

    session = model.enquire_session(session=None)
    with session.as_default():
        y_mean_value = session.run(y_mean)
        y_std_value = session.run(y_std)
        assert_all_finite(y_mean_value)
        assert_all_finite(y_std_value)
        return GPRResult(y_mean_value, y_std_value)
Beispiel #24
0
 def transform(self, df, *_):
     assert_all_finite(df)
     interaction = {}
     for c0 in df:
         for c1 in df:
             interaction['{}*{}'.format(c0, c1)] = df[c0] * df[c1]
             if c0 != c1:
                 interaction['{}-{}'.format(c0, c1)] = df[c0] - df[c1]
                 interaction['{}/{}'.format(
                     c0, c1)] = df[c0] / df[c1].replace(0, 1)
     df_interaction = pd.DataFrame(interaction)
     df_interaction.index = df.index
     df = pd.concat([df, df_interaction], axis=1)
     assert_all_finite(df)
     return df
 def fit_transform(self, X, y=None):
     """Runs fit() and transform() together."""
     if np.any(y):
         X, y = check_X_y(
             X.toarray() if isinstance(X, sparse.csr.csr_matrix) else X, y)
     else:
         X = check_array(
             X.toarray() if isinstance(X, sparse.csr.csr_matrix) else X)
     assert_all_finite(X)
     if y is None:
         # fit method of arity 1 (unsupervised transformation)
         return self.fit(X).transform(X)
     else:
         # fit method of arity 2 (supervised transformation)
         return self.fit(X, y).transform(X)
Beispiel #26
0
    def predict(self, X):
        """Predict using the factorization machine

        Parameters
        ----------
        X : sparse matrix, shape = [n_samples, n_features]


        Returns
        -------

        array, shape = [n_samples]
           Predicted target values per element in X.
        """
        assert_all_finite(X)
        return self.fm.predict(X)
Beispiel #27
0
def _validate_mcmc_fit_input(X_train, y_train, X_test):

    check_consistent_length(X_train, y_train)
    assert_all_finite(y_train)
    y_train = check_array(y_train, ensure_2d=False, dtype=np.float64)

    assert X_train.shape[1] == X_test.shape[1]
    X_train = check_array(X_train,
                          accept_sparse="csc",
                          dtype=np.float64,
                          order="F")
    X_test = check_array(X_test,
                         accept_sparse="csc",
                         dtype=np.float64,
                         order="F")
    return X_train, y_train, X_test
Beispiel #28
0
def tf_optimize(model,
                Xnew_arr,
                learning_rate=0.01,
                maxiter=100,
                ucb_beta=3.,
                active_dims=None,
                bounds=None):
    Xnew_arr = check_array(Xnew_arr,
                           copy=False,
                           warn_on_dtype=True,
                           dtype=FLOAT_DTYPES)

    Xnew = tf.Variable(Xnew_arr, name='Xnew', dtype=settings.float_type)
    if bounds is None:
        lower_bound = tf.constant(-np.infty, dtype=settings.float_type)
        upper_bound = tf.constant(np.infty, dtype=settings.float_type)
    else:
        lower_bound = tf.constant(bounds[0], dtype=settings.float_type)
        upper_bound = tf.constant(bounds[1], dtype=settings.float_type)
    Xnew_bounded = tf.minimum(tf.maximum(Xnew, lower_bound), upper_bound)

    if active_dims:
        indices = []
        updates = []
        n_rows = Xnew_arr.shape[0]
        for c in active_dims:
            for r in range(n_rows):
                indices.append([r, c])
                updates.append(Xnew_bounded[r, c])
        part_X = tf.scatter_nd(indices, updates, Xnew_arr.shape)
        Xin = part_X + tf.stop_gradient(-part_X + Xnew_bounded)
    else:
        Xin = Xnew_bounded

    beta_t = tf.constant(ucb_beta, name='ucb_beta', dtype=settings.float_type)
    y_mean_var = model.likelihood.predict_mean_and_var(
        *model._build_predict(Xin))
    loss = tf.subtract(y_mean_var[0],
                       tf.multiply(beta_t, y_mean_var[1]),
                       name='loss_fn')
    opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-6)
    train_op = opt.minimize(loss)
    variables = opt.variables()
    init_op = tf.variables_initializer([Xnew] + variables)
    session = model.enquire_session(session=None)
    with session.as_default():
        session.run(init_op)
        for i in range(maxiter):
            session.run(train_op)
        Xnew_value = session.run(Xnew_bounded)
        y_mean_value, y_var_value = session.run(y_mean_var)
        loss_value = session.run(loss)
        assert_all_finite(Xnew_value)
        assert_all_finite(y_mean_value)
        assert_all_finite(y_var_value)
        assert_all_finite(loss_value)
        return GPRGDResult(y_mean_value, y_var_value, loss_value, Xnew_value)
Beispiel #29
0
def score_histogram(x, score_domain=(1, 5, 1)):
    """
    Histogram of scores 

    Parameters
    ----------
    x : array, shape=(n_samples), dtype=float or int
        A set of scores
    score_domain : array, shape=(3,) OR int, optional 
        Domain of scores, represented by a triple of the minimum, the maximum,
        and strides of the score, if array-like.  
        The range between the minimum and the maximum are divided into the
        specified number of bins, if int.
        default=(1, 5, 1).
    Returns
    -------
    hist : array_like, shape=(n_score_levels,)
        The number of data in each bin
    scores : array_like, shape=(n_score_levels + 1,)
        sequences of possible scores
    """

    # check inputs
    assert_all_finite(x)
    if isinstance(score_domain, np.integer):
        bins = score_domain
    else:
        assert_all_finite(score_domain)
        bins = generate_score_bins(score_domain)

    # making histogram
    hist, bins = np.histogram(x, bins=bins)

    # candidates of possible scores
    if isinstance(score_domain, np.integer):
        scores = (bins[1:] + bins[:-1]) / 2
    else:
        scores = np.hstack([
            np.arange(score_domain[0],
                      score_domain[1],
                      score_domain[2],
                      dtype=float), score_domain[1]
        ])

    # return statistics
    return hist, scores
Beispiel #30
0
    def fit(self, X: pd.DataFrame, y):
        # store a mapping from feature value to woe value
        self.mapping_ = dict()
        cols = self.cols or X.columns.tolist()
        conditional_cols = self.conditional_cols or []

        for col in cols:
            if col not in conditional_cols:
                # missing value can not be handled by WoeEncoder
                # since np.nan will fail the equality check
                assert_all_finite(X[col])

            woe_value = woe(X[col], y,
                            conditional=col in conditional_cols,
                            na_values=self.na_values)
            self.mapping_[col] = woe_value
        return self
Beispiel #31
0
def score_histogram(x, score_domain=(1, 5, 1)):
    """
    Histogram of scores 

    Parameters
    ----------
    x : array, shape=(n_samples), dtype=float or int
        A set of scores
    score_domain : array, shape=(3,) OR int, optional 
        Domain of scores, represented by a triple of the minimum, the maximum,
        and strides of the score, if array-like.  
        The range between the minimum and the maximum are divided into the
        specified number of bins, if int.
        default=(1, 5, 1).
    Returns
    -------
    hist : array_like, shape=(n_score_levels,)
        The number of data in each bin
    scores : array_like, shape=(n_score_levels + 1,)
        sequences of possible scores
    """

    # check inputs
    assert_all_finite(x)
    if isinstance(score_domain, np.integer):
        bins = score_domain
    else:
        assert_all_finite(score_domain)
        bins = generate_score_bins(score_domain)

    # making histogram
    hist, bins = np.histogram(x, bins=bins)

    # candidates of possible scores
    if isinstance(score_domain, np.integer):
        scores = (bins[1:] + bins[:-1]) / 2
    else:
        scores = np.hstack(
            [np.arange(score_domain[0], score_domain[1], score_domain[2],
                       dtype=float),
             score_domain[1]])

    # return statistics
    return hist, scores
Beispiel #32
0
    def transform(self, X: pd.DataFrame, y=None):
        check_is_fitted(self, 'categories_')
        x = X.copy()

        for col in self.cols or X.columns:
            if col not in x:
                msg = 'Column {} is not found in the DataFrame'.format(col)
                if self.error == 'raise':
                    raise ValueError(msg)
                if self.error == 'warn':
                    warnings.warn(msg)

            if not self.fill:
                assert_all_finite(x[col], allow_nan=False)
            else:
                x[col] = x[col].fillna('_MISSING').astype(str)
            cutoff = self.categories_[col]
            _, x[col] = _encode_python(x[col], uniques=cutoff, encode=True, unseen=self.unseen)
        return x
Beispiel #33
0
def test_baseline_categorical_crossentropy():
    rng = np.random.RandomState(0)

    prediction_dim = 4
    loss = _LOSSES['categorical_crossentropy']()
    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
        y_train = y_train.astype(np.float32)
        baseline_prediction = loss.get_baseline_prediction(
            y_train, prediction_dim)
        assert_all_finite(baseline_prediction)

    # Same logic as for above test. Here inverse_link_function = softmax and
    # link_function = log
    y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
    baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim)
    assert baseline_prediction.shape == (1, prediction_dim)
    for k in range(prediction_dim):
        p = (y_train == k).mean()
        assert_almost_equal(baseline_prediction[:, k], np.log(p))
Beispiel #34
0
def test_baseline_poisson():
    rng = np.random.RandomState(0)

    loss = _LOSSES["poisson"](sample_weight=None)
    y_train = rng.poisson(size=100).astype(np.float64)
    # Sanity check, make sure at least one sample is non-zero so we don't take
    # log(0)
    assert y_train.sum() > 0
    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
    assert np.isscalar(baseline_prediction)
    assert baseline_prediction.dtype == y_train.dtype
    assert_all_finite(baseline_prediction)
    # Make sure baseline prediction produces the log of the mean of all targets
    assert_almost_equal(np.log(y_train.mean()), baseline_prediction)

    # Test baseline for y_true = 0
    y_train.fill(0.0)
    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
    assert_all_finite(baseline_prediction)
Beispiel #35
0
                def on_next(obj):
                    nonlocal self
                    X = obj[["p_log", "q_log"]]
                    check_is_fitted(self, ["is_fitted"])
                    utils.assert_all_finite(X)
                    X = utils.as_float_array(X)
                    self._update_clustering(X)

                    obj_2 = {
                        "i_min": np.min(obj[["i"]]),
                        "i_max": np.max(obj[["i"]]),
                        "cluster": self.clustering,
                        "X": X
                    }

                    if "start_time" in obj.keys():
                        obj_2["start_time"] = obj.iloc[-1]["start_time"]

                    observer.on_next(obj_2)
Beispiel #36
0
    def fit(self, X):
        """Compute the Robust Shared Response Model

        Parameters
        ----------

        X : list of 2D arrays, element i has shape=[voxels_i, timepoints]
            Each element in the list contains the fMRI data of one subject.
        """
        logger.info('Starting RSRM')

        # Check that the regularizer value is positive
        if 0.0 >= self.lam:
            raise ValueError("Gamma parameter should be positive.")

        # Check the number of subjects
        if len(X) <= 1:
            raise ValueError("There are not enough subjects in the input "
                             "data to train the model.")

        # Check for input data sizes
        if X[0].shape[1] < self.features:
            raise ValueError(
                "There are not enough timepoints to train the model with "
                "{0:d} features.".format(self.features))

        # Check if all subjects have same number of TRs for alignment
        number_trs = X[0].shape[1]
        number_subjects = len(X)
        for subject in range(number_subjects):
            assert_all_finite(X[subject])
            if X[subject].shape[1] != number_trs:
                raise ValueError("Different number of alignment timepoints "
                                 "between subjects.")

        # Create a new random state
        self.random_state_ = np.random.RandomState(self.rand_seed)

        # Run RSRM
        self.w_, self.r_, self.s_ = self._rsrm(X)

        return self
Beispiel #37
0
    def fit(self, X):
        """Compute the Robust Shared Response Model

        Parameters
        ----------

        X : list of 2D arrays, element i has shape=[voxels_i, timepoints]
            Each element in the list contains the fMRI data of one subject.
        """
        logger.info('Starting RSRM')

        # Check that the regularizer value is positive
        if 0.0 >= self.lam:
            raise ValueError("Gamma parameter should be positive.")

        # Check the number of subjects
        if len(X) <= 1:
            raise ValueError("There are not enough subjects in the input "
                             "data to train the model.")

        # Check for input data sizes
        if X[0].shape[1] < self.features:
            raise ValueError(
                "There are not enough timepoints to train the model with "
                "{0:d} features.".format(self.features))

        # Check if all subjects have same number of TRs for alignment
        number_trs = X[0].shape[1]
        number_subjects = len(X)
        for subject in range(number_subjects):
            assert_all_finite(X[subject])
            if X[subject].shape[1] != number_trs:
                raise ValueError("Different number of alignment timepoints "
                                 "between subjects.")

        # Create a new random state
        self.random_state_ = np.random.RandomState(self.rand_seed)

        # Run RSRM
        self.w_, self.r_, self.s_ = self._rsrm(X)

        return self
def test_baseline_categorical_crossentropy():
    rng = np.random.RandomState(0)

    prediction_dim = 4
    loss = _LOSSES['categorical_crossentropy']()
    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
        y_train = y_train.astype(np.float64)
        baseline_prediction = loss.get_baseline_prediction(y_train,
                                                           prediction_dim)
        assert baseline_prediction.dtype == y_train.dtype
        assert_all_finite(baseline_prediction)

    # Same logic as for above test. Here inverse_link_function = softmax and
    # link_function = log
    y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
    baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim)
    assert baseline_prediction.shape == (prediction_dim, 1)
    for k in range(prediction_dim):
        p = (y_train == k).mean()
        assert np.allclose(baseline_prediction[k, :], np.log(p))
Beispiel #39
0
def test_baseline_categorical_crossentropy():
    rng = np.random.RandomState(0)

    prediction_dim = 4
    loss = _LOSSES["categorical_crossentropy"](sample_weight=None)
    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
        y_train = y_train.astype(np.float64)
        baseline_prediction = loss.get_baseline_prediction(
            y_train, None, prediction_dim)
        assert baseline_prediction.dtype == y_train.dtype
        assert_all_finite(baseline_prediction)

    # Same logic as for above test. Here inverse_link_function = softmax and
    # link_function = log
    y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
    baseline_prediction = loss.get_baseline_prediction(y_train, None,
                                                       prediction_dim)
    assert baseline_prediction.shape == (prediction_dim, 1)
    for k in range(prediction_dim):
        p = (y_train == k).mean()
        assert np.allclose(baseline_prediction[k, :], np.log(p))
Beispiel #40
0
def test_baseline_binary_crossentropy():
    rng = np.random.RandomState(0)

    loss = _LOSSES['binary_crossentropy']()
    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
        y_train = y_train.astype(np.float32)
        baseline_prediction = loss.get_baseline_prediction(y_train, 1)
        assert_all_finite(baseline_prediction)
        assert_almost_equal(loss.inverse_link_function(baseline_prediction),
                            y_train[0])

    # Make sure baseline prediction is equal to link_function(p), where p
    # is the proba of the positive class. We want predict_proba() to return p,
    # and by definition
    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
    y_train = rng.randint(0, 2, size=100).astype(np.float32)
    baseline_prediction = loss.get_baseline_prediction(y_train, 1)
    assert baseline_prediction.shape == tuple()  # scalar
    p = y_train.mean()
    assert_almost_equal(baseline_prediction, np.log(p / (1 - p)))
Beispiel #41
0
def test_multinomial_loss_fit_intercept_only():
    """Test that fit_intercept_only returns the mean functional for CCE."""
    rng = np.random.RandomState(0)
    n_classes = 4
    loss = HalfMultinomialLoss(n_classes=n_classes)
    # Same logic as test_specific_fit_intercept_only. Here inverse link
    # function = softmax and link function = log - symmetry term.
    y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64)
    baseline_prediction = loss.fit_intercept_only(y_true=y_train)
    assert baseline_prediction.shape == (n_classes, )
    p = np.zeros(n_classes, dtype=y_train.dtype)
    for k in range(n_classes):
        p[k] = (y_train == k).mean()
    assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p)))
    assert_allclose(baseline_prediction[None, :], loss.link.link(p[None, :]))

    for y_train in (np.zeros(shape=10), np.ones(shape=10)):
        y_train = y_train.astype(np.float64)
        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
        assert baseline_prediction.dtype == y_train.dtype
        assert_all_finite(baseline_prediction)
def test_baseline_binary_crossentropy():
    rng = np.random.RandomState(0)

    loss = _LOSSES['binary_crossentropy']()
    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
        y_train = y_train.astype(np.float64)
        baseline_prediction = loss.get_baseline_prediction(y_train, 1)
        assert_all_finite(baseline_prediction)
        assert np.allclose(loss.inverse_link_function(baseline_prediction),
                           y_train[0])

    # Make sure baseline prediction is equal to link_function(p), where p
    # is the proba of the positive class. We want predict_proba() to return p,
    # and by definition
    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
    y_train = rng.randint(0, 2, size=100).astype(np.float64)
    baseline_prediction = loss.get_baseline_prediction(y_train, 1)
    assert baseline_prediction.shape == tuple()  # scalar
    assert baseline_prediction.dtype == y_train.dtype
    p = y_train.mean()
    assert np.allclose(baseline_prediction, np.log(p / (1 - p)))
Beispiel #43
0
def mean_squared_error(y_true, y_pred):
    """
    Root mean square error, mean square error, and its standard deviation.

    If you need only RMSE, use 
    :func:`sklearn.metrics.mean_absolute_error`

    Parameters
    ----------
    y_true : array, shape(n_samples,)
        Ground truth scores
    y_pred : array, shape(n_samples,)
        Predicted scores

    Returns
    -------
    rmse : float
        root mean squared error
    mean : float
        mean of absolute errors
    stdev : float
        standard deviation of absolute errors
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calculate errors
    errs = (y_true - y_pred) ** 2
    mean = np.nanmean(errs)
    stdev = np.nanstd(errs)
    rmse = np.sqrt(np.maximum(mean, 0.))

    return rmse, mean, stdev
def mean_absolute_percentage_error(y_true, y_pred): 
    """
    Use of this metric is not recommended; for illustration only. 
    See other regression metrics on sklearn docs:
      http://scikit-learn.org/stable/modules/classes.html#regression-metrics
    Use like any other metric
    >>> y_true = [3, -0.5, 2, 7]; y_pred = [2.5, -0.3, 2, 8]
    >>> mean_absolute_percentage_error(y_true, y_pred)
    Out[]: 24.791666666666668
    """
    y_true = np.asanyarray(y_true)
    y_pred = np.asanyarray(y_pred)
    assert_all_finite(y_true)
    assert_all_finite(y_pred)
    #Filter zero values in y_true
    sel = (y_true != 0)
    y_true = y_true[sel]
    y_pred = y_pred[sel]
    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)
#     return np.abs((y_true - y_pred) / y_true.astype(np.float32)).sum()/float(district_num * dateslot_num)
    return np.mean(np.abs((y_true - y_pred) / y_true.astype(np.float32)))
Beispiel #45
0
    def fit(self, X, pairs):
        """ Fit model with specified loss.

        Parameters
        ----------
        X : scipy.sparse.csc_matrix, (n_samples, n_features)

        y : float | ndarray, shape = (n_compares, 2)
                Each row `i` defines a pair of samples such that
                the first returns a high value then the second
                FM(X[i,0]) > FM(X[i, 1]).
        """
        X = X.T
        X = check_array(X, accept_sparse="csc", dtype=np.float64)
        assert_all_finite(pairs)

        pairs = pairs.astype(np.float64)
        # check that pairs contain no real values
        assert_array_equal(pairs, pairs.astype(np.int32))
        assert pairs.max() <= X.shape[1]
        assert pairs.min() >= 0
        self.w0_, self.w_, self.V_ = ffm.ffm_fit_sgd_bpr(self, X, pairs)
        return self
Beispiel #46
0
    def fit(self, X, y=None):
        """Compute the probabilistic Shared Response Model

        Parameters
        ----------
        X :  list of 2D arrays, element i has shape=[voxels_i, samples]
            Each element in the list contains the fMRI data of one subject.

        y : not used
        """
        if self.verbose:
            print('Running Probabilistic SRM')  # noqa FIXME

        # Check the number of subjects
        if len(X) <= 1:
            raise ValueError("There are not enough subjects "
                             "({0:d}) to train the model.".format(len(X)))

        # Check for input data sizes
        if X[0].shape[1] < self.features:
            raise ValueError(
                "There are not enough samples to train the model with "
                "{0:d} features.".format(self.features))

        # Check if all subjects have same number of TRs
        number_trs = X[0].shape[1]
        number_subjects = len(X)
        for subject in range(number_subjects):
            assert_all_finite(X[subject])
            if X[subject].shape[1] != number_trs:
                raise ValueError(
                    "Different number of samples between subjects.")

        # Run SRM
        self.sigma_s_, self.w_, self.mu_, self.rho2_, self.s_ = self._srm(X)

        return self
Beispiel #47
0
def testLearner(d_dfData, s_symbol, d_dfFeatures, d_dfClass, b_scaling, b_pca, fc_learnerFactory, i_trainPeriod, b_Plot = False):
    t1 = datetime.now()
    
    df_data = d_dfFeatures[s_symbol]
    #print df_data.to_csv()
    df_classData = d_dfClass[s_symbol]
    #print df_classData.to_csv()
    success = float(0)
    success_up = float(0)
    success_down = float(0)
    
    count = 0
    for i in range(i_trainPeriod, df_data.index.size - i_forwardlook + 1):
        day = df_data.index[i]
        na_data = df_data.iloc[i - i_trainPeriod:i].values
        y_train = df_classData.iloc[i - i_trainPeriod:i].values.ravel()
        x_predict = df_data.iloc[i].values
        #print "{} - nans: data:{} class:{} x_predict:{} price:{}".format(day, np.count_nonzero(np.isnan(na_data)), np.count_nonzero(np.isnan(y_train)), np.count_nonzero(np.isnan(x_predict)), d_dfData['close'][s_symbol][day])
        try:
            assert_all_finite(na_data)
            assert_all_finite(y_train)
            assert_all_finite(x_predict)
        except ValueError:
            continue
        if b_scaling == True:
            scaler = preprocessing.StandardScaler().fit(na_data)
            x_train = scaler.transform(na_data)
        else:
            x_train = na_data
        
        
        if b_pca == True:
            pca = decomposition.PCA(n_components = 40)
            pca.fit(x_train)
            x_train = pca.transform(x_train)
            x_predict = pca.transform(x_predict)
        
        i_prediction = fc_learnerFactory(x_train, y_train, x_predict)
        if (i_prediction == df_classData.iloc[i][0]):
            success += 1
            if (i_prediction == 1):
                success_up += 1
            else:
                success_down += 1
        count += 1
        #sys.stdout.write(str(all_count - count) + " to go\r")
    if count == 0:
        print symbol + " no prediction"
    else:
        print symbol + " success rate: " + str(success/count) + " up: " + str(success_up/count) + " down: " + str(success_down/count) + " count: " + str(count)
Beispiel #48
0
np.ravel(yKrn)
safe_asarray(yKrn)   #.ravel()
np.asarray_chkfinite(yKrn)
print("l885: yKrn", type(yKrn) )
#yKrn = yKrn.astype(numpy.float32, copy=False)


#safe_asarray(yKrn).ravel()#print(len(yKrn) )
np.array(yKrn,float);  as_float_array(XKrn);  as_float_array(yKrn)

#yKrn.astype(float)
np.float64(yKrn);  np.asarray( yKrn )
#warn_if_not_float(yKrn)
XKrn = XKrn[np.logical_not(np.isnan(XKrn))]; yKrn = XKrn[np.logical_not(np.isnan(yKrn))]
assert_all_finite(XKrn); assert_all_finite(yKrn)    #X_vec = np.vectorize(XKrn) #y_vec = np.vectorize(yKrn)
XKrn.ravel(), yKrn.ravel()
print("912: XKrn row, yKrn row", XKrn.shape, yKrn.shape )
#new_list =[ (F, T) [boolean test] for x in old_list ]
#chk0 = [ (0.001)  [i == True] for i in yKrn ]


print ("minX", np.min(XKrn) )
print ("y shape", yKrn.shape)
indices =  (y == False).nonzero()       #   np.nonzero(yKrn)
print ("**nonzero yKrn", indices, y[indices])
for i in indices:
        yKrn[i]=1e-10
indx = np.nonzero(XKrn)
for i in indx:
        XKrn[i]=1e-10
Beispiel #49
0
def score_predictor_statistics(y_true, y_pred, score_domain=(1, 5, 1)):
    """
    Full Statistics of prediction performance
    
    * n_samples
    * mean_absolute_error: mean, stdev
    * mean_squared_error: mean, rmse, stdev 
    * predicted: mean, stdev
    * true: mean, stdev

    Parameters
    ----------
    y_true : array, shape=(n_samples,)
        Ground truth scores
    y_pred : array, shape=(n_samples,)
        Predicted scores
    score_domain : array, shape=(3,)
        Domain of scores, represented by a triple: start, end, and stride
        default=(1, 5, 1).

    Returns
    -------
    stats : dict
        Full statistics of prediction performance
    """

    # check inputs
    assert_all_finite(y_true)
    y_true = as_float_array(y_true)
    assert_all_finite(y_pred)
    y_pred = as_float_array(y_pred)
    check_consistent_length(y_true, y_pred)

    # calc statistics
    stats = {}

    # dataset size
    stats['n_samples'] = y_true.size

    # a list of possible score levels
    stats['score levels'] = np.hstack([
        np.arange(score_domain[0], score_domain[1], score_domain[2],
                  dtype=float), score_domain[1]])

    # mean absolute error
    mean, stdev = mean_absolute_error(y_true, y_pred)
    stats['mean absolute error'] = {'mean': mean, 'stdev': stdev}

    # root mean squared error
    rmse, mean, stdev = mean_squared_error(y_true, y_pred)
    stats['mean squared error'] = {'rmse': rmse, 'mean': mean, 'stdev': stdev}

    # descriptive statistics of ground truth scores
    stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)}

    hist, _ = score_histogram(y_true, score_domain=score_domain)
    stats['true']['histogram'] = hist
    stats['true']['histogram density'] = (hist / hist.sum())

    # descriptive statistics of ground predicted scores
    stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}

    hist, _ = score_histogram(y_pred, score_domain=score_domain)
    stats['predicted']['histogram'] = hist
    stats['predicted']['histogram density'] = (hist / hist.sum())

    return stats
Beispiel #50
0
    def fit(self, X, y, Z):
        """Compute the Semi-Supervised Shared Response Model

        Parameters
        ----------

        X : list of 2D arrays, element i has shape=[voxels_i, n_align]
            Each element in the list contains the fMRI data for alignment of
            one subject. There are n_align samples for each subject.

        y : list of arrays of int, element i has shape=[samples_i]
            Each element in the list contains the labels for the data samples
            in Z.

        Z : list of 2D arrays, element i has shape=[voxels_i, samples_i]
            Each element in the list contains the fMRI data of one subject
            for training the MLR classifier.

        """
        logger.info('Starting SS-SRM')

        # Check that the alpha value is in range (0.0,1.0)
        if 0.0 >= self.alpha or self.alpha >= 1.0:
            raise ValueError("Alpha parameter should be in range (0.0, 1.0)")

        # Check that the regularizer value is positive
        if 0.0 >= self.gamma:
            raise ValueError("Gamma parameter should be positive.")

        # Check the number of subjects
        if len(X) <= 1 or len(y) <= 1 or len(Z) <= 1:
            raise ValueError("There are not enough subjects in the input "
                             "data to train the model.")

        if not (len(X) == len(y)) or not (len(X) == len(Z)):
            raise ValueError("Different number of subjects in data.")

        # Check for input data sizes
        if X[0].shape[1] < self.features:
            raise ValueError(
                "There are not enough samples to train the model with "
                "{0:d} features.".format(self.features))

        # Check if all subjects have same number of TRs for alignment
        # and if alignment and classification data have the same number of
        # voxels per subject. Also check that there labels for all the classif.
        # sample
        number_trs = X[0].shape[1]
        number_subjects = len(X)
        for subject in range(number_subjects):
            assert_all_finite(X[subject])
            assert_all_finite(Z[subject])
            if X[subject].shape[1] != number_trs:
                raise ValueError("Different number of alignment samples "
                                 "between subjects.")
            if X[subject].shape[0] != Z[subject].shape[0]:
                raise ValueError("Different number of voxels between alignment"
                                 " and classification data (subject {0:d})"
                                 ".".format(subject))
            if Z[subject].shape[1] != y[subject].size:
                raise ValueError("Different number of samples and labels in "
                                 "subject {0:d}.".format(subject))

        # Map the classes to [0..C-1]
        new_y = self._init_classes(y)

        # Run SS-SRM
        self.w_, self.s_, self.theta_, self.bias_ = self._sssrm(X, Z, new_y)

        return self