def process_phonescreen_day_data(self, user_id: str, touchstream: List[DataPoint],
                                input_touchstream: DataStream,  gm: GaussianMixture):
        """
        Analyze the phone touch screen gap to find typing, pause between typing, reading
        and unknown sessions. It uses the Gaussian Mixture algorithm to find different peaks
        in a mixture of 4 different gaussian distribution of screen touch gap.

        :param str user_id: UUID of the stream owner
        :param List(DataPoint) touchstream: Phone touch screen stream data
        :param DataStream input_touchstream: DataStream object of phone touch screen
        :param DataStream input_categorystream: DataStream object of app category stream
        :param GaussianMixture gm: GaussianMixture object created from all day data of the user
        :return:
        """
        touchstream = sorted(touchstream, key=lambda x: x.start_time)

        # appusage = self.get_appusage_duration_by_category(categorystream, ["Communication", "Productivity"])
        # tapping_gap = self.appusage_interval_list(touchstream, appusage)
        #         if len(tapping_gap) < 50:
        #             self.CC.logging.log("Not enough screen touch data")
        #             return
        tapping_gap = []
        for i in range(1, len(touchstream)):
            tapping_gap.append(touchstream[i].sample - touchstream[i-1].sample)

        tapping_gap = sorted(tapping_gap)
        if len(tapping_gap)==0:
            self.CC.logging.log("Not enough screen touch data")
            return

        #gm = GaussianMixture(n_components = 4, max_iter = 500)#, covariance_type = 'spherical')
        X = (np.array(tapping_gap)/1000).reshape(-1, 1)
        #gm.fit(X)

        P = gm.predict(X)
        mx = np.zeros(gm.get_params()['n_components'])
        mn = np.full(gm.get_params()['n_components'], np.inf)
        for i in range(len(P)):
            x = P[i]
            mx[x] = max(mx[x], X[i][0])
            mn[x] = min(mn[x], X[i][0])

        intervals = []
        for i in range(len(mx)):
            intervals.append((mn[i], mx[i]))
        intervals = sorted(intervals)

        try:
            data = self.label_appusage_intervals(touchstream, intervals,
                                                 ["typing", "pause", "reading", "unknown"])
            if data:
                self.store_stream(filepath="phone_touch_type_all_app.json",
                                  input_streams=[input_touchstream],
                                  user_id=user_id, data=data, localtime=False)
        except Exception as e:
            self.CC.logging.log("Exception:", str(e))
            self.CC.logging.log(str(traceback.format_exc()))
def Gaussian_Mixture_Model(X_train, y_train, X_test, y_test, max_iter):
    '''
    Isomap demonstrated that the data is distributed in overlaping groups.
    Therefore, samples should be allocated to digit based on a
    Gaussian distribution.
    '''
    n_components = 10
    model = GaussianMixture(n_components = n_components, max_iter = max_iter)
    classifier = model.fit(X_train, y_train)
    testing_model = model.predict(X_test)
    score = accuracy_score(y_test, testing_model)
#     cv_scores = cross_val_score(classifier, X_test, y_test, cv = 3)

    print(' ')
    print('===== Gaussian Mixture Model =====')
    print('score:', score)
#     print('cross validation scores:', cv_scores)
    
    # Visualize parameters in a table.
    visualize_params(model.get_params())
    
    # Visualize actual labels versus predicted labels.
    visualize_heatmap(y_test, testing_model, 'Gaussian Mixture Model')    
    
    return score
Esempio n. 3
0
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs):
    """
    Finds cluster of users in data using Gaussian Mixture Models.

    :param data: pd.DataFrame with features for clustering indexed by users (sessions)
    :param max_n_clusters: maximal number of clusters for automatic selection for number of clusters.
        if None, then use n_clusters from arguments
    :param use_csi: if True, then cluster stability index will be calculated (may take a lot of time)
    :param random_state: random state for GaussianMixture clusterer
    :param kwargs: keyword arguments for sklearn.mixture.GaussianMixture
    :return: np.array of clusters
    """
    if max_n_clusters is not None:
        kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters,
                                      random_state, **kwargs)
    else:
        kmargs = {
            i: j
            for i, j in kwargs.items()
            if i in GaussianMixture.get_params(GaussianMixture)
        }
    kmargs.update({'random_state': random_state})
    km = GaussianMixture(**kmargs)
    cl = km.fit_predict(data.values)
    km.labels_ = cl
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    return cl, metrics
Esempio n. 4
0
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs):
    """
    Finds cluster of users in data using Gaussian Mixture Models.

    Parameters
    --------
    data: pd.DataFrame
        Dataframe with features for clustering indexed as in ``retention_config.index_col``
    max_n_clusters: int, optional
        Maximal number of clusters for automatic selection for number of clusters. If ``None``, then uses ```n_clusters`` from arguments. Default: `None```
    use_csi: bool, optional
        If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True``
    random_state: int, optional
        Random state for GaussianMixture clusterer.
    kwargs: optional
        Parameters for ``sklearn.mixture.GaussianMixture``

    Returns
    --------
    Array of clusters

    Return type
    --------
    np.array
    """
    if max_n_clusters is not None:
        kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters,
                                      random_state, **kwargs)
    else:
        kmargs = {
            i: j
            for i, j in kwargs.items()
            if i in GaussianMixture.get_params(GaussianMixture)
        }
    kmargs.update({'random_state': random_state})
    km = GaussianMixture(**kmargs)
    cl = km.fit_predict(data.values)
    km.labels_ = cl
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    return cl, metrics
Esempio n. 5
0
def run_sample(word2vec_model: str, sample_size, averaged_category_map,
               averaged_comment_map):
    # Convert article-comment map into a matrix of averaged comment weight vectors
    print("Converting comment map to matrix...")
    comment_matrix = convert_to_matrix(averaged_comment_map)
    print()

    if sample_size is not None:
        print("Selecting " + str(sample_size) + " samples...", end="")
        sample_indices = np.random.choice(comment_matrix.shape[0],
                                          size=sample_size,
                                          replace=False)
        samples = comment_matrix[sample_indices, :]
    else:
        sample_size = len(comment_matrix)
        print("Selecting " + str(sample_size) + " samples...", end="")
        samples = comment_matrix
    print(" Done.")
    print()

    # Set up GMM and fit to data
    print("Fitting using GMM...")
    gmm = GaussianMixture(n_components=50, verbose=2, verbose_interval=1)
    gmm.fit(samples)
    print()

    gmm_data = {
        "params": gmm.get_params(),
        "weights": gmm.weights_,
        "means": gmm.means_,
        "covariances": gmm.covariances_,
        "precisions": gmm.precisions_,
        "precisions_cholesky": gmm.precisions_cholesky_
    }

    np.save(
        "../resources/gmm_" + str(sample_size) + "_" + word2vec_model + ".npy",
        gmm_data)
def gmm():
    gmm = GaussianMixture(n_components=3).fit(iris_X)
    print(gmm.get_params(iris_X))
    labels = gmm.predict(iris_X)
    count1 = 0
    count2 = 0
    count3 = 0
    sum2 = np.sum(labels[:50] == 0)
    sum0 = np.sum(labels[:50] == 1)
    sum1 = np.sum(labels[:50] == 2)
    if sum1 > sum2 and sum1 > sum0:
        count1 = sum2 + sum0
    elif sum0 > sum1 and sum0 > sum2:
        count1 = sum1 + sum2
    else:
        count1 = sum1 + sum0
    sum2 = np.sum(labels[50:100] == 0)
    sum0 = np.sum(labels[50:100] == 1)
    sum1 = np.sum(labels[50:100] == 2)
    if sum1 > sum2 and sum1 > sum0:
        count2 = sum2 + sum0
    elif sum0 > sum1 and sum0 > sum2:
        count2 = sum1 + sum2
    else:
        count2 = sum1 + sum0
    sum2 = np.sum(labels[100:150] == 0)
    sum0 = np.sum(labels[100:150] == 1)
    sum1 = np.sum(labels[100:150] == 2)
    if sum1 > sum2 and sum1 > sum0:
        count3 = sum2 + sum0
    elif sum0 > sum1 and sum0 > sum2:
        count3 = sum1 + sum2
    else:
        count3 = sum1 + sum0
    accuracy = 1 - (count1 + count2 + count3) / 150
    return accuracy
Esempio n. 7
0
class pcm(object):
    """Profile Classification Model class constructor

        Consume and return :mod:`xarray` objects

    """
    def __init__(self,
                 K:int,
                 features:dict(),
                 scaling=1,
                 reduction=1, maxvar=15,
                 classif='gmm', covariance_type='full',
                 verb=False,
                 debug=False,
                 timeit=False, timeit_verb=False,
                 chunk_size='auto',
                 backend='sklearn'):
        """Create the PCM instance

        Parameters
        ----------
        K: int
            The number of class, or cluster, in the classification model.

        features: dict()
            The vertical axis to use for each features.
            eg: {'temperature':np.arange(-2000,0,1)}

        scaling: int (default: 1)
            Define the scaling method:

            - 0: No scaling
            - **1: Center on sample mean and scale by sample std**
            - 2: Center on sample mean only

        reduction: int (default: 1)
            Define the dimensionality reduction method:

            - 0: No reduction
            - **1: Reduction using :class:`sklearn.decomposition.PCA`**

        maxvar: float (default: 99.9)
            Maximum feature variance to preserve in the reduced dataset using :class:`sklearn.decomposition.PCA`. In %.

        classif: str (default: 'gmm')
            Define the classification method.
            The only method available as of now is a Gaussian Mixture Model.
            See :class:`sklearn.mixture.GaussianMixture` for more details.

        covariance_type: str (default: 'full')
            Define the type of covariance matrix shape to be used in the default classifier GMM.
            It can be ‘full’ (default), ‘tied’, ‘diag’ or ‘spherical’.

        verb: boolean (default: False)
            More verbose output

        timeit: boolean (default: False)
            Register time of operation for performance evaluation

        timeit_verb: boolean (default: False)
            Print time of operation during execution

        chunk_size: 'auto' or int
            Sampling chunk size, (array of features after pre-processing)

        backend: str
            Statistic library backend, 'sklearn' (default) or 'dask_ml'

        """
        if K==0:
            raise PCMClassError("Can't create a PCM with K=0")
        if K is None:
            raise PCMClassError("K must be defined to create a PMC")
        if not bool(features):
            raise PCMFeatureError("Can't create a PCM without features")

        if   scaling==0: with_scaler = 'none'; with_mean=False; with_std = False
        elif scaling==1: with_scaler = 'normal'; with_mean=True; with_std = True
        elif scaling==2: with_scaler = 'center'; with_mean=True; with_std = False
        else: raise NameError('scaling must be 0, 1 or 2')
        
        if   reduction==0: with_reducer = False
        elif reduction==1: with_reducer = True
        else: raise NameError('reduction must be 0 or 1')
        
        if classif=='gmm': with_classifier = 'gmm';
        else: raise NameError("classifier must be 'gmm' (no other methods implemented at this time)")

        #todo check validity of the dict of features

        self._props = {'K': np.int(K),
                       'F': len(features),
                        'llh': None,
                        'COVARTYPE': covariance_type,
                        'with_scaler': with_scaler,
                        'with_reducer': with_reducer,
                        'with_classifier': with_classifier,
                        'maxvar': maxvar,
                        'features': collections.OrderedDict(features),
                        'chunk_size': chunk_size,
                        'backend': backend}
        self._xmask = None # xarray mask for nd-array used at pre-processing steps
        self._register = collections.OrderedDict() # Will register mutable instances of sub-modules like 'plot'

        self._verb = verb #todo _verb is a property, should be set/get with a decorator
        self._debug = debug

        self._interpoler = collections.OrderedDict()
        self._scaler = collections.OrderedDict()
        self._scaler_props = collections.OrderedDict()
        self._reducer = collections.OrderedDict()
        self._homogeniser = collections.OrderedDict()

        # Load estimators for a specific backend:
        bck = StatisticsBackend(backend, scaler='StandardScaler', reducer='PCA')

        for feature_name in features:
            feature_axis = self._props['features'][feature_name]
            if isinstance(feature_axis, xr.DataArray):
                self._props['features'][feature_name] = feature_axis.values

            # self._scaler[feature_name] = preprocessing.StandardScaler(with_mean=with_mean,
            #                                             with_std=with_std)
            if 'none' not in self._props['with_scaler']:
                self._scaler[feature_name] = bck.scaler(with_mean=with_mean, with_std=with_std)
            else:
                self._scaler[feature_name] = NoTransform()
            self._scaler_props[feature_name] = {'units': '?'}

            is_slice = np.all(feature_axis == None)
            if not is_slice:
                self._interpoler[feature_name] = Vertical_Interpolator(axis=feature_axis, debug=self._debug)
                if np.prod(feature_axis.shape) == 1:
                    # Single level: no need to reduce
                    if self._debug: print('Single level, not need to reduce', np.prod(feature_axis.ndim))
                    self._reducer[feature_name] = NoTransform()
                else:
                    # Multi-vertical-levels, set reducer:
                    if with_reducer:
                        self._reducer[feature_name] = bck.reducer(n_components=self._props['maxvar'],
                                                                  svd_solver='full')
                    else:
                        self._reducer[feature_name] = NoTransform()
            else:
                self._interpoler[feature_name] = NoTransform()
                self._reducer[feature_name] = NoTransform()
                if self._debug: print("%s is single level, no need to reduce" % feature_name)

            self._homogeniser[feature_name] = {'mean': 0, 'std': 1}

        self._classifier = GaussianMixture(n_components=self._props['K'],
                                          covariance_type=self._props['COVARTYPE'],
                                          init_params='kmeans',
                                          max_iter=1000,
                                          tol=1e-6)

        # Define the "context" to execute some functions inner code
        # (useful for time benchmarking)
        self._context = self.__empty_context # Default is empty, do nothing
        self._context_args = dict()
        if timeit:
            self._context = self.__timeit_context
            self._context_args = {'maxlevel': 3, 'verb':timeit_verb}
            self._timeit = dict()

        # Define statistics for the fit method:
        self._fit_stats = dict({'datetime': None, 'n_samples_seen_': None, 'score': None, 'etime': None})

    @contextmanager
    def __timeit_context(self, name, opts=dict()):
        default_opts = {'maxlevel': np.inf, 'verb':False}
        for key in opts:
            if key in default_opts:
                default_opts[key] = opts[key]
        level = len([i for i in range(len(name)) if name.startswith('.', i)])
        if level <= default_opts['maxlevel']:
            startTime = time.time()
            yield
            elapsedTime = time.time() - startTime
            trailingspace = " " * level
            trailingspace = " "
            if default_opts['verb']:
                # print('... time in {} {}: {} ms'.format(trailingspace, name, int(elapsedTime * 1000)))
                print('{} {}: {} ms'.format(trailingspace, name, int(elapsedTime * 1000)))
            if name in self._timeit:
                self._timeit[name].append(elapsedTime * 1000)
            else:
                self._timeit[name] = list([elapsedTime*1000])
        else:
            yield

    @contextmanager
    def __empty_context(self, name, *args, **kargs):
        yield

    def __call__(self, **kwargs):
        self.__init__(**kwargs)
    
    def __iter__(self):
        self.__i = 0
        return self
    
    def __next__(self):
        if self.__i < self.K:
            i = self.__i
            self.__i += 1
            return i
        else:
            raise StopIteration()

    def __repr__(self):
        return self.display(deep=self._verb)

    def ravel(self, da, dim=None, feature_name=str):
        """ Extract from N-d array a X(feature,sample) 2-d array and vertical dimension z

            Parameters
            ----------
            da: :class:`xarray.DataArray`
                The DataArray to process

            dim: str
                Name of the vertical dimension in the input :class:`xarray.DataArray`

            feature_name: str
                Target PCM feature name for the input :class:`xarray.DataArray`

            Returns
            -------
            X: :class:`xarray.DataArray`
                A new DataArray with dimension ['n_sampling','n_features']
                Note that data are always :class:`dask.array.Array`.

            z: :class:`numpy.array`
                The vertical axis of data

            sampling_dims: dict()
                Dictionary where keys are :class:`xarray.Dataset` variable names of features
                and values are another dictionary with the list of sampling dimension in
                ``DIM_SAMPLING`` key and the name of the vertical axis in the ``DIM_VERTICAL`` key.

            Examples
            --------
            This function is meant to be used internally only

            __author__: [email protected]

        """

        # Is this a thick array or a slice ?
        is_slice = np.all(self._props['features'][feature_name] == None)

        # Load mask where all features are available for this PCM:
        mask_stacked = self._xmask

        if is_slice:
            # No vertical dimension to use, simple stacking
            sampling_dims = list(da.dims)
            # Apply all-features mask:
            X = da.stack({'sampling': sampling_dims})
            X = X.where(mask_stacked == 1, drop=True).expand_dims('dummy').transpose()#.values
            z = np.empty((1,))
        else:
            if not dim:
                # Try to infer the vertical dimension name looking for the CF 'axis' attribute in all dimensions
                dim_found = False
                for this_dim in da.dims:
                    if ('axis' in da[this_dim].attrs) and (da[this_dim].attrs['axis'] == 'Z'):
                        dim = this_dim
                        dim_found = True
                if not dim_found:
                    raise PCMFeatureError("You must specify a vertical dimension name: "\
                                          "use argument 'dim' or "\
                                          "specify DataSet dimension the attribute 'axis' to 'Z' (CF1.6)")
            elif dim not in da.dims:
                raise ValueError("Vertical dimension %s not found in this DataArray" % dim)

            sampling_dims = list(da.dims)
            sampling_dims.remove(dim)
            X = da.stack({'sampling': sampling_dims}) #todo Improve performance for this operation !
            # Apply all-features mask:
            X = X.where(mask_stacked == 1, drop=True).transpose()
            z = da[dim].values

        X = X.chunk(chunks={'sampling': self._props['chunk_size']})
        return X, z, sampling_dims

    def unravel(self, ds, sampling_dims, X):
        """ Create a DataArray from a numpy array and sampling dimensions """

        # Load mask where all features are available for this PCM:
        mask_stacked = self._xmask

        #
        coords = list()
        size = list()
        for dim in sampling_dims:
            coords.append(ds[dim])
            size.append(len(ds[dim]))
        da = xr.DataArray(np.empty((size)), coords=coords)
        da = da.stack({'sampling': sampling_dims})
        da = da.where(mask_stacked == 1, drop=True).transpose()
        da.values = X
        da = da.unstack('sampling')

        if (np.prod(da.shape) != mask_stacked.shape[0]):
            if self._debug:
                print("\tUnravelled data not matching mask dimension, re-indexing")
            mask = mask_stacked.unstack()
            da = da.reindex_like(mask)

        return da

    @property
    def K(self):
        """Return the number of classes"""
        return self._props['K']

    @property
    def F(self):
        """Return the number of features"""
        return self._props['F']

    @property
    def features(self):
        """Return features definition dictionnary"""
        return self._props['features']

    @property
    def plot(self):
        """Access plotting functions"""
        # Create a mutable instance on 1st call so that later changes will be reflected in future calls
        # https://stackoverflow.com/a/8140747
        if 'plot' not in self._register:
            self._register['plot'] = [_PlotMethods(self)]
        return self._register['plot'][0]

    @property
    def stat(self):
        """Access statistics functions"""
        return _StatMethods(self)

    @property
    def timeit(self):
        """ Return a :class:`pandas.DataFrame` with Execution time of method called on this instance """

        def get_multindex(times):
            """ Create multi-index pandas """
            # Get max levels:
            dpt = list()
            [dpt.append(len(key.split("."))) for key in times]
            max_dpt = np.max(dpt)
            # Read index:
            levels_1 = list()
            levels_2 = list()
            levels_3 = list()
            levels_4 = list()
            if max_dpt == 1:
                for key in times:
                    levels = key.split(".")
                    levels_1.append(levels[0])
                return max_dpt, [levels_1]
            elif max_dpt == 2:
                for key in times:
                    levels = key.split(".")
                    if len(levels) == 1:
                        levels_1.append(levels[0])
                        levels_2.append('total')
                    if len(levels) == 2:
                        levels_1.append(levels[0])
                        levels_2.append(levels[1])
                return max_dpt, [levels_1,levels_2]
            elif max_dpt == 3:
                for key in times:
                    levels = key.split(".")
        #             print(len(levels), levels)
                    if len(levels) == 1:
                        levels_1.append(levels[0])
                        levels_2.append('total')
                        levels_3.append('')
                    if len(levels) == 2:
                        levels_1.append(levels[0])
                        levels_2.append(levels[1])
                        levels_3.append('total')
                    if len(levels) == 3:
                        levels_1.append(levels[0])
                        levels_2.append(levels[1])
                        levels_3.append(levels[2])
                return max_dpt, [levels_1,levels_2,levels_3]
            elif max_dpt == 4:
                for key in times:
                    levels = key.split(".")
                    if len(levels) == 1:
                        levels_1.append(levels[0])
                        levels_2.append('total')
                        levels_3.append('')
                        levels_4.append('')
                    if len(levels) == 2:
                        levels_1.append(levels[0])
                        levels_2.append(levels[1])
                        levels_3.append('total')
                        levels_4.append('')
                    if len(levels) == 3:
                        levels_1.append(levels[0])
                        levels_2.append(levels[1])
                        levels_3.append(levels[2])
                        levels_4.append('total')
                    if len(levels) == 4:
                        levels_1.append(levels[0])
                        levels_2.append(levels[1])
                        levels_3.append(levels[2])
                        levels_4.append(levels[3])
                return max_dpt, [levels_1,levels_2,levels_3,levels_4]

        times = self._timeit
        max_dpt, arrays = get_multindex(times)
        if max_dpt == 1:
            index = pd.Index(arrays[0], names=['Method'])
            df = pd.Series([np.sum(times[key]) for key in times], index=index)
            # df = df.T
        elif max_dpt == 2:
            tuples = list(zip(*arrays))
            index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Sub-method'])
            df = pd.Series([np.sum(times[key]) for key in times], index=index)
            df = df.unstack(0)
            df = df.drop('total')
            df = df.T
        elif max_dpt == 3:
            tuples = list(zip(*arrays))
            index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Sub-method', 'Sub-sub-method'])
            df = pd.Series([np.sum(times[key]) for key in times], index=index)
    #         df = df.unstack(0)
        elif max_dpt == 4:
            tuples = list(zip(*arrays))
            index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Sub-method', 'Sub-sub-method',
                                                             'Sub-sub-sub-method'])
            df = pd.Series([np.sum(times[key]) for key in times], index=index)

        return df

    @property
    def backend(self):
        """Return the name of the statistic backend"""
        return self._props['backend']

    @property
    def fitstats(self):
        """ Estimator fit properties

            The number of samples processed by the estimator
            Will be reset on new calls to fit, but increments across partial_fit calls.
        """
        return self._fit_stats

    @docstring(io.to_netcdf.__doc__)
    def to_netcdf(self, ncfile, **ka):
        """ Save PCM to netcdf file

            Parameters
            ----------
            path : str
                Path to file
        """
        return io.to_netcdf(self, ncfile, **ka)

    def display(self, deep=False):
        """Display detailed parameters of the PCM
            This is not a get_params because it doesn't return a dictionary
            Set Boolean option 'deep' to True for all properties display
        """
        summary = [("<pcm '%s' (K: %i, F: %i)>")%(self._props['with_classifier'],
                                                  self._props['K'],
                                                  len(self._props['features']))]
        
        # PCM core properties:
        prop_info = ('Number of class: %i') % self._props['K']
        summary.append(prop_info)
        prop_info = ('Number of feature: %i') % len(self._props['features'])
        summary.append(prop_info)

        prop_info = ('Feature names: %s') % (repr(self._props['features'].keys()))
        summary.append(prop_info)

        # prop_info = ('Feature axis: [%s, ..., %s]') % (repr(self._props['features'][0]),
        #                                                repr(self._props['feature_axis'][-1]))
        # summary.append(prop_info)
        
        prop_info = ('Fitted: %r') % hasattr(self, 'fitted')
        summary.append(prop_info)

        # PCM workflow parameters:
        for feature in self._props['features']:
            prop_info = "Feature: '%s'" % feature
            summary.append(prop_info)
            summary.append("\t Interpoler: %s"%(type(self._interpoler[feature])))

            # prop_info = ('\t Sample Scaling: %r') %
            # summary.append(prop_info)
            summary.append("\t Scaler: %r, %s"%(self._props['with_scaler'], type(self._scaler[feature])))

            if (deep):
                # summary.append("\t\t Scaler properties:")
                d = self._scaler[feature].get_params(deep=deep)
                for p in d: summary.append(("\t\t %s: %r")%(p,d[p]))

            # prop_info = ('\t Dimensionality Reduction: %r') %
            # summary.append(prop_info)
            summary.append("\t Reducer: %r, %s"%(self._props['with_reducer'], type(self._reducer[feature])))

            if (deep):
                # summary.append("\t\t Reducer properties:")
                d = self._reducer[feature].get_params(deep=deep)
                for p in d: summary.append(("\t\t %s: %r")%(p,d[p]))
        # return '\n'.join(summary)

        # prop_info = ('Classification: %r') %
        # summary.append(prop_info)
        summary.append("Classifier: %r, %s"%(self._props['with_classifier'], type(self._classifier)))
        #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE']
        #summary.append(prop_info)
        if (hasattr(self,'fitted')):
            prop_info = ('\t log likelihood of the training set: %f') % self._props['llh']
            summary.append(prop_info)
        
        if (deep):
            summary.append("\t Classifier properties:")
            d = self._classifier.get_params(deep=deep)
            for p in d: summary.append(("\t\t %s: %r")%(p,d[p]))
        
        # Done
        return '\n'.join(summary)

    def preprocessing_this(self, da, dim=None, feature_name=str(), action='?'):
        """Pre-process data before anything

        Possible pre-processing steps:

        - interpolation,
        - scaling,
        - reduction

        Parameters
        ----------
        da: :class:`xarray.DataArray`
            The DataArray to process

        dim: str
            Name of the vertical dimension in the input :class:`xarray.DataArray`

        feature_name: str
            Target PCM feature name for the input :class:`xarray.DataArray`

        Returns
        -------
        X: np.array
            Pre-processed feature, with dimensions (N_SAMPLE, N_FEATURES)

        sampling_dims: list()
            List of the input :class:`xarray.DataArray` dimensions stacked as sampling points

        """
        this_context = str(action)+'.1-preprocess.2-feature_'+feature_name
        with self._context(this_context + '.total', self._context_args):

            # MAKE THE ND-ARRAY A 2D-ARRAY
            with self._context(this_context + '.1-ravel', self._context_args):
                X, z, sampling_dims = self.ravel(da, dim=dim, feature_name=feature_name)
                if self._debug:
                    print("\t", "X RAVELED with success", str(LogDataType(X)))

            # INTERPOLATION STEP:
            with self._context(this_context + '.2-interp', self._context_args):
                X = self._interpoler[feature_name].transform(X, z)
                if self._debug:
                    if isinstance(self._interpoler[feature_name], NoTransform):
                        print("\t", "X INTERPOLATED with success (NoTransform)", str(LogDataType(X)))
                    else:
                        print("\t", "X INTERPOLATED with success", str(LogDataType(X)))
                    # print(X.values.flags['WRITEABLE'])
                    # After the interpolation step, we must not have nan in the 2d array:
                    assert_all_finite(X, allow_nan=False)

            # FIT STEPS:
            # We need to fit pre-processing methods in order to re-use them when
            # predicting a new dataset

            # SCALING:
            with self._context(this_context+'.3-scale_fit', self._context_args):
                if not hasattr(self, 'fitted'):
                    self._scaler[feature_name].fit(X.data)
                    if 'units' in da.attrs:
                        self._scaler_props[feature_name]['units'] = da.attrs['units']

            with self._context(this_context + '.4-scale_transform', self._context_args):
                try:
                    X.data = self._scaler[feature_name].transform(X.data, copy=False)
                except ValueError:
                    if self._debug: print("\t\t Fail to scale.transform without copy, fall back on copy=True")
                    try:
                        X.data = self._scaler[feature_name].transform(X.data, copy=True)
                    except ValueError:
                        if self._debug: print("\t\t Fail to scale.transform with copy, fall back on input copy")
                        X.data = self._scaler[feature_name].transform(X.data.copy())
                        pass
                    except:
                        if self._debug: print(X.values.flags['WRITEABLE'])
                        raise
                    pass
                except:
                    raise

                if self._debug:
                    print("\t", "X SCALED with success)", str(LogDataType(X)))

            # REDUCTION:
            with self._context(this_context + '.5-reduce_fit', self._context_args):
                if (not hasattr(self, 'fitted')) and (self._props['with_reducer']):

                    if self.backend == 'dask_ml':
                        # We have to convert any type of data array into a Dask array because
                        # dask_ml cannot handle anything else (!)
                        #todo Raise an issue on dask_ml github to ask why is this choice made
                        # Related issues:
                        #   https://github.com/dask/dask-ml/issues/6
                        #   https://github.com/dask/dask-ml/issues/541
                        #   https://github.com/dask/dask-ml/issues/542
                        X.data = dask.array.asarray(X.data, chunks=X.shape)

                    if isinstance(X.data, dask.array.Array):
                        self._reducer[feature_name].fit(X.data)
                    else:
                        self._reducer[feature_name].fit(X)

            with self._context(this_context + '.6-reduce_transform', self._context_args):
                X = self._reducer[feature_name].transform(X.data) # Reduction, return np.array

                # After reduction the new array is [ sampling, reduced_dim ]
                X = xr.DataArray(X,
                                 dims=['sampling', 'n_features'],
                                 coords={'sampling': range(0, X.shape[0]),
                                         'n_features': np.arange(0,X.shape[1])})
                if self._debug:
                    print("\t", "X REDUCED with success)", str(LogDataType(X)))


        # Output:
        return X, sampling_dims

    def preprocessing(self, ds, features=None, dim=None, action='?', mask=None):
        """ Dataset pre-processing of feature(s)

        Depending on pyXpcm set-up, pre-processing steps can be:

        - interpolation,
        - scaling,
        - reduction

        Parameters
        ----------
        ds: :class:`xarray.Dataset`
            The dataset to work with

        features: dict()
            Definitions of PCM features in the input :class:`xarray.Dataset`.
            If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'.

        dim: str
            Name of the vertical dimension in the input :class:`xarray.Dataset`

        Returns
        -------
        X: np.array
            Pre-processed set of features, with dimensions (N_SAMPLE, N_FEATURES)

        sampling_dims: list()
            List of the input :class:`xarray.Dataset` dimensions stacked as sampling points

        """
        this_context = str(action)+'.1-preprocess'
        with self._context(this_context, self._context_args):
            if self._debug:
                print("> Start preprocessing for action '%s'" % action)

            # How do we find feature variable in this dataset ?
            features_dict = ds.pyxpcm.feature_dict(self, features=features)

            # Determine mask where all features are defined for this PCM:
            with self._context(this_context + '.1-mask', self._context_args):
                if not mask:
                    mask = ds.pyxpcm.mask(self, features=features, dim=dim)
                    # Stack all-features mask:
                    mask = mask.stack({'sampling': list(mask.dims)})
                self._xmask = mask

            # Pre-process all features and build the X array
            X = np.empty(())
            Xlabel = list() # Construct a list of string labels for each feature dimension (useful for plots)
            F = self.F # Nb of features

            for feature_in_pcm in features_dict:
                feature_in_ds = features_dict[feature_in_pcm]
                if self._debug:
                    print( ("\n\t> Preprocessing xarray dataset '%s' as PCM feature '%s'")\
                           %(feature_in_ds, feature_in_pcm) )

                if ('maxlevel' in self._context_args) and (self._context_args['maxlevel'] <= 2):
                    a = this_context + '.2-features'
                else:
                    a = this_context
                with self._context(a, self._context_args):
                    da = ds[feature_in_ds]
                    x, sampling_dims = self.preprocessing_this(da,
                                                               dim=dim,
                                                               feature_name=feature_in_pcm,
                                                               action=action)
                    xlabel = ["%s_%i"%(feature_in_pcm, i) for i in range(0, x.shape[1])]
                    if self._debug:
                        print("\t%s pre-processed with success, "  % feature_in_pcm, str(LogDataType(x)))

                with self._context(this_context + '.3-homogeniser', self._context_args):
                    # Store full array mean and std during fit:
                    if F>1:
                        # For more than 1 feature, we need to make them comparable,
                        # so we normalise each features by their global stats:
                        # FIT:
                        if (action == 'fit') or (action == 'fit_predict'):
                            self._homogeniser[feature_in_pcm]['mean'] = x.mean().values
                            self._homogeniser[feature_in_pcm]['std'] = x.std().values
                            #todo _homogeniser should be a proper standard scaler
                        # TRANSFORM:
                        x = (x-self._homogeniser[feature_in_pcm]['mean'])/\
                            self._homogeniser[feature_in_pcm]['std']
                        if self._debug and action == 'fit':
                            print(("\tHomogenisation for fit of %s") % (feature_in_pcm))
                        elif self._debug:
                            print(("\tHomogenisation of %s using fit data") % (feature_in_pcm))
                    elif self._debug:
                        print(("\tNo need for homogenisation of %s") % (feature_in_pcm))

                if np.prod(X.shape) == 1:
                    X = x
                    Xlabel = xlabel
                else:
                    X = np.append(X, x, axis=1)
                    [Xlabel.append(i) for i in xlabel]

            with self._context(this_context + '.4-xarray', self._context_args):
                self._xlabel = Xlabel
                if self._debug:
                    print("\tFeatures array shape and type for xarray:",
                          X.shape, type(X), type(X.data))
                X = xr.DataArray(X, dims=['n_samples', 'n_features'],
                                 coords={'n_samples': range(0, X.shape[0]), 'n_features': Xlabel})

            if self._debug:
                print("> Preprocessing done, working with final X (%s) array of shape:" % type(X), X.shape,
                      " and sampling dimensions:", sampling_dims)
        return X, sampling_dims

    def fit(self, ds, features=None, dim=None):
        """Estimate PCM parameters

        For a PCM, the fit method consists in the following operations:

        - pre-processing
            - interpolation to the ``feature_axis`` levels of the model
            - scaling
            - reduction
        - estimate classifier parameters

        Parameters
        ----------
        ds: :class:`xarray.Dataset`
            The dataset to work with

        features: dict()
            Definitions of PCM features in the input :class:`xarray.Dataset`.
            If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'.

        dim: str
            Name of the vertical dimension in the input :class:`xarray.Dataset`

        Returns
        -------
        self
        """
        with self._context('fit', self._context_args) :
            # PRE-PROCESSING:
            X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='fit')

            # CLASSIFICATION-MODEL TRAINING:
            with self._context('fit.fit', self._context_args):
                self._classifier.fit(X)

            with self._context('fit.score', self._context_args):
                self._props['llh'] = self._classifier.score(X)

            # Furthermore gather some information about the fit:
            self._fit_stats['score'] = self._props['llh']
            self._fit_stats['datetime'] = datetime.utcnow()
            if 'n_samples_seen_' not in self._classifier.__dict__:
                self._fit_stats['n_samples_seen_'] = X.shape[0]
            else:
                self._fit_stats['n_samples_seen_'] = self._classifier.n_samples_seen_
            if 'n_iter_' in self._classifier.__dict__:
                self._fit_stats['n_iter_'] = self._classifier.n_iter_

        # Done:
        self.fitted = True
        return self

    def predict(self, ds, features=None, dim=None, inplace=False, name='PCM_LABELS'):
        """Predict labels for profile samples

        This method add these properties to the PCM object:

        - ``llh``: The log likelihood of the model with regard to new data

        Parameters
        ----------
        ds: :class:`xarray.Dataset`
            The dataset to work with

        features: dict()
            Definitions of PCM features in the input :class:`xarray.Dataset`.
            If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'.

        dim: str
            Name of the vertical dimension in the input :class:`xarray.Dataset`

        inplace: boolean, False by default
            If False, return a :class:`xarray.DataArray` with predicted labels
            If True, return the input :class:`xarray.Dataset` with labels added as a new :class:`xarray.DataArray`

        name: str, default is 'PCM_LABELS'
            Name of the :class:`xarray.DataArray` with labels

        Returns
        -------
        :class:`xarray.DataArray`
            Component labels (if option 'inplace' = False)

        *or*

        :class:`xarray.Dataset`
            Input dataset with Component labels as a 'PCM_LABELS' new :class:`xarray.DataArray`
            (if option 'inplace' = True)
        """
        with self._context('predict', self._context_args):
            # Check if the PCM is trained:
            validation.check_is_fitted(self, 'fitted')

            # PRE-PROCESSING:
            X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='predict')

            # CLASSIFICATION PREDICTION:
            with self._context('predict.predict', self._context_args):
                labels = self._classifier.predict(X)
            with self._context('predict.score', self._context_args):
                llh = self._classifier.score(X)

            # Create a xarray with labels output:
            with self._context('predict.xarray', self._context_args):
                da = self.unravel(ds, sampling_dims, labels).rename(name)
                da.attrs['long_name'] = 'PCM labels'
                da.attrs['units'] = ''
                da.attrs['valid_min'] = 0
                da.attrs['valid_max'] = self._props['K']-1
                da.attrs['llh'] = llh

            # Add labels to the dataset:
            if inplace:
                return ds.pyxpcm.add(da)
            else:
                return da

    def fit_predict(self, ds, features=None, dim=None, inplace=False, name='PCM_LABELS'):
        """Estimate PCM parameters and predict classes.

        This method add these properties to the PCM object:

        - ``llh``: The log likelihood of the model with regard to new data

        Parameters
        ----------
        ds: :class:`xarray.Dataset`
            The dataset to work with

        features: dict()
            Definitions of PCM features in the input :class:`xarray.Dataset`.
            If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'.

        dim: str
            Name of the vertical dimension in the input :class:`xarray.Dataset`

        inplace: boolean, False by default
            If False, return a :class:`xarray.DataArray` with predicted labels
            If True, return the input :class:`xarray.Dataset` with labels added as a new :class:`xarray.DataArray`

        name: string ('PCM_LABELS')
            Name of the DataArray holding labels.

        Returns
        -------
        :class:`xarray.DataArray`
            Component labels (if option 'inplace' = False)

        *or*

        :class:`xarray.Dataset`
            Input dataset with component labels as a 'PCM_LABELS' new :class:`xarray.DataArray` (if option 'inplace' = True)

        """
        with self._context('fit_predict', self._context_args):

            # PRE-PROCESSING:
            X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='fit_predict')

            # CLASSIFICATION-MODEL TRAINING:
            with self._context('fit_predict.fit', self._context_args):
                self._classifier.fit(X)
            with self._context('fit_predict.score', self._context_args):
                self._props['llh'] = self._classifier.score(X)

            # Furthermore gather some information about this fit:
            self._fit_stats['score'] = self._props['llh']
            if 'n_samples_seen_' not in self._classifier.__dict__:
                self._fit_stats['n_samples_seen_'] = X.shape[0]
            else:
                self._fit_stats['n_samples_seen_'] = self._classifier.n_samples_seen_
            if 'n_iter_' in self._classifier.__dict__:
                self._fit_stats['n_iter_'] = self._classifier.n_iter_

            # Done:
            self.fitted = True

            # CLASSIFICATION PREDICTION:
            with self._context('fit_predict.predict', self._context_args):
                labels = self._classifier.predict(X)

            # Create a xarray with labels output:
            with self._context('fit_predict.xarray', self._context_args):
                da = self.unravel(ds, sampling_dims, labels).rename(name)
                da.attrs['long_name'] = 'PCM labels'
                da.attrs['units'] = ''
                da.attrs['valid_min'] = 0
                da.attrs['valid_max'] = self._props['K']-1
                da.attrs['llh'] = self._props['llh']

            # Add labels to the dataset:
            if inplace:
                return ds.pyxpcm.add(da)
            else:
                return da

    def predict_proba(self, ds, features=None, dim=None, inplace=False, name='PCM_POST', classdimname='pcm_class'):
        """Predict posterior probability of each components given the data

        This method adds these properties to the PCM instance:

        - ``llh``: The log likelihood of the model with regard to new data

        Parameters
        ----------
        ds: :class:`xarray.Dataset`
            The dataset to work with

        features: dict()
            Definitions of PCM features in the input :class:`xarray.Dataset`.
            If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'.

        dim: str
            Name of the vertical dimension in the input :class:`xarray.Dataset`

        inplace: boolean, False by default
            If False, return a :class:`xarray.DataArray` with predicted probabilities
            If True, return the input :class:`xarray.Dataset` with probabilities added as a new :class:`xarray.DataArray`

        name: str, default is 'PCM_POST'
            Name of the DataArray with prediction probability (posteriors)

        classdimname: str, default is 'pcm_class'
            Name of the dimension holding classes

        Returns
        -------
        :class:`xarray.DataArray`
            Probability of each Gaussian (state) in the model given each
            sample (if option 'inplace' = False)

        *or*

        :class:`xarray.Dataset`
            Input dataset with Component Probability as a 'PCM_POST' new :class:`xarray.DataArray`
            (if option 'inplace' = True)


        """
        with self._context('predict_proba', self._context_args):

            # Check if the PCM is trained:
            validation.check_is_fitted(self, 'fitted')

            # PRE-PROCESSING:
            X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='predict_proba')

            # CLASSIFICATION PREDICTION:
            with self._context('predict_proba.predict', self._context_args):
                post_values = self._classifier.predict_proba(X)
            with self._context('predict_proba.score', self._context_args):
                self._props['llh'] = self._classifier.score(X)

            # Create a xarray with posteriors:
            with self._context('predict_proba.xarray', self._context_args):
                P = list()
                for k in range(self.K):
                    X = post_values[:, k]
                    x = self.unravel(ds, sampling_dims, X)
                    P.append(x)
                da = xr.concat(P, dim=classdimname).rename(name)
                da.attrs['long_name'] = 'PCM posteriors'
                da.attrs['units'] = ''
                da.attrs['valid_min'] = 0
                da.attrs['valid_max'] = 1
                da.attrs['llh'] = self._props['llh']

            # Add posteriors to the dataset:
            if inplace:
                return ds.pyxpcm.add(da)
            else:
                return da

    def score(self, ds, features=None, dim=None):
        """Compute the per-sample average log-likelihood of the given data

        Parameters
        ----------
        ds: :class:`xarray.Dataset`
            The dataset to work with

        features: dict()
            Definitions of PCM features in the input :class:`xarray.Dataset`.
            If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'.

        dim: str
            Name of the vertical dimension in the input :class:`xarray.Dataset`

        Returns
        -------
        log_likelihood: float
            In the case of a GMM classifier, this is the Log likelihood of the Gaussian mixture given data

        """
        with self._context('score', self._context_args):

            # Check if the PCM is trained:
            validation.check_is_fitted(self, 'fitted')

            # PRE-PROCESSING:
            X, sampling_dims = self.preprocessing(ds, features=features, action='score')

            # COMPUTE THE PREDICTION SCORE:
            with self._context('score.score', self._context_args):
                llh = self._classifier.score(X)

        return llh

    def bic(self, ds, features=None, dim=None):
        """Compute Bayesian information criterion for the current model on the input dataset

        Only for a GMM classifier

        Parameters
        ----------
        ds: :class:`xarray.Dataset`
            The dataset to work with

        features: dict()
            Definitions of PCM features in the input :class:`xarray.Dataset`.
            If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'.

        dim: str
            Name of the vertical dimension in the input :class:`xarray.Dataset`

        Returns
        -------
        bic: float
            The lower the better
        """
        with self._context('bic', self._context_args):

            # Check classifier:
            if self._props['with_classifier'] != 'gmm':
                raise Exception( ("BIC is only available for the 'gmm' classifier ('%s')")%\
                                 (self._props['with_classifier']) )

            def _n_parameters(_classifier):
                """Return the number of free parameters in the model. See sklearn code"""
                _, n_features = _classifier.means_.shape
                if _classifier.covariance_type == 'full':
                    cov_params = _classifier.n_components * n_features * (n_features + 1) / 2.
                elif _classifier.covariance_type == 'diag':
                    cov_params = _classifier.n_components * n_features
                elif _classifier.covariance_type == 'tied':
                    cov_params = n_features * (n_features + 1) / 2.
                elif _classifier.covariance_type == 'spherical':
                    cov_params = _classifier.n_components
                mean_params = n_features * _classifier.n_components
                return int(cov_params + mean_params + _classifier.n_components - 1)

            # Check if the PCM is trained:
            validation.check_is_fitted(self, 'fitted')

            # PRE-PROCESSING:
            X, sampling_dims = self.preprocessing(ds, features=features, action='bic')

            # COMPUTE THE log-likelihood:
            with self._context('bic.score', self._context_args):
                llh = self._classifier.score(X)

            # COMPUTE BIC:
            N_samples = X.shape[0]
            bic = (-2 * llh * N_samples + _n_parameters(self._classifier) * np.log(N_samples))

        return bic
def main():

    path = 'X_new.txt'
    X = np.loadtxt(path)
    N = X.shape[0]
    gmm = GaussianMixture(n_components=3,
                          covariance_type='spherical',
                          tol=0.001,
                          reg_covar=1e-06,
                          max_iter=200,
                          n_init=10,
                          init_params='random')

    gmm.fit(X)
    print(gmm.get_params())
    print(gmm.means_)
    print(gmm.covariances_)

    labels = gmm.predict(X)
    mu = gmm.means_

    sorted_gs = np.argsort(gmm.means_[:, 0])
    sorted_colors = ['b', 'g', 'r']
    color_dict = {}
    for i, g in enumerate(sorted_gs):
        color_dict[g] = sorted_colors[i]
    c = np.array([color_dict[i] for i in labels])

    plt.scatter(X[:, 0], X[:, 1], c=c)
    plt.scatter(mu[:, 0], mu[:, 1], c='k')
    plt.show()

    sorted_gs = np.argsort(gmm.means_[:, 2])
    color_dict = {}
    for i, g in enumerate(sorted_gs):
        color_dict[g] = sorted_colors[i]
    c = np.array([color_dict[i] for i in labels])

    plt.scatter(X[:, 2], X[:, 3], c=c)
    plt.scatter(mu[:, 2], mu[:, 3], c='k')
    plt.show()

    sorted_gs = np.argsort(gmm.means_[:, 3])
    color_dict = {}
    for i, g in enumerate(sorted_gs):
        color_dict[g] = sorted_colors[i]
    c = np.array([color_dict[i] for i in labels])

    plt.scatter(X[:, 3], X[:, 4], c=c)
    plt.scatter(mu[:, 3], mu[:, 4], c='k')
    # plt.clf()
    plt.show()
    # pdb.set_trace()

    # mus = np.random.rand(3)
    stds = np.sqrt(gmm.covariances_.copy())
    pis = gmm.weights_.copy()
    exps = np.zeros([N, 3])

    iter_max = 150

    ##EM algorithm without library functions##
    ##initialize means##
    init_inds = np.random.randint(N, size=3)
    mus = X[init_inds]
    print("init mus:", mus)

    covs = [np.identity(5) * (stds[i]**2) for i in range(3)]
    pis = np.random.rand(3)
    pis = pis / sum(pis)
    new_mus = np.zeros_like(mus)
    tol = 0.001

    for i in range(iter_max):
        n = [multivariate_normal(mus[j], covs[j]) for j in range(3)]

        for j in range(N):
            for k in range(3):

                exps[j, k] = pis[k] * (n[k].pdf(X[j]) + 1e-15)
            # print (exps[j,:])
            exps[j, :] = exps[j, :] / np.sum(exps[j, :])

        ##MAXIMIZE!!!##
        for k in range(3):
            pis[k] = np.sum(exps[:, k]) / N
            new_mus[k] = np.sum(
                exps[:, k].reshape(-1, 1) * X, axis=0) / np.sum(
                    exps[:, k])  #exps = Nxk , X = nxd , mus = d

        if (np.linalg.norm(new_mus - mus) < tol):
            print("Breaking at iter ", i)
            break
        else:
            mus = new_mus.copy()

    print("post optimization:")
    print(mus)
    print(covs)
    print(pis)
    gmm.means_ = np.array(mus)

    labels = gmm.predict(X)
    # mu = gmm.means_

    sorted_gs = np.argsort(gmm.means_[:, 0])
    sorted_colors = ['b', 'g', 'r']
    color_dict = {}
    for i, g in enumerate(sorted_gs):
        color_dict[g] = sorted_colors[i]
    c = np.array([color_dict[i] for i in labels])

    plt.scatter(X[:, 0], X[:, 1], c=c)
    plt.scatter(mus[:, 0], mus[:, 1], c='k')
    plt.show()
Esempio n. 9
0
class PCM(BaseEstimator, ClassifierMixin):
    """Profile Classification Model

        Parameters
        ----------

        Methods
        -------

        Examples
        --------

    """
    def __init__(self,
                 n_components=1,
                 axis=9999,
                 scaling=1,
                 reduction=1,
                 classifier='gmm',
                 COVARTYPE='full',
                 maxvar=99.9,
                 verb=False):
        """Create the PCM instance
        """
        if scaling == 0:
            with_scaler = 'none'
            with_mean = False
            with_std = False
        elif scaling == 1:
            with_scaler = 'normal'
            with_mean = True
            with_std = True
        elif scaling == 2:
            with_scaler = 'center'
            with_mean = True
            with_std = False
        else:
            raise NameError('scaling must be 0, 1 or 2')

        if reduction == 0: with_reducer = False
        elif reduction == 1: with_reducer = True
        else: raise NameError('reduction must be 0 or 1')

        if classifier == 'gmm': with_classifier = 'gmm'
        else:
            raise NameError(
                "classifier must be 'gmm' (no other methods at this time)")

        self._props = {
            'K': np.int(n_components),
            'llh': None,
            'COVARTYPE': COVARTYPE,
            'with_scaler': with_scaler,
            'with_reducer': with_reducer,
            'with_classifier': with_classifier,
            'maxvar': maxvar,
            'DPTmodel': np.float32(axis)
        }
        self._trained = False  #todo _trained is a property, should be set/get with a decorator
        self._verb = verb  #todo _verb is a property, should be set/get with a decorator
        self._version = '0.4'

    # def __call__(self, **kwargs):
    #     self.__init__(**kwargs)

    def get_params(self, deep=True):
        # suppose this estimator has parameters "alpha" and "recursive"
        return self._props

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self._props[parameter] = value
        return self

    def set_config(self, **kargs):
        """Set-up all processing steps according to PCM properties"""

        self.set_params(kargs)

        self._interpoler = self.__Interp(self._props['DPTmodel'])

        self._scaler = preprocessing.StandardScaler(with_mean=with_mean,
                                                    with_std=with_std)
        self._reducer = PCA(n_components=self._props['maxvar'] / 100,
                            svd_solver='full')
        self._classifier = GaussianMixture(
            n_components=self._props['K'],
            covariance_type=self._props['COVARTYPE'],
            init_params='kmeans',
            max_iter=1000,
            tol=1e-6)
        return self

    class __Interp:
        """ Internal machinery for the interpolation of vertical profiles
            
            This class is called once at PCM instance initialisation and
            whenever data to be classified are not on the PCM vertical axis.
            
        """
        def __init__(self, DPTmodel):
            self.zi = DPTmodel
            self.doINTERPz = False

        def isnecessary(self, C, z):
            """Check wether or not the input data vertical axis is different
                from the PCM one, if not, avoid interpolation
            """
            #todo We should be smarter and recognize occurences of z in DPTmodel
            # or viceversa in order to limit interpolation as much as possible !
            z = np.float32(z)
            #self.doINTERPz = not np.array_equal(self.zi,z)
            self.doINTERPz = not np.array_equiv(self.zi, z)
            return self.doINTERPz

        def mix(self, x):
            """ 
                Homogeneize the upper water column: 
                Set 1st nan value to the first non-NaN value
            """
            #izmixed = np.argwhere(np.isnan(x))
            izok = np.where(~np.isnan(x))[0][0]
            #x[izmixed] = x[izok]
            x[0] = x[izok]
            return x

        def fit_transform(self, C, z):
            """
                Interpolate data on the PCM vertical axis
            """
            if (self.isnecessary(C, z)):
                [Np, Nz] = C.shape
                # Possibly Create a mixed layer for the interpolation to work
                # smoothly at the surface
                if ((z[0] < 0.) & (self.zi[0] == 0.)):
                    z = np.concatenate((np.zeros(1), z))
                    x = np.empty((Np, 1))
                    x.fill(np.nan)
                    C = np.concatenate((x, C), axis=1)
                    np.apply_along_axis(self.mix, 1, C)
                # Linear interpolation of profiles onto the model grid:
                #f = interpolate.interp2d(z, np.arange(Np), C, kind='cubic')
                f = interpolate.interp2d(z, np.arange(Np), C, kind='linear')
                C = f(self.zi, np.arange(Np))
            return C

    def display(self, deep=False):
        """Display detailled parameters of the PCM
            This is not a get_params because it doesn't return a dictionnary
            Set Boolean option 'deep' to True for all properties display
        """
        summary = [("<pcm '%s' (K: %i, Z: %i)>") %
                   (self._props['with_classifier'], self._props['K'],
                    self._props['DPTmodel'].size)]

        # PCM core properties:
        prop_info = ('Number of class: %i') % self._props['K']
        summary.append(prop_info)

        # prop_info = ('Vertical axis: %s') % self._props['DPTmodel']
        prop_info = ('Vertical axis: [%s, ..., %s]') % (repr(
            self._props['DPTmodel'][0]), repr(self._props['DPTmodel'][-1]))
        summary.append(prop_info)

        prop_info = ('Trained: %r') % self._trained
        summary.append(prop_info)

        # PCM workflow parameters:
        prop_info = ('Vertical Interpolation: %r') % self._interpoler.doINTERPz
        summary.append(prop_info)
        summary.append("\t Interpoler: %s" % (type(self._interpoler)))

        prop_info = ('Sample Scaling: %r') % self._props['with_scaler']
        summary.append(prop_info)
        summary.append("\t Scaler: %s" % (type(self._scaler)))

        if (deep):
            summary.append("\t Scaler properties:")
            d = self._scaler.get_params(deep=deep)
            for p in d:
                summary.append(("\t\t %s: %r") % (p, d[p]))

        prop_info = (
            'Dimensionality Reduction: %r') % self._props['with_reducer']
        summary.append(prop_info)
        summary.append("\t Reducer: %s" % (type(self._reducer)))
        #prop_info = ('\t Maximum Variance: %0.2f%%') % self._props['maxvar']
        #summary.append(prop_info)

        if (deep):
            summary.append("\t Reducer properties:")
            d = self._reducer.get_params(deep=deep)
            for p in d:
                summary.append(("\t\t %s: %r") % (p, d[p]))

        prop_info = ('Classification: %r') % self._props['with_classifier']
        summary.append(prop_info)
        summary.append("\t Classifier: %s" % (type(self._classifier)))
        #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE']
        #summary.append(prop_info)
        if (self._trained):
            prop_info = ('\t log likelihood: %f') % self._props['llh']
            summary.append(prop_info)

        if (deep):
            summary.append("\t Classifier properties:")
            d = self._classifier.get_params(deep=deep)
            for p in d:
                summary.append(("\t\t %s: %r") % (p, d[p]))

        # Done
        return '\n'.join(summary)

    # def __repr__(self):
    #     return self.display(deep=self._verb)

    def copy(self):
        """Return a deep copy of the PCM instance"""
        return copy.deepcopy(self)

    def preprocessing(self, X, Z):
        """"Pre-process data for classification

            Preprocessing steps:
                interpolation,
                scaling,
                reduction.

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            Z: array-like, shape (N_z=n_features,)
                Vertical axis of profiles

            Returns
            -------
            X : array-like, shape (N_p=n_samples, n_reduced_scaled_interpolated_features)
                List of profiles pre-processed for classification
        """

        # INTERPOLATION:
        X = self._interpoler.fit_transform(X, Z)

        # SCALING:
        self._scaler.fit(X)
        X = self._scaler.transform(X)

        # REDUCTION:
        if self._props['with_reducer']:
            self._reducer.fit(X)
            X = self._reducer.transform(X)

        # Output:
        return X

    def fit(self, X, y=None, axis=None, **kargs):
        """Estimate PCM parameters
           
            For a PCM, the fit method consists in the following operations:
                - interpolation to the Depth levels of the model
                - scaling
                - reduction
                - estimate classifier parameters

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            axis: array-like, shape (N_z=n_features,)
                Vertical axis of profiles

            Returns
            -------
            self
        """
        self.set_config(kargs)

        # PRE-PROCESSING:
        X = self.preprocessing(X, axis)

        # CLASSIFICATION-MODEL TRAINING:
        self._classifier.fit(X)
        self._props['llh'] = self._classifier.score(X)

        # Done:
        self._trained = True
        return self

    def score(self, X, y=None, axis=None):
        """Compute the per-sample average log-likelihood of the given data X
        """
        if (not self._trained):
            raise ValueError("Can't predict before fitting !")

        # PRE-PROCESSING:
        X = self.preprocessing(X, axis)

        return self._classifier.score(X)

    def predict(self, X, y=None, axis=None):
        """Predict the labels for the profile samples in X using trained PCM

           This method add these properties to the PCM data property:
              llh: The log likelihood of the model with regard to new data

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.

            Returns
            -------
            labels : xarray.DataArray, shape (N_p = n_samples)
                Component labels.
        """
        # if not self._trained:
        #     raise ValueError("Can't predict before fitting !")
        validation.check_is_fitted(
            self,
            '_trained',
            msg=
            "This %(name)s instance is not fitted yet. Call ‘fit’ with appropriate arguments before using this method."
        )

        # PRE-PROCESSING:
        X = self.preprocessing(X, axis)

        # CLASSIFICATION PREDICTION:
        labels = self._classifier.predict(X)
        self._props['llh'] = self._classifier.score(X)

        # Prepare xarray for output:
        labels = xr.DataArray(labels, dims='samples', name='LABELS')
        labels.attrs['llh'] = self._props['llh']

        # done:
        return labels
Esempio n. 10
0
        predict.append(1)
correct = 0
for i in range(len(predict)):
    if (i >= 0 and i <= len(left) - 1):
        if (predict[i] == 0):
            correct += 1
    else:
        if (predict[i] == 1):
            correct += 1
print("SImiliarity Score: ", metrics.adjusted_rand_score(predict, true))
#print('Correct Prediction Rate: ', correct/2000)
b = np.array(predict)
np.savetxt('predict.txt', predict, delimiter=',', fmt='%1d')
a = np.array(mixer)
np.savetxt('iter.txt', a, delimiter=',', fmt='%1.8f')
loglik.pop(0)
print('Number of iterations ', j + 1)
#plt.subplot(2,2,2)
plt.plot(loglik)
plt.xlabel('iteration')
plt.ylabel('log likelihood')
plt.show()
sample = data.reshape(len(data), 1)
model = GaussianMixture(2, init_params='random')
model.fit(sample)
l = model.predict(sample)
dd = model.get_params()
print(model.lower_bound_)
data = data.reshape(-1, 1)
s = model.score(data)
Esempio n. 11
0
class PCM:
    """
        Common base class for a Profile Classification Model
    """
    def __init__(self, K, DPTmodel, scaling=1, reduction=1, classif='gmm', COVARTYPE='full', maxvar=99.9, verb=False):
        """Create the PCM instance
        """
        if   scaling==0: with_scaler = 'none'; with_mean=False; with_std = False
        elif scaling==1: with_scaler = 'normal';  with_mean=True; with_std = True
        elif scaling==2: with_scaler = 'center';  with_mean=True; with_std = False
        else: raise NameError('scaling must be 0, 1 or 2')
        
        if   reduction==0: with_reducer = False
        elif reduction==1: with_reducer = True
        else: raise NameError('reduction must be 0 or 1')
        
        if classif=='gmm': with_classifier = 'gmm';
        else: raise NameError("classifier must be 'gmm' (no other methods at this time)")
        
        self._props = {'K': np.int(K),
                        'llh': None,
                        'COVARTYPE': COVARTYPE,
                        'with_scaler': with_scaler,
                        'with_reducer': with_reducer,
                        'with_classifier': with_classifier,
                        'maxvar': maxvar,
                        'DPTmodel': np.float32(DPTmodel)}
        self._trained = False #todo _trained is a property, should be set/get with a decorator
        self._verb = verb #todo _verb is a property, should be set/get with a decorator
        
        self._interpoler = self.__Interp(self._props['DPTmodel'])
        
        self._scaler = preprocessing.StandardScaler(with_mean=with_mean,
                                                    with_std=with_std)
        self._reducer = PCA(n_components=self._props['maxvar']/100,
                           svd_solver='full')
        self._classifier = GaussianMixture(n_components=self._props['K'],
                                          covariance_type=self._props['COVARTYPE'],
                                          init_params='kmeans',
                                          max_iter=1000,
                                          tol=1e-6)
        self._version = '0.3'

    def __call__(self, **kwargs):
        self.__init__(**kwargs)
    
    def __iter__(self):
        self.__i = 0
        return self
    
    def next(self):
        if self.__i < self.K:
            i = self.__i
            self.__i += 1
            return i
        else:
            raise StopIteration()

    class __Interp:
        """ Internal machinery for the interpolation of vertical profiles
            
            This class is called once at PCM instance initialisation and
            whenever data to be classified are not on the PCM vertical axis.
            
        """
        def __init__(self,DPTmodel):
            self.zi = DPTmodel
            self.doINTERPz = False
        
        def isnecessary(self,C,z):
            """Check wether or not the input data vertical axis is different
                from the PCM one, if not, avoid interpolation
            """
            #todo We should be smarter and recognize occurences of z in DPTmodel
            # or viceversa in order to limit interpolation as much as possible !
            z = np.float32(z)
            #self.doINTERPz = not np.array_equal(self.zi,z)
            self.doINTERPz = not np.array_equiv(self.zi,z)
            return self.doINTERPz
        
        def mix(self,x):
            """ 
                Homogeneize the upper water column: 
                Set 1st nan value to the first non-NaN value
            """
            #izmixed = np.argwhere(np.isnan(x))
            izok = np.where(~np.isnan(x))[0][0]
            #x[izmixed] = x[izok]
            x[0] = x[izok]
            return x;
        
        def fit_transform(self,C,z):
            """
                Interpolate data on the PCM vertical axis
            """
            if (self.isnecessary(C,z)):
                [Np, Nz] = C.shape            
                # Possibly Create a mixed layer for the interpolation to work 
                # smoothly at the surface
                if ((z[0]<0.) & (self.zi[0] == 0.)):
                    z = np.concatenate((np.zeros(1),z))
                    x = np.empty((Np,1))
                    x.fill(np.nan)
                    C = np.concatenate((x,C),axis=1)
                    np.apply_along_axis(self.mix,1,C)
                # Linear interpolation of profiles onto the model grid:
                #f = interpolate.interp2d(z, np.arange(Np), C, kind='cubic')
                f = interpolate.interp2d(z, np.arange(Np), C, kind='linear')
                C = f(self.zi, np.arange(Np))
            return C

    @property
    def K(self):
        """Return the number of class K in the PCM"""
        return self._props['K']

    def display(self, deep=False):
        """Display detailled parameters of the PCM
            This is not a get_params because it doesn't return a dictionnary
            Set Boolean option 'deep' to True for all properties display
        """
        summary = [("<pcm '%s' (K: %i, Z: %i)>")%(self._props['with_classifier'],self._props['K'],self._props['DPTmodel'].size)]
        
        # PCM core properties:
        prop_info = ('Number of class: %i') % self._props['K']
        summary.append(prop_info)
        
        # prop_info = ('Vertical axis: %s') % self._props['DPTmodel']
        prop_info = ('Vertical axis: [%s, ..., %s]') % (repr(self._props['DPTmodel'][0]),repr(self._props['DPTmodel'][-1]))
        summary.append(prop_info)
        
        prop_info = ('Trained: %r') % self._trained
        summary.append(prop_info)
        
        # PCM workflow parameters:
        prop_info = ('Vertical Interpolation: %r') % self._interpoler.doINTERPz
        summary.append(prop_info)    
        summary.append("\t Interpoler: %s"%(type(self._interpoler)))
        
        prop_info = ('Sample Scaling: %r') % self._props['with_scaler']
        summary.append(prop_info)
        summary.append("\t Scaler: %s"%(type(self._scaler)))
        
        if (deep):
            summary.append("\t Scaler properties:")
            d = self._scaler.get_params(deep=deep)
            for p in d: summary.append(("\t\t %s: %r")%(p,d[p]))
        
        prop_info = ('Dimensionality Reduction: %r') % self._props['with_reducer']
        summary.append(prop_info)       
        summary.append("\t Reducer: %s"%(type(self._reducer)))
        #prop_info = ('\t Maximum Variance: %0.2f%%') % self._props['maxvar']
        #summary.append(prop_info) 
        
        if (deep):
            summary.append("\t Reducer properties:")
            d = self._reducer.get_params(deep=deep)
            for p in d: summary.append(("\t\t %s: %r")%(p,d[p]))
        
        prop_info = ('Classification: %r') % self._props['with_classifier']
        summary.append(prop_info) 
        summary.append("\t Classifier: %s"%(type(self._classifier)))
        #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE']
        #summary.append(prop_info)
        if (self._trained):
            prop_info = ('\t log likelihood: %f') % self._props['llh']
            summary.append(prop_info)
        
        if (deep):
            summary.append("\t Classifier properties:")
            d = self._classifier.get_params(deep=deep)
            for p in d: summary.append(("\t\t %s: %r")%(p,d[p]))
        
        # Done
        return '\n'.join(summary)
    
    def __repr__(self):
        return self.display(deep=self._verb)
    
    def copy(self):
        """Return a deep copy of the PCM instance"""
        return copy.deepcopy(self)

    def preprocessing(self, X, Z):
        """"Pre-process data for classification

            Preprocessing steps:
                interpolation,
                scaling,
                reduction.

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            Z: array-like, shape (N_z=n_features,)
                Vertical axis of profiles

            Returns
            -------
            X : array-like, shape (N_p=n_samples, n_reduced_scaled_interpolated_features)
                List of profiles pre-processed for classification
        """

        # INTERPOLATION:
        X = self._interpoler.fit_transform(X, Z)

        # SCALING:
        self._scaler.fit(X)
        X = self._scaler.transform(X)

        # REDUCTION:
        if self._props['with_reducer']:
            self._reducer.fit(X)
            X = self._reducer.transform(X)

        # Output:
        return X

    def fit(self, X, Z):
        """Estimate PCM parameters
           
            For a PCM, the fit method consists in the following operations:
                - interpolation to the Depth levels of the model
                - scaling
                - reduction
                - estimate classifier parameters

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            Z: array-like, shape (N_z=n_features,)
                Vertical axis of profiles

            Returns
            -------
            self
        """
        # PRE-PROCESSING:
        X = self.preprocessing(X, Z)
        
        # CLASSIFICATION-MODEL TRAINING:
        self._classifier.fit(X)
        self._props['llh'] = self._classifier.score(X)
        
        # Done:
        self._trained = True
        return self
    
    def predict(self, X, Z):
        """Predict the labels for the profile samples in X using trained PCM

           This method add these properties to the PCM data property:
              llh: The log likelihood of the model with regard to new data

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            Z: array-like, shape (N_z=n_features,)
                Vertical axis of profiles

            Returns
            -------
            labels : xarray.DataArray, shape (N_p = n_samples)
                Component labels.
        """
        # if not self._trained:
        #     raise ValueError("Can't predict before fitting !")
        validation.check_is_fitted(self, '_trained',msg="This %(name)s instance is not fitted yet. Call ‘fit’ with appropriate arguments before using this method.")

        # PRE-PROCESSING:
        X = self.preprocessing(X, Z)

        # CLASSIFICATION PREDICTION:
        labels = self._classifier.predict(X)
        self._props['llh'] = self._classifier.score(X)

        # Prepare xarray for output:
        labels = xr.DataArray(labels, dims='samples', name='LABELS')
        labels.attrs['llh'] = self._props['llh']

        # done:
        return labels
    
    def fit_predict(self, X, Z):
        """Estimate PCM parameters and predict classes
           
            Train a PCM and predict classes in a single step

           This method add these properties to the PCM data property:
              llh: The log likelihood of the model with regard to new data

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            Z: array-like, shape (N_z=n_features,)
                Vertical axis of profiles

            Returns
            -------
            labels : xarray.DataArray, shape (N_p = n_samples)
                Component labels.
        """
        # PRE-PROCESSING:
        X = self.preprocessing(X, Z)
        
        # CLASSIFICATION-MODEL TRAINING:
        self._classifier.fit(X)
        
        # CLASSIFICATION PREDICTION:
        labels = self._classifier.predict(X)
        self._props['llh'] = self._classifier.score(X)

        # Prepare xarray for output:
        labels = xr.DataArray(labels, dims='samples', name='LABELS')
        labels.attrs['llh'] = self._props['llh']

        # Done:
        self._trained = True
        return labels
    
    def predict_proba(self, X, Z):
        """Predict posterior probability of each component given the data
           
           This method adds these properties to the PCM instance:
               llh: The log likelihood of the model with regard to new data

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            Z: array-like, shape (N_z=n_features,)
                Vertical axis of profiles

            Returns
            -------
            post : array, shape (n_samples, n_components)
                Returns the probability of each Gaussian (state) in
                the model given each sample.
        """
        if (not self._trained):
            raise ValueError("Can't predict before fitting !")
        
        # PRE-PROCESSING:
        X = self.preprocessing(X, Z)
        
        # CLASSIFICATION PREDICTION:
        post = self._classifier.predict_proba(X)
        self._props['llh'] = self._classifier.score(X)

        # Prepare xarray for output:
        post = xr.DataArray(post, dims={'samples','components'}, name='POST')
        post.attrs['llh'] = self._props['llh']

        # done:
        return post

    def quant(self, X, Z=None, labels=None, q=[0.05, 0.5, 0.95], verb=False):
        """Compute the qth quantiles of the data for each PCM component.

            Usage A:
                pcm.quant(X, labels=L, q=[0.05,0.5,0.95])
                    This usage will use labels L to compute component percentiles of X
            Usage B:
                pcm.quant(X, Z=DEPTH, q=[0.05,0.5,0.95])
                    This usage will classify data X at depth Z and then compute percentiles.
                    Be careful, if you re-fit a model, you may not end up with something coherent
                    from previous calculation of labels and posteriors, as components will show
                    up in different orders

            Parameters
            ----------
            X : array-like, shape (N_p=n_samples, N_z=n_features)
                List of N_z-dimensional data profile. Each row
                corresponds to a single profile.
            Z: array-like, shape (N_z=n_features,)
                Vertical axis of profiles
            labels: array, shape (N_p=n_samples,)
                Component labels.
            q: float in the range of [0,1] (or sequence of floats), shape (n_quantiles,1)
                Quantile(s) to compute, which must be between 0 and 1 inclusive.

            Returns
            -------
            Q : xarray.DataArray, shape (K, n_quantiles, N_z=n_features)

        """
        if labels is None:
            labels = self.fit_predict(X,Z)
        elif Z is None:
            if not self._trained:
                raise ValueError("Can't compute quantiles without a fitted model !")

        #
        if (not isinstance(X,xr.core.dataarray.DataArray)):
            XR = xr.DataArray(X, dims=['samples', 'features'])
        else:
            XR = xr.DataArray(X.values, dims=['samples', 'features'])

        if (not isinstance(labels,xr.core.dataarray.DataArray)):
            LR = xr.DataArray(labels, dims=['samples'])
        else:
            LR = xr.DataArray(labels.values, dims=['samples'])

        DS = xr.Dataset({'DATA': XR, 'LABELS': LR})
        varname = 'DATA'
        Q = [] # list of results
        for label, group in DS.groupby('LABELS'):
            if verb:
                print ("Using %0d profiles of %s in class %i") % (group['samples'].shape[0], varname, label)
            quant = group[varname].quantile(q, dim='samples')
            Q.append(quant)
        Q = xr.concat(Q, dim='components') # Transform the list into a DataArray
        Q.name = 'QUANTILES'

        # Done:
        return Q
fig, axes = plt.subplots()

plt.scatter(Xnew[:, 0], Xnew[:, 1], c=labels, s=1, cmap='viridis')
plt.scatter(ra, dec, s=36, color='red', marker='^')
plt.xlabel('RA')
plt.ylabel('DEC')

gmm_means = []
gmm_cov = []

for i in range(0, len(gmm.means_)):

    print i, gmm.covariances_[i][0][0]

    gmm_means.append([gmm.means_[i][0], gmm.means_[i][1]])
    gmm_cov.append([[gmm.covariances_[i][0][0], gmm.covariances_[i][0][1]],
                    [gmm.covariances_[i][1][0], gmm.covariances_[i][1][1]]])

plot_results(gmm_means, gmm_cov, fig, axes)

plt.show()

print gmm.get_params()

#labels_1 = gmm.fit_predict(Xnew)

#plt.scatter(Xnew[:, 0], Xnew[:, 1], c=labels_1, s=7, cmap='viridis')

#labels_2 = gmm.get_params(deep=True)
#print (labels_2)
Esempio n. 13
0
File: gmm.py Progetto: tadeze/ADMV
                                         p_k=param[2])


if __name__ == '__main__':
    import pandas as pd
    from util.common import metric
    df = pd.read_csv(
        '/home/tadeze/projects/missingvalue/datasets/anomaly/yeast/fullsamples/yeast_1.csv'
    )
    train_data = df.ix[:, 1:].as_matrix().astype(np.float64)
    # train_lbl = df.ix[:,0] #
    train_lbl = map(int, df.ix[:, 0] == "anomaly")
    gmms = GaussianMixture(n_components=3)
    gmms.fit(train_data)
    score = -gmms.score_samples(train_data)
    print gmms.get_params(False)
    print len(score)
    print metric(train_lbl, score)
    from pypr.clustering import gmm

    cen_lst, cov_lst, p_k, logL = gmm.em_gm(train_data, max_iter=100, K=3)
    score = [
        -gmm.gm_log_likelihood(
            train_data[i, :], center_list=cen_lst, cov_list=cov_lst, p_k=p_k)
        for i in range(0, train_data.shape[0])
    ]
    #print score
    print metric(train_lbl, score)

    # Marginalize the
    #m_cen_lst, m_cov_lst, m_p_k
Esempio n. 14
0
class PCM:
    """
        Common base class for a Profile Classification Model
    """
    def __init__(self,
                 K,
                 DPTmodel,
                 scaling=1,
                 reduction=1,
                 classif='gmm',
                 COVARTYPE='full',
                 maxvar=99.9,
                 verb=False):
        #todo: check inputs validity
        if (scaling == 0):
            with_scaler = False
            with_mean = False
            with_std = False
        elif (scaling == 1):
            with_scaler = True
            with_mean = True
            with_std = True
        elif (scaling == 2):
            with_scaler = True
            with_mean = True
            with_std = False
        else:
            raise NameError('scaling must be 0, 1 or 2')

        if (reduction == 0): with_reducer = False
        elif (reduction == 1): with_reducer = True
        else: raise NameError('reduction must be 0, 1')

        if (classif == 'gmm'): with_classifier = 'gmm'
        else:
            raise NameError(
                "classifier must be 'gmm' (no other methods at this time)")

        self._props = {
            'K': np.int(K),
            'COVARTYPE': COVARTYPE,
            'with_scaler': with_scaler,
            'with_reducer': with_reducer,
            'with_classifier': with_classifier,
            'maxvar': maxvar,
            'DPTmodel': np.float32(DPTmodel)
        }
        self._trained = False
        self._verb = verb
        self.K = self._props['K']
        self._scaler = preprocessing.StandardScaler(with_mean=with_mean,
                                                    with_std=with_std)
        self._reducer = PCA(n_components=self._props['maxvar'] / 100,
                            svd_solver='full')
        self._classifier = GaussianMixture(
            n_components=self._props['K'],
            covariance_type=self._props['COVARTYPE'],
            init_params='kmeans',
            max_iter=1000,
            tol=1e-6)
        self.interpoler = PCM.Interp(self._props['DPTmodel'])
        self._version = '0.1'

    def __iter__(self):
        self.__i = 0
        return self

    def next(self):
        if self.__i < self.K:
            i = self.__i
            self.__i += 1
            return i
        else:
            raise StopIteration()

    class Interp:
        """ 
            Internal machinery for the interpolation of vertical profiles
            
            This class is called once at PCM instance initialisation and
            whenever data to be classified are not on the PCM vertical axis.
            
        """
        def __init__(self, DPTmodel):
            self.zi = DPTmodel
            self.doINTERPz = False

        def isnecessary(self, C, z):
            """
                Check whether or not the data vertical axis is different
                from the PCM one, if not, avoid interpolation
            """
            z = np.float32(z)
            # self.doINTERPz = not np.array_equal(self.zi,z)
            self.doINTERPz = not np.array_equiv(self.zi, z)
            return self.doINTERPz

        def mix(self, x):
            """ 
                Homogenize the upper water column:
                Set 1st nan value to the first non-NaN value
            """
            # izmixed = np.argwhere(np.isnan(x))
            izok = np.where(~np.isnan(x))[0][0]
            # x[izmixed] = x[izok]
            x[0] = x[izok]
            return x

        def fit_transform(self, C, z):
            """
                Interpolate data on the PCM vertical axis
            """
            if self.isnecessary(C, z):
                [Np, Nz] = C.shape
                # Possibly Create a mixed layer for the interpolation to work
                # smoothly at the surface
                if (z[0] < 0.) & (self.zi[0] == 0.):
                    z = np.concatenate((np.zeros(1), z))
                    x = np.empty((Np, 1))
                    x.fill(np.nan)
                    C = np.concatenate((x, C), axis=1)
                    np.apply_along_axis(self.mix, 1, C)
                # Linear interpolation of profiles onto the model grid:
                # f = interpolate.interp2d(z, np.arange(Np), C, kind='cubic')
                f = interpolate.interp2d(z, np.arange(Np), C, kind='linear')
                C = f(self.zi, np.arange(Np))
            return C

    def display(self, deep=True):
        """
            Display detailed parameters of the PCM
            This is not get_params because it should return a dictionnary
        """
        summary = [("<pcm '%s' (K: %i, Z: %i)>") %
                   (self._props['with_classifier'], self._props['K'],
                    self._props['DPTmodel'].size)]

        # PCM core properties:
        prop_info = 'Number of class: %i'.format(self._props['K'])
        summary.append(prop_info)

        prop_info = ('Vertical axis: %s') % repr(self._props['DPTmodel'])
        summary.append(prop_info)

        prop_info = ('Trained: %r') % self._trained
        summary.append(prop_info)

        # PCM workflow parameters:
        prop_info = ('Vertical Interpolation: %r') % self.interpoler.doINTERPz
        summary.append(prop_info)
        summary.append("\t Interpoler: %s" % (type(self.interpoler)))

        prop_info = ('Sample Normalisation: %r') % self._props['with_scaler']
        summary.append(prop_info)
        summary.append("\t Normaliser: %s" % (type(self._scaler)))

        if (deep):
            summary.append("\t Normaliser properties:")
            d = self._scaler.get_params(deep=deep)
            for p in d:
                summary.append(("\t\t %s: %r") % (p, d[p]))

        prop_info = (
            'Dimensionality Reduction: %r') % self._props['with_reducer']
        summary.append(prop_info)
        summary.append("\t Reducer: %s" % (type(self._reducer)))
        #prop_info = ('\t Maximum Variance: %0.2f%%') % self._props['maxvar']
        #summary.append(prop_info)

        if (deep):
            summary.append("\t Reducer properties:")
            d = self._reducer.get_params(deep=deep)
            for p in d:
                summary.append(("\t\t %s: %r") % (p, d[p]))

        prop_info = ('Classification: %r') % self._props['with_classifier']
        summary.append(prop_info)
        summary.append("\t Classifier: %s" % (type(self._classifier)))
        #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE']
        #summary.append(prop_info)
        if (self._trained):
            prop_info = ('\t log likelihood: %f') % self.llh
            summary.append(prop_info)

        if (deep):
            summary.append("\t Classifier properties:")
            d = self._classifier.get_params(deep=deep)
            for p in d:
                summary.append(("\t\t %s: %r") % (p, d[p]))

        # Done
        return '\n'.join(summary)

    def __repr__(self):
        return self.display(deep=self._verb)

    def fit(self, X, Z):
        """
            For a PCM, the fit method consists in the following operations:
                - interpolation to the Depth levels of the model
                - scaling
                - reduction
                - estimate GMM parameters                
        """
        # CHECK INPUTS
        #todo we should check for errors/inconsistencies in inpts

        # INTERPOLATION:
        X = self.interpoler.fit_transform(X, Z)

        # SCALING:
        self._scaler.fit(X)
        X = self._scaler.transform(X)

        # REDUCTION:
        if (self._props['with_reducer']):
            self._reducer.fit(X)
            X = self._reducer.transform(X)

        # CLASSIFICATION-MODEL TRAINING:
        self._classifier.fit(X)
        self.llh = self._classifier.score(X)

        # Done:
        self._trained = True
        return self

    def predict(self, X, Z):
        """
            Using the self PCM properties, predict the class of new data
        """
        # CHECK INPUTS
        #todo we should check for errors/inconsistencies in inpts
        print self._trained
        if not self._trained:
            raise NameError("Can't predict before fitting !")

        # INTERPOLATION:
        X = self.interpoler.fit_transform(X, Z)

        # SCALING:
        X = self._scaler.transform(X)

        # REDUCTION:
        if (self._props['with_reducer']):
            X = self._reducer.transform(X)

        # CLASSIFICATION PREDICTION:
        self.LABELS = self._classifier.predict(X)
        self.llh = self._classifier.score(X)

        # done:
        return self

    def fit_predict(self, X, Z):
        """
            Train a PCM and predict classes in a single step
        """
        # CHECK INPUTS
        #todo we should check for errors/inconsistencies in inpts

        # INTERPOLATION:
        X = self.interpoler.fit_transform(X, Z)

        # SCALING:
        self._scaler.fit(X)
        X = self._scaler.transform(X)

        # REDUCTION:
        if (self._props['with_reducer']):
            self._reducer.fit(X)
            X = self._reducer.transform(X)

        # CLASSIFICATION-MODEL TRAINING:
        self._classifier.fit(X)

        # CLASSIFICATION PREDICTION:
        self.LABELS = self._classifier.predict(X)
        self.llh = self._classifier.score(X)

        # Done:
        self._trained = True
        return self
Esempio n. 15
0
    y_train_lbl, classes=range(10))  # TODO: sparse_output=True?
predicted_class = np.empty((labeled_rows + unlabeled_rows))
predicted_class[0:labeled_rows] = y_train_lbl

# https://people.duke.edu/~ccc14/sta-663/EMAlgorithm.html
tol = 0.001
max_iter = 100
# n = all_rows = X_train_all.shape[0]
# for P(x)
gm = GaussianMixture(max_iter=1,
                     n_components=10,
                     weights_init=thetas[2],
                     means_init=thetas[0],
                     precisions_init=thetas[1])
gm.fit(X_train_lbl)  #needed for predict
print(gm.get_params()['means_init'] == thetas[0])

#ll_old = 0      #log-likelyhood?
for i in range(max_iter):
    print('\nIteration: ', i)
    print()

    # E-step: compute γ_y(x) = P(z=y | x, Σ, μ, w) for unlabeled data
    #gammas = np.zeros((10, unlabeled_rows))
    #for y in range(10):
    #for i in range(unlabeled_rows):      # for each unlabled x-row
    # w_y*P(X | Σ_y, μ_y) / P(X)
    gammas[labeled_rows:] = gm.predict_proba(
        X_train_unlbl)  # shape: unlbl samples * 10

    # M-step: MLE for μ_y, Σ_y, w_y