def process_phonescreen_day_data(self, user_id: str, touchstream: List[DataPoint], input_touchstream: DataStream, gm: GaussianMixture): """ Analyze the phone touch screen gap to find typing, pause between typing, reading and unknown sessions. It uses the Gaussian Mixture algorithm to find different peaks in a mixture of 4 different gaussian distribution of screen touch gap. :param str user_id: UUID of the stream owner :param List(DataPoint) touchstream: Phone touch screen stream data :param DataStream input_touchstream: DataStream object of phone touch screen :param DataStream input_categorystream: DataStream object of app category stream :param GaussianMixture gm: GaussianMixture object created from all day data of the user :return: """ touchstream = sorted(touchstream, key=lambda x: x.start_time) # appusage = self.get_appusage_duration_by_category(categorystream, ["Communication", "Productivity"]) # tapping_gap = self.appusage_interval_list(touchstream, appusage) # if len(tapping_gap) < 50: # self.CC.logging.log("Not enough screen touch data") # return tapping_gap = [] for i in range(1, len(touchstream)): tapping_gap.append(touchstream[i].sample - touchstream[i-1].sample) tapping_gap = sorted(tapping_gap) if len(tapping_gap)==0: self.CC.logging.log("Not enough screen touch data") return #gm = GaussianMixture(n_components = 4, max_iter = 500)#, covariance_type = 'spherical') X = (np.array(tapping_gap)/1000).reshape(-1, 1) #gm.fit(X) P = gm.predict(X) mx = np.zeros(gm.get_params()['n_components']) mn = np.full(gm.get_params()['n_components'], np.inf) for i in range(len(P)): x = P[i] mx[x] = max(mx[x], X[i][0]) mn[x] = min(mn[x], X[i][0]) intervals = [] for i in range(len(mx)): intervals.append((mn[i], mx[i])) intervals = sorted(intervals) try: data = self.label_appusage_intervals(touchstream, intervals, ["typing", "pause", "reading", "unknown"]) if data: self.store_stream(filepath="phone_touch_type_all_app.json", input_streams=[input_touchstream], user_id=user_id, data=data, localtime=False) except Exception as e: self.CC.logging.log("Exception:", str(e)) self.CC.logging.log(str(traceback.format_exc()))
def Gaussian_Mixture_Model(X_train, y_train, X_test, y_test, max_iter): ''' Isomap demonstrated that the data is distributed in overlaping groups. Therefore, samples should be allocated to digit based on a Gaussian distribution. ''' n_components = 10 model = GaussianMixture(n_components = n_components, max_iter = max_iter) classifier = model.fit(X_train, y_train) testing_model = model.predict(X_test) score = accuracy_score(y_test, testing_model) # cv_scores = cross_val_score(classifier, X_test, y_test, cv = 3) print(' ') print('===== Gaussian Mixture Model =====') print('score:', score) # print('cross validation scores:', cv_scores) # Visualize parameters in a table. visualize_params(model.get_params()) # Visualize actual labels versus predicted labels. visualize_heatmap(y_test, testing_model, 'Gaussian Mixture Model') return score
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs): """ Finds cluster of users in data using Gaussian Mixture Models. :param data: pd.DataFrame with features for clustering indexed by users (sessions) :param max_n_clusters: maximal number of clusters for automatic selection for number of clusters. if None, then use n_clusters from arguments :param use_csi: if True, then cluster stability index will be calculated (may take a lot of time) :param random_state: random state for GaussianMixture clusterer :param kwargs: keyword arguments for sklearn.mixture.GaussianMixture :return: np.array of clusters """ if max_n_clusters is not None: kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters, random_state, **kwargs) else: kmargs = { i: j for i, j in kwargs.items() if i in GaussianMixture.get_params(GaussianMixture) } kmargs.update({'random_state': random_state}) km = GaussianMixture(**kmargs) cl = km.fit_predict(data.values) km.labels_ = cl bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) return cl, metrics
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs): """ Finds cluster of users in data using Gaussian Mixture Models. Parameters -------- data: pd.DataFrame Dataframe with features for clustering indexed as in ``retention_config.index_col`` max_n_clusters: int, optional Maximal number of clusters for automatic selection for number of clusters. If ``None``, then uses ```n_clusters`` from arguments. Default: `None``` use_csi: bool, optional If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True`` random_state: int, optional Random state for GaussianMixture clusterer. kwargs: optional Parameters for ``sklearn.mixture.GaussianMixture`` Returns -------- Array of clusters Return type -------- np.array """ if max_n_clusters is not None: kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters, random_state, **kwargs) else: kmargs = { i: j for i, j in kwargs.items() if i in GaussianMixture.get_params(GaussianMixture) } kmargs.update({'random_state': random_state}) km = GaussianMixture(**kmargs) cl = km.fit_predict(data.values) km.labels_ = cl bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) return cl, metrics
def run_sample(word2vec_model: str, sample_size, averaged_category_map, averaged_comment_map): # Convert article-comment map into a matrix of averaged comment weight vectors print("Converting comment map to matrix...") comment_matrix = convert_to_matrix(averaged_comment_map) print() if sample_size is not None: print("Selecting " + str(sample_size) + " samples...", end="") sample_indices = np.random.choice(comment_matrix.shape[0], size=sample_size, replace=False) samples = comment_matrix[sample_indices, :] else: sample_size = len(comment_matrix) print("Selecting " + str(sample_size) + " samples...", end="") samples = comment_matrix print(" Done.") print() # Set up GMM and fit to data print("Fitting using GMM...") gmm = GaussianMixture(n_components=50, verbose=2, verbose_interval=1) gmm.fit(samples) print() gmm_data = { "params": gmm.get_params(), "weights": gmm.weights_, "means": gmm.means_, "covariances": gmm.covariances_, "precisions": gmm.precisions_, "precisions_cholesky": gmm.precisions_cholesky_ } np.save( "../resources/gmm_" + str(sample_size) + "_" + word2vec_model + ".npy", gmm_data)
def gmm(): gmm = GaussianMixture(n_components=3).fit(iris_X) print(gmm.get_params(iris_X)) labels = gmm.predict(iris_X) count1 = 0 count2 = 0 count3 = 0 sum2 = np.sum(labels[:50] == 0) sum0 = np.sum(labels[:50] == 1) sum1 = np.sum(labels[:50] == 2) if sum1 > sum2 and sum1 > sum0: count1 = sum2 + sum0 elif sum0 > sum1 and sum0 > sum2: count1 = sum1 + sum2 else: count1 = sum1 + sum0 sum2 = np.sum(labels[50:100] == 0) sum0 = np.sum(labels[50:100] == 1) sum1 = np.sum(labels[50:100] == 2) if sum1 > sum2 and sum1 > sum0: count2 = sum2 + sum0 elif sum0 > sum1 and sum0 > sum2: count2 = sum1 + sum2 else: count2 = sum1 + sum0 sum2 = np.sum(labels[100:150] == 0) sum0 = np.sum(labels[100:150] == 1) sum1 = np.sum(labels[100:150] == 2) if sum1 > sum2 and sum1 > sum0: count3 = sum2 + sum0 elif sum0 > sum1 and sum0 > sum2: count3 = sum1 + sum2 else: count3 = sum1 + sum0 accuracy = 1 - (count1 + count2 + count3) / 150 return accuracy
class pcm(object): """Profile Classification Model class constructor Consume and return :mod:`xarray` objects """ def __init__(self, K:int, features:dict(), scaling=1, reduction=1, maxvar=15, classif='gmm', covariance_type='full', verb=False, debug=False, timeit=False, timeit_verb=False, chunk_size='auto', backend='sklearn'): """Create the PCM instance Parameters ---------- K: int The number of class, or cluster, in the classification model. features: dict() The vertical axis to use for each features. eg: {'temperature':np.arange(-2000,0,1)} scaling: int (default: 1) Define the scaling method: - 0: No scaling - **1: Center on sample mean and scale by sample std** - 2: Center on sample mean only reduction: int (default: 1) Define the dimensionality reduction method: - 0: No reduction - **1: Reduction using :class:`sklearn.decomposition.PCA`** maxvar: float (default: 99.9) Maximum feature variance to preserve in the reduced dataset using :class:`sklearn.decomposition.PCA`. In %. classif: str (default: 'gmm') Define the classification method. The only method available as of now is a Gaussian Mixture Model. See :class:`sklearn.mixture.GaussianMixture` for more details. covariance_type: str (default: 'full') Define the type of covariance matrix shape to be used in the default classifier GMM. It can be ‘full’ (default), ‘tied’, ‘diag’ or ‘spherical’. verb: boolean (default: False) More verbose output timeit: boolean (default: False) Register time of operation for performance evaluation timeit_verb: boolean (default: False) Print time of operation during execution chunk_size: 'auto' or int Sampling chunk size, (array of features after pre-processing) backend: str Statistic library backend, 'sklearn' (default) or 'dask_ml' """ if K==0: raise PCMClassError("Can't create a PCM with K=0") if K is None: raise PCMClassError("K must be defined to create a PMC") if not bool(features): raise PCMFeatureError("Can't create a PCM without features") if scaling==0: with_scaler = 'none'; with_mean=False; with_std = False elif scaling==1: with_scaler = 'normal'; with_mean=True; with_std = True elif scaling==2: with_scaler = 'center'; with_mean=True; with_std = False else: raise NameError('scaling must be 0, 1 or 2') if reduction==0: with_reducer = False elif reduction==1: with_reducer = True else: raise NameError('reduction must be 0 or 1') if classif=='gmm': with_classifier = 'gmm'; else: raise NameError("classifier must be 'gmm' (no other methods implemented at this time)") #todo check validity of the dict of features self._props = {'K': np.int(K), 'F': len(features), 'llh': None, 'COVARTYPE': covariance_type, 'with_scaler': with_scaler, 'with_reducer': with_reducer, 'with_classifier': with_classifier, 'maxvar': maxvar, 'features': collections.OrderedDict(features), 'chunk_size': chunk_size, 'backend': backend} self._xmask = None # xarray mask for nd-array used at pre-processing steps self._register = collections.OrderedDict() # Will register mutable instances of sub-modules like 'plot' self._verb = verb #todo _verb is a property, should be set/get with a decorator self._debug = debug self._interpoler = collections.OrderedDict() self._scaler = collections.OrderedDict() self._scaler_props = collections.OrderedDict() self._reducer = collections.OrderedDict() self._homogeniser = collections.OrderedDict() # Load estimators for a specific backend: bck = StatisticsBackend(backend, scaler='StandardScaler', reducer='PCA') for feature_name in features: feature_axis = self._props['features'][feature_name] if isinstance(feature_axis, xr.DataArray): self._props['features'][feature_name] = feature_axis.values # self._scaler[feature_name] = preprocessing.StandardScaler(with_mean=with_mean, # with_std=with_std) if 'none' not in self._props['with_scaler']: self._scaler[feature_name] = bck.scaler(with_mean=with_mean, with_std=with_std) else: self._scaler[feature_name] = NoTransform() self._scaler_props[feature_name] = {'units': '?'} is_slice = np.all(feature_axis == None) if not is_slice: self._interpoler[feature_name] = Vertical_Interpolator(axis=feature_axis, debug=self._debug) if np.prod(feature_axis.shape) == 1: # Single level: no need to reduce if self._debug: print('Single level, not need to reduce', np.prod(feature_axis.ndim)) self._reducer[feature_name] = NoTransform() else: # Multi-vertical-levels, set reducer: if with_reducer: self._reducer[feature_name] = bck.reducer(n_components=self._props['maxvar'], svd_solver='full') else: self._reducer[feature_name] = NoTransform() else: self._interpoler[feature_name] = NoTransform() self._reducer[feature_name] = NoTransform() if self._debug: print("%s is single level, no need to reduce" % feature_name) self._homogeniser[feature_name] = {'mean': 0, 'std': 1} self._classifier = GaussianMixture(n_components=self._props['K'], covariance_type=self._props['COVARTYPE'], init_params='kmeans', max_iter=1000, tol=1e-6) # Define the "context" to execute some functions inner code # (useful for time benchmarking) self._context = self.__empty_context # Default is empty, do nothing self._context_args = dict() if timeit: self._context = self.__timeit_context self._context_args = {'maxlevel': 3, 'verb':timeit_verb} self._timeit = dict() # Define statistics for the fit method: self._fit_stats = dict({'datetime': None, 'n_samples_seen_': None, 'score': None, 'etime': None}) @contextmanager def __timeit_context(self, name, opts=dict()): default_opts = {'maxlevel': np.inf, 'verb':False} for key in opts: if key in default_opts: default_opts[key] = opts[key] level = len([i for i in range(len(name)) if name.startswith('.', i)]) if level <= default_opts['maxlevel']: startTime = time.time() yield elapsedTime = time.time() - startTime trailingspace = " " * level trailingspace = " " if default_opts['verb']: # print('... time in {} {}: {} ms'.format(trailingspace, name, int(elapsedTime * 1000))) print('{} {}: {} ms'.format(trailingspace, name, int(elapsedTime * 1000))) if name in self._timeit: self._timeit[name].append(elapsedTime * 1000) else: self._timeit[name] = list([elapsedTime*1000]) else: yield @contextmanager def __empty_context(self, name, *args, **kargs): yield def __call__(self, **kwargs): self.__init__(**kwargs) def __iter__(self): self.__i = 0 return self def __next__(self): if self.__i < self.K: i = self.__i self.__i += 1 return i else: raise StopIteration() def __repr__(self): return self.display(deep=self._verb) def ravel(self, da, dim=None, feature_name=str): """ Extract from N-d array a X(feature,sample) 2-d array and vertical dimension z Parameters ---------- da: :class:`xarray.DataArray` The DataArray to process dim: str Name of the vertical dimension in the input :class:`xarray.DataArray` feature_name: str Target PCM feature name for the input :class:`xarray.DataArray` Returns ------- X: :class:`xarray.DataArray` A new DataArray with dimension ['n_sampling','n_features'] Note that data are always :class:`dask.array.Array`. z: :class:`numpy.array` The vertical axis of data sampling_dims: dict() Dictionary where keys are :class:`xarray.Dataset` variable names of features and values are another dictionary with the list of sampling dimension in ``DIM_SAMPLING`` key and the name of the vertical axis in the ``DIM_VERTICAL`` key. Examples -------- This function is meant to be used internally only __author__: [email protected] """ # Is this a thick array or a slice ? is_slice = np.all(self._props['features'][feature_name] == None) # Load mask where all features are available for this PCM: mask_stacked = self._xmask if is_slice: # No vertical dimension to use, simple stacking sampling_dims = list(da.dims) # Apply all-features mask: X = da.stack({'sampling': sampling_dims}) X = X.where(mask_stacked == 1, drop=True).expand_dims('dummy').transpose()#.values z = np.empty((1,)) else: if not dim: # Try to infer the vertical dimension name looking for the CF 'axis' attribute in all dimensions dim_found = False for this_dim in da.dims: if ('axis' in da[this_dim].attrs) and (da[this_dim].attrs['axis'] == 'Z'): dim = this_dim dim_found = True if not dim_found: raise PCMFeatureError("You must specify a vertical dimension name: "\ "use argument 'dim' or "\ "specify DataSet dimension the attribute 'axis' to 'Z' (CF1.6)") elif dim not in da.dims: raise ValueError("Vertical dimension %s not found in this DataArray" % dim) sampling_dims = list(da.dims) sampling_dims.remove(dim) X = da.stack({'sampling': sampling_dims}) #todo Improve performance for this operation ! # Apply all-features mask: X = X.where(mask_stacked == 1, drop=True).transpose() z = da[dim].values X = X.chunk(chunks={'sampling': self._props['chunk_size']}) return X, z, sampling_dims def unravel(self, ds, sampling_dims, X): """ Create a DataArray from a numpy array and sampling dimensions """ # Load mask where all features are available for this PCM: mask_stacked = self._xmask # coords = list() size = list() for dim in sampling_dims: coords.append(ds[dim]) size.append(len(ds[dim])) da = xr.DataArray(np.empty((size)), coords=coords) da = da.stack({'sampling': sampling_dims}) da = da.where(mask_stacked == 1, drop=True).transpose() da.values = X da = da.unstack('sampling') if (np.prod(da.shape) != mask_stacked.shape[0]): if self._debug: print("\tUnravelled data not matching mask dimension, re-indexing") mask = mask_stacked.unstack() da = da.reindex_like(mask) return da @property def K(self): """Return the number of classes""" return self._props['K'] @property def F(self): """Return the number of features""" return self._props['F'] @property def features(self): """Return features definition dictionnary""" return self._props['features'] @property def plot(self): """Access plotting functions""" # Create a mutable instance on 1st call so that later changes will be reflected in future calls # https://stackoverflow.com/a/8140747 if 'plot' not in self._register: self._register['plot'] = [_PlotMethods(self)] return self._register['plot'][0] @property def stat(self): """Access statistics functions""" return _StatMethods(self) @property def timeit(self): """ Return a :class:`pandas.DataFrame` with Execution time of method called on this instance """ def get_multindex(times): """ Create multi-index pandas """ # Get max levels: dpt = list() [dpt.append(len(key.split("."))) for key in times] max_dpt = np.max(dpt) # Read index: levels_1 = list() levels_2 = list() levels_3 = list() levels_4 = list() if max_dpt == 1: for key in times: levels = key.split(".") levels_1.append(levels[0]) return max_dpt, [levels_1] elif max_dpt == 2: for key in times: levels = key.split(".") if len(levels) == 1: levels_1.append(levels[0]) levels_2.append('total') if len(levels) == 2: levels_1.append(levels[0]) levels_2.append(levels[1]) return max_dpt, [levels_1,levels_2] elif max_dpt == 3: for key in times: levels = key.split(".") # print(len(levels), levels) if len(levels) == 1: levels_1.append(levels[0]) levels_2.append('total') levels_3.append('') if len(levels) == 2: levels_1.append(levels[0]) levels_2.append(levels[1]) levels_3.append('total') if len(levels) == 3: levels_1.append(levels[0]) levels_2.append(levels[1]) levels_3.append(levels[2]) return max_dpt, [levels_1,levels_2,levels_3] elif max_dpt == 4: for key in times: levels = key.split(".") if len(levels) == 1: levels_1.append(levels[0]) levels_2.append('total') levels_3.append('') levels_4.append('') if len(levels) == 2: levels_1.append(levels[0]) levels_2.append(levels[1]) levels_3.append('total') levels_4.append('') if len(levels) == 3: levels_1.append(levels[0]) levels_2.append(levels[1]) levels_3.append(levels[2]) levels_4.append('total') if len(levels) == 4: levels_1.append(levels[0]) levels_2.append(levels[1]) levels_3.append(levels[2]) levels_4.append(levels[3]) return max_dpt, [levels_1,levels_2,levels_3,levels_4] times = self._timeit max_dpt, arrays = get_multindex(times) if max_dpt == 1: index = pd.Index(arrays[0], names=['Method']) df = pd.Series([np.sum(times[key]) for key in times], index=index) # df = df.T elif max_dpt == 2: tuples = list(zip(*arrays)) index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Sub-method']) df = pd.Series([np.sum(times[key]) for key in times], index=index) df = df.unstack(0) df = df.drop('total') df = df.T elif max_dpt == 3: tuples = list(zip(*arrays)) index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Sub-method', 'Sub-sub-method']) df = pd.Series([np.sum(times[key]) for key in times], index=index) # df = df.unstack(0) elif max_dpt == 4: tuples = list(zip(*arrays)) index = pd.MultiIndex.from_tuples(tuples, names=['Method', 'Sub-method', 'Sub-sub-method', 'Sub-sub-sub-method']) df = pd.Series([np.sum(times[key]) for key in times], index=index) return df @property def backend(self): """Return the name of the statistic backend""" return self._props['backend'] @property def fitstats(self): """ Estimator fit properties The number of samples processed by the estimator Will be reset on new calls to fit, but increments across partial_fit calls. """ return self._fit_stats @docstring(io.to_netcdf.__doc__) def to_netcdf(self, ncfile, **ka): """ Save PCM to netcdf file Parameters ---------- path : str Path to file """ return io.to_netcdf(self, ncfile, **ka) def display(self, deep=False): """Display detailed parameters of the PCM This is not a get_params because it doesn't return a dictionary Set Boolean option 'deep' to True for all properties display """ summary = [("<pcm '%s' (K: %i, F: %i)>")%(self._props['with_classifier'], self._props['K'], len(self._props['features']))] # PCM core properties: prop_info = ('Number of class: %i') % self._props['K'] summary.append(prop_info) prop_info = ('Number of feature: %i') % len(self._props['features']) summary.append(prop_info) prop_info = ('Feature names: %s') % (repr(self._props['features'].keys())) summary.append(prop_info) # prop_info = ('Feature axis: [%s, ..., %s]') % (repr(self._props['features'][0]), # repr(self._props['feature_axis'][-1])) # summary.append(prop_info) prop_info = ('Fitted: %r') % hasattr(self, 'fitted') summary.append(prop_info) # PCM workflow parameters: for feature in self._props['features']: prop_info = "Feature: '%s'" % feature summary.append(prop_info) summary.append("\t Interpoler: %s"%(type(self._interpoler[feature]))) # prop_info = ('\t Sample Scaling: %r') % # summary.append(prop_info) summary.append("\t Scaler: %r, %s"%(self._props['with_scaler'], type(self._scaler[feature]))) if (deep): # summary.append("\t\t Scaler properties:") d = self._scaler[feature].get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r")%(p,d[p])) # prop_info = ('\t Dimensionality Reduction: %r') % # summary.append(prop_info) summary.append("\t Reducer: %r, %s"%(self._props['with_reducer'], type(self._reducer[feature]))) if (deep): # summary.append("\t\t Reducer properties:") d = self._reducer[feature].get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r")%(p,d[p])) # return '\n'.join(summary) # prop_info = ('Classification: %r') % # summary.append(prop_info) summary.append("Classifier: %r, %s"%(self._props['with_classifier'], type(self._classifier))) #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE'] #summary.append(prop_info) if (hasattr(self,'fitted')): prop_info = ('\t log likelihood of the training set: %f') % self._props['llh'] summary.append(prop_info) if (deep): summary.append("\t Classifier properties:") d = self._classifier.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r")%(p,d[p])) # Done return '\n'.join(summary) def preprocessing_this(self, da, dim=None, feature_name=str(), action='?'): """Pre-process data before anything Possible pre-processing steps: - interpolation, - scaling, - reduction Parameters ---------- da: :class:`xarray.DataArray` The DataArray to process dim: str Name of the vertical dimension in the input :class:`xarray.DataArray` feature_name: str Target PCM feature name for the input :class:`xarray.DataArray` Returns ------- X: np.array Pre-processed feature, with dimensions (N_SAMPLE, N_FEATURES) sampling_dims: list() List of the input :class:`xarray.DataArray` dimensions stacked as sampling points """ this_context = str(action)+'.1-preprocess.2-feature_'+feature_name with self._context(this_context + '.total', self._context_args): # MAKE THE ND-ARRAY A 2D-ARRAY with self._context(this_context + '.1-ravel', self._context_args): X, z, sampling_dims = self.ravel(da, dim=dim, feature_name=feature_name) if self._debug: print("\t", "X RAVELED with success", str(LogDataType(X))) # INTERPOLATION STEP: with self._context(this_context + '.2-interp', self._context_args): X = self._interpoler[feature_name].transform(X, z) if self._debug: if isinstance(self._interpoler[feature_name], NoTransform): print("\t", "X INTERPOLATED with success (NoTransform)", str(LogDataType(X))) else: print("\t", "X INTERPOLATED with success", str(LogDataType(X))) # print(X.values.flags['WRITEABLE']) # After the interpolation step, we must not have nan in the 2d array: assert_all_finite(X, allow_nan=False) # FIT STEPS: # We need to fit pre-processing methods in order to re-use them when # predicting a new dataset # SCALING: with self._context(this_context+'.3-scale_fit', self._context_args): if not hasattr(self, 'fitted'): self._scaler[feature_name].fit(X.data) if 'units' in da.attrs: self._scaler_props[feature_name]['units'] = da.attrs['units'] with self._context(this_context + '.4-scale_transform', self._context_args): try: X.data = self._scaler[feature_name].transform(X.data, copy=False) except ValueError: if self._debug: print("\t\t Fail to scale.transform without copy, fall back on copy=True") try: X.data = self._scaler[feature_name].transform(X.data, copy=True) except ValueError: if self._debug: print("\t\t Fail to scale.transform with copy, fall back on input copy") X.data = self._scaler[feature_name].transform(X.data.copy()) pass except: if self._debug: print(X.values.flags['WRITEABLE']) raise pass except: raise if self._debug: print("\t", "X SCALED with success)", str(LogDataType(X))) # REDUCTION: with self._context(this_context + '.5-reduce_fit', self._context_args): if (not hasattr(self, 'fitted')) and (self._props['with_reducer']): if self.backend == 'dask_ml': # We have to convert any type of data array into a Dask array because # dask_ml cannot handle anything else (!) #todo Raise an issue on dask_ml github to ask why is this choice made # Related issues: # https://github.com/dask/dask-ml/issues/6 # https://github.com/dask/dask-ml/issues/541 # https://github.com/dask/dask-ml/issues/542 X.data = dask.array.asarray(X.data, chunks=X.shape) if isinstance(X.data, dask.array.Array): self._reducer[feature_name].fit(X.data) else: self._reducer[feature_name].fit(X) with self._context(this_context + '.6-reduce_transform', self._context_args): X = self._reducer[feature_name].transform(X.data) # Reduction, return np.array # After reduction the new array is [ sampling, reduced_dim ] X = xr.DataArray(X, dims=['sampling', 'n_features'], coords={'sampling': range(0, X.shape[0]), 'n_features': np.arange(0,X.shape[1])}) if self._debug: print("\t", "X REDUCED with success)", str(LogDataType(X))) # Output: return X, sampling_dims def preprocessing(self, ds, features=None, dim=None, action='?', mask=None): """ Dataset pre-processing of feature(s) Depending on pyXpcm set-up, pre-processing steps can be: - interpolation, - scaling, - reduction Parameters ---------- ds: :class:`xarray.Dataset` The dataset to work with features: dict() Definitions of PCM features in the input :class:`xarray.Dataset`. If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'. dim: str Name of the vertical dimension in the input :class:`xarray.Dataset` Returns ------- X: np.array Pre-processed set of features, with dimensions (N_SAMPLE, N_FEATURES) sampling_dims: list() List of the input :class:`xarray.Dataset` dimensions stacked as sampling points """ this_context = str(action)+'.1-preprocess' with self._context(this_context, self._context_args): if self._debug: print("> Start preprocessing for action '%s'" % action) # How do we find feature variable in this dataset ? features_dict = ds.pyxpcm.feature_dict(self, features=features) # Determine mask where all features are defined for this PCM: with self._context(this_context + '.1-mask', self._context_args): if not mask: mask = ds.pyxpcm.mask(self, features=features, dim=dim) # Stack all-features mask: mask = mask.stack({'sampling': list(mask.dims)}) self._xmask = mask # Pre-process all features and build the X array X = np.empty(()) Xlabel = list() # Construct a list of string labels for each feature dimension (useful for plots) F = self.F # Nb of features for feature_in_pcm in features_dict: feature_in_ds = features_dict[feature_in_pcm] if self._debug: print( ("\n\t> Preprocessing xarray dataset '%s' as PCM feature '%s'")\ %(feature_in_ds, feature_in_pcm) ) if ('maxlevel' in self._context_args) and (self._context_args['maxlevel'] <= 2): a = this_context + '.2-features' else: a = this_context with self._context(a, self._context_args): da = ds[feature_in_ds] x, sampling_dims = self.preprocessing_this(da, dim=dim, feature_name=feature_in_pcm, action=action) xlabel = ["%s_%i"%(feature_in_pcm, i) for i in range(0, x.shape[1])] if self._debug: print("\t%s pre-processed with success, " % feature_in_pcm, str(LogDataType(x))) with self._context(this_context + '.3-homogeniser', self._context_args): # Store full array mean and std during fit: if F>1: # For more than 1 feature, we need to make them comparable, # so we normalise each features by their global stats: # FIT: if (action == 'fit') or (action == 'fit_predict'): self._homogeniser[feature_in_pcm]['mean'] = x.mean().values self._homogeniser[feature_in_pcm]['std'] = x.std().values #todo _homogeniser should be a proper standard scaler # TRANSFORM: x = (x-self._homogeniser[feature_in_pcm]['mean'])/\ self._homogeniser[feature_in_pcm]['std'] if self._debug and action == 'fit': print(("\tHomogenisation for fit of %s") % (feature_in_pcm)) elif self._debug: print(("\tHomogenisation of %s using fit data") % (feature_in_pcm)) elif self._debug: print(("\tNo need for homogenisation of %s") % (feature_in_pcm)) if np.prod(X.shape) == 1: X = x Xlabel = xlabel else: X = np.append(X, x, axis=1) [Xlabel.append(i) for i in xlabel] with self._context(this_context + '.4-xarray', self._context_args): self._xlabel = Xlabel if self._debug: print("\tFeatures array shape and type for xarray:", X.shape, type(X), type(X.data)) X = xr.DataArray(X, dims=['n_samples', 'n_features'], coords={'n_samples': range(0, X.shape[0]), 'n_features': Xlabel}) if self._debug: print("> Preprocessing done, working with final X (%s) array of shape:" % type(X), X.shape, " and sampling dimensions:", sampling_dims) return X, sampling_dims def fit(self, ds, features=None, dim=None): """Estimate PCM parameters For a PCM, the fit method consists in the following operations: - pre-processing - interpolation to the ``feature_axis`` levels of the model - scaling - reduction - estimate classifier parameters Parameters ---------- ds: :class:`xarray.Dataset` The dataset to work with features: dict() Definitions of PCM features in the input :class:`xarray.Dataset`. If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'. dim: str Name of the vertical dimension in the input :class:`xarray.Dataset` Returns ------- self """ with self._context('fit', self._context_args) : # PRE-PROCESSING: X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='fit') # CLASSIFICATION-MODEL TRAINING: with self._context('fit.fit', self._context_args): self._classifier.fit(X) with self._context('fit.score', self._context_args): self._props['llh'] = self._classifier.score(X) # Furthermore gather some information about the fit: self._fit_stats['score'] = self._props['llh'] self._fit_stats['datetime'] = datetime.utcnow() if 'n_samples_seen_' not in self._classifier.__dict__: self._fit_stats['n_samples_seen_'] = X.shape[0] else: self._fit_stats['n_samples_seen_'] = self._classifier.n_samples_seen_ if 'n_iter_' in self._classifier.__dict__: self._fit_stats['n_iter_'] = self._classifier.n_iter_ # Done: self.fitted = True return self def predict(self, ds, features=None, dim=None, inplace=False, name='PCM_LABELS'): """Predict labels for profile samples This method add these properties to the PCM object: - ``llh``: The log likelihood of the model with regard to new data Parameters ---------- ds: :class:`xarray.Dataset` The dataset to work with features: dict() Definitions of PCM features in the input :class:`xarray.Dataset`. If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'. dim: str Name of the vertical dimension in the input :class:`xarray.Dataset` inplace: boolean, False by default If False, return a :class:`xarray.DataArray` with predicted labels If True, return the input :class:`xarray.Dataset` with labels added as a new :class:`xarray.DataArray` name: str, default is 'PCM_LABELS' Name of the :class:`xarray.DataArray` with labels Returns ------- :class:`xarray.DataArray` Component labels (if option 'inplace' = False) *or* :class:`xarray.Dataset` Input dataset with Component labels as a 'PCM_LABELS' new :class:`xarray.DataArray` (if option 'inplace' = True) """ with self._context('predict', self._context_args): # Check if the PCM is trained: validation.check_is_fitted(self, 'fitted') # PRE-PROCESSING: X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='predict') # CLASSIFICATION PREDICTION: with self._context('predict.predict', self._context_args): labels = self._classifier.predict(X) with self._context('predict.score', self._context_args): llh = self._classifier.score(X) # Create a xarray with labels output: with self._context('predict.xarray', self._context_args): da = self.unravel(ds, sampling_dims, labels).rename(name) da.attrs['long_name'] = 'PCM labels' da.attrs['units'] = '' da.attrs['valid_min'] = 0 da.attrs['valid_max'] = self._props['K']-1 da.attrs['llh'] = llh # Add labels to the dataset: if inplace: return ds.pyxpcm.add(da) else: return da def fit_predict(self, ds, features=None, dim=None, inplace=False, name='PCM_LABELS'): """Estimate PCM parameters and predict classes. This method add these properties to the PCM object: - ``llh``: The log likelihood of the model with regard to new data Parameters ---------- ds: :class:`xarray.Dataset` The dataset to work with features: dict() Definitions of PCM features in the input :class:`xarray.Dataset`. If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'. dim: str Name of the vertical dimension in the input :class:`xarray.Dataset` inplace: boolean, False by default If False, return a :class:`xarray.DataArray` with predicted labels If True, return the input :class:`xarray.Dataset` with labels added as a new :class:`xarray.DataArray` name: string ('PCM_LABELS') Name of the DataArray holding labels. Returns ------- :class:`xarray.DataArray` Component labels (if option 'inplace' = False) *or* :class:`xarray.Dataset` Input dataset with component labels as a 'PCM_LABELS' new :class:`xarray.DataArray` (if option 'inplace' = True) """ with self._context('fit_predict', self._context_args): # PRE-PROCESSING: X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='fit_predict') # CLASSIFICATION-MODEL TRAINING: with self._context('fit_predict.fit', self._context_args): self._classifier.fit(X) with self._context('fit_predict.score', self._context_args): self._props['llh'] = self._classifier.score(X) # Furthermore gather some information about this fit: self._fit_stats['score'] = self._props['llh'] if 'n_samples_seen_' not in self._classifier.__dict__: self._fit_stats['n_samples_seen_'] = X.shape[0] else: self._fit_stats['n_samples_seen_'] = self._classifier.n_samples_seen_ if 'n_iter_' in self._classifier.__dict__: self._fit_stats['n_iter_'] = self._classifier.n_iter_ # Done: self.fitted = True # CLASSIFICATION PREDICTION: with self._context('fit_predict.predict', self._context_args): labels = self._classifier.predict(X) # Create a xarray with labels output: with self._context('fit_predict.xarray', self._context_args): da = self.unravel(ds, sampling_dims, labels).rename(name) da.attrs['long_name'] = 'PCM labels' da.attrs['units'] = '' da.attrs['valid_min'] = 0 da.attrs['valid_max'] = self._props['K']-1 da.attrs['llh'] = self._props['llh'] # Add labels to the dataset: if inplace: return ds.pyxpcm.add(da) else: return da def predict_proba(self, ds, features=None, dim=None, inplace=False, name='PCM_POST', classdimname='pcm_class'): """Predict posterior probability of each components given the data This method adds these properties to the PCM instance: - ``llh``: The log likelihood of the model with regard to new data Parameters ---------- ds: :class:`xarray.Dataset` The dataset to work with features: dict() Definitions of PCM features in the input :class:`xarray.Dataset`. If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'. dim: str Name of the vertical dimension in the input :class:`xarray.Dataset` inplace: boolean, False by default If False, return a :class:`xarray.DataArray` with predicted probabilities If True, return the input :class:`xarray.Dataset` with probabilities added as a new :class:`xarray.DataArray` name: str, default is 'PCM_POST' Name of the DataArray with prediction probability (posteriors) classdimname: str, default is 'pcm_class' Name of the dimension holding classes Returns ------- :class:`xarray.DataArray` Probability of each Gaussian (state) in the model given each sample (if option 'inplace' = False) *or* :class:`xarray.Dataset` Input dataset with Component Probability as a 'PCM_POST' new :class:`xarray.DataArray` (if option 'inplace' = True) """ with self._context('predict_proba', self._context_args): # Check if the PCM is trained: validation.check_is_fitted(self, 'fitted') # PRE-PROCESSING: X, sampling_dims = self.preprocessing(ds, features=features, dim=dim, action='predict_proba') # CLASSIFICATION PREDICTION: with self._context('predict_proba.predict', self._context_args): post_values = self._classifier.predict_proba(X) with self._context('predict_proba.score', self._context_args): self._props['llh'] = self._classifier.score(X) # Create a xarray with posteriors: with self._context('predict_proba.xarray', self._context_args): P = list() for k in range(self.K): X = post_values[:, k] x = self.unravel(ds, sampling_dims, X) P.append(x) da = xr.concat(P, dim=classdimname).rename(name) da.attrs['long_name'] = 'PCM posteriors' da.attrs['units'] = '' da.attrs['valid_min'] = 0 da.attrs['valid_max'] = 1 da.attrs['llh'] = self._props['llh'] # Add posteriors to the dataset: if inplace: return ds.pyxpcm.add(da) else: return da def score(self, ds, features=None, dim=None): """Compute the per-sample average log-likelihood of the given data Parameters ---------- ds: :class:`xarray.Dataset` The dataset to work with features: dict() Definitions of PCM features in the input :class:`xarray.Dataset`. If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'. dim: str Name of the vertical dimension in the input :class:`xarray.Dataset` Returns ------- log_likelihood: float In the case of a GMM classifier, this is the Log likelihood of the Gaussian mixture given data """ with self._context('score', self._context_args): # Check if the PCM is trained: validation.check_is_fitted(self, 'fitted') # PRE-PROCESSING: X, sampling_dims = self.preprocessing(ds, features=features, action='score') # COMPUTE THE PREDICTION SCORE: with self._context('score.score', self._context_args): llh = self._classifier.score(X) return llh def bic(self, ds, features=None, dim=None): """Compute Bayesian information criterion for the current model on the input dataset Only for a GMM classifier Parameters ---------- ds: :class:`xarray.Dataset` The dataset to work with features: dict() Definitions of PCM features in the input :class:`xarray.Dataset`. If not specified or set to None, features are identified using :class:`xarray.DataArray` attributes 'feature_name'. dim: str Name of the vertical dimension in the input :class:`xarray.Dataset` Returns ------- bic: float The lower the better """ with self._context('bic', self._context_args): # Check classifier: if self._props['with_classifier'] != 'gmm': raise Exception( ("BIC is only available for the 'gmm' classifier ('%s')")%\ (self._props['with_classifier']) ) def _n_parameters(_classifier): """Return the number of free parameters in the model. See sklearn code""" _, n_features = _classifier.means_.shape if _classifier.covariance_type == 'full': cov_params = _classifier.n_components * n_features * (n_features + 1) / 2. elif _classifier.covariance_type == 'diag': cov_params = _classifier.n_components * n_features elif _classifier.covariance_type == 'tied': cov_params = n_features * (n_features + 1) / 2. elif _classifier.covariance_type == 'spherical': cov_params = _classifier.n_components mean_params = n_features * _classifier.n_components return int(cov_params + mean_params + _classifier.n_components - 1) # Check if the PCM is trained: validation.check_is_fitted(self, 'fitted') # PRE-PROCESSING: X, sampling_dims = self.preprocessing(ds, features=features, action='bic') # COMPUTE THE log-likelihood: with self._context('bic.score', self._context_args): llh = self._classifier.score(X) # COMPUTE BIC: N_samples = X.shape[0] bic = (-2 * llh * N_samples + _n_parameters(self._classifier) * np.log(N_samples)) return bic
def main(): path = 'X_new.txt' X = np.loadtxt(path) N = X.shape[0] gmm = GaussianMixture(n_components=3, covariance_type='spherical', tol=0.001, reg_covar=1e-06, max_iter=200, n_init=10, init_params='random') gmm.fit(X) print(gmm.get_params()) print(gmm.means_) print(gmm.covariances_) labels = gmm.predict(X) mu = gmm.means_ sorted_gs = np.argsort(gmm.means_[:, 0]) sorted_colors = ['b', 'g', 'r'] color_dict = {} for i, g in enumerate(sorted_gs): color_dict[g] = sorted_colors[i] c = np.array([color_dict[i] for i in labels]) plt.scatter(X[:, 0], X[:, 1], c=c) plt.scatter(mu[:, 0], mu[:, 1], c='k') plt.show() sorted_gs = np.argsort(gmm.means_[:, 2]) color_dict = {} for i, g in enumerate(sorted_gs): color_dict[g] = sorted_colors[i] c = np.array([color_dict[i] for i in labels]) plt.scatter(X[:, 2], X[:, 3], c=c) plt.scatter(mu[:, 2], mu[:, 3], c='k') plt.show() sorted_gs = np.argsort(gmm.means_[:, 3]) color_dict = {} for i, g in enumerate(sorted_gs): color_dict[g] = sorted_colors[i] c = np.array([color_dict[i] for i in labels]) plt.scatter(X[:, 3], X[:, 4], c=c) plt.scatter(mu[:, 3], mu[:, 4], c='k') # plt.clf() plt.show() # pdb.set_trace() # mus = np.random.rand(3) stds = np.sqrt(gmm.covariances_.copy()) pis = gmm.weights_.copy() exps = np.zeros([N, 3]) iter_max = 150 ##EM algorithm without library functions## ##initialize means## init_inds = np.random.randint(N, size=3) mus = X[init_inds] print("init mus:", mus) covs = [np.identity(5) * (stds[i]**2) for i in range(3)] pis = np.random.rand(3) pis = pis / sum(pis) new_mus = np.zeros_like(mus) tol = 0.001 for i in range(iter_max): n = [multivariate_normal(mus[j], covs[j]) for j in range(3)] for j in range(N): for k in range(3): exps[j, k] = pis[k] * (n[k].pdf(X[j]) + 1e-15) # print (exps[j,:]) exps[j, :] = exps[j, :] / np.sum(exps[j, :]) ##MAXIMIZE!!!## for k in range(3): pis[k] = np.sum(exps[:, k]) / N new_mus[k] = np.sum( exps[:, k].reshape(-1, 1) * X, axis=0) / np.sum( exps[:, k]) #exps = Nxk , X = nxd , mus = d if (np.linalg.norm(new_mus - mus) < tol): print("Breaking at iter ", i) break else: mus = new_mus.copy() print("post optimization:") print(mus) print(covs) print(pis) gmm.means_ = np.array(mus) labels = gmm.predict(X) # mu = gmm.means_ sorted_gs = np.argsort(gmm.means_[:, 0]) sorted_colors = ['b', 'g', 'r'] color_dict = {} for i, g in enumerate(sorted_gs): color_dict[g] = sorted_colors[i] c = np.array([color_dict[i] for i in labels]) plt.scatter(X[:, 0], X[:, 1], c=c) plt.scatter(mus[:, 0], mus[:, 1], c='k') plt.show()
class PCM(BaseEstimator, ClassifierMixin): """Profile Classification Model Parameters ---------- Methods ------- Examples -------- """ def __init__(self, n_components=1, axis=9999, scaling=1, reduction=1, classifier='gmm', COVARTYPE='full', maxvar=99.9, verb=False): """Create the PCM instance """ if scaling == 0: with_scaler = 'none' with_mean = False with_std = False elif scaling == 1: with_scaler = 'normal' with_mean = True with_std = True elif scaling == 2: with_scaler = 'center' with_mean = True with_std = False else: raise NameError('scaling must be 0, 1 or 2') if reduction == 0: with_reducer = False elif reduction == 1: with_reducer = True else: raise NameError('reduction must be 0 or 1') if classifier == 'gmm': with_classifier = 'gmm' else: raise NameError( "classifier must be 'gmm' (no other methods at this time)") self._props = { 'K': np.int(n_components), 'llh': None, 'COVARTYPE': COVARTYPE, 'with_scaler': with_scaler, 'with_reducer': with_reducer, 'with_classifier': with_classifier, 'maxvar': maxvar, 'DPTmodel': np.float32(axis) } self._trained = False #todo _trained is a property, should be set/get with a decorator self._verb = verb #todo _verb is a property, should be set/get with a decorator self._version = '0.4' # def __call__(self, **kwargs): # self.__init__(**kwargs) def get_params(self, deep=True): # suppose this estimator has parameters "alpha" and "recursive" return self._props def set_params(self, **parameters): for parameter, value in parameters.items(): self._props[parameter] = value return self def set_config(self, **kargs): """Set-up all processing steps according to PCM properties""" self.set_params(kargs) self._interpoler = self.__Interp(self._props['DPTmodel']) self._scaler = preprocessing.StandardScaler(with_mean=with_mean, with_std=with_std) self._reducer = PCA(n_components=self._props['maxvar'] / 100, svd_solver='full') self._classifier = GaussianMixture( n_components=self._props['K'], covariance_type=self._props['COVARTYPE'], init_params='kmeans', max_iter=1000, tol=1e-6) return self class __Interp: """ Internal machinery for the interpolation of vertical profiles This class is called once at PCM instance initialisation and whenever data to be classified are not on the PCM vertical axis. """ def __init__(self, DPTmodel): self.zi = DPTmodel self.doINTERPz = False def isnecessary(self, C, z): """Check wether or not the input data vertical axis is different from the PCM one, if not, avoid interpolation """ #todo We should be smarter and recognize occurences of z in DPTmodel # or viceversa in order to limit interpolation as much as possible ! z = np.float32(z) #self.doINTERPz = not np.array_equal(self.zi,z) self.doINTERPz = not np.array_equiv(self.zi, z) return self.doINTERPz def mix(self, x): """ Homogeneize the upper water column: Set 1st nan value to the first non-NaN value """ #izmixed = np.argwhere(np.isnan(x)) izok = np.where(~np.isnan(x))[0][0] #x[izmixed] = x[izok] x[0] = x[izok] return x def fit_transform(self, C, z): """ Interpolate data on the PCM vertical axis """ if (self.isnecessary(C, z)): [Np, Nz] = C.shape # Possibly Create a mixed layer for the interpolation to work # smoothly at the surface if ((z[0] < 0.) & (self.zi[0] == 0.)): z = np.concatenate((np.zeros(1), z)) x = np.empty((Np, 1)) x.fill(np.nan) C = np.concatenate((x, C), axis=1) np.apply_along_axis(self.mix, 1, C) # Linear interpolation of profiles onto the model grid: #f = interpolate.interp2d(z, np.arange(Np), C, kind='cubic') f = interpolate.interp2d(z, np.arange(Np), C, kind='linear') C = f(self.zi, np.arange(Np)) return C def display(self, deep=False): """Display detailled parameters of the PCM This is not a get_params because it doesn't return a dictionnary Set Boolean option 'deep' to True for all properties display """ summary = [("<pcm '%s' (K: %i, Z: %i)>") % (self._props['with_classifier'], self._props['K'], self._props['DPTmodel'].size)] # PCM core properties: prop_info = ('Number of class: %i') % self._props['K'] summary.append(prop_info) # prop_info = ('Vertical axis: %s') % self._props['DPTmodel'] prop_info = ('Vertical axis: [%s, ..., %s]') % (repr( self._props['DPTmodel'][0]), repr(self._props['DPTmodel'][-1])) summary.append(prop_info) prop_info = ('Trained: %r') % self._trained summary.append(prop_info) # PCM workflow parameters: prop_info = ('Vertical Interpolation: %r') % self._interpoler.doINTERPz summary.append(prop_info) summary.append("\t Interpoler: %s" % (type(self._interpoler))) prop_info = ('Sample Scaling: %r') % self._props['with_scaler'] summary.append(prop_info) summary.append("\t Scaler: %s" % (type(self._scaler))) if (deep): summary.append("\t Scaler properties:") d = self._scaler.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r") % (p, d[p])) prop_info = ( 'Dimensionality Reduction: %r') % self._props['with_reducer'] summary.append(prop_info) summary.append("\t Reducer: %s" % (type(self._reducer))) #prop_info = ('\t Maximum Variance: %0.2f%%') % self._props['maxvar'] #summary.append(prop_info) if (deep): summary.append("\t Reducer properties:") d = self._reducer.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r") % (p, d[p])) prop_info = ('Classification: %r') % self._props['with_classifier'] summary.append(prop_info) summary.append("\t Classifier: %s" % (type(self._classifier))) #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE'] #summary.append(prop_info) if (self._trained): prop_info = ('\t log likelihood: %f') % self._props['llh'] summary.append(prop_info) if (deep): summary.append("\t Classifier properties:") d = self._classifier.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r") % (p, d[p])) # Done return '\n'.join(summary) # def __repr__(self): # return self.display(deep=self._verb) def copy(self): """Return a deep copy of the PCM instance""" return copy.deepcopy(self) def preprocessing(self, X, Z): """"Pre-process data for classification Preprocessing steps: interpolation, scaling, reduction. Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Z: array-like, shape (N_z=n_features,) Vertical axis of profiles Returns ------- X : array-like, shape (N_p=n_samples, n_reduced_scaled_interpolated_features) List of profiles pre-processed for classification """ # INTERPOLATION: X = self._interpoler.fit_transform(X, Z) # SCALING: self._scaler.fit(X) X = self._scaler.transform(X) # REDUCTION: if self._props['with_reducer']: self._reducer.fit(X) X = self._reducer.transform(X) # Output: return X def fit(self, X, y=None, axis=None, **kargs): """Estimate PCM parameters For a PCM, the fit method consists in the following operations: - interpolation to the Depth levels of the model - scaling - reduction - estimate classifier parameters Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. axis: array-like, shape (N_z=n_features,) Vertical axis of profiles Returns ------- self """ self.set_config(kargs) # PRE-PROCESSING: X = self.preprocessing(X, axis) # CLASSIFICATION-MODEL TRAINING: self._classifier.fit(X) self._props['llh'] = self._classifier.score(X) # Done: self._trained = True return self def score(self, X, y=None, axis=None): """Compute the per-sample average log-likelihood of the given data X """ if (not self._trained): raise ValueError("Can't predict before fitting !") # PRE-PROCESSING: X = self.preprocessing(X, axis) return self._classifier.score(X) def predict(self, X, y=None, axis=None): """Predict the labels for the profile samples in X using trained PCM This method add these properties to the PCM data property: llh: The log likelihood of the model with regard to new data Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Returns ------- labels : xarray.DataArray, shape (N_p = n_samples) Component labels. """ # if not self._trained: # raise ValueError("Can't predict before fitting !") validation.check_is_fitted( self, '_trained', msg= "This %(name)s instance is not fitted yet. Call ‘fit’ with appropriate arguments before using this method." ) # PRE-PROCESSING: X = self.preprocessing(X, axis) # CLASSIFICATION PREDICTION: labels = self._classifier.predict(X) self._props['llh'] = self._classifier.score(X) # Prepare xarray for output: labels = xr.DataArray(labels, dims='samples', name='LABELS') labels.attrs['llh'] = self._props['llh'] # done: return labels
predict.append(1) correct = 0 for i in range(len(predict)): if (i >= 0 and i <= len(left) - 1): if (predict[i] == 0): correct += 1 else: if (predict[i] == 1): correct += 1 print("SImiliarity Score: ", metrics.adjusted_rand_score(predict, true)) #print('Correct Prediction Rate: ', correct/2000) b = np.array(predict) np.savetxt('predict.txt', predict, delimiter=',', fmt='%1d') a = np.array(mixer) np.savetxt('iter.txt', a, delimiter=',', fmt='%1.8f') loglik.pop(0) print('Number of iterations ', j + 1) #plt.subplot(2,2,2) plt.plot(loglik) plt.xlabel('iteration') plt.ylabel('log likelihood') plt.show() sample = data.reshape(len(data), 1) model = GaussianMixture(2, init_params='random') model.fit(sample) l = model.predict(sample) dd = model.get_params() print(model.lower_bound_) data = data.reshape(-1, 1) s = model.score(data)
class PCM: """ Common base class for a Profile Classification Model """ def __init__(self, K, DPTmodel, scaling=1, reduction=1, classif='gmm', COVARTYPE='full', maxvar=99.9, verb=False): """Create the PCM instance """ if scaling==0: with_scaler = 'none'; with_mean=False; with_std = False elif scaling==1: with_scaler = 'normal'; with_mean=True; with_std = True elif scaling==2: with_scaler = 'center'; with_mean=True; with_std = False else: raise NameError('scaling must be 0, 1 or 2') if reduction==0: with_reducer = False elif reduction==1: with_reducer = True else: raise NameError('reduction must be 0 or 1') if classif=='gmm': with_classifier = 'gmm'; else: raise NameError("classifier must be 'gmm' (no other methods at this time)") self._props = {'K': np.int(K), 'llh': None, 'COVARTYPE': COVARTYPE, 'with_scaler': with_scaler, 'with_reducer': with_reducer, 'with_classifier': with_classifier, 'maxvar': maxvar, 'DPTmodel': np.float32(DPTmodel)} self._trained = False #todo _trained is a property, should be set/get with a decorator self._verb = verb #todo _verb is a property, should be set/get with a decorator self._interpoler = self.__Interp(self._props['DPTmodel']) self._scaler = preprocessing.StandardScaler(with_mean=with_mean, with_std=with_std) self._reducer = PCA(n_components=self._props['maxvar']/100, svd_solver='full') self._classifier = GaussianMixture(n_components=self._props['K'], covariance_type=self._props['COVARTYPE'], init_params='kmeans', max_iter=1000, tol=1e-6) self._version = '0.3' def __call__(self, **kwargs): self.__init__(**kwargs) def __iter__(self): self.__i = 0 return self def next(self): if self.__i < self.K: i = self.__i self.__i += 1 return i else: raise StopIteration() class __Interp: """ Internal machinery for the interpolation of vertical profiles This class is called once at PCM instance initialisation and whenever data to be classified are not on the PCM vertical axis. """ def __init__(self,DPTmodel): self.zi = DPTmodel self.doINTERPz = False def isnecessary(self,C,z): """Check wether or not the input data vertical axis is different from the PCM one, if not, avoid interpolation """ #todo We should be smarter and recognize occurences of z in DPTmodel # or viceversa in order to limit interpolation as much as possible ! z = np.float32(z) #self.doINTERPz = not np.array_equal(self.zi,z) self.doINTERPz = not np.array_equiv(self.zi,z) return self.doINTERPz def mix(self,x): """ Homogeneize the upper water column: Set 1st nan value to the first non-NaN value """ #izmixed = np.argwhere(np.isnan(x)) izok = np.where(~np.isnan(x))[0][0] #x[izmixed] = x[izok] x[0] = x[izok] return x; def fit_transform(self,C,z): """ Interpolate data on the PCM vertical axis """ if (self.isnecessary(C,z)): [Np, Nz] = C.shape # Possibly Create a mixed layer for the interpolation to work # smoothly at the surface if ((z[0]<0.) & (self.zi[0] == 0.)): z = np.concatenate((np.zeros(1),z)) x = np.empty((Np,1)) x.fill(np.nan) C = np.concatenate((x,C),axis=1) np.apply_along_axis(self.mix,1,C) # Linear interpolation of profiles onto the model grid: #f = interpolate.interp2d(z, np.arange(Np), C, kind='cubic') f = interpolate.interp2d(z, np.arange(Np), C, kind='linear') C = f(self.zi, np.arange(Np)) return C @property def K(self): """Return the number of class K in the PCM""" return self._props['K'] def display(self, deep=False): """Display detailled parameters of the PCM This is not a get_params because it doesn't return a dictionnary Set Boolean option 'deep' to True for all properties display """ summary = [("<pcm '%s' (K: %i, Z: %i)>")%(self._props['with_classifier'],self._props['K'],self._props['DPTmodel'].size)] # PCM core properties: prop_info = ('Number of class: %i') % self._props['K'] summary.append(prop_info) # prop_info = ('Vertical axis: %s') % self._props['DPTmodel'] prop_info = ('Vertical axis: [%s, ..., %s]') % (repr(self._props['DPTmodel'][0]),repr(self._props['DPTmodel'][-1])) summary.append(prop_info) prop_info = ('Trained: %r') % self._trained summary.append(prop_info) # PCM workflow parameters: prop_info = ('Vertical Interpolation: %r') % self._interpoler.doINTERPz summary.append(prop_info) summary.append("\t Interpoler: %s"%(type(self._interpoler))) prop_info = ('Sample Scaling: %r') % self._props['with_scaler'] summary.append(prop_info) summary.append("\t Scaler: %s"%(type(self._scaler))) if (deep): summary.append("\t Scaler properties:") d = self._scaler.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r")%(p,d[p])) prop_info = ('Dimensionality Reduction: %r') % self._props['with_reducer'] summary.append(prop_info) summary.append("\t Reducer: %s"%(type(self._reducer))) #prop_info = ('\t Maximum Variance: %0.2f%%') % self._props['maxvar'] #summary.append(prop_info) if (deep): summary.append("\t Reducer properties:") d = self._reducer.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r")%(p,d[p])) prop_info = ('Classification: %r') % self._props['with_classifier'] summary.append(prop_info) summary.append("\t Classifier: %s"%(type(self._classifier))) #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE'] #summary.append(prop_info) if (self._trained): prop_info = ('\t log likelihood: %f') % self._props['llh'] summary.append(prop_info) if (deep): summary.append("\t Classifier properties:") d = self._classifier.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r")%(p,d[p])) # Done return '\n'.join(summary) def __repr__(self): return self.display(deep=self._verb) def copy(self): """Return a deep copy of the PCM instance""" return copy.deepcopy(self) def preprocessing(self, X, Z): """"Pre-process data for classification Preprocessing steps: interpolation, scaling, reduction. Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Z: array-like, shape (N_z=n_features,) Vertical axis of profiles Returns ------- X : array-like, shape (N_p=n_samples, n_reduced_scaled_interpolated_features) List of profiles pre-processed for classification """ # INTERPOLATION: X = self._interpoler.fit_transform(X, Z) # SCALING: self._scaler.fit(X) X = self._scaler.transform(X) # REDUCTION: if self._props['with_reducer']: self._reducer.fit(X) X = self._reducer.transform(X) # Output: return X def fit(self, X, Z): """Estimate PCM parameters For a PCM, the fit method consists in the following operations: - interpolation to the Depth levels of the model - scaling - reduction - estimate classifier parameters Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Z: array-like, shape (N_z=n_features,) Vertical axis of profiles Returns ------- self """ # PRE-PROCESSING: X = self.preprocessing(X, Z) # CLASSIFICATION-MODEL TRAINING: self._classifier.fit(X) self._props['llh'] = self._classifier.score(X) # Done: self._trained = True return self def predict(self, X, Z): """Predict the labels for the profile samples in X using trained PCM This method add these properties to the PCM data property: llh: The log likelihood of the model with regard to new data Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Z: array-like, shape (N_z=n_features,) Vertical axis of profiles Returns ------- labels : xarray.DataArray, shape (N_p = n_samples) Component labels. """ # if not self._trained: # raise ValueError("Can't predict before fitting !") validation.check_is_fitted(self, '_trained',msg="This %(name)s instance is not fitted yet. Call ‘fit’ with appropriate arguments before using this method.") # PRE-PROCESSING: X = self.preprocessing(X, Z) # CLASSIFICATION PREDICTION: labels = self._classifier.predict(X) self._props['llh'] = self._classifier.score(X) # Prepare xarray for output: labels = xr.DataArray(labels, dims='samples', name='LABELS') labels.attrs['llh'] = self._props['llh'] # done: return labels def fit_predict(self, X, Z): """Estimate PCM parameters and predict classes Train a PCM and predict classes in a single step This method add these properties to the PCM data property: llh: The log likelihood of the model with regard to new data Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Z: array-like, shape (N_z=n_features,) Vertical axis of profiles Returns ------- labels : xarray.DataArray, shape (N_p = n_samples) Component labels. """ # PRE-PROCESSING: X = self.preprocessing(X, Z) # CLASSIFICATION-MODEL TRAINING: self._classifier.fit(X) # CLASSIFICATION PREDICTION: labels = self._classifier.predict(X) self._props['llh'] = self._classifier.score(X) # Prepare xarray for output: labels = xr.DataArray(labels, dims='samples', name='LABELS') labels.attrs['llh'] = self._props['llh'] # Done: self._trained = True return labels def predict_proba(self, X, Z): """Predict posterior probability of each component given the data This method adds these properties to the PCM instance: llh: The log likelihood of the model with regard to new data Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Z: array-like, shape (N_z=n_features,) Vertical axis of profiles Returns ------- post : array, shape (n_samples, n_components) Returns the probability of each Gaussian (state) in the model given each sample. """ if (not self._trained): raise ValueError("Can't predict before fitting !") # PRE-PROCESSING: X = self.preprocessing(X, Z) # CLASSIFICATION PREDICTION: post = self._classifier.predict_proba(X) self._props['llh'] = self._classifier.score(X) # Prepare xarray for output: post = xr.DataArray(post, dims={'samples','components'}, name='POST') post.attrs['llh'] = self._props['llh'] # done: return post def quant(self, X, Z=None, labels=None, q=[0.05, 0.5, 0.95], verb=False): """Compute the qth quantiles of the data for each PCM component. Usage A: pcm.quant(X, labels=L, q=[0.05,0.5,0.95]) This usage will use labels L to compute component percentiles of X Usage B: pcm.quant(X, Z=DEPTH, q=[0.05,0.5,0.95]) This usage will classify data X at depth Z and then compute percentiles. Be careful, if you re-fit a model, you may not end up with something coherent from previous calculation of labels and posteriors, as components will show up in different orders Parameters ---------- X : array-like, shape (N_p=n_samples, N_z=n_features) List of N_z-dimensional data profile. Each row corresponds to a single profile. Z: array-like, shape (N_z=n_features,) Vertical axis of profiles labels: array, shape (N_p=n_samples,) Component labels. q: float in the range of [0,1] (or sequence of floats), shape (n_quantiles,1) Quantile(s) to compute, which must be between 0 and 1 inclusive. Returns ------- Q : xarray.DataArray, shape (K, n_quantiles, N_z=n_features) """ if labels is None: labels = self.fit_predict(X,Z) elif Z is None: if not self._trained: raise ValueError("Can't compute quantiles without a fitted model !") # if (not isinstance(X,xr.core.dataarray.DataArray)): XR = xr.DataArray(X, dims=['samples', 'features']) else: XR = xr.DataArray(X.values, dims=['samples', 'features']) if (not isinstance(labels,xr.core.dataarray.DataArray)): LR = xr.DataArray(labels, dims=['samples']) else: LR = xr.DataArray(labels.values, dims=['samples']) DS = xr.Dataset({'DATA': XR, 'LABELS': LR}) varname = 'DATA' Q = [] # list of results for label, group in DS.groupby('LABELS'): if verb: print ("Using %0d profiles of %s in class %i") % (group['samples'].shape[0], varname, label) quant = group[varname].quantile(q, dim='samples') Q.append(quant) Q = xr.concat(Q, dim='components') # Transform the list into a DataArray Q.name = 'QUANTILES' # Done: return Q
fig, axes = plt.subplots() plt.scatter(Xnew[:, 0], Xnew[:, 1], c=labels, s=1, cmap='viridis') plt.scatter(ra, dec, s=36, color='red', marker='^') plt.xlabel('RA') plt.ylabel('DEC') gmm_means = [] gmm_cov = [] for i in range(0, len(gmm.means_)): print i, gmm.covariances_[i][0][0] gmm_means.append([gmm.means_[i][0], gmm.means_[i][1]]) gmm_cov.append([[gmm.covariances_[i][0][0], gmm.covariances_[i][0][1]], [gmm.covariances_[i][1][0], gmm.covariances_[i][1][1]]]) plot_results(gmm_means, gmm_cov, fig, axes) plt.show() print gmm.get_params() #labels_1 = gmm.fit_predict(Xnew) #plt.scatter(Xnew[:, 0], Xnew[:, 1], c=labels_1, s=7, cmap='viridis') #labels_2 = gmm.get_params(deep=True) #print (labels_2)
p_k=param[2]) if __name__ == '__main__': import pandas as pd from util.common import metric df = pd.read_csv( '/home/tadeze/projects/missingvalue/datasets/anomaly/yeast/fullsamples/yeast_1.csv' ) train_data = df.ix[:, 1:].as_matrix().astype(np.float64) # train_lbl = df.ix[:,0] # train_lbl = map(int, df.ix[:, 0] == "anomaly") gmms = GaussianMixture(n_components=3) gmms.fit(train_data) score = -gmms.score_samples(train_data) print gmms.get_params(False) print len(score) print metric(train_lbl, score) from pypr.clustering import gmm cen_lst, cov_lst, p_k, logL = gmm.em_gm(train_data, max_iter=100, K=3) score = [ -gmm.gm_log_likelihood( train_data[i, :], center_list=cen_lst, cov_list=cov_lst, p_k=p_k) for i in range(0, train_data.shape[0]) ] #print score print metric(train_lbl, score) # Marginalize the #m_cen_lst, m_cov_lst, m_p_k
class PCM: """ Common base class for a Profile Classification Model """ def __init__(self, K, DPTmodel, scaling=1, reduction=1, classif='gmm', COVARTYPE='full', maxvar=99.9, verb=False): #todo: check inputs validity if (scaling == 0): with_scaler = False with_mean = False with_std = False elif (scaling == 1): with_scaler = True with_mean = True with_std = True elif (scaling == 2): with_scaler = True with_mean = True with_std = False else: raise NameError('scaling must be 0, 1 or 2') if (reduction == 0): with_reducer = False elif (reduction == 1): with_reducer = True else: raise NameError('reduction must be 0, 1') if (classif == 'gmm'): with_classifier = 'gmm' else: raise NameError( "classifier must be 'gmm' (no other methods at this time)") self._props = { 'K': np.int(K), 'COVARTYPE': COVARTYPE, 'with_scaler': with_scaler, 'with_reducer': with_reducer, 'with_classifier': with_classifier, 'maxvar': maxvar, 'DPTmodel': np.float32(DPTmodel) } self._trained = False self._verb = verb self.K = self._props['K'] self._scaler = preprocessing.StandardScaler(with_mean=with_mean, with_std=with_std) self._reducer = PCA(n_components=self._props['maxvar'] / 100, svd_solver='full') self._classifier = GaussianMixture( n_components=self._props['K'], covariance_type=self._props['COVARTYPE'], init_params='kmeans', max_iter=1000, tol=1e-6) self.interpoler = PCM.Interp(self._props['DPTmodel']) self._version = '0.1' def __iter__(self): self.__i = 0 return self def next(self): if self.__i < self.K: i = self.__i self.__i += 1 return i else: raise StopIteration() class Interp: """ Internal machinery for the interpolation of vertical profiles This class is called once at PCM instance initialisation and whenever data to be classified are not on the PCM vertical axis. """ def __init__(self, DPTmodel): self.zi = DPTmodel self.doINTERPz = False def isnecessary(self, C, z): """ Check whether or not the data vertical axis is different from the PCM one, if not, avoid interpolation """ z = np.float32(z) # self.doINTERPz = not np.array_equal(self.zi,z) self.doINTERPz = not np.array_equiv(self.zi, z) return self.doINTERPz def mix(self, x): """ Homogenize the upper water column: Set 1st nan value to the first non-NaN value """ # izmixed = np.argwhere(np.isnan(x)) izok = np.where(~np.isnan(x))[0][0] # x[izmixed] = x[izok] x[0] = x[izok] return x def fit_transform(self, C, z): """ Interpolate data on the PCM vertical axis """ if self.isnecessary(C, z): [Np, Nz] = C.shape # Possibly Create a mixed layer for the interpolation to work # smoothly at the surface if (z[0] < 0.) & (self.zi[0] == 0.): z = np.concatenate((np.zeros(1), z)) x = np.empty((Np, 1)) x.fill(np.nan) C = np.concatenate((x, C), axis=1) np.apply_along_axis(self.mix, 1, C) # Linear interpolation of profiles onto the model grid: # f = interpolate.interp2d(z, np.arange(Np), C, kind='cubic') f = interpolate.interp2d(z, np.arange(Np), C, kind='linear') C = f(self.zi, np.arange(Np)) return C def display(self, deep=True): """ Display detailed parameters of the PCM This is not get_params because it should return a dictionnary """ summary = [("<pcm '%s' (K: %i, Z: %i)>") % (self._props['with_classifier'], self._props['K'], self._props['DPTmodel'].size)] # PCM core properties: prop_info = 'Number of class: %i'.format(self._props['K']) summary.append(prop_info) prop_info = ('Vertical axis: %s') % repr(self._props['DPTmodel']) summary.append(prop_info) prop_info = ('Trained: %r') % self._trained summary.append(prop_info) # PCM workflow parameters: prop_info = ('Vertical Interpolation: %r') % self.interpoler.doINTERPz summary.append(prop_info) summary.append("\t Interpoler: %s" % (type(self.interpoler))) prop_info = ('Sample Normalisation: %r') % self._props['with_scaler'] summary.append(prop_info) summary.append("\t Normaliser: %s" % (type(self._scaler))) if (deep): summary.append("\t Normaliser properties:") d = self._scaler.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r") % (p, d[p])) prop_info = ( 'Dimensionality Reduction: %r') % self._props['with_reducer'] summary.append(prop_info) summary.append("\t Reducer: %s" % (type(self._reducer))) #prop_info = ('\t Maximum Variance: %0.2f%%') % self._props['maxvar'] #summary.append(prop_info) if (deep): summary.append("\t Reducer properties:") d = self._reducer.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r") % (p, d[p])) prop_info = ('Classification: %r') % self._props['with_classifier'] summary.append(prop_info) summary.append("\t Classifier: %s" % (type(self._classifier))) #prop_info = ('GMM covariance type: %s') % self._props['COVARTYPE'] #summary.append(prop_info) if (self._trained): prop_info = ('\t log likelihood: %f') % self.llh summary.append(prop_info) if (deep): summary.append("\t Classifier properties:") d = self._classifier.get_params(deep=deep) for p in d: summary.append(("\t\t %s: %r") % (p, d[p])) # Done return '\n'.join(summary) def __repr__(self): return self.display(deep=self._verb) def fit(self, X, Z): """ For a PCM, the fit method consists in the following operations: - interpolation to the Depth levels of the model - scaling - reduction - estimate GMM parameters """ # CHECK INPUTS #todo we should check for errors/inconsistencies in inpts # INTERPOLATION: X = self.interpoler.fit_transform(X, Z) # SCALING: self._scaler.fit(X) X = self._scaler.transform(X) # REDUCTION: if (self._props['with_reducer']): self._reducer.fit(X) X = self._reducer.transform(X) # CLASSIFICATION-MODEL TRAINING: self._classifier.fit(X) self.llh = self._classifier.score(X) # Done: self._trained = True return self def predict(self, X, Z): """ Using the self PCM properties, predict the class of new data """ # CHECK INPUTS #todo we should check for errors/inconsistencies in inpts print self._trained if not self._trained: raise NameError("Can't predict before fitting !") # INTERPOLATION: X = self.interpoler.fit_transform(X, Z) # SCALING: X = self._scaler.transform(X) # REDUCTION: if (self._props['with_reducer']): X = self._reducer.transform(X) # CLASSIFICATION PREDICTION: self.LABELS = self._classifier.predict(X) self.llh = self._classifier.score(X) # done: return self def fit_predict(self, X, Z): """ Train a PCM and predict classes in a single step """ # CHECK INPUTS #todo we should check for errors/inconsistencies in inpts # INTERPOLATION: X = self.interpoler.fit_transform(X, Z) # SCALING: self._scaler.fit(X) X = self._scaler.transform(X) # REDUCTION: if (self._props['with_reducer']): self._reducer.fit(X) X = self._reducer.transform(X) # CLASSIFICATION-MODEL TRAINING: self._classifier.fit(X) # CLASSIFICATION PREDICTION: self.LABELS = self._classifier.predict(X) self.llh = self._classifier.score(X) # Done: self._trained = True return self
y_train_lbl, classes=range(10)) # TODO: sparse_output=True? predicted_class = np.empty((labeled_rows + unlabeled_rows)) predicted_class[0:labeled_rows] = y_train_lbl # https://people.duke.edu/~ccc14/sta-663/EMAlgorithm.html tol = 0.001 max_iter = 100 # n = all_rows = X_train_all.shape[0] # for P(x) gm = GaussianMixture(max_iter=1, n_components=10, weights_init=thetas[2], means_init=thetas[0], precisions_init=thetas[1]) gm.fit(X_train_lbl) #needed for predict print(gm.get_params()['means_init'] == thetas[0]) #ll_old = 0 #log-likelyhood? for i in range(max_iter): print('\nIteration: ', i) print() # E-step: compute γ_y(x) = P(z=y | x, Σ, μ, w) for unlabeled data #gammas = np.zeros((10, unlabeled_rows)) #for y in range(10): #for i in range(unlabeled_rows): # for each unlabled x-row # w_y*P(X | Σ_y, μ_y) / P(X) gammas[labeled_rows:] = gm.predict_proba( X_train_unlbl) # shape: unlbl samples * 10 # M-step: MLE for μ_y, Σ_y, w_y