def __init__(self, n_clusters=200): self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=1, weights='uniform', metric='euclidean', n_jobs=-1) self.d = None self.n_clusters = n_clusters
class kNNClassifier: def __init__(self, n_neighbours=5, mac_neighbours=None, weights="uniform", metric_params={}, n_jobs=-1): self.n_neighbours = n_neighbours self.mac_neighbours = mac_neighbours self.weights = weights self.metric_params = metric_params self.n_jobs = n_jobs def get_params(self, deep=True): return { "n_neighbours": self.n_neighbours, "mac_neighbours": self.mac_neighbours, "weights": self.weights, "metric_params": self.metric_params, "n_jobs": self.n_jobs } def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self def fit(self, X_train, y_train): self.X_train = X_train self.y_train = y_train self.model = KNeighborsTimeSeriesClassifier( n_neighbors=self.n_neighbours, metric="euclidean", weights=self.weights, n_jobs=self.n_jobs).fit(self.X_train, self.y_train) return self def predict(self, X_test): if self.mac_neighbours is None: return self.model.predict(X_test) else: y_hat = [] k_neighbors = self.model.kneighbors( X_test, n_neighbors=self.mac_neighbours, return_distance=False) for idx, k in enumerate(k_neighbors): X_train = self.X_train[k] y_train = self.y_train[k] self.model = KNeighborsTimeSeriesClassifier( n_neighbors=self.n_neighbours, metric="dtw", weights=self.weights, n_jobs=self.n_jobs, metric_params=self.metric_params).fit(X_train, y_train) pred = self.model.predict(X_test[idx]) y_hat.append(pred) return y_hat
def __init__(self, n_neighbors): ''' initialize KNN class with dynamic time warping distance metric hyperparameters: n_neighbors : number of neighbors on which to make classification decision ''' self.n_neighbors = n_neighbors self.knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=n_neighbors, metric="dtw")
def fit(self, X_train, y_train): self.X_train = X_train self.y_train = y_train self.model = KNeighborsTimeSeriesClassifier( n_neighbors=self.n_neighbours, metric="euclidean", weights=self.weights, n_jobs=self.n_jobs).fit(self.X_train, self.y_train) return self
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1])
def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False
def test_serialize_knn_classifier(): n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) X = rng.randn(n, sz, d) y = rng.randint(low=0, high=3, size=n) knc = KNeighborsTimeSeriesClassifier() _check_not_fitted(knc) knc.fit(X, y) _check_params_predict(knc, X, ['predict'])
class Knn(): def __init__(self, n_neighbors): ''' initialize KNN class with dynamic time warping distance metric hyperparameters: n_neighbors : number of neighbors on which to make classification decision ''' self.n_neighbors = n_neighbors self.knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=n_neighbors, metric="dtw") def __ScaleData(self, input_data): ''' scale input data to range [0,1] parameters: input_data : input data to rescale ''' return TimeSeriesScalerMinMax().fit_transform(input_data) def fit(self, X_train, y_train): ''' fit KNN classifier on training data parameters: X_train : training time series y_train : training labels ''' # scale training data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) self.knn_clf.fit(X_train_scaled, y_train) def predict(self, X_test): ''' classifications for time series in test data set parameters: X_test: test time series on which to predict classes returns: classifications for test data set ''' # scale test data to between 0 and 1 X_test_scaled = self.__ScaleData(X_test) return self.knn_clf.predict(X_test_scaled)
def test_variable_cross_val(): # TODO: here we just check that they can accept variable-length TS, not # that they do clever things X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]]) y = [0, 0, 0, 0, 1, 1, 1, 1] rng = np.random.RandomState(0) cv = KFold(n_splits=2, shuffle=True, random_state=rng) for estimator in [ TimeSeriesSVC(kernel="gak", random_state=rng), TimeSeriesSVR(kernel="gak"), KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1), KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) ]: # TODO: cannot test for clustering methods since they don't have a # score method yet cross_val_score(estimator, X=X, y=y, cv=cv)
def predict(self, X_test): if self.mac_neighbours is None: return self.model.predict(X_test) else: y_hat = [] k_neighbors = self.model.kneighbors( X_test, n_neighbors=self.mac_neighbours, return_distance=False) for idx, k in enumerate(k_neighbors): X_train = self.X_train[k] y_train = self.y_train[k] self.model = KNeighborsTimeSeriesClassifier( n_neighbors=self.n_neighbours, metric="dtw", weights=self.weights, n_jobs=self.n_jobs, metric_params=self.metric_params).fit(X_train, y_train) pred = self.model.predict(X_test[idx]) y_hat.append(pred) return y_hat
def test_sax_scale(): n, sz, d = 10, 10, 3 rng = np.random.RandomState(0) X = rng.rand(n, sz, d) y = rng.choice([0, 1], size=n) sax = SymbolicAggregateApproximation(n_segments=3, alphabet_size_avg=2, scale=True) sax.fit(X) np.testing.assert_array_almost_equal(X, sax._unscale(sax._scale(X))) np.testing.assert_array_almost_equal(np.zeros((d, )), sax._scale(X).reshape((-1, d)).mean()) np.testing.assert_array_almost_equal(np.ones((d, )), sax._scale(X).reshape((-1, d)).std()) # Case of kNN-SAX knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="sax", metric_params={"scale": True}) knn_sax.fit(X, y) X_scale_unscale = knn_sax._sax._unscale(knn_sax._sax._scale(X)) np.testing.assert_array_almost_equal(X, X_scale_unscale) knn_sax.predict(X)
def NearestCentroidClassification(X_train, X_test, y_train_n, y_test_n, dataset_name): ''' :param X_train: if using DTAN, should already be aligned :param X_test: if using DTAN, should already be aligned :param y_train_n: numerical labels (not one-hot) :param y_test_n: numerical labels (not one-hot) :param dataset_name: :return: test set NCC accuracy ''' # vars and placeholders input_shape = X_train.shape[1:] n_classes = len(np.unique(y_train_n)) class_names = np.unique(y_train_n, axis=0) aligned_means = np.zeros((n_classes, input_shape[0], input_shape[1])) ncc_labels = [] # Train set within class Euclidean mean for class_num in class_names: train_class_idx = y_train_n == class_num # get indices X_train_aligned_within_class = X_train[train_class_idx] aligned_means[int(class_num), :] = np.mean( X_train_aligned_within_class, axis=0) ncc_labels.append(class_num) ncc_labels = np.asarray(ncc_labels) # Nearest neighbor classification - using euclidean distance knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="euclidean") knn_clf.fit(aligned_means, ncc_labels) predicted_labels = knn_clf.predict(X_test) acc = accuracy_score(y_test_n, predicted_labels) print(f"{dataset_name} - NCC results: {acc}")
def main(): dm = EMGDataManager(PATH_TO_DATA, path_to_timestamps=PATH_TO_ACTION_LABELS, downsampler=True) clf = KNeighborsTimeSeriesClassifier(n_neighbors=N_NEIGHBOURS, metric="dtw", d=1) nbr_clf_rob = NeighbourClassifier(clf) nbr_clf_lap = NeighbourClassifier(clf) # LOADING DATA # we can only use downsampled data here rob_data_mus1, rob_data_mus2, rob_data_mus3, rob_data_mus4, rob_data_mus5, rob_data_mus6 = \ dm.get_ROB_data_downsampled() _, _, timestamps = dm.get_ROB_metadata() nbr_clf_rob.load_data(rob_data_mus1, rob_data_mus2, rob_data_mus3, rob_data_mus4, rob_data_mus5, rob_data_mus6, timestamps) # PREDICTIONS predictions = nbr_clf_rob.predict() actual = nbr_clf_rob.get_test_timestamps() print("ROB Prediction") print(predictions) print("ROB Actual") print(actual) lap_data_mus1, lap_data_mus2, lap_data_mus3, lap_data_mus4, lap_data_mus5, lap_data_mus6 = \ dm.get_LAP_data_downsampled() _, _, timestamps = dm.get_LAP_metadata() nbr_clf_lap.load_data(lap_data_mus1, lap_data_mus2, lap_data_mus3, lap_data_mus4, lap_data_mus5, lap_data_mus6, timestamps) # PREDICTIONS predictions = nbr_clf_lap.predict() actual = nbr_clf_lap.get_test_timestamps() print("LAP Prediction") print(predictions) print("LAP Actual") print(actual)
def NN1_DTWClassifier(X_train, Y_train): knn1_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw") knn1_clf.fit(X_train, Y_train) return knn1_clf
seq = np.genfromtxt(rep + dataset, delimiter=' ', dtype=str, encoding="utf8") ids, counts = np.unique(seq[:, 0], return_counts=True) No = ids.shape[0] D = seq.shape[1] - 3 arr = np.asarray((ids, counts)).T Max_Seq_Len = np.max(arr[:, 1].astype(np.int)) out_X = np.zeros((No, Max_Seq_Len, D)) out_Y = np.zeros((No, )) for idx, id in enumerate(ids): seq_cpy = seq[seq[:, 0] == id] out_X[idx] = seq_cpy[:, 3:] out_Y[idx] = seq_cpy[0, 2] return out_X, out_Y x_train, y_train = convert_mts(rep, ds_train) x_test, y_test = convert_mts(rep, ds_test) clf = KNeighborsTimeSeriesClassifier(n_neighbors=2, metric="dtw") y_test_pred = clf.fit(x_train, y=y_train).predict(x_test) print("the accuracy score of the testing data is : " + accuracy_score(y_test, y_test_pred))
def get_models(device, include_knn=False): """ Get the models used in the comparison. :param device: a PyTorch device used for training / inference :param include_knn: True if k-NN DTW model should be used :return: list of models """ models = [ FeatureBasedClassifier( LogisticRegression(n_jobs=-1, class_weight='balanced', C=100.0, max_iter=1000)), FeatureBasedClassifier(RandomForestClassifier( class_weight='balanced', max_depth=20, min_samples_leaf=5, n_estimators=1000, n_jobs=-1, ), pipeline=get_default_rf_pipeline()), FeatureBasedClassifier(XGBClassifier( colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=7, min_child_weight=4, n_estimators=1000, nthread=8, subsample=0.8, ), pipeline=None), FeatureBasedClassifier( MLPClassifier(early_stopping=True, hidden_layer_sizes=(512, ), batch_size=128)), SignalBasedClassifier(ResNet18, device, batch_size=32, epochs=20, optimizer_args=dict(lr=3e-4, weight_decay=1e-4)), SignalBasedClassifier(CnnGru, device, batch_size=32, epochs=10, gru_dropout=0, gru_layers=1, optimizer_args=dict(lr=3e-4, weight_decay=1e-4)) ] if include_knn: models.append( DistanceBasedClassifier( KNeighborsTimeSeriesClassifier( metric='dtw', n_jobs=-1, n_neighbors=1, ))) return models
X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2] X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:] y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2] y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:] # Nearest neighbor search knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw") knn.fit(X_train, y_train) dists, ind = knn.kneighbors(X_test) print("1. Nearest neighbour search") print("Computed nearest neighbor indices (wrt DTW)\n", ind) print("First nearest neighbor class:", y_test[ind[:, 0]]) # Nearest neighbor classification knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n2. Nearest neighbor classification using DTW") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification with a different metric (Euclidean distance) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n3. Nearest neighbor classification using L2") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification based on SAX representation sax_trans = SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=5) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean")
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. """ metadata = metadata_base.PrimitiveMetadata({ "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ "https://github.com/kungfuai/d3m-primitives", ], }, "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.16" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params( scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns, ) def set_params(self, *, params: Params) -> None: self._scaler = params["scaler"] self._knn = params["classifier"] self._output_columns = params["output_columns"] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/Attribute", )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)
def test_constrained_paths(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) X = rng.randn(n, sz, d) y = rng.randint(low=0, high=3, size=n) model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") y_pred_euc = model_euc.fit(X, y).predict(X) model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw", metric_params={ "global_constraint": "sakoe_chiba", "sakoe_chiba_radius": 0 }) y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X) np.testing.assert_equal(y_pred_euc, y_pred_sakoe) model_softdtw = KNeighborsTimeSeriesClassifier( n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6}) y_pred_softdtw = model_softdtw.fit(X, y).predict(X) model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") y_pred_dtw = model_dtw.fit(X, y).predict(X) np.testing.assert_equal(y_pred_dtw, y_pred_softdtw)
def create_model(self, model_type, X, y, model_params, search_params): """ Executes random search hyper parameter optimization for the specified model. Refer to sklearn and tslearn documentation for details. Sources: # https://www.kaggle.com/hatone/mlpclassifier-with-gridsearchcv # https://en.wikipedia.org/wiki/Hyperparameter_optimization#Grid_search # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html # alternative to grid search: https://github.com/sahilm89/lhsmdu :param model_type: :param X: :param y: :param model_params: :param search_params: :param test_size: :return: """ try: if X is None or y is None or model_params is None or search_params is None: raise TypeError(self.messages.ILLEGAL_ARGUMENT_NONE_TYPE.value) if (not isinstance(X, pandas.DataFrame) and not isinstance(X, pandas.core.frame.DataFrame) \ and not isinstance(X, pandas.core.series.Series)): raise TypeError(self.messages.ILLEGAL_ARGUMENT_TYPE.value) if (not isinstance(y, pandas.DataFrame) and not isinstance(y, pandas.core.frame.DataFrame) \ and not isinstance(y, pandas.core.series.Series)): raise TypeError(self.messages.ILLEGAL_ARGUMENT_TYPE.value) if not isinstance(model_params, dict) or not isinstance( search_params, list): raise TypeError(self.messages.ILLEGAL_ARGUMENT_TYPE.value) model = None if model_type == 'tssvc': model = TimeSeriesSVC(search_params[0]) if model_type == 'knn_classifier': model = KNeighborsTimeSeriesClassifier(search_params[0]) if model is None: raise ValueError( self.messages.PROVIDED_MODE_DOESNT_EXIST.value) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=search_params[6], stratify=y) clf = RandomizedSearchCV(model, model_params, n_jobs=search_params[0], verbose=search_params[1], cv=search_params[2], n_iter=search_params[3]) clf.fit(X_train, y_train) if search_params[4]: with open(r"{}".format(search_params[5]), "wb") as output_file: pickle.dump(clf, output_file) return {'clf': clf, 'X_test': X_test, 'y_test': y_test} except (TypeError, ValueError): self.logger.error(traceback.format_exc()) os._exit(1) except Exception: self.logger.error(traceback.format_exc()) os._exit(2)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. Training inputs: 1) Feature dataframe, 2) Target dataframe Outputs: Dataframe with predictions for specific time series at specific future time instances Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/Yonder-OSS/D3M-Primitives", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.14" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params(scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns) def set_params(self, *, params: Params) -> None: self._scaler = params['scaler'] self._knn = params['classifier'] self._output_columns = params['output_columns'] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """ private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ('https://metadata.datadrivendiscovery.org/types/Attribute', )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)
# Set seed numpy.random.seed(0) # Defining dataset and the number of segments data_loader = UCR_UEA_datasets() datasets = [('SyntheticControl', 16), ('GunPoint', 64), ('FaceFour', 128), ('Lightning2', 256), ('Lightning7', 128), ('ECG200', 32), ('Plane', 64), ('Car', 256), ('Beef', 128), ('Coffee', 128), ('OliveOil', 256)] # We will compare the accuracies & execution times of 1-NN using: # (i) MINDIST on SAX representations, and # (ii) euclidean distance on raw values knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax') knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean') accuracies = {} times = {} for dataset, w in datasets: X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset) ts_scaler = TimeSeriesScalerMeanVariance() X_train = ts_scaler.fit_transform(X_train) X_test = ts_scaler.fit_transform(X_test) # Fit 1-NN using SAX representation & MINDIST metric_params = {'n_segments': w, 'alphabet_size_avg': 10} knn_sax = clone(knn_sax).set_params(metric_params=metric_params) start = time.time()
from flask import Flask from flask_restful import Api, Resource, reqparse import numpy as np from tslearn.neighbors import KNeighborsTimeSeriesClassifier from tslearn.shapelets import LearningShapelets from os import path as os_path from pathlib import Path from tslearn.utils import to_time_series_dataset, to_time_series APP = Flask(__name__) API = Api(APP) # Load models from disk k_nn_model = KNeighborsTimeSeriesClassifier.from_pickle('./models/k_nn.pickle') shapelets_model = LearningShapelets.from_pickle( './models/learning_shapelets.pickle') working_dir_path = Path.cwd() filename = os_path.join(working_dir_path, './models/mlp_nn.pickle') mlp_nn_model = pickle.load(open(filename, 'rb')) filename = os_path.join(working_dir_path, './models/gak_svm.pickle') gak_svm_model = pickle.load(open(filename, 'rb')) class Classify(Resource): @staticmethod def post():
def test_constrained_paths(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) X = rng.randn(n, sz, d) y = rng.randint(low=0, high=3, size=n) model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") y_pred_euc = model_euc.fit(X, y).predict(X) model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw", metric_params={ "global_constraint": "sakoe_chiba", "sakoe_chiba_radius": 0 }) y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X) np.testing.assert_equal(y_pred_euc, y_pred_sakoe) model_softdtw = KNeighborsTimeSeriesClassifier( n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6}) y_pred_softdtw = model_softdtw.fit(X, y).predict(X) model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") y_pred_dtw = model_dtw.fit(X, y).predict(X) np.testing.assert_equal(y_pred_dtw, y_pred_softdtw) model_sax = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="sax", metric_params={ "alphabet_size_avg": 6, "n_segments": 10 }) model_sax.fit(X, y) # The MINDIST of SAX is a lower bound of the euclidean distance euc_dist, _ = model_euc.kneighbors(X, n_neighbors=5) sax_dist, _ = model_sax.kneighbors(X, n_neighbors=5) # First column will contain zeroes np.testing.assert_array_less(sax_dist[:, 1:], euc_dist[:, 1:])
def fit(self, X, y): """ Fit early classifier. Parameters ---------- X : array-like of shape (n_series, n_timestamps, n_features) Training data, where `n_series` is the number of time series, `n_timestamps` is the number of timestamps in the series and `n_features` is the number of features recorded at each timestamp. y : array-like of shape (n_samples,) Target values. Will be cast to X's dtype if necessary Returns ------- self : returns an instance of self. """ X = check_array(X, allow_nd=True) X = check_dims(X) X = to_time_series_dataset(X) y_arr = np.array(y) label_set = np.unique(y_arr) self.cluster_ = TimeSeriesKMeans(n_clusters=self.n_clusters, random_state=self.random_state) if self.base_classifier is not None: clf = self.base_classifier else: clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="euclidean") self.__n_classes_ = len(label_set) self._X_fit_dims = X.shape sz = X.shape[1] self.classifiers_ = {t: clone(clf) for t in range(self.min_t, sz + 1)} self.pyhatyck_ = np.empty((sz - self.min_t + 1, self.n_clusters, self.__n_classes_, self.__n_classes_)) c_k = self.cluster_.fit_predict(X) X1, X2, c_k1, c_k2, y1, y2 = train_test_split( X, c_k, y_arr, test_size=0.5, stratify=c_k, random_state=self.random_state ) label_to_ind = {lab: ind for ind, lab in enumerate(label_set)} y_ = np.array([label_to_ind.get(lab, self.__n_classes_ + 1) for lab in y_arr]) vector_of_ones = np.ones((X.shape[0], )) self.pyck_ = coo_matrix( (vector_of_ones, (y_, c_k)), shape=(self.__n_classes_, self.n_clusters), ).toarray() self.pyck_ /= self.pyck_.sum(axis=0, keepdims=True) for t in range(self.min_t, sz + 1): self.classifiers_[t].fit(X1[:, :t], y1) for k in range(0, self.n_clusters): index = (c_k2 == k) if index.shape[0] != 0: X2_current_cluster = X2[index, :t] y2_current_cluster = y2[index] y2_hat = self.classifiers_[t].predict( X2_current_cluster[:, :t] ) conf_matrix = confusion_matrix(y2_current_cluster, y2_hat, labels=label_set) # normalize parameter seems to be quite recent in sklearn, # so let's do it ourselves normalizer = conf_matrix.sum(axis=0, keepdims=True) normalizer[normalizer == 0] = 1 # Avoid divide by 0 conf_matrix = conf_matrix / normalizer # pyhatyck_ stores # P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}} # elements so it should have a null diagonal because of # the \delta_{y \neq \hat{y}} term np.fill_diagonal(conf_matrix, 0) self.pyhatyck_[t - self.min_t, k] = conf_matrix return self
working_dir_path = Path.cwd() sys.path.append(str(working_dir_path)) # Load the dataset raw_data = pd.read_csv(os_path.join(working_dir_path, "./data/train_curves.csv"), header=None) time_series_train = to_time_series_dataset(raw_data) labels_train = genfromtxt(os_path.join( working_dir_path, "./data/train_clustering_result.csv"), delimiter=',') # Define the model knn_classification_model = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw", n_jobs=4) # fit the model using the training data knn_classification_model.fit(time_series_train, labels_train) ############################################################################################# # save model ############################################################################################# print( "#############################################################################################" ) # return string with current datetime now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.datasets import CachedDatasets from sklearn.model_selection import GridSearchCV, StratifiedKFold from sklearn.pipeline import Pipeline import numpy as np import matplotlib.pyplot as plt # Our pipeline consists of two phases. First, data will be normalized using # min-max normalization. Afterwards, it is fed to a KNN classifier. For the # KNN classifier, we tune the n_neighbors and weights hyper-parameters. n_splits = 3 pipeline = GridSearchCV(Pipeline([('normalize', TimeSeriesScalerMinMax()), ('knn', KNeighborsTimeSeriesClassifier())]), { 'knn__n_neighbors': [5, 25], 'knn__weights': ['uniform', 'distance'] }, cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)) X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace") # Keep only timeseries of class 1, 2, 3 X_train = X_train[y_train > 0] y_train = y_train[y_train > 0] # Keep only the first 50 timeseries of both train and
correct = (preds == labels[:len(preds)]) score = float(sum(correct))/len(correct) return score # get train/test data/labels inDataFile = 'data/160k_f100_20190908-1401.txt' labels, data = dp.readProcFile(inDataFile) labels = np.array(labels) data = np.array(data) trainData, trainLabels, testData, testLabels = dp.splitTestTrainSets(data, labels, 0.8, 'Stratified') # z-normalisation trainData, testData = dp.znorm(trainData, testData) clf = KNeighborsTimeSeriesClassifier(n_jobs=-1) print "Fitting..." clf.fit(trainData, trainLabels) print "Scoring..." predictions = [] for i in range(len(testData)): if (i % 10 == 0) and (i > 0): print "{} complete...current score: {}".format(i, getScore(np.array(predictions), testLabels) ) predictions += clf.predict([testData[i]]).tolist() predictions = np.array(predictions) test_acc = getScore(predictions, testLabels) #test_acc = clf.score(testData, testLabels) print test_acc
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [9, 8, 7, 6, 5, 2], [8, 7, 6, 5, 3]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) scaler = TimeSeriesScalerMeanVariance() clf = KNeighborsTimeSeriesClassifier(metric="sax", n_neighbors=1, metric_params={'n_segments': 2}) X_transf = scaler.fit_transform(X) clf.fit(X_transf, y) assert_allclose(clf.predict(X_transf), [0, 0, 1, 1])
tmpSum +=1 # rsltCol.append([x,tmpSum]) rsltCol.append(tmpSum) return [rsltRow, rsltCol] def dataToSeries(dataset): rowArray = [] # colArray = [] for i in range(0, len(dataset)): row = mapper(dataset[i]) rowArray.append(row) # colArray.append(col) return to_time_series(rowArray) X_train, y_train, X_test, y_test = load_data('data/') X_train_ts = dataToSeries(X_train) X_test_ts = dataToSeries(X_test) knn_clf_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw") knn_clf_dtw.fit(X_train_ts, y_train) predicted_labels_dtw = knn_clf_dtw.predict(X_test_ts) print("knn with dtw: \n", accuracy_score(y_test, predicted_labels_dtw)) print("Classification report: \n", classification_report(y_test, predicted_labels_dtw)) print("Confusion matrix: \n", confusion_matrix(y_test, predicted_labels_dtw)) unlabaled = df = pd.read_csv("data/test.csv") unlabaled = unlabaled.values unlabaled_ts = dataToSeries(unlabaled) plt.imshow(unlabaled[165].reshape((28, 28))) predicted_label_dtw = knn_clf_dtw.predict(unlabaled_ts)