def test_sax_scale():
    n, sz, d = 10, 10, 3
    rng = np.random.RandomState(0)
    X = rng.rand(n, sz, d)
    y = rng.choice([0, 1], size=n)

    sax = SymbolicAggregateApproximation(n_segments=3,
                                         alphabet_size_avg=2,
                                         scale=True)
    sax.fit(X)
    np.testing.assert_array_almost_equal(X,
                                         sax._unscale(sax._scale(X)))

    np.testing.assert_array_almost_equal(np.zeros((d, )),
                                         sax._scale(X).reshape((-1, d)).mean())
    np.testing.assert_array_almost_equal(np.ones((d, )),
                                         sax._scale(X).reshape((-1, d)).std())

    # Case of kNN-SAX
    knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="sax",
                                             metric_params={"scale": True})
    knn_sax.fit(X, y)
    X_scale_unscale = knn_sax._sax._unscale(knn_sax._sax._scale(X))
    np.testing.assert_array_almost_equal(X, X_scale_unscale)

    knn_sax.predict(X)
def test_constrained_paths():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, sz, d)
    y = rng.randint(low=0, high=3, size=n)

    model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                               metric="euclidean")
    y_pred_euc = model_euc.fit(X, y).predict(X)
    model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                                     metric="dtw",
                                                     metric_params={
                                                         "global_constraint":
                                                         "sakoe_chiba",
                                                         "sakoe_chiba_radius":
                                                         0
                                                     })
    y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X)
    np.testing.assert_equal(y_pred_euc, y_pred_sakoe)

    model_softdtw = KNeighborsTimeSeriesClassifier(
        n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6})
    y_pred_softdtw = model_softdtw.fit(X, y).predict(X)

    model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw")
    y_pred_dtw = model_dtw.fit(X, y).predict(X)

    np.testing.assert_equal(y_pred_dtw, y_pred_softdtw)
def test_variable_length_knn():
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9],
                                [3, 5, 6, 7, 8]])
    y = [0, 0, 1, 1]
    clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])
Exemple #4
0
def test_serialize_knn_classifier():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    X = rng.randn(n, sz, d)
    y = rng.randint(low=0, high=3, size=n)

    knc = KNeighborsTimeSeriesClassifier()

    _check_not_fitted(knc)

    knc.fit(X, y)

    _check_params_predict(knc, X, ['predict'])
Exemple #5
0
class Knn():
    def __init__(self, n_neighbors):
        '''
            initialize KNN class with dynamic time warping distance metric

            hyperparameters:
                n_neighbors           : number of neighbors on which to make classification decision
        '''
        self.n_neighbors = n_neighbors
        self.knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=n_neighbors,
                                                      metric="dtw")

    def __ScaleData(self, input_data):
        ''' 
            scale input data to range [0,1]

            parameters:
                input_data        : input data to rescale
        '''

        return TimeSeriesScalerMinMax().fit_transform(input_data)

    def fit(self, X_train, y_train):
        '''
            fit KNN classifier on training data

            parameters:
                X_train                : training time series
                y_train                : training labels
        '''
        # scale training data to between 0 and 1
        X_train_scaled = self.__ScaleData(X_train)
        self.knn_clf.fit(X_train_scaled, y_train)

    def predict(self, X_test):
        '''
            classifications for time series in test data set

            parameters:
                X_test:     test time series on which to predict classes

            returns: classifications for test data set
        '''
        # scale test data to between 0 and 1
        X_test_scaled = self.__ScaleData(X_test)
        return self.knn_clf.predict(X_test_scaled)
Exemple #6
0
Fichier : NCC.py Projet : zfbi/dtan
def NearestCentroidClassification(X_train, X_test, y_train_n, y_test_n,
                                  dataset_name):
    '''

    :param X_train: if using DTAN, should already be aligned
    :param X_test: if using DTAN, should already be aligned
    :param y_train_n: numerical labels (not one-hot)
    :param y_test_n: numerical labels (not one-hot)
    :param dataset_name:
    :return: test set NCC accuracy
    '''

    # vars and placeholders
    input_shape = X_train.shape[1:]
    n_classes = len(np.unique(y_train_n))
    class_names = np.unique(y_train_n, axis=0)

    aligned_means = np.zeros((n_classes, input_shape[0], input_shape[1]))
    ncc_labels = []

    # Train set within class Euclidean mean
    for class_num in class_names:
        train_class_idx = y_train_n == class_num  # get indices
        X_train_aligned_within_class = X_train[train_class_idx]
        aligned_means[int(class_num), :] = np.mean(
            X_train_aligned_within_class, axis=0)
        ncc_labels.append(class_num)

    ncc_labels = np.asarray(ncc_labels)

    # Nearest neighbor classification - using euclidean distance
    knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="euclidean")
    knn_clf.fit(aligned_means, ncc_labels)

    predicted_labels = knn_clf.predict(X_test)
    acc = accuracy_score(y_test_n, predicted_labels)

    print(f"{dataset_name} - NCC results: {acc}")
def test_constrained_paths():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, sz, d)
    y = rng.randint(low=0, high=3, size=n)

    model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                               metric="euclidean")
    y_pred_euc = model_euc.fit(X, y).predict(X)
    model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                                     metric="dtw",
                                                     metric_params={
                                                         "global_constraint":
                                                         "sakoe_chiba",
                                                         "sakoe_chiba_radius":
                                                         0
                                                     })
    y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X)
    np.testing.assert_equal(y_pred_euc, y_pred_sakoe)

    model_softdtw = KNeighborsTimeSeriesClassifier(
        n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6})
    y_pred_softdtw = model_softdtw.fit(X, y).predict(X)

    model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw")
    y_pred_dtw = model_dtw.fit(X, y).predict(X)

    np.testing.assert_equal(y_pred_dtw, y_pred_softdtw)

    model_ctw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="ctw")
    # Just testing that things run, nothing smart here :(
    model_ctw.fit(X, y).predict(X)

    model_sax = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                               metric="sax",
                                               metric_params={
                                                   "alphabet_size_avg": 6,
                                                   "n_segments": 10
                                               })
    model_sax.fit(X, y)

    # The MINDIST of SAX is a lower bound of the euclidean distance
    euc_dist, _ = model_euc.kneighbors(X, n_neighbors=5)
    sax_dist, _ = model_sax.kneighbors(X, n_neighbors=5)

    # First column will contain zeroes
    np.testing.assert_array_less(sax_dist[:, 1:], euc_dist[:, 1:])
Exemple #8
0
def test_variable_length_knn():
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [9, 8, 7, 6, 5, 2],
                                [8, 7, 6, 5, 3]])
    y = [0, 0, 1, 1]

    clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    scaler = TimeSeriesScalerMeanVariance()
    clf = KNeighborsTimeSeriesClassifier(metric="sax",
                                         n_neighbors=1,
                                         metric_params={'n_segments': 2})
    X_transf = scaler.fit_transform(X)
    clf.fit(X_transf, y)
    assert_allclose(clf.predict(X_transf), [0, 0, 1, 1])
X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2]
X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:]
y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2]
y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:]

# Nearest neighbor search
knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw")
knn.fit(X_train, y_train)
dists, ind = knn.kneighbors(X_test)
print("1. Nearest neighbour search")
print("Computed nearest neighbor indices (wrt DTW)\n", ind)
print("First nearest neighbor class:", y_test[ind[:, 0]])

# Nearest neighbor classification
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw")
knn_clf.fit(X_train, y_train)
predicted_labels = knn_clf.predict(X_test)
print("\n2. Nearest neighbor classification using DTW")
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

# Nearest neighbor classification with a different metric (Euclidean distance)
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean")
knn_clf.fit(X_train, y_train)
predicted_labels = knn_clf.predict(X_test)
print("\n3. Nearest neighbor classification using L2")
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

# Nearest neighbor classification  based on SAX representation
sax_trans = SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=5)
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean")
pipeline_model = Pipeline(steps=[('sax', sax_trans), ('knn', knn_clf)])
Exemple #10
0
    correct = (preds == labels[:len(preds)])
    score = float(sum(correct))/len(correct)

    return score

# get train/test data/labels
inDataFile = 'data/160k_f100_20190908-1401.txt'
labels, data = dp.readProcFile(inDataFile)
labels = np.array(labels)
data = np.array(data)
trainData, trainLabels, testData, testLabels = dp.splitTestTrainSets(data, labels, 0.8, 'Stratified')
# z-normalisation
trainData, testData = dp.znorm(trainData, testData)

clf = KNeighborsTimeSeriesClassifier(n_jobs=-1)

print "Fitting..."
clf.fit(trainData, trainLabels)

print "Scoring..."
predictions = []
for i in range(len(testData)):
    if (i % 10 == 0) and (i > 0):
        print "{} complete...current score: {}".format(i, getScore(np.array(predictions), testLabels) )
    predictions += clf.predict([testData[i]]).tolist()
    
predictions = np.array(predictions)
test_acc = getScore(predictions, testLabels)
#test_acc = clf.score(testData, testLabels)
print test_acc
Exemple #11
0
knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax')
knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean')

accuracies = {}
times = {}
for dataset, w in datasets:
    X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset)

    ts_scaler = TimeSeriesScalerMeanVariance()
    X_train = ts_scaler.fit_transform(X_train)
    X_test = ts_scaler.fit_transform(X_test)

    # Fit 1-NN using SAX representation & MINDIST
    metric_params = {'n_segments': w, 'alphabet_size_avg': 10}
    knn_sax = clone(knn_sax).set_params(metric_params=metric_params)
    start = time.time()
    knn_sax.fit(X_train, y_train)
    acc_sax = accuracy_score(y_test, knn_sax.predict(X_test))
    time_sax = time.time() - start

    # Fit 1-NN using euclidean distance on raw values
    start = time.time()
    knn_eucl.fit(X_train, y_train)
    acc_euclidean = accuracy_score(y_test, knn_eucl.predict(X_test))
    time_euclidean = time.time() - start

    accuracies[dataset] = (acc_sax, acc_euclidean)
    times[dataset] = (time_sax, time_euclidean)

print_table(accuracies, times)
Exemple #12
0
    raw_data = pd.read_csv(os_path.join(working_dir_path,
                                        "./data/train_curves.csv"),
                           header=None)
    time_series_train = to_time_series_dataset(raw_data)

    labels_train = genfromtxt(os_path.join(
        working_dir_path, "./data/train_clustering_result.csv"),
                              delimiter=',')

    # Define the model
    knn_classification_model = KNeighborsTimeSeriesClassifier(n_neighbors=5,
                                                              metric="dtw",
                                                              n_jobs=4)

    # fit the model using the training data
    knn_classification_model.fit(time_series_train, labels_train)

    #############################################################################################
    # save model
    #############################################################################################

    print(
        "#############################################################################################"
    )

    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

    try:

        # save model to models folder
Exemple #13
0
                tmpSum +=1
        # rsltCol.append([x,tmpSum])
        rsltCol.append(tmpSum)
    return [rsltRow, rsltCol]

def dataToSeries(dataset):
    rowArray = []
    # colArray = []
    for i in range(0, len(dataset)):
        row = mapper(dataset[i])
        rowArray.append(row)
        # colArray.append(col)
    return to_time_series(rowArray)

X_train, y_train, X_test, y_test = load_data('data/')
X_train_ts =  dataToSeries(X_train)
X_test_ts = dataToSeries(X_test)

knn_clf_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw")
knn_clf_dtw.fit(X_train_ts, y_train)
predicted_labels_dtw = knn_clf_dtw.predict(X_test_ts)
print("knn with dtw: \n", accuracy_score(y_test, predicted_labels_dtw))
print("Classification report: \n", classification_report(y_test, predicted_labels_dtw))
print("Confusion matrix: \n", confusion_matrix(y_test, predicted_labels_dtw))


unlabaled =   df = pd.read_csv("data/test.csv")
unlabaled = unlabaled.values
unlabaled_ts = dataToSeries(unlabaled)
plt.imshow(unlabaled[165].reshape((28, 28)))
predicted_label_dtw = knn_clf_dtw.predict(unlabaled_ts)
Exemple #14
0
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
    Primitive that applies the k nearest neighbor classification algorithm to time series data.
    The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped.
    """

    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "2d6d3223-1b3c-49cc-9ddd-50f571818268",
        "version":
        __version__,
        "name":
        "kanine",
        "keywords": [
            "time series",
            "knn",
            "k nearest neighbor",
            "time series classification",
        ],
        "source": {
            "name": __author__,
            "contact": __contact__,
            "uris": [
                "https://github.com/kungfuai/d3m-primitives",
            ],
        },
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.16"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        "python_path":
        "d3m.primitives.time_series_classification.k_neighbors.Kanine",
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False

    def get_params(self) -> Params:
        if not self._is_fit:
            return Params(scaler=None, classifier=None, output_columns=None)

        return Params(
            scaler=self._scaler,
            classifier=self._knn,
            output_columns=self._output_columns,
        )

    def set_params(self, *, params: Params) -> None:
        self._scaler = params["scaler"]
        self._knn = params["classifier"]
        self._output_columns = params["output_columns"]
        self._is_fit = all(param is not None for param in params.values())

    def _get_cols(self, input_metadata):
        """private util function that finds grouping column from input metadata

        Arguments:
            input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
            list[int] -- list of column indices annotated with GroupingKey metadata
        """

        # find column with ts value through metadata
        grouping_column = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey", ))
        return grouping_column

    def _get_value_col(self, input_metadata):
        """
        private util function that finds the value column from input metadata

        Arguments:
        input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
        int -- index of column that contains time series value after Time Series Formatter primitive
        """

        # find attribute column but not file column
        attributes = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/Attribute", ))
        # this is assuming alot, but timeseries formaters typicaly place value column at the end
        attribute_col = attributes[-1]
        return attribute_col

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """Sets primitive's training data

        Arguments:
            inputs {Inputs} -- D3M dataframe containing attributes
            outputs {Outputs} -- D3M dataframe containing targets
        """

        # load and reshape training data
        self._output_columns = outputs.columns
        outputs = np.array(outputs)
        n_ts = outputs.shape[0]
        ts_sz = inputs.shape[0] // n_ts

        attribute_col = self._get_value_col(inputs.metadata)
        self._X_train = inputs.iloc[:,
                                    attribute_col].values.reshape(n_ts, ts_sz)
        self._y_train = np.array(outputs).reshape(-1, )

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """Fits KNN model using training data from set_training_data and hyperparameters

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Returns:
            CallResult[None]
        """

        scaled = self._scaler.fit_transform(self._X_train)
        self._knn.fit(scaled, self._y_train)
        self._is_fit = True
        return CallResult(None, has_finished=self._is_fit)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Produce primitive's classifications for new time series data

        Arguments:
            inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Raises:
            PrimitiveNotFittedError: if primitive not fit

        Returns:
            CallResult[Outputs] -- dataframe with a column containing a predicted class
                for each input time series
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        # find column with ts value through metadata
        grouping_column = self._get_cols(inputs.metadata)

        n_ts = inputs.iloc[:, grouping_column[0]].nunique()
        ts_sz = inputs.shape[0] // n_ts
        attribute_col = self._get_value_col(inputs.metadata)
        x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz)

        # make predictions
        scaled = self._scaler.transform(x_vals)
        preds = self._knn.predict(scaled)

        # create output frame
        result_df = container.DataFrame({self._output_columns[0]: preds},
                                        generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=True)
def NN1_DTWClassifier(X_train, Y_train):

    knn1_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw")
    knn1_clf.fit(X_train, Y_train)
    return knn1_clf
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
        Primitive that applies the k nearest neighbor classification algorithm to time series data. 
        The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped.
        
        Training inputs: 1) Feature dataframe, 2) Target dataframe
        Outputs: Dataframe with predictions for specific time series at specific future time instances 
    
        Arguments:
            hyperparams {Hyperparams} -- D3M Hyperparameter object
        
        Keyword Arguments:
            random_seed {int} -- random seed (default: {0})
    """

    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        "id":
        "2d6d3223-1b3c-49cc-9ddd-50f571818268",
        "version":
        __version__,
        "name":
        "kanine",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        "keywords": [
            "time series",
            "knn",
            "k nearest neighbor",
            "time series classification",
        ],
        "source": {
            "name":
            __author__,
            "contact":
            __contact__,
            "uris": [
                # Unstructured URIs.
                "https://github.com/Yonder-OSS/D3M-Primitives",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.14"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        "python_path":
        "d3m.primitives.time_series_classification.k_neighbors.Kanine",
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False

    def get_params(self) -> Params:
        if not self._is_fit:
            return Params(scaler=None, classifier=None, output_columns=None)

        return Params(scaler=self._scaler,
                      classifier=self._knn,
                      output_columns=self._output_columns)

    def set_params(self, *, params: Params) -> None:
        self._scaler = params['scaler']
        self._knn = params['classifier']
        self._output_columns = params['output_columns']
        self._is_fit = all(param is not None for param in params.values())

    def _get_cols(self, input_metadata):
        """ private util function that finds grouping column from input metadata
        
        Arguments:
            input_metadata {D3M Metadata object} -- D3M Metadata object for input frame
        
        Returns:
            list[int] -- list of column indices annotated with GroupingKey metadata
        """

        # find column with ts value through metadata
        grouping_column = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey", ))
        return grouping_column

    def _get_value_col(self, input_metadata):
        """
        private util function that finds the value column from input metadata

        Arguments:
        input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
        int -- index of column that contains time series value after Time Series Formatter primitive
        """

        # find attribute column but not file column
        attributes = input_metadata.list_columns_with_semantic_types(
            ('https://metadata.datadrivendiscovery.org/types/Attribute', ))
        # this is assuming alot, but timeseries formaters typicaly place value column at the end
        attribute_col = attributes[-1]
        return attribute_col

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """ Sets primitive's training data

            Arguments:
                inputs {Inputs} -- D3M dataframe containing attributes
                outputs {Outputs} -- D3M dataframe containing targets
        """

        # load and reshape training data
        self._output_columns = outputs.columns
        outputs = np.array(outputs)
        n_ts = outputs.shape[0]
        ts_sz = inputs.shape[0] // n_ts

        attribute_col = self._get_value_col(inputs.metadata)
        self._X_train = inputs.iloc[:,
                                    attribute_col].values.reshape(n_ts, ts_sz)
        self._y_train = np.array(outputs).reshape(-1, )

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """ Fits KNN model using training data from set_training_data and hyperparameters
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})
            
            Returns:
                CallResult[None]
        """

        scaled = self._scaler.fit_transform(self._X_train)
        self._knn.fit(scaled, self._y_train)
        self._is_fit = True
        return CallResult(None, has_finished=self._is_fit)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """ Produce primitive's classifications for new time series data

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit

            Returns:
                CallResult[Outputs] -- dataframe with a column containing a predicted class 
                    for each input time series
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        # find column with ts value through metadata
        grouping_column = self._get_cols(inputs.metadata)

        n_ts = inputs.iloc[:, grouping_column[0]].nunique()
        ts_sz = inputs.shape[0] // n_ts
        attribute_col = self._get_value_col(inputs.metadata)
        x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz)

        # make predictions
        scaled = self._scaler.transform(x_vals)
        preds = self._knn.predict(scaled)

        # create output frame
        result_df = container.DataFrame({self._output_columns[0]: preds},
                                        generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=True)
Exemple #17
0
    seq = np.genfromtxt(rep + dataset,
                        delimiter=' ',
                        dtype=str,
                        encoding="utf8")
    ids, counts = np.unique(seq[:, 0], return_counts=True)

    No = ids.shape[0]
    D = seq.shape[1] - 3
    arr = np.asarray((ids, counts)).T
    Max_Seq_Len = np.max(arr[:, 1].astype(np.int))

    out_X = np.zeros((No, Max_Seq_Len, D))
    out_Y = np.zeros((No, ))

    for idx, id in enumerate(ids):
        seq_cpy = seq[seq[:, 0] == id]
        out_X[idx] = seq_cpy[:, 3:]
        out_Y[idx] = seq_cpy[0, 2]
    return out_X, out_Y


x_train, y_train = convert_mts(rep, ds_train)
x_test, y_test = convert_mts(rep, ds_test)

clf = KNeighborsTimeSeriesClassifier(n_neighbors=2, metric="dtw")

y_test_pred = clf.fit(x_train, y=y_train).predict(x_test)

print("the accuracy score of the testing data is : " +
      accuracy_score(y_test, y_test_pred))
Exemple #18
0
class AnomalyDetection(ClassifierMixin, BaseEstimator):
    """
    Anomaly detection with 1-NN and automatic calculation of optimal threshold.
    """
    def __init__(self, n_clusters=200):
        self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=1,
                                                  weights='uniform',
                                                  metric='euclidean',
                                                  n_jobs=-1)
        self.d = None
        self.n_clusters = n_clusters

    def fit(self, X, y):
        """
        Fit the algorithm according to the given training data.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features, n_channels)
            Training samples.
        y : array-like of shape (n_samples,)
            True labels for X.
        Returns
        -------
        self: object
            Fitted model
        """
        # Fit anomaly detection knn over k-means centroids
        X_good = X[np.where(y == 0)]
        X_bad = X[np.where(y != 0)]
        km = TimeSeriesKMeans(n_clusters=self.n_clusters,
                              metric="euclidean",
                              max_iter=100,
                              random_state=0,
                              n_jobs=-1).fit(X_good)
        self.knn.fit(km.cluster_centers_, np.zeros((self.n_clusters, )))

        # Calculate distances to all samples in good and bad
        d_bad, _ = self.knn.kneighbors(X_bad)
        d_good, _ = self.knn.kneighbors(X_good)

        # Calculate ROC
        y_true = np.hstack(
            (np.zeros(X_good.shape[0]), np.ones(X_bad.shape[0])))
        y_score = np.vstack((d_good, d_bad))
        fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=1)

        # Determine d by Youden index
        self.d = thresholds[np.argmax(tpr - fpr)]
        return self

    def predict(self, X):
        """
        Perform a classification on samples in X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features, n_channels)
            Test samples.
        Returns
        -------
        y_pred: array, shape (n_samples,)
            Predictions
        """
        # Binary predictions of anomaly detector
        y_pred = np.squeeze(np.where(self.knn.kneighbors(X)[0] < self.d, 0, 1))
        return y_pred