def predict_proba(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_pandas=True) # Derivative DTW (DDTW) uses the regular DTW algorithm on data that # are transformed into derivatives. # To increase the efficiency of DDTW we can pre-transform the data # into derivatives, and then call the # standard DTW algorithm on it, rather than transforming each series # every time a distance calculation # is made. Please note that using DDTW elsewhere will not benefit # from this speed enhancement if self.distance_measures.__contains__( ddtw_c) or self.distance_measures.__contains__(wddtw_c): der_X = DerivativeSlopeTransformer().fit_transform(X) der_X = np.array( [np.asarray([x]).reshape(len(x), 1) for x in der_X.iloc[:, 0]]) else: der_X = None # reshape X for use with the efficient cython distance measures X = np.array( [np.asarray([x]).reshape(len(x), 1) for x in X.iloc[:, 0]]) output_probas = [] train_sum = 0 for c in range(0, len(self.estimators_)): if (self.distance_measures[c] == ddtw_c or self.distance_measures[c] == wddtw_c): test_X_to_use = der_X else: test_X_to_use = X this_train_acc = self.train_accs_by_classifier[c] this_probas = np.multiply( self.estimators_[c].predict_proba(test_X_to_use), this_train_acc) output_probas.append(this_probas) train_sum += this_train_acc output_probas = np.sum(output_probas, axis=0) output_probas = np.divide(output_probas, train_sum) return output_probas
def setup_all_distance_measure_getter(proximity): """ setup all distance measure getter functions from a proximity object :param proximity: a PT / PF / PS :return: a list of distance measure getters """ transformer = _CachedTransformer(DerivativeSlopeTransformer()) distance_measure_getters = [ euclidean_distance_measure_getter, dtw_distance_measure_getter, setup_ddtw_distance_measure_getter(transformer), wdtw_distance_measure_getter, setup_wddtw_distance_measure_getter(transformer), msm_distance_measure_getter, lcss_distance_measure_getter, erp_distance_measure_getter, twe_distance_measure_getter, ] def pick_rand_distance_measure(proximity): """ generate a distance measure from a range of parameters :param proximity: proximity object containing distance measures, ranges and dataset :return: a distance measure with no parameters """ random_state = proximity.random_state X = proximity.X distance_measure_getter = random_state.choice(distance_measure_getters) distance_measure_perm = distance_measure_getter(X) param_perm = pick_rand_param_perm_from_dict(distance_measure_perm, random_state) distance_measure = param_perm["distance_measure"] del param_perm["distance_measure"] return distance_predefined_params(distance_measure, **param_perm) return pick_rand_distance_measure
def fit(self, X, y): """Build an ensemble of 1-NN classifiers from the training set (X, y), Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, it must have a single column. BOSS not configured to handle multivariate y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_pandas=True) # Derivative DTW (DDTW) uses the regular DTW algorithm on data that # are transformed into derivatives. # To increase the efficiency of DDTW we can pre-transform the data # into derivatives, and then call the # standard DTW algorithm on it, rather than transforming each series # every time a distance calculation # is made. Please note that using DDTW elsewhere will not benefit # from this speed enhancement if self.distance_measures.__contains__( ddtw_c) or self.distance_measures.__contains__(wddtw_c): der_X = DerivativeSlopeTransformer().fit_transform(X) # reshape X for use with the efficient cython distance measures der_X = np.array( [np.asarray([x]).reshape(len(x), 1) for x in der_X.iloc[:, 0]]) else: der_X = None # reshape X for use with the efficient cython distance measures X = np.array( [np.asarray([x]).reshape(len(x), 1) for x in X.iloc[:, 0]]) self.train_accs_by_classifier = np.zeros(len(self.distance_measures)) self.train_preds_by_classifier = [None] * len(self.distance_measures) self.estimators_ = [None] * len(self.distance_measures) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] rand = np.random.RandomState(self.random_state) # The default EE uses all training instances for setting parameters, # and 100 parameter options per # elastic measure. The prop_train_in_param_finding and # prop_of_param_options attributes of this class # can be used to control this however, using less cases to optimise # parameters on the training data # and/or using less parameter options. # # For using less training instances the appropriate number of cases # must be sampled from the data. # This is achieved through the use of a deterministic # StratifiedShuffleSplit # # For using less parameter options a RandomizedSearchCV is used in # place of a GridSearchCV param_train_x = None der_param_train_x = None param_train_y = None # If using less cases for parameter optimisation, use the # StratifiedShuffleSplit: if self.proportion_train_in_param_finding < 1: if self.verbose > 0: print( # noqa: T001 "Restricting training cases for parameter optimisation: ", end="") sss = StratifiedShuffleSplit( n_splits=1, test_size=1 - self.proportion_train_in_param_finding, random_state=rand, ) for train_index, _ in sss.split(X, y): param_train_x = X[train_index, :] param_train_y = y[train_index] if der_X is not None: der_param_train_x = der_X[train_index, :] if self.verbose > 0: print( # noqa: T001 "using " + str(len(param_train_x)) + " training cases instead of " + str(len(X)) + " for parameter optimisation") # else, use the full training data for optimising parameters else: if self.verbose > 0: print( # noqa: T001 "Using all training cases for parameter optimisation") param_train_x = X param_train_y = y if der_X is not None: der_param_train_x = der_X self.constituent_build_times = [] if self.verbose > 0: print( # noqa: T001 "Using " + str(100 * self.proportion_of_param_options) + " parameter " "options per " "measure") for dm in range(0, len(self.distance_measures)): this_measure = self.distance_measures[dm] # uses the appropriate training data as required (either full or # smaller sample as per the StratifiedShuffleSplit) param_train_to_use = param_train_x full_train_to_use = X if this_measure is ddtw_c or dm is wddtw_c: param_train_to_use = der_param_train_x full_train_to_use = der_X if this_measure is ddtw_c: this_measure = dtw_c elif this_measure is wddtw_c: this_measure = wdtw_c start_build_time = time.time() if self.verbose > 0: if (self.distance_measures[dm] is ddtw_c or self.distance_measures[dm] is wddtw_c): print( # noqa: T001 "Currently evaluating " + str(self.distance_measures[dm].__name__) + " (implemented as " + str(this_measure.__name__) + " with pre-transformed derivative data)") else: print( # noqa: T001 "Currently evaluating " + str(self.distance_measures[dm].__name__)) # If 100 parameter options are being considered per measure, # use a GridSearchCV if self.proportion_of_param_options == 1: grid = GridSearchCV( estimator=KNeighborsTimeSeriesClassifier( metric=this_measure, n_neighbors=1, algorithm="brute"), param_grid=ElasticEnsemble._get_100_param_options( self.distance_measures[dm], X), cv=LeaveOneOut(), scoring="accuracy", verbose=self.verbose, ) grid.fit(param_train_to_use, param_train_y) # Else, used RandomizedSearchCV to randomly sample parameter # options for each measure else: grid = RandomizedSearchCV( estimator=KNeighborsTimeSeriesClassifier( metric=this_measure, n_neighbors=1, algorithm="brute"), param_distributions=ElasticEnsemble._get_100_param_options( self.distance_measures[dm], X), cv=LeaveOneOut(), scoring="accuracy", n_iter=100 * self.proportion_of_param_options, random_state=rand, verbose=self.verbose, ) grid.fit(param_train_to_use, param_train_y) # once the best parameter option has been estimated on the # training data, perform a final pass with this parameter option # to get the individual predictions with cross_cal_predict ( # Note: optimisation potentially possible here if a GridSearchCV # was used previously. TO-DO: determine how to extract # predictions for the best param option from GridSearchCV) best_model = KNeighborsTimeSeriesClassifier( algorithm="brute", n_neighbors=1, metric=this_measure, metric_params=grid.best_params_["metric_params"], ) preds = cross_val_predict(best_model, full_train_to_use, y, cv=LeaveOneOut()) acc = accuracy_score(y, preds) if self.verbose > 0: print( # noqa: T001 "Training accuracy for " + str(self.distance_measures[dm].__name__) + ": " + str(acc) + " (with parameter setting: " + str(grid.best_params_["metric_params"]) + ")") # Finally, reset the classifier for this measure and parameter # option, ready to be called for test classification best_model = KNeighborsTimeSeriesClassifier( algorithm="brute", n_neighbors=1, metric=this_measure, metric_params=grid.best_params_["metric_params"], ) best_model.fit(full_train_to_use, y) end_build_time = time.time() self.constituent_build_times.append( str(end_build_time - start_build_time)) self.estimators_[dm] = best_model self.train_accs_by_classifier[dm] = acc self.train_preds_by_classifier[dm] = preds self._is_fitted = True return self