def run_clustering_experiment( trainX, clusterer, results_path, trainY=None, testX=None, testY=None, cls_name=None, dataset_name=None, resample_id=0, ): """ Run a clustering experiment and save the results to file. Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. This version loads the data from file based on a path. The clusterer is always trained on the required input data trainX. Output to trainResample<resampleID>.csv will be the predicted clusters of trainX. If trainY is also passed, these are written to file. If the clusterer makes probabilistic predictions, these are also written to file. See write_results_to_uea_format for more on the output. Be warned, this method will always overwrite existing results, check bvefore calling or use load_and_run_clustering_experiment instead. Parameters ---------- trainX : pd.DataFrame or np.array The data to cluster. clusterer : BaseClusterer The clustering object results_path : str Where to write the results to trainY : np.array, default = None Train data tue class labels, only used for file writing, ignored by the clusterer testX : pd.DataFrame or np.array, default = None Test attribute data, if present it is used for predicting testY testY : np.array, default = None Test data true class labels, only used for file writing, ignored by the clusterer cls_name : str, default = None Name of the clusterer, written to the results file, ignored if None dataset_name : str, default = None Name of problem, written to the results file, ignored if None resample_id : int, default = 0 Resample identifier, defaults to 0 """ # Build the clusterer on train data, recording how long it takes start = int(round(time.time() * 1000)) clusterer.fit(trainX) build_time = int(round(time.time() * 1000)) - start start = int(round(time.time() * 1000)) train_preds = clusterer.predict(trainX) # predict_train_time = int(round(time.time() * 1000)) - start # Form predictions on trainY start = int(round(time.time() * 1000)) preds = clusterer.predict(testX) test_time = int(round(time.time() * 1000)) - start second = str(clusterer.get_params()) second.replace("\n", " ") second.replace("\r", " ") # TODO: refactor clusterers to return an array pr = np.array(preds) third = "," + str(build_time) + "," + str(test_time) + ",-1,-1," write_results_to_uea_format( second_line=second, third_line=third, output_path=results_path, estimator_name=cls_name, resample_seed=resample_id, y_pred=pr, dataset_name=dataset_name, y_true=testY, split="TEST", full_path=False, ) # preds = form_cluster_list(clusters, len(testY)) if "Composite" in cls_name: second = "Para info too long!" else: if "Composite" in cls_name: second = "Para info too long!" else: second = str(clusterer.get_params()) second.replace("\n", " ") second.replace("\r", " ") third = "FORMAT NOT FINALISED" write_results_to_uea_format( second_line=second, third_line=third, output_path=results_path, estimator_name=cls_name, resample_seed=resample_id, y_pred=train_preds, dataset_name=dataset_name, y_true=trainY, split="TRAIN", full_path=False, )
def run_classification_experiment( trainX, trainY, testX, testY, classifier, results_path, cls_name="", dataset="", resample_id=0, train_file=False, ): """Run a classification experiment and save the results to file. Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. Parameters ---------- trainX : pd.DataFrame or np.array The data to train the classifier. trainY : np.array, default = None Training data class labels. testX : pd.DataFrame or np.array, default = None The data used to test the trained classifier. testY : np.array, default = None Testing data class labels. classifier : BaseClassifier Classifier to be used in the experiment. results_path : str Location of where to write results. Any required directories will be created. cls_name : str, default="" Name of the classifier. dataset : str, default="" Name of problem. resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. train_file : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold cross-validation on the train data and saves. If the classifier can produce its own estimates, those are used instead. """ start = int(round(time.time() * 1000)) classifier.fit(trainX, trainY) build_time = int(round(time.time() * 1000)) - start start = int(round(time.time() * 1000)) probs = classifier.predict_proba(testX) preds = classifier.classes_[np.argmax(probs, axis=1)] test_time = int(round(time.time() * 1000)) - start ac = accuracy_score(testY, preds) if "Composite" in cls_name: second = "Para info too long!" else: second = str(classifier.get_params()) second.replace("\n", " ") second.replace("\r", " ") third = (str(ac) + "," + str(build_time) + "," + str(test_time) + ",-1,-1," + str(len(classifier.classes_))) write_results_to_uea_format( second_line=second, third_line=third, output_path=results_path, estimator_name=cls_name, resample_seed=resample_id, y_pred=preds, predicted_probs=probs, dataset_name=dataset, y_true=testY, split="TEST", full_path=False, ) if train_file: start = int(round(time.time() * 1000)) if hasattr(classifier, "_get_train_probs" ): # Normally Can only do this if test has been built train_probs = classifier._get_train_probs(trainX) else: train_probs = cross_val_predict(classifier, X=trainX, y=trainY, cv=10, method="predict_proba") train_time = int(round(time.time() * 1000)) - start train_preds = classifier.classes_[np.argmax(train_probs, axis=1)] train_acc = accuracy_score(trainY, train_preds) if "Composite" in cls_name: second = "Para info too long!" else: second = str(classifier.get_params()) second.replace("\n", " ") second.replace("\r", " ") third = (str(train_acc) + "," + str(train_time) + ",-1,-1,-1," + str(len(classifier.classes_))) write_results_to_uea_format( second_line=second, third_line=third, output_path=results_path, estimator_name=cls_name, resample_seed=resample_id, y_pred=train_preds, predicted_probs=train_probs, dataset_name=dataset, y_true=trainY, split="TRAIN", full_path=False, )
def run_classification_experiment( X_train, y_train, X_test, y_test, classifier, results_path, cls_name="", dataset="", resample_id=0, train_file=False, test_file=True, ): """Run a classification experiment and save the results to file. Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. Parameters ---------- X_train : pd.DataFrame or np.array The data to train the classifier. y_train : np.array, default = None Training data class labels. X_test : pd.DataFrame or np.array, default = None The data used to test the trained classifier. y_test : np.array, default = None Testing data class labels. classifier : BaseClassifier Classifier to be used in the experiment. results_path : str Location of where to write results. Any required directories will be created. cls_name : str, default="" Name of the classifier. dataset : str, default="" Name of problem. resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. train_file : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold cross-validation on the train data and saves. If the classifier can produce its own estimates, those are used instead. test_file : bool, default=True: Whether to generate test files or not. If the classifier can generate its own train probabilities, the classifier will be built but no file will be output. """ if not test_file and not train_file: raise Exception( "Both test_file and train_file are set to False. " "At least one must be output." ) classifier_train_probs = ( train_file and getattr(classifier, "time_limit_in_minutes", None) is not None ) build_time = -1 if test_file or classifier_train_probs: start = int(round(time.time() * 1000)) classifier.fit(X_train, y_train) build_time = int(round(time.time() * 1000)) - start if test_file: start = int(round(time.time() * 1000)) probs = classifier.predict_proba(X_test) test_time = int(round(time.time() * 1000)) - start if "composite" in cls_name.lower(): second = "Para info too long!" else: second = str(classifier.get_params()) second.replace("\n", " ") second.replace("\r", " ") preds = classifier.classes_[np.argmax(probs, axis=1)] acc = accuracy_score(y_test, preds) third = ( str(acc) + "," + str(build_time) + "," + str(test_time) + ",-1,-1," + str(len(classifier.classes_)) + ",,-1,-1" ) write_results_to_uea_format( second_line=second, third_line=third, first_line_comment="PREDICTIONS,Generated by experiments.py on " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + ".", timing_type="MILLISECONDS", output_path=results_path, estimator_name=cls_name, resample_seed=resample_id, y_pred=preds, predicted_probs=probs, dataset_name=dataset, y_true=y_test, split="TEST", full_path=False, ) if train_file: start = int(round(time.time() * 1000)) if classifier_train_probs: # Normally Can only do this if test has been built train_probs = classifier._get_train_probs(X_train, y_train) else: cv_size = 10 _, counts = np.unique(y_train, return_counts=True) min_class = np.min(counts) if min_class < cv_size: cv_size = min_class train_probs = cross_val_predict( classifier, X=X_train, y=y_train, cv=cv_size, method="predict_proba" ) train_time = int(round(time.time() * 1000)) - start if "composite" in cls_name.lower(): second = "Para info too long!" else: second = str(classifier.get_params()) second.replace("\n", " ") second.replace("\r", " ") train_preds = classifier.classes_[np.argmax(train_probs, axis=1)] train_acc = accuracy_score(y_train, train_preds) third = ( str(train_acc) + "," + str(build_time) + ",-1,-1,-1," + str(len(classifier.classes_)) + ",," + str(train_time) + "," + str(build_time + train_time) ) write_results_to_uea_format( second_line=second, third_line=third, first_line_comment="PREDICTIONS,Generated by experiments.py on " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + ".", timing_type="NANOSECONDS", output_path=results_path, estimator_name=cls_name, resample_seed=resample_id, y_pred=train_preds, predicted_probs=train_probs, dataset_name=dataset, y_true=y_train, split="TRAIN", full_path=False, )
def write_files( self, output_results_path, output_classifier_name="EE", write_train=True, write_test=True, overwrite=False, ): """ Write the results to file. Probably could be replaced with data_io.write_results_UEA Parameters ---------- output_results_path : str path to where output results will be written output_classifier_name : str the name of the composite ensemble classifier in the output files write_train : boolean true will write train files for the ensemble, false will skip training files write_test : boolean true will write test files for the ensemble, false will skip test files overwrite: boolean if true, any existing train/test files will be over-written. False prevents file overwriting """ if write_train is False and write_test is False: return if not overwrite: if write_train: full_path = (str(output_results_path) + "/" + str(output_classifier_name) + "/Predictions/" + str(self.dataset_name) + "/trainFold" + str(self.resample_id) + ".csv") if os.path.exists(full_path): write_train = False if write_test is True: full_path = (str(output_results_path) + "/" + str(output_classifier_name) + "/Predictions/" + str(self.dataset_name) + "/testFold" + str(self.resample_id) + ".csv") if os.path.exists(full_path): print( full_path + " already exists and overwrite set to false, not writing Test" ) write_test = False if write_train is False and write_test is False: return if write_train: train_probs = self.ee_train_dists train_preds = self.classes_[np.argmax(train_probs, axis=1)] acc = accuracy_score(self.actual_train_class_vals, train_preds) second = str(self.distance_measures) third = (str(acc) + ",NA,NA,-1,-1," + str(len(self.classes_)) + "," + str(self.classes_)) write_results_to_uea_format( second_line=second, third_line=third, output_path=output_results_path, classifier_name=output_classifier_name, resample_seed=self.resample_id, predicted_class_vals=train_preds, predicted_probs=train_probs, dataset_name=self.dataset_name, actual_class_vals=self.actual_train_class_vals, split="TRAIN", ) if write_test: test_probs = self.ee_test_dists test_preds = self.classes_[np.argmax(test_probs, axis=1)] acc = accuracy_score(self.actual_test_class_vals, test_preds) second = str(self.distance_measures) third = (str(acc) + ",NA,NA,-1,-1," + str(len(self.classes_)) + "," + str(self.classes_)) write_results_to_uea_format( second_line=second, third_line=third, output_path=output_results_path, classifier_name=output_classifier_name, resample_seed=self.resample_id, predicted_class_vals=test_preds, predicted_probs=test_probs, dataset_name=self.dataset_name, actual_class_vals=self.actual_test_class_vals, split="TEST", )