def load_and_run_classification_experiment( problem_path, results_path, cls_name, dataset, classifier=None, resample_id=0, overwrite=False, build_train=False, predefined_resample=False, ): """Load a dataset and run a classification experiment. Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. Parameters ---------- problem_path : str Location of problem files, full path. results_path : str Location of where to write results. Any required directories will be created. cls_name : str Determines which classifier to use, as defined in set_classifier. This assumes predict_proba is implemented, to avoid predicting twice. May break some classifiers though. dataset : str Name of problem. Files must be <problem_path>/<dataset>/<dataset>+"_TRAIN.ts", same for "_TEST". classifier : BaseClassifier, default=None Classifier to be used in the experiment, if none is provided one is selected using cls_name using resample_id as a seed. resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. overwrite : bool, default=False If set to False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there. build_train : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold cross-validation on the train data and saves. If the classifier can produce its own estimates, those are used instead. predefined_resample : bool, default=False Read a predefined resample from file instead of performing a resample. If True the file format must include the resample_id at the end of the dataset name i.e. <problem_path>/<dataset>/<dataset>+<resample_id>+"_TRAIN.ts". """ # Check which files exist, if both exist, exit build_test = True if not overwrite: full_path = ( results_path + "/" + cls_name + "/Predictions/" + dataset + "/testResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): build_test = False if build_train: full_path = ( results_path + "/" + cls_name + "/Predictions/" + dataset + "/trainResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): build_train = False if build_train is False and build_test is False: return if predefined_resample: X_train, y_train = load_ts( problem_path + dataset + "/" + dataset + str(resample_id) + "_TRAIN.ts" ) X_test, y_test = load_ts( problem_path + dataset + "/" + dataset + str(resample_id) + "_TEST.ts" ) else: X_train, y_train = load_ts(problem_path + dataset + "/" + dataset + "_TRAIN.ts") X_test, y_test = load_ts(problem_path + dataset + "/" + dataset + "_TEST.ts") if resample_id != 0: X_train, y_train, X_test, y_test = stratified_resample( X_train, y_train, X_test, y_test, resample_id ) if classifier is None: classifier = set_classifier(cls_name, resample_id, build_train) run_classification_experiment( X_train, y_train, X_test, y_test, classifier, results_path, cls_name=cls_name, dataset=dataset, resample_id=resample_id, train_file=build_train, test_file=build_test, )
def load_and_run_clustering_experiment( problem_path, results_path, cls_name, dataset, clusterer=None, resample_id=0, overwrite=False, format=".ts", train_file=False, ): """Run a clustering experiment. Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. This version loads the data from file based on a path. The clusterer is always trained on the Parameters ---------- problem_path : str Location of problem files, full path. results_path : str Location of where to write results. Any required directories will be created cls_name : str determines which clusterer to use if clusterer is None. In this case, set_clusterer is called with this cls_name dataset : str Name of problem. Files must be <problem_path>/<dataset>/<dataset>+ "_TRAIN"+format, same for "_TEST" resample_id : int, default = 0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. overwrite : boolean, default = False if False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there. format: string, default = ".ts" Valid formats are ".ts", ".arff", ".tsv" and ".long". For more info on format, see examples/loading_data.ipynb train_file: boolean, default = False whether to generate train files or not. If true, it performs a 10xCV on the train and saves """ # Set up the file path in standard format if not overwrite: full_path = ( str(results_path) + "/" + str(cls_name) + "/Predictions/" + str(dataset) + "/testResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): build_test = False if train_file: full_path = ( str(results_path) + "/" + str(cls_name) + "/Predictions/" + str(dataset) + "/trainResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): train_file = False if train_file is False and build_test is False: return # currently only works with .ts trainX, trainY = load_ts(problem_path + dataset + "/" + dataset + "_TRAIN" + format) testX, testY = load_ts(problem_path + dataset + "/" + dataset + "_TEST" + format) if resample_id != 0: trainX, trainY, testX, testY = stratified_resample( trainX, trainY, testX, testY, resample_id ) le = preprocessing.LabelEncoder() le.fit(trainY) trainY = le.transform(trainY) testY = le.transform(testY) if clusterer is None: clusterer = set_clusterer(cls_name, resample_id) run_clustering_experiment( trainX, clusterer, trainY=trainY, testX=testX, testY=testY, cls_name=cls_name, dataset_name=dataset, results_path=results_path, )