Example #1
0
def run_clustering_experiment(
    trainX,
    clusterer,
    results_path,
    trainY=None,
    testX=None,
    testY=None,
    cls_name=None,
    dataset_name=None,
    resample_id=0,
):
    """
    Run a clustering experiment and save the results to file.

    Method to run a basic experiment and write the results to files called
    testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. This
    version loads the data from file based on a path. The clusterer is always trained on
    the required input data trainX. Output to trainResample<resampleID>.csv will be
    the predicted clusters of trainX. If trainY is also passed, these are written to
    file. If the clusterer makes probabilistic predictions, these are also written to
    file. See write_results_to_uea_format for more on the output. Be warned,
    this method will always overwrite existing results, check bvefore calling or use
    load_and_run_clustering_experiment instead.

    Parameters
    ----------
    trainX : pd.DataFrame or np.array
        The data to cluster.
    clusterer : BaseClusterer
        The clustering object
    results_path : str
        Where to write the results to
    trainY : np.array, default = None
        Train data tue class labels, only used for file writing, ignored by the
        clusterer
    testX : pd.DataFrame or np.array, default = None
        Test attribute data, if present it is used for predicting testY
    testY : np.array, default = None
        Test data true class labels, only used for file writing, ignored by the
        clusterer
    cls_name : str, default = None
        Name of the clusterer, written to the results file, ignored if None
    dataset_name : str, default = None
        Name of problem, written to the results file, ignored if None
    resample_id : int, default = 0
        Resample identifier, defaults to 0

    """
    # Build the clusterer on train data, recording how long it takes

    start = int(round(time.time() * 1000))
    clusterer.fit(trainX)
    build_time = int(round(time.time() * 1000)) - start
    start = int(round(time.time() * 1000))
    train_preds = clusterer.predict(trainX)
    # predict_train_time = int(round(time.time() * 1000)) - start

    # Form predictions on trainY
    start = int(round(time.time() * 1000))
    preds = clusterer.predict(testX)
    test_time = int(round(time.time() * 1000)) - start
    second = str(clusterer.get_params())
    second.replace("\n", " ")
    second.replace("\r", " ")
    # TODO: refactor clusterers to return an array
    pr = np.array(preds)
    third = "," + str(build_time) + "," + str(test_time) + ",-1,-1,"
    write_results_to_uea_format(
        second_line=second,
        third_line=third,
        output_path=results_path,
        estimator_name=cls_name,
        resample_seed=resample_id,
        y_pred=pr,
        dataset_name=dataset_name,
        y_true=testY,
        split="TEST",
        full_path=False,
    )

    #        preds = form_cluster_list(clusters, len(testY))
    if "Composite" in cls_name:
        second = "Para info too long!"
    else:

        if "Composite" in cls_name:
            second = "Para info too long!"
        else:
            second = str(clusterer.get_params())
        second.replace("\n", " ")
        second.replace("\r", " ")
        third = "FORMAT NOT FINALISED"
        write_results_to_uea_format(
            second_line=second,
            third_line=third,
            output_path=results_path,
            estimator_name=cls_name,
            resample_seed=resample_id,
            y_pred=train_preds,
            dataset_name=dataset_name,
            y_true=trainY,
            split="TRAIN",
            full_path=False,
        )
Example #2
0
def run_classification_experiment(
    trainX,
    trainY,
    testX,
    testY,
    classifier,
    results_path,
    cls_name="",
    dataset="",
    resample_id=0,
    train_file=False,
):
    """Run a classification experiment and save the results to file.

    Method to run a basic experiment and write the results to files called
    testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv.

    Parameters
    ----------
    trainX : pd.DataFrame or np.array
        The data to train the classifier.
    trainY : np.array, default = None
        Training data class labels.
    testX : pd.DataFrame or np.array, default = None
        The data used to test the trained classifier.
    testY : np.array, default = None
        Testing data class labels.
    classifier : BaseClassifier
        Classifier to be used in the experiment.
    results_path : str
        Location of where to write results. Any required directories will be created.
    cls_name : str, default=""
        Name of the classifier.
    dataset : str, default=""
        Name of problem.
    resample_id : int, default=0
        Seed for resampling. If set to 0, the default train/test split from file is
        used. Also used in output file name.
    train_file : bool, default=False
        Whether to generate train files or not. If true, it performs a 10-fold
        cross-validation on the train data and saves. If the classifier can produce its
        own estimates, those are used instead.
    """
    start = int(round(time.time() * 1000))
    classifier.fit(trainX, trainY)
    build_time = int(round(time.time() * 1000)) - start
    start = int(round(time.time() * 1000))
    probs = classifier.predict_proba(testX)
    preds = classifier.classes_[np.argmax(probs, axis=1)]
    test_time = int(round(time.time() * 1000)) - start
    ac = accuracy_score(testY, preds)
    if "Composite" in cls_name:
        second = "Para info too long!"
    else:
        second = str(classifier.get_params())
    second.replace("\n", " ")
    second.replace("\r", " ")
    third = (str(ac) + "," + str(build_time) + "," + str(test_time) +
             ",-1,-1," + str(len(classifier.classes_)))
    write_results_to_uea_format(
        second_line=second,
        third_line=third,
        output_path=results_path,
        estimator_name=cls_name,
        resample_seed=resample_id,
        y_pred=preds,
        predicted_probs=probs,
        dataset_name=dataset,
        y_true=testY,
        split="TEST",
        full_path=False,
    )
    if train_file:
        start = int(round(time.time() * 1000))
        if hasattr(classifier, "_get_train_probs"
                   ):  # Normally Can only do this if test has been built
            train_probs = classifier._get_train_probs(trainX)
        else:
            train_probs = cross_val_predict(classifier,
                                            X=trainX,
                                            y=trainY,
                                            cv=10,
                                            method="predict_proba")
        train_time = int(round(time.time() * 1000)) - start
        train_preds = classifier.classes_[np.argmax(train_probs, axis=1)]
        train_acc = accuracy_score(trainY, train_preds)
        if "Composite" in cls_name:
            second = "Para info too long!"
        else:
            second = str(classifier.get_params())
        second.replace("\n", " ")
        second.replace("\r", " ")
        third = (str(train_acc) + "," + str(train_time) + ",-1,-1,-1," +
                 str(len(classifier.classes_)))
        write_results_to_uea_format(
            second_line=second,
            third_line=third,
            output_path=results_path,
            estimator_name=cls_name,
            resample_seed=resample_id,
            y_pred=train_preds,
            predicted_probs=train_probs,
            dataset_name=dataset,
            y_true=trainY,
            split="TRAIN",
            full_path=False,
        )
Example #3
0
def run_classification_experiment(
    X_train,
    y_train,
    X_test,
    y_test,
    classifier,
    results_path,
    cls_name="",
    dataset="",
    resample_id=0,
    train_file=False,
    test_file=True,
):
    """Run a classification experiment and save the results to file.

    Method to run a basic experiment and write the results to files called
    testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv.

    Parameters
    ----------
    X_train : pd.DataFrame or np.array
        The data to train the classifier.
    y_train : np.array, default = None
        Training data class labels.
    X_test : pd.DataFrame or np.array, default = None
        The data used to test the trained classifier.
    y_test : np.array, default = None
        Testing data class labels.
    classifier : BaseClassifier
        Classifier to be used in the experiment.
    results_path : str
        Location of where to write results. Any required directories will be created.
    cls_name : str, default=""
        Name of the classifier.
    dataset : str, default=""
        Name of problem.
    resample_id : int, default=0
        Seed for resampling. If set to 0, the default train/test split from file is
        used. Also used in output file name.
    train_file : bool, default=False
        Whether to generate train files or not. If true, it performs a 10-fold
        cross-validation on the train data and saves. If the classifier can produce its
        own estimates, those are used instead.
    test_file : bool, default=True:
         Whether to generate test files or not. If the classifier can generate its own
         train probabilities, the classifier will be built but no file will be output.
    """
    if not test_file and not train_file:
        raise Exception(
            "Both test_file and train_file are set to False. "
            "At least one must be output."
        )

    classifier_train_probs = (
        train_file and getattr(classifier, "time_limit_in_minutes", None) is not None
    )
    build_time = -1

    if test_file or classifier_train_probs:
        start = int(round(time.time() * 1000))
        classifier.fit(X_train, y_train)
        build_time = int(round(time.time() * 1000)) - start

        if test_file:
            start = int(round(time.time() * 1000))
            probs = classifier.predict_proba(X_test)
            test_time = int(round(time.time() * 1000)) - start

            if "composite" in cls_name.lower():
                second = "Para info too long!"
            else:
                second = str(classifier.get_params())
            second.replace("\n", " ")
            second.replace("\r", " ")

            preds = classifier.classes_[np.argmax(probs, axis=1)]
            acc = accuracy_score(y_test, preds)
            third = (
                str(acc)
                + ","
                + str(build_time)
                + ","
                + str(test_time)
                + ",-1,-1,"
                + str(len(classifier.classes_))
                + ",,-1,-1"
            )

            write_results_to_uea_format(
                second_line=second,
                third_line=third,
                first_line_comment="PREDICTIONS,Generated by experiments.py on "
                + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
                + ".",
                timing_type="MILLISECONDS",
                output_path=results_path,
                estimator_name=cls_name,
                resample_seed=resample_id,
                y_pred=preds,
                predicted_probs=probs,
                dataset_name=dataset,
                y_true=y_test,
                split="TEST",
                full_path=False,
            )

    if train_file:
        start = int(round(time.time() * 1000))
        if classifier_train_probs:  # Normally Can only do this if test has been built
            train_probs = classifier._get_train_probs(X_train, y_train)
        else:
            cv_size = 10
            _, counts = np.unique(y_train, return_counts=True)
            min_class = np.min(counts)
            if min_class < cv_size:
                cv_size = min_class

            train_probs = cross_val_predict(
                classifier, X=X_train, y=y_train, cv=cv_size, method="predict_proba"
            )
        train_time = int(round(time.time() * 1000)) - start

        if "composite" in cls_name.lower():
            second = "Para info too long!"
        else:
            second = str(classifier.get_params())
        second.replace("\n", " ")
        second.replace("\r", " ")

        train_preds = classifier.classes_[np.argmax(train_probs, axis=1)]
        train_acc = accuracy_score(y_train, train_preds)
        third = (
            str(train_acc)
            + ","
            + str(build_time)
            + ",-1,-1,-1,"
            + str(len(classifier.classes_))
            + ",,"
            + str(train_time)
            + ","
            + str(build_time + train_time)
        )

        write_results_to_uea_format(
            second_line=second,
            third_line=third,
            first_line_comment="PREDICTIONS,Generated by experiments.py on "
            + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            + ".",
            timing_type="NANOSECONDS",
            output_path=results_path,
            estimator_name=cls_name,
            resample_seed=resample_id,
            y_pred=train_preds,
            predicted_probs=train_probs,
            dataset_name=dataset,
            y_true=y_train,
            split="TRAIN",
            full_path=False,
        )
    def write_files(
        self,
        output_results_path,
        output_classifier_name="EE",
        write_train=True,
        write_test=True,
        overwrite=False,
    ):
        """
        Write the results to file.

        Probably could be replaced with data_io.write_results_UEA

        Parameters
        ----------
        output_results_path : str
            path to where output results will be written
        output_classifier_name : str
            the name of the composite ensemble classifier in the output files
        write_train : boolean
            true will write train files for the ensemble, false will skip training files
        write_test : boolean
            true will write test files for the ensemble, false will skip test files
        overwrite: boolean
            if true, any existing train/test files will be over-written. False
            prevents file overwriting
        """
        if write_train is False and write_test is False:
            return

        if not overwrite:
            if write_train:
                full_path = (str(output_results_path) + "/" +
                             str(output_classifier_name) + "/Predictions/" +
                             str(self.dataset_name) + "/trainFold" +
                             str(self.resample_id) + ".csv")
                if os.path.exists(full_path):
                    write_train = False

            if write_test is True:
                full_path = (str(output_results_path) + "/" +
                             str(output_classifier_name) + "/Predictions/" +
                             str(self.dataset_name) + "/testFold" +
                             str(self.resample_id) + ".csv")
                if os.path.exists(full_path):
                    print(
                        full_path +
                        " already exists and overwrite set to false, not writing Test"
                    )
                    write_test = False

        if write_train is False and write_test is False:
            return
        if write_train:
            train_probs = self.ee_train_dists
            train_preds = self.classes_[np.argmax(train_probs, axis=1)]
            acc = accuracy_score(self.actual_train_class_vals, train_preds)
            second = str(self.distance_measures)
            third = (str(acc) + ",NA,NA,-1,-1," + str(len(self.classes_)) +
                     "," + str(self.classes_))
            write_results_to_uea_format(
                second_line=second,
                third_line=third,
                output_path=output_results_path,
                classifier_name=output_classifier_name,
                resample_seed=self.resample_id,
                predicted_class_vals=train_preds,
                predicted_probs=train_probs,
                dataset_name=self.dataset_name,
                actual_class_vals=self.actual_train_class_vals,
                split="TRAIN",
            )

        if write_test:
            test_probs = self.ee_test_dists
            test_preds = self.classes_[np.argmax(test_probs, axis=1)]
            acc = accuracy_score(self.actual_test_class_vals, test_preds)
            second = str(self.distance_measures)
            third = (str(acc) + ",NA,NA,-1,-1," + str(len(self.classes_)) +
                     "," + str(self.classes_))
            write_results_to_uea_format(
                second_line=second,
                third_line=third,
                output_path=output_results_path,
                classifier_name=output_classifier_name,
                resample_seed=self.resample_id,
                predicted_class_vals=test_preds,
                predicted_probs=test_probs,
                dataset_name=self.dataset_name,
                actual_class_vals=self.actual_test_class_vals,
                split="TEST",
            )