Ejemplo n.º 1
0
    def __init__(self, km, fh):
        self.utils_cl = MyUtils()
        self.km = km
        self.fh = fh
        self.cm = ClusteringMachine()

        logging.info("Wrappers instantiated")
Ejemplo n.º 2
0
    def __init__(self):
        logging.info(pk.getKhiopsInfo())
        self.this_file_dir = os.path.dirname(os.path.realpath(__file__))

        # path mgmt
        # use timestamp of each exec in paths
        self.fh = FileHelper()
        self.dictionary_file = os.path.join(self.this_file_dir, "dic",
                                            "series.kdic")
        self.classif_res = os.path.join(self.this_file_dir, "res",
                                        "khiops_res", "classif")
        self.coclus_res = os.path.join(self.this_file_dir, "res", "khiops_res",
                                       "coclus")
        self.pred_res = os.path.join(self.this_file_dir, "res", "khiops_res",
                                     "pred_res")

        self.fh.ensure_dirs_exist([
            self.dictionary_file, self.classif_res, self.coclus_res,
            self.pred_res
        ])

        self.ccr = CoclusteringResults()
        self.utils = MyUtils()
        logging.info("Khiops manager instantiated")
        logging.info("dictionary_file used: %s", self.dictionary_file)
 def __init__(self, file_name):
     self.utils_cl = MyUtils()
     self.this_file_dir = os.path.dirname(os.path.realpath(__file__))
     self.fh = FileHelper()
     self.hm = HmmMachine()
     self.cm = ClusteringMachine()
     self.pm = PredictMachine()
     self.my_metric = "euclidean"
     self.file_name = file_name
     self.out_fcasts_f = os.path.join(self.this_file_dir, "res", "fcasts", file_name, "ecml")
     self.fh.ensure_dirs_exist([self.out_fcasts_f])
     logging.info("Instantiated ECML_operator")
Ejemplo n.º 4
0
    }, requests.codes.bad, "error"),
    ({
        "email": "peter@klaven"
    }, requests.codes.bad, "error"),
    ({
        "email": "1",
        "password": "******"
    }, requests.codes.not_found, "error"),
    ({}, requests.codes.bad, "error"),
]

ids_list = [
    "successful", "w/o email", "w/o password", "invalid email/password",
    "w/o parameters"
]
u = MyUtils()


@pytest.fixture(scope='function', params=param_list, ids=ids_list)
def param_test(request):
    return request.param


class TestClass():

    login_request = "api/login"
    email = "peter@klaven"
    password = "******"

    @allure.feature("Login")
    @allure.testcase("Login")
Ejemplo n.º 5
0
class Wrappers:
    def __init__(self, km, fh):
        self.utils_cl = MyUtils()
        self.km = km
        self.fh = fh
        self.cm = ClusteringMachine()

        logging.info("Wrappers instantiated")

    def launch_preds(
            self,
            data_fmt_clustering_train,
            data_fmt_clustering_valid_or_test,  # Train and test formated to be fed in classifier
            days_train,
            days_valid_or_test,
            l_ref,  # Days of train and test in a list
            cluster_and_values_train,  # To compute centroids
            nb_cluster,
            mean_day,
            fit,
            classifier):
        """
        Utilitary piece of code that is used to launch predictions. This wrapper
        takes input from clustering or coclustering algorithms and generates
        associated predictions MSEs.

        This method works both for clustering and coclustering group creation.

        ARGS:
        * data_fmt_clustering_train: Formatted data, ready to be fed into a
        classifier
        * data_fmt_clustering_valid_or_test: Same, but with only the last 20% remaining
        * days_train: all days present in train
        * days_valid_or_test: all days present in test
        * cluster_and_values_train: dic(cluster_num => values_associated)
        * nb_cluster: how many clusters pour cette passe
        * mean_day: mean day on train only
        * fit: classifier used
        * classifier: string label, which classifier are we using?
        """
        # Separate target from data
        train_full = data_fmt_clustering_train.loc[:,
                                                   data_fmt_clustering_train.
                                                   columns != 'y'].sort_index(
                                                   )
        train_target = data_fmt_clustering_train.ix[:, "y"].sort_index()

        test_full = data_fmt_clustering_valid_or_test.loc[:,
                                                          data_fmt_clustering_valid_or_test
                                                          .columns !=
                                                          'y'].sort_index()
        test_target = data_fmt_clustering_valid_or_test.ix[:, "y"].sort_index()

        # Feed df_train
        fit.fit(train_full, train_target)

        res = {}

        # Retrieve cluster names (we are not numbering them but referencing them
        # their little given name)
        clusts_names = cluster_and_values_train.keys()

        (centroids, e) = self.km.compute_centroids(cluster_and_values_train,
                                                   days_train, l_ref)

        # NOTE: this lines below are indeed strange... But we have had developed some other API that we had to follow.
        # That is why we format the data that way.
        y_pred = pd.DataFrame(fit.predict_proba(test_full),
                              columns=fit.classes_)
        y_pred["Predictedy"] = y_pred.idxmax(axis=1)
        y_pred.columns = ["Proby" + str(col) for col in y_pred.columns]
        y_pred.rename(columns={'ProbyPredictedy': 'Predictedy'}, inplace=True)

        # 1. w/ MODL & probabilistic prevision
        res[False] = self.km.process_pred_proba(y_pred, test_target,
                                                clusts_names, centroids,
                                                days_valid_or_test, nb_cluster,
                                                l_ref, mean_day)

        # 2. Oracle
        y_pred_or = y_pred.copy().reset_index(drop=True)
        y_pred_or["Predictedy"] = test_target.reset_index(drop=True)

        res[True] = self.km.process_pred_proba(y_pred_or,
                                               test_target,
                                               clusts_names,
                                               centroids,
                                               days_valid_or_test,
                                               nb_cluster,
                                               l_ref,
                                               mean_day,
                                               o=True)

        return (e, res)

    def simplify_coclus(self, n_clus_found, n_clus_target, mpi, mcn,
                        file_name_train, path_file_test, file_name_root,
                        wlabel):
        """
        Ad-hoc method to simplify a Khiops coclustering json file.
        """
        # loop configuration
        to_add = 5
        to_remove = 2
        mcn_init_ok = False

        # First try to get the good number of cluster!
        logging.info("Keep %s percent of info", mpi)
        logging.info("Desired nb of cluster is %s", n_clus_target)

        identifier = mpi

        # While we do not reach the good number of cluster
        all_mpis = []
        while n_clus_found != n_clus_target:
            if mcn == 1 and len(all_mpis) < 20:
                logging.info("Simplifying using MPI")
                simplified_file = self.km.simplify_coclustering(
                    file_name_train, mpi=mpi)
                n_clus_found = self.km.get_cluster_number(file_name_train,
                                                          ref_id=mpi)
                logging.info(
                    "Found %s clusters in simplified coclus file (expected %s)",
                    n_clus_found, n_clus_target)
                all_mpis.append(mpi)
                if n_clus_found > n_clus_target:
                    self.fh.rm_simplified_outputs(file_name_train, mpi)
                    mpi = mpi - to_remove
                    logging.info(
                        "MPI was apparently too big, decreasing its size of -%s: mpi = %s",
                        to_remove, mpi)
                    if mpi in all_mpis:
                        to_remove = to_remove / 2
                        to_add = to_add / 2
                elif n_clus_found < n_clus_target:
                    self.fh.rm_simplified_outputs(file_name_train, mpi)
                    mpi = mpi + to_add
                    logging.info(
                        "MPI was apparently too small, increasing its size of +%s: mpi = %s",
                        to_add, mpi)
                identifier = mpi
            else:
                logging.info("Now simplifying using MCN")

                # Init new loop: should use cell number now because it is more
                # precise than the latest
                if not mcn_init_ok:
                    self.km.simplify_coclustering(file_name_train, mpi=mpi)
                    mcn = self.km.get_cells_number(file_name_train, mpi)
                    to_add = math.floor(mcn * 0.2437)  # empirical coefficient
                    to_remove = math.floor(mcn * 0.097)
                    mcn = mcn + to_add
                    # once initialised, we do not want to re run above
                    mcn_init_ok = True
                simplified_file = self.km.simplify_coclustering(
                    file_name_train, mcn=mcn)
                n_clus_found = self.km.get_cluster_number(file_name_train,
                                                          ref_id=mcn)
                logging.info(
                    "Found %s clusters in simplified coclus file (expected %s)",
                    n_clus_found, n_clus_target)
                if n_clus_found > n_clus_target:
                    self.fh.rm_simplified_outputs(file_name_train, mcn)
                    mcn = mcn - to_remove
                    logging.info(
                        "MCN was apparently too big, decreasing its size of -%s: mcn = %s",
                        to_remove, mcn)
                elif n_clus_found < n_clus_target:
                    self.fh.rm_simplified_outputs(file_name_train, mcn)
                    mcn = mcn + to_add
                    logging.info(
                        "MCN was apparently too small, increasing its size of +%s: mcn = %s",
                        to_add, mcn)
                identifier = mcn

        # Then, manage khiops files
        _, cluster_and_values_train = self.km.get_clusters(simplified_file)
        # Write labels of test file for future MODL transfert
        path_test_labels = self.fh.write_labels(path_file_test, identifier)
        # Write cluster and values on disk to compare clustering results
        self.fh.write_cav_on_disk(cluster_and_values_train, file_name_root,
                                  n_clus_found, wlabel)

        # Deploy dic and use deployed dic to transfer (transfer = actually use
        # MODL model created, fm)
        path_deployed_dic = self.km.deploy_coclustering(
            file_name_train, identifier)
        path_transfered = self.km.transfer_database(path_deployed_dic,
                                                    path_test_labels,
                                                    path_file_test, identifier)

        # Retrieve clusters attribution from khiops use of MODL
        cluster_and_values_test = self.km.get_clusters_from_dep(
            path_transfered)

        return ((mpi, mcn, n_clus_found), (cluster_and_values_train,
                                           cluster_and_values_test),
                simplified_file)

    def modl_wrap(self, df_train, df_valid_or_test, l_ref, mean_day, fit,
                  classifier, cluster_and_values_train,
                  cluster_and_values_valid_or_test, n_clus_found):

        # Create datasets before training a classifier
        data_fmt_train = self.utils_cl.format_data_for_classifier(
            df_train, cluster_and_values_train, "days_train")
        data_fmt_valid_or_test = self.utils_cl.format_data_for_classifier(
            df_valid_or_test, cluster_and_values_valid_or_test,
            "days_valid_or_test")

        (_, res) = self.launch_preds(data_fmt_train, data_fmt_valid_or_test,
                                     df_train, df_valid_or_test, l_ref,
                                     cluster_and_values_train, n_clus_found,
                                     mean_day, fit, classifier)

        # Extract results
        # For the simple algo (no oracle)
        (mses_non_proba, mses_proba, mean_mse_mean_day) = res[False][0]
        (mae_non_proba, mae_proba, mean_mae_mean_day) = res[False][1]
        (mase_non_proba, mase_proba, mean_mase_mean_day) = res[False][2]
        classifier_acc = res[False][3]
        (std_mse_non_proba, std_mse_proba, std_mse_mean_day) = res[False][4]
        (std_mae_non_proba, std_mae_proba, std_mae_mean_day) = res[False][5]
        (std_mase_non_proba, std_mase_proba, std_mase_mean_day) = res[False][6]

        # Then for the oracle algo
        (mse_non_proba_or, _, _) = res[True][0]
        (mae_non_proba_or, _, _) = res[True][1]
        (mase_non_proba_or, _, _) = res[True][2]
        (std_mse_non_proba_or, _, _) = res[True][4]
        (std_mae_non_proba_or, _, _) = res[True][5]
        (std_mase_non_proba_or, _, _) = res[True][6]

        return (((mses_non_proba, mses_proba, mae_non_proba, mae_proba,
                  mase_non_proba, mase_proba),
                 (mse_non_proba_or, mae_non_proba_or,
                  mase_non_proba_or), (mean_mse_mean_day, mean_mae_mean_day,
                                       mean_mase_mean_day), classifier_acc,
                 (std_mse_non_proba, std_mse_proba, std_mse_mean_day,
                  std_mae_non_proba, std_mae_proba, std_mae_mean_day,
                  std_mase_non_proba, std_mase_proba, std_mase_mean_day),
                 (std_mse_non_proba_or, std_mae_non_proba_or,
                  std_mase_non_proba_or)))

    def clustering_wrapper(self, func_and_label, days_train,
                           days_valid_or_test, mean_day_train, l_ref,
                           file_name, nb_cluster, fit, classifier):
        func, wlabel = func_and_label
        logging.info(
            "Clustering wrapper: process %s redictions for file %s with %s classifier",
            wlabel, file_name, classifier)

        # TRAIN: train clustering model
        (clustering, clusters_train) = func(days_train, nb_cluster)
        cc_train = self.cm.create_c_a_v(days_train, clusters_train)

        # TEST: apply the clustering model previously trained
        clusters_test = self.cm.apply_clustering(days_valid_or_test,
                                                 clustering)
        cc_valid_or_test = self.cm.create_c_a_v(days_valid_or_test,
                                                clusters_test)

        cav_clustering_train, _ = self.cm.format_clust_results(
            nb_cluster, cc_train)
        self.fh.write_cav_on_disk(cav_clustering_train, file_name, nb_cluster,
                                  wlabel)
        cav_clustering_valid_or_test, _ = self.cm.format_clust_results(
            nb_cluster, cc_valid_or_test)

        data_fmt_clustering_train = self.utils_cl.format_data_for_classifier(
            days_train, cav_clustering_train, "days_train")
        data_fmt_clustering_valid_or_test = self.utils_cl.format_data_for_classifier(
            days_valid_or_test, cav_clustering_valid_or_test,
            "days_valid_or_test")

        (empty_cluster_found, res) = self.launch_preds(
            data_fmt_clustering_train, data_fmt_clustering_valid_or_test,
            days_train, days_valid_or_test, l_ref, cav_clustering_train,
            nb_cluster, mean_day_train, fit, classifier)

        # Then for the simple algo (no oracle)
        (mses_non_proba, mses_proba, mean_mse_mean_day) = res[False][0]
        (mae_non_proba, mae_proba, mean_mae_mean_day) = res[False][1]
        (mase_non_proba, mase_proba, mean_mase_mean_day) = res[False][2]
        classifier_acc = res[False][3]
        (std_mse_non_proba, std_mse_proba, std_mse_mean_day) = res[False][4]
        (std_mae_non_proba, std_mae_proba, std_mae_mean_day) = res[False][5]
        (std_mase_non_proba, std_mase_proba, std_mase_mean_day) = res[False][6]

        # Then for the oracle algo
        (mse_non_proba_or, _, _) = res[True][0]
        (mae_non_proba_or, _, _) = res[True][1]
        (mase_non_proba_or, _, _) = res[True][2]
        (std_mse_non_proba_or, _, _) = res[True][4]
        (std_mae_non_proba_or, _, _) = res[True][5]
        (std_mase_non_proba_or, _, _) = res[True][6]

        return (empty_cluster_found,
                ((mses_non_proba, mses_proba, mae_non_proba, mae_proba,
                  mase_non_proba, mase_proba),
                 (mse_non_proba_or, mae_non_proba_or,
                  mase_non_proba_or), (mean_mse_mean_day, mean_mae_mean_day,
                                       mean_mase_mean_day), classifier_acc,
                 (std_mse_non_proba, std_mse_proba, std_mse_mean_day,
                  std_mae_non_proba, std_mae_proba, std_mae_mean_day,
                  std_mase_non_proba, std_mase_proba, std_mase_mean_day),
                 (std_mse_non_proba_or, std_mae_non_proba_or,
                  std_mase_non_proba_or)))
class ECMLMachine:

    def __init__(self, file_name):
        self.utils_cl = MyUtils()
        self.this_file_dir = os.path.dirname(os.path.realpath(__file__))
        self.fh = FileHelper()
        self.hm = HmmMachine()
        self.cm = ClusteringMachine()
        self.pm = PredictMachine()
        self.my_metric = "euclidean"
        self.file_name = file_name
        self.out_fcasts_f = os.path.join(self.this_file_dir, "res", "fcasts", file_name, "ecml")
        self.fh.ensure_dirs_exist([self.out_fcasts_f])
        logging.info("Instantiated ECML_operator")

    def get_results_univ(self, df, mean_day, n_pt_one_period, n_serie_concatenated = 1):
        # Create datasets in the format we need
        logging.info("Computing ECML results")
        logging.info("Find best number of cluster")
        (df_train, df_valid, df_test)     = self.utils_cl.app_valid_test(df, n_pt_one_period)
        (df_train_compar, df_test_compar) = self.utils_cl.app_test(df, n_pt_one_period)
        
        last_day = df_train_compar["val_"][-len(df_test_compar):]

        # run algos using df_valid to find the better num of kmeans tb-used
        mean_mse = self.do_it(
            [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 60, 70, 80, 100, 150, 200],
            df_train, df_valid, mean_day,
            n_pt_one_period)

        best_num_of_kmean = min(mean_mse, key = lambda t: t[1])[0]
        logging.info("Found best number of cluster: %s", best_num_of_kmean)
        # run algos using df_test with previously found best kmean
        res = self.do_it(
            [best_num_of_kmean], 
            df_train, df_test, mean_day,
            n_pt_one_period)

        # Retrieve results
        (mse_colin, mse_fake, mse_mean) =    (res[0][1], res[0][2], res[0][3])
        (mae_colin, mae_fake, mae_mean) =    (res[0][4], res[0][5], res[0][6])
        (mase_colin, mase_fake, mase_mean) = (res[0][7], res[0][8], res[0][9])

        (std_mse_colin, std_mse_fake, std_mse_mean) =    (res[0][10], res[0][11], res[0][12])
        (std_mae_colin, std_mae_fake, std_mae_mean) =    (res[0][13], res[0][14], res[0][15])
        (std_mase_colin, std_mase_fake, std_mase_mean) = (res[0][16], res[0][17], res[0][18])
        logging.info("Retrieve results")

        # Compute baselines
        logging.info("Compute baselines")
        (_, mse_ar_error, mae_ar_error, mase_ar_error)  = self.pm.do_forecast_ar_model(
            last_day, df_train_compar["val_"], df_test_compar["val_"])
        (mse_hw, mae_hw, mase_hw)  = -1, -1, -1

        return (
            mse_colin, mse_fake, mse_mean, 
            mae_colin, mae_fake, mae_mean,
            mase_colin, mase_fake, mase_mean,
            mse_ar_error, mae_ar_error, mase_ar_error, 
            mse_hw, mae_hw, mase_hw,
            best_num_of_kmean,
            std_mse_colin, std_mse_fake, std_mse_mean,
            std_mae_colin, std_mae_fake, std_mae_mean, 
            std_mase_colin, std_mase_fake, std_mase_mean
        )
        
    def do_it(self, 
              km_sizes, 
              df_train, df_valid_ou_test, mean_day,
              n_pt_one_period, n_serie_concatenated = 1):
        mean_mse = []
        for my_km_size in km_sizes:
            logging.info("ECML with km_size of %s", my_km_size)
            n_day_found_train = len(df_train["n_day_"].unique())
            logging.debug("There are %s days in train", n_day_found_train)

            #################################
            # I. TRAIN 
            #################################
            (km, comprehensive_clusters_df_train) = self.cm.do_kmeans_wrapper(
                df_train,
                km_size = my_km_size
            )

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # MM
            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # compute MMs
            raw_hmm_data_df_train = list(
                self.hm.compute_raw_hmm(comprehensive_clusters_df_train, order = 1))
            # compute transition matrix
            transition_mat = self.hm.compute_hmm_transition_mat_1d(
                raw_hmm_data_df_train, my_km_size)

            #################################
            # II. VALID
            #################################
            # 1. apply km on df_valid data
            y_pred = self.cm.apply_clustering(df_valid_ou_test, km)

            # create tuple of known/wanted:
            # a) for ts data itself
            s_arr=[]
            for c in range(df_valid_ou_test["n_day_"].min() + 1, df_valid_ou_test["n_day_"].max() - 1):
                s_arr.append(
                    (
                        df_valid_ou_test[df_valid_ou_test["n_day_"] == c - 1]["val_"].values, 
                        df_valid_ou_test[df_valid_ou_test["n_day_"] == c]["val_"].values,
                        df_valid_ou_test[df_valid_ou_test["n_day_"] <= c]["val_"].values
                    )
                )
            
            # b) for ts labels
            s_l_arr=[]
            for c in range(1, len(y_pred) - 1):
                s_l_arr.append((y_pred[c - 1], y_pred[c]))

            precision_colin = [] 
            precision_fake  = []
            precision_mean  = []

            precision_colin_mae = [] 
            precision_fake_mae  = []
            precision_mean_mae  = []

            precision_colin_mase = [] 
            precision_fake_mase  = []
            precision_mean_mase  = []

            # compute predictions and mse
            for count in range(0, len(s_arr)):
                path_count = os.path.join(self.out_fcasts_f, str(count) + "_fcast_" + str(my_km_size) + "_kmsize.csv") 
                (known, guess, before_w)     = s_arr[count]
                known_w = np.pad(known, (0,len(before_w) - len(known)), "constant", constant_values=-42.42)
                guess_w = np.pad(guess, (0,len(before_w) - len(guess)), "constant", constant_values=-42.42)
                (known_l, guess_l) = s_l_arr[count]                            

                pd.DataFrame({"known": known_w[:], "guess": guess_w[:], "before": before_w[:]}) .to_csv(path_count,  sep = ";", index_label = False, index = False)

                pred = self.pm.predict_median_hmm(known_l, transition_mat, km)
                pred_fake = self.pm.predict_median_hmm(
                    known_l, transition_mat, km, 
                    real_class_of_following_day = guess_l) 
                
                precision_colin.append(self.utils_cl.compute_mse(guess, pred))
                precision_fake .append(self.utils_cl.compute_mse(guess, pred_fake))
                precision_mean .append(self.utils_cl.compute_mse(guess, mean_day))

                precision_colin_mae.append(self.utils_cl.compute_mae(guess, pred))
                precision_fake_mae .append(self.utils_cl.compute_mae(guess, pred_fake))
                precision_mean_mae .append(self.utils_cl.compute_mae(guess, mean_day))

                precision_colin_mase.append(self.utils_cl.compute_mase(known, guess, pred))
                precision_fake_mase .append(self.utils_cl.compute_mase(known, guess, pred_fake))
                precision_mean_mase .append(self.utils_cl.compute_mase(known, guess, mean_day))                        

            mean_mse.append(
                (
                    my_km_size, np.mean(precision_colin), np.mean(precision_fake), np.mean(precision_mean),
                    np.mean(precision_colin_mae), np.mean(precision_fake_mae), np.mean(precision_mean_mae),
                    np.mean(precision_colin_mase), np.mean(precision_fake_mase),np.mean(precision_mean_mase),
                    np.std(precision_colin), np.std(precision_fake), np.std(precision_mean),
                    np.std(precision_colin_mae), np.std(precision_fake_mae), np.std(precision_mean_mae),
                    np.std(precision_colin_mase), np.std(precision_fake_mase), np.std(precision_mean_mase)
                )
            )
        return mean_mse
Ejemplo n.º 7
0
# Bunch of cleanings for empty files
from file_helper import FileHelper
fh = FileHelper(tstmp)
fh.clean_zips_folder()
fh.clean_res_folder(out_dir_res)
fh.ensure_dirs_exist([out_path_res, in_dir, out_dir_res, out_dir_khiops, out_dir_fcasts])

# After cleaning, zip the code which is executed now
fh.zip_code()

# Init objects
from khiops import KhiopsManager
km    = KhiopsManager()

from my_utils import MyUtils
utils = MyUtils()

from ecml_machine import ECMLMachine

from clustering_machine import ClusteringMachine
cm    = ClusteringMachine()

from wrappers import Wrappers
wrapper = Wrappers(km, fh)

# Read configuration file: which input data to use?
import json
with open(os.path.join(this_file_dir, "conf", "conf_lite.json"), 'r') as f:
    confs = json.load(f)

# Init res files
Ejemplo n.º 8
0
class KhiopsManager:
    def __init__(self):
        logging.info(pk.getKhiopsInfo())
        self.this_file_dir = os.path.dirname(os.path.realpath(__file__))

        # path mgmt
        # use timestamp of each exec in paths
        self.fh = FileHelper()
        self.dictionary_file = os.path.join(self.this_file_dir, "dic",
                                            "series.kdic")
        self.classif_res = os.path.join(self.this_file_dir, "res",
                                        "khiops_res", "classif")
        self.coclus_res = os.path.join(self.this_file_dir, "res", "khiops_res",
                                       "coclus")
        self.pred_res = os.path.join(self.this_file_dir, "res", "khiops_res",
                                     "pred_res")

        self.fh.ensure_dirs_exist([
            self.dictionary_file, self.classif_res, self.coclus_res,
            self.pred_res
        ])

        self.ccr = CoclusteringResults()
        self.utils = MyUtils()
        logging.info("Khiops manager instantiated")
        logging.info("dictionary_file used: %s", self.dictionary_file)

    """KHIOPS COCLUSTERING TRAIN AND SIMPLIFICATIONS"""

    def train_coclustering(self, f):
        """
        Train a coclustering model in the simplest way possible
        """
        file_name = self.fh.get_file_name(f)
        logging.info("Train of coclustering for file %s", file_name)

        # Train coclustering model for variables "SampleId" and "Char"
        pk.trainCoclustering(dictionaryFile=self.dictionary_file,
                             dictionary="train",
                             dataTable=f,
                             coclusteringVariables=["time_", "n_day_", "val_"],
                             resultsDir=self.coclus_res,
                             fieldSeparator=";",
                             samplePercentage=100,
                             resultsPrefix=file_name + "_")

    def simplify_coclustering(self, file_name, mpi=100, mcn=999999):
        """
        Simplify a coclustering model in the simplest way possible
        """
        base_path = os.path.join(self.coclus_res, file_name)
        self.fh.ensure_dirs_exist([base_path])
        cf = os.path.join(self.coclus_res, file_name + "_Coclustering.khc")
        logging.info("Simplify coclustering for file %s", file_name)
        logging.info("MCN=%s, MPI=%s", mcn, mpi)

        if mcn != 999999:
            scf = file_name + "_Simplified-" + str(mcn) + ".khc"
        else:
            scf = file_name + "_Simplified-" + str(mpi) + ".khc"

        logging.info("scf=%s", scf)

        pk.simplifyCoclustering(
            coclusteringFile=cf,
            simplifiedCoclusteringFile=scf,
            resultsDir=base_path,
            maxCellNumber=mcn,
            maxPreservedInformation=mpi,
        )

        return str(os.path.join(base_path, scf)).replace(".khc", ".json")

    def deploy_coclustering(self, file_name, identifier):
        """
        Deploy a coclustering model
        """
        base_path = os.path.join(self.coclus_res, file_name)
        self.fh.ensure_dirs_exist([base_path])

        logging.info("Deploy coclustering for file %s", file_name)
        scf = os.path.join(
            self.coclus_res, file_name,
            file_name + "_Simplified-" + str(identifier) + ".khc")

        dep_prefix = file_name + "_Deployed-" + str(identifier) + "-"

        pk.prepareCoclusteringDeployment(dictionaryFile=self.dictionary_file,
                                         dictionary="root",
                                         coclusteringFile=scf,
                                         tableVariable="secondary",
                                         deployedVariable="n_day_",
                                         buildDistanceVariables=True,
                                         resultsPrefix=dep_prefix,
                                         resultsDir=base_path)
        return os.path.join(base_path, dep_prefix + "Coclustering.kdic")

    def transfer_database(self, d, deployed_path, f, identifier):
        """
        Transfer a coclustering model to use it
        """
        file_name = self.fh.get_file_name(f)
        out_path = os.path.join(
            self.coclus_res, file_name,
            file_name + "_Transf-" + str(identifier) + ".csv")

        logging.info("Transfering of coclustering for file %s into %s",
                     file_name, out_path)

        pk.transferDatabase(dictionaryFile=d,
                            dictionary="root",
                            dataTable=deployed_path,
                            additionalDataTables={"root`secondary": f},
                            fieldSeparator=";",
                            outputFieldSeparator=";",
                            outputDataTable=out_path)
        return out_path

    """JSON COCLUSTERING MANIPULATIONS"""

    def get_clusters(self, path):
        """
        From a given Khiops json file, extract clusters, and detailed info about
        them.

        Returns:
          * zips: more info about each cluster zipped together (e.g all tuples that goes together 'aligned') => tuples (clus, values, value_frequencies, value_typicalities)
          * cluster_and_values: dic(cluster_num => values_associated)
        """
        cluster_and_values = {}
        zips = []
        with open(path) as f:
            data = json.load(f)["coclusteringReport"]["dimensionPartitions"]
            n_days = list(filter(lambda d: d['name'] in "n_day_", data))

            for idx, n_day in enumerate(n_days[0]["valueGroups"], start=1):
                # Ugly turnaround to manipulate floats and int
                # see https://goo.gl/8tYfhn
                values = list(map(int, map(float, n_day["values"])))
                clus = n_day["cluster"]
                value_frequencies = n_day["valueFrequencies"]
                value_typicalities = n_day["valueTypicalities"]

                zips.append(
                    list(
                        zip(clus, values, value_frequencies,
                            value_typicalities)))
                cluster_and_values[clus] = values
        return self.utils.flattify(zips), cluster_and_values

    @staticmethod
    def get_clusters_from_dep(path):
        """
        From a given Khiops transferred thing, extract clusters, and detailed
        info about them.

        Returns:
          * cluster_and_values: dic(cluster_num => values_associated)
        """
        with open(path) as f:
            df = pd.read_csv(f, sep=";")

            k = df["n_day_PredictedLabel"].unique()
            cluster_and_values = dict((key, []) for key in k)

            for index, row in df.iterrows():
                n_day_ = row["n_day_"]
                clus = row["n_day_PredictedLabel"]
                cluster_and_values[clus].append(n_day_)
        return cluster_and_values

    def get_cells_number(self, file_name, mpi=None):
        """
        From a given Khiops json file, extract number of cells for all
        dimentions

        return:
          - cells
        """
        if mpi is not None:
            p = os.path.join(self.coclus_res, file_name,
                             file_name + "_Simplified-" + str(mpi) + ".json")
        else:
            p = os.path.join(self.coclus_res, file_name + "_Coclustering.json")
        with open(p) as f:
            cells = json.load(f)["coclusteringReport"]["summary"]["cells"]

        return int(cells)

    def get_cluster_number(self, file_name, ref_id=None):
        """
        From a given Khiops json file, extract number of cluster for dim
        "n_day_" 

        return:
          - nb_of_cluster_found
        """
        if ref_id is not None:
            p = os.path.join(
                self.coclus_res, file_name,
                file_name + "_Simplified-" + str(ref_id) + ".json")
        else:
            p = os.path.join(self.coclus_res, file_name + "_Coclustering.json")
        with open(p) as f:
            data = json.load(f)["coclusteringReport"]["dimensionSummaries"]
            n_days = list(filter(lambda d: d['name'] in "n_day_", data))

        if not n_days:
            return False
        else:
            return int(n_days[0]["parts"])

    """UTILS"""

    @staticmethod
    def _compute_accuracy(len_y_pred, y_pred, y_pred_target):
        """
        Compute the accuracy of a classifier.
        """
        c = 0
        for i in range(0, len_y_pred):
            pred_group_day_ahead = str(y_pred.iloc[i]["Predictedy"])
            real_group_day_ahead = str(y_pred_target.values[i])
            if pred_group_day_ahead != real_group_day_ahead:
                c = c + 1
        train_acc = 100 - c * 100 / len_y_pred
        logging.debug("%s errors over %s values", c, len_y_pred)
        logging.debug("That is %s perc of accuracy", train_acc)
        return train_acc

    @staticmethod
    def __get_typicalitie(n_day, my_zip):
        """
        From a list of tuples my_zip, find the
        tuple which concern n_day and retrieve its valTyp.

        PARAMETERS
        ------------------------
        - my_zip: tuples (group, n_day, valFreq, valTyp) 
        - n_day: int
        """
        items = [i for i in my_zip if n_day == i[1]]
        return items[0][3]

    @staticmethod
    def compute_centroids(c_a_v, df, l_ref):
        """
        This method computes mean days and centroids for given values

        PARAMETERS
        ------------------------
          * c_a_v: dic(cluster_num => values_associated)
          * df: dataset studied
          * l_ref: len of one ref day

        RETURNS
        -------------------------
          * centroids: dic(cluster_id: int => centroid: [])
        """
        centroids = {}
        e = False
        for k, v in c_a_v.items():
            # If there is at least one day in the cluster considered!
            if len(v) != 0:
                logging.info("Computing cluster %s centroid", k)
                centroids[k] = df[df['n_day_'].isin(map(
                    str, v))].groupby('time_')['val_'].agg('mean').values
            # Otherwise, mean day = 0
            else:
                logging.info("Cluster %s is empty", k)
                e = True
                centroids[k] = [0] * l_ref
        logging.info("Keys of centroids are %s", list(centroids.keys()))
        logging.debug("Centroids are %s", centroids)
        return centroids, e

    def process_pred_proba(self,
                           y_pred,
                           y_pred_target,
                           clusts_names,
                           centroids,
                           days_to_predict,
                           nb_cluster_found,
                           n_pt_per_day,
                           mean_day,
                           o=False):
        """
        This method process results from Khiops classification method to create
        predictions and compute mses. If Oracle mode is "True", then use a mock
        for classifier Y (e.g the classifier knows exactly the Y columns)

        WHAT IT DOES
        ------------------------
          * First thing (A): Compute given classifier accuracy regarding known 
            y_pred_target. 
          * Second thing (B): Process probalistic classifier results but not
            using the probabilities and only the most probable class for day n +
            1. NPA
          * Third thing (C): Process probalistic classifier results using
            probabilities affected to each cluster (e.g. day n + 1 could be 10%
            in cluster 1, 50% in cluster 2, etc). Use ponderations to sum up
            each centroids and produce result. PA
          * Last thing (D): Wrap up, means computations => compute results...

        PARAMETERS
        ------------------------
          * y_pred: Matrix results from classifier
          * y_pred_target_test: Known y_pred_target for classifier for test data
          * clusts_names: names of all clusters
          * centroids: clusters centroids
          * days_to_predict: list of days that have been used for test
          * nb_cluster_found: value exctracted from Khiops coclustering file;
            used to parameterize loops and computations.
          * n_pt_per_day: if there is one point per hour of the day, then this
            value will be 24.
          * mean_day: pre-computed mean day for all training ensemble.
          * oracle: are we in oracle mode? ('Y' known)

        RETURN
        -------------------------
          * Tuple(mean_mse_non_proba, mean_mse_proba, classifier_acc,
            mean_mse_mean_day) => Meaned MSE !
            :param y_pred:
            :param clusts_names:
            :param centroids:
            :param days_to_predict:
            :param nb_cluster_found:
            :param n_pt_per_day:
            :param mean_day:
            :param y_pred_target:
            :param o:
        """
        len_y_pred = len(y_pred)

        # (A): Compute classifier accuracy
        classifier_acc = self._compute_accuracy(len_y_pred, y_pred,
                                                y_pred_target)
        if classifier_acc == 0.0:
            logging.info(
                "This classifier is very not good and failed all the time")

        # Init empty arrays
        mses_non_proba = []
        mses_proba = []
        mses_mean_day = []

        mae_non_proba = []
        mae_proba = []
        mae_mean_day = []

        mase_non_proba = []
        mase_proba = []
        mase_mean_day = []

        # Think about removing last day, which has not days after
        n_days_ = days_to_predict["n_day_"].unique()[:-1]
        # Also got to remove the first day, which has not day before
        # Wrangling data to have good types.
        n_days_int = list(map(int, n_days_))
        n_days_int.remove(min(n_days_int))
        n_days_ = list(map(str, n_days_int))

        for ref_in_classif_ensemble, z in enumerate(n_days_):
            """
            (B): NPA approach
            """
            indice_today = str(int(z) - 1)
            today_vals = days_to_predict[days_to_predict["n_day_"] ==
                                         indice_today]["val_"].values
            tomor_vals = days_to_predict[days_to_predict["n_day_"] ==
                                         z]["val_"].values
            tom_class_pred_cluster_y = y_pred.iloc[ref_in_classif_ensemble][
                "Predictedy"]
            tom_pred_y = centroids[str(tom_class_pred_cluster_y)]

            # Append mses
            mses_non_proba.append(
                self.utils.compute_mse(tomor_vals, tom_pred_y))
            mses_mean_day.append(self.utils.compute_mse(tomor_vals, mean_day))

            mae_non_proba.append(self.utils.compute_mae(
                tomor_vals, tom_pred_y))
            mae_mean_day.append(self.utils.compute_mae(tomor_vals, mean_day))

            mase_non_proba.append(
                self.utils.compute_mase(today_vals, tomor_vals, tom_pred_y))
            mase_mean_day.append(
                self.utils.compute_mase(today_vals, tomor_vals, mean_day))

            mean_days_pond = []
            p_tot = 0
            """
            (C): PA approach
            """
            for c in clusts_names:
                try:
                    # Get probability to be in groupe number "c"
                    p = y_pred.iloc[ref_in_classif_ensemble]["Proby" + str(c)]

                    # p_tot variable for debug purposes
                    p_tot = p_tot + p
                except:
                    logging.debug("Did not find %s in the prediction matrix",
                                  c)
                    p = 0
                    pass

                # Retrieve mean day for cluster c
                day_pond = centroids[c].copy()

                # Scale according to proba
                day_pond[:] = [x * p for x in day_pond]
                mean_days_pond.append(day_pond)

            logging.debug("p_tot is %s", p_tot)  # Here it should be 100.

            # Sum up everything to create prediction
            tomo_pred_with_pond = [0] * n_pt_per_day
            for d_p in mean_days_pond:
                for idx, e in enumerate(d_p):
                    tomo_pred_with_pond[idx] = tomo_pred_with_pond[idx] + e

            mses_proba.append(
                self.utils.compute_mse(tomor_vals, tomo_pred_with_pond))
            mae_proba.append(
                self.utils.compute_mae(tomor_vals, tomo_pred_with_pond))
            mase_proba.append(
                self.utils.compute_mase(today_vals, tomor_vals,
                                        tomo_pred_with_pond))

        # (D): Wrap up: mean MSEs (so we have, at the end, Meaned Mean Square
        # Error)
        # MMSES
        mean_mse_non_proba = np.mean(mses_non_proba)
        mean_mse_proba = np.mean(mses_proba)
        mean_mse_mean_day = np.mean(mses_mean_day)

        mean_mae_non_proba = np.mean(mae_non_proba)
        mean_mae_proba = np.mean(mae_proba)
        mean_mae_mean_day = np.mean(mae_mean_day)

        mean_mase_non_proba = np.mean(mase_non_proba)
        mean_mase_proba = np.mean(mase_proba)
        mean_mase_mean_day = np.mean(mase_mean_day)

        # MSTD
        std_mse_non_proba = np.std(mses_non_proba)
        std_mse_proba = np.std(mses_proba)
        std_mse_mean_day = np.std(mses_mean_day)

        std_mae_non_proba = np.std(mae_non_proba)
        std_mae_proba = np.std(mae_proba)
        std_mae_mean_day = np.std(mae_mean_day)

        std_mase_non_proba = np.std(mase_non_proba)
        std_mase_proba = np.std(mase_proba)
        std_mase_mean_day = np.std(mase_mean_day)

        return ((mean_mse_non_proba, mean_mse_proba, mean_mse_mean_day),
                (mean_mae_non_proba, mean_mae_proba,
                 mean_mae_mean_day), (mean_mase_non_proba, mean_mase_proba,
                                      mean_mase_mean_day), classifier_acc,
                (std_mse_non_proba, std_mse_proba, std_mse_mean_day),
                (std_mae_non_proba, std_mae_proba, std_mae_mean_day),
                (std_mase_non_proba, std_mase_proba, std_mase_mean_day))
Ejemplo n.º 9
0
# one-hotエンコード用
OHE_COLUMNS = ['sales', 'salary']

# 評価指標を選択
# metrics
# accuracy, precision, recall, f1, auc
metrics = 'accuracy'
# metrics = 'precision'
# metrics = 'recall'
# metrics = 'f1'
# metrics = 'auc'

selector_method = 'RFE'
# selector_method = 'PCA'

myutils = MyUtils('./param_clf.yml', OBJ_TYPE, metrics)

print(myutils.get_str_timestamp())

# mode = 'train'
mode = 'score'

if mode == 'train':

    # データ読み込み
    _, X, y = myutils.read_data(TRAIN_PATH)

    # 前処理
    X = myutils.train_data__preprocessing_with_imputer(X, OHE_COLUMNS)

    # 初回探索
 def __init__(self):
     self.utils_cl = MyUtils()
     logging.info("HMM machine instantiated")
class HmmMachine:

    def __init__(self):
        self.utils_cl = MyUtils()
        logging.info("HMM machine instantiated")

    def compute_raw_hmm(self, tuple_array, order=1):
        """
        From an array of typle ((srv1, 2), (srv2, 5), (srv3, 4), ...):
        * sort it,
        * create a List(2,5,4,...), 
        * compute HMM (2,5),(5,4), ...
        """
        # flatmap + only keep first elem of tuple; need to sort to exhibits the sequences of days!
        iterable = [x[1] for x in self.utils_cl.my_sort_func(tuple_array, 0)]

        i = iter(iterable)
        win = []
        for e in range(0, order + 1):
            win.append(next(i))
        yield win
        for e in i:
            win = win[1:] + [e]
            yield win

        return list(win)

    @staticmethod
    def compute_hmm_transition_mat_1d(raw_hmm_data, n_cluster):
        """
        Knowing the number of clusters in the KNN and with numeric results of
        the KNN, compute the HMM transition matrice, e.g. percentages of going
        from one state to another.
        """
        order = len(raw_hmm_data[0]) - 1

        pow2 = int(n_cluster * n_cluster)
        pow3 = int(n_cluster * n_cluster * n_cluster)

        if order == 1:
            d = pd.DataFrame(0, index=np.arange(n_cluster),
                             columns=range(0, n_cluster))
            for tup in raw_hmm_data:
                d.at[tup[0], tup[1]] = d.at[tup[0], tup[1]] + 1
            # sum of lines
            sum_col = d.sum(1)
            for i in range(0, n_cluster):
                for j in range(0, n_cluster):
                    if d.at[i, j] != 0 and sum_col[i] != 0:
                        perc = 100 * (d.at[i, j] / sum_col[i])
                    else:
                        perc = 0
                    if not math.isnan(perc):
                        d.at[i, j] = perc
                    else:
                        d.at[i, j] = 0
        elif order == 2:
            d = pd.DataFrame(0, index=np.arange(
                pow2), columns=range(0, n_cluster))

            # First construct raw data
            for tup in raw_hmm_data:
                d.at[tup[0] * n_cluster + tup[1], tup[2]] = d.at[tup[0] * n_cluster + tup[1], tup[2]] + 1
            sum_col = d.sum(1)

            # Then percentagise it
            for i in range(0,  pow2):
                for j in range(0, n_cluster):
                    if d.at[i, j] != 0 and sum_col[i] != 0:
                        perc = 100 * (d.at[i, j] / sum_col[i])
                    else:
                        perc = 0
                    perc = 100 * (d.at[i, j] / sum_col[i])
                    if not math.isnan(perc):
                        d.at[i, j] = perc
                    else:
                        d.at[i, j] = 0
        elif order == 3:
            d = pd.DataFrame(0, index=np.arange(
                pow3), columns=range(0, n_cluster))

            for tup in raw_hmm_data:
                d.at[tup[0] * pow2 + tup[1] * n_cluster + tup[2], tup[3]
                     ] = d.at[tup[0] * pow2 + tup[1] * n_cluster + tup[2], tup[3]] + 1
            sum_col = d.sum(1)

            for i in range(0, pow3):
                for j in range(0, n_cluster):
                    if d.at[i, j] != 0 and sum_col[i] != 0:
                        perc = 100 * (d.at[i, j] / sum_col[i])
                    else:
                        perc = 0
                    perc = 100 * (d.at[i, j] / sum_col[i])
                    if not math.isnan(perc):
                        d.at[i, j] = perc
                    else:
                        d.at[i, j] = 0
        return d

    @staticmethod
    def compute_hmm_transition_mat_2d(raw_hmm_data1, raw_hmm_data2, n_cluster):
        """
        Knowing the number of clusters in the KNN and with numeric results of
        the KNN, compute the HMM transition matrice, e.g. percentages of going
        from one state to another. Two dimentional (e.g. using two sequences of
        two TS to use more info for predictions)
        """
        order = len(raw_hmm_data1[0]) - 1

        pow2 = int(n_cluster * n_cluster)
        pow3 = int(n_cluster * n_cluster * n_cluster)
        pow4 = int(n_cluster * n_cluster * n_cluster * n_cluster)

        raw_hmm_data = list(zip(raw_hmm_data1, raw_hmm_data2))

        if order == 1:
            d = pd.DataFrame(0, index=np.arange(pow2),
                             columns=range(0, pow2))
            for (tup1, tup2) in raw_hmm_data:
                d.at[tup1[0] * n_cluster + tup2[0], tup1[1] * n_cluster + tup2[1]
                     ] = d.at[tup1[0] * n_cluster + tup2[0], tup1[1] * n_cluster + tup2[1]] + 1
            # sum of lines
            sum_col = d.sum(1)
            for i in range(0, pow2):
                for j in range(0, pow2):
                    perc = 100 * (d.at[i, j] / sum_col[i])
                    if not math.isnan(perc):
                        d.at[i, j] = perc
                    else:
                        d.at[i, j] = 0
        return d