def run_old(self, data_in):
        super().run(data_in)  # do not remove this!
        Logging.log("Training da heat...")

        # cross validation params
        iter_idx = 2  # Monte Carlo cross-validation - randomly assign training and test set x times
        percentage_train = 0.8  # percentage of data being trainingset

        # 1. transform to df and keep critical
        train_df = self._extract_critical_data_frame(data_in)
        train_df[self._field_in_train_cluster_id] = data_in[
            self._field_in_train_cluster_id]

        for cluster_id in list(train_df["train_cluster_id"].unique()
                               ):  # per cluster own model
            if self._test_mode and not (cluster_id == 3 or cluster_id == 1):
                continue

            print("\n\n TRAINING CLUSTER: " + str(cluster_id))
            cur_train_df = train_df[train_df[self._field_in_train_cluster_id]
                                    == cluster_id]

            # 2. scale data and remove outliers
            output_dfs, trained_scalers = self._preprocess_data(
                cur_train_df, self._remove_empty_features,
                self._nr_outlier_iterations, self._outlier_window_size,
                self._outlier_std_threshold)
            data_in[
                "CL_" + str(cluster_id) + "_" +
                self._field_out_train_model_trained_scalers] = trained_scalers

            # 3. Train the model
            model_per_feature = self._build_heat_map_parallel(output_dfs)
            data_in["CL_" + str(cluster_id) + "_" +
                    self._field_out_train_model_grid_area] = self._grid_area

            # 4. Store the models
            data_in["CL_" + str(cluster_id) + "_" +
                    self._field_out_train_model] = model_per_feature

        # TODO: Training oben muss mit 80% der Dategeschehen und Testlauf dann mit 20 %
        # 5. Durchlaufe Testlauf hier

        # 5. empty metrics
        metrics = dict()

        return data_in, metrics
    def _score_feature_quality(self, test_df, whole_df, model_per_feature,
                               cluster_id, data_in, r_min, r_max,
                               finetuner_index):

        # wenn einmal berechnet wegspeichern und im Zweifel wieder laden
        print("_________SCORING: " + str(r_min) + " to " + str(r_max))

        # jedes model is ein heat map
        tester = HeatmapConvolutionTester(smooth_per_feature=True,
                                          enable_all_print=False,
                                          visualize_summed_curve=False,
                                          visualize_per_feature_curve=False)
        abs_max_rul = whole_df["RUL"].max()  # 217
        segment_thrshld = 0.33 * abs_max_rul
        distances = {}  # key feat name, value list_dist
        phm_scores = {}
        rmse_scores = {}
        tot = len(list(test_df["id"].unique()))
        oo = 0
        for object_id in list(test_df["id"].unique()):
            oo += 1
            Logging.log(
                str(oo) + " of " + str(tot) +
                " - Optimizing based on - OBJECT ID: " + str(object_id))
            cur_df1 = test_df[test_df['id'] == object_id]

            # predict immer einmal random aus erste 33% dann zwischen 33% und 66% und dann zwischen 66 und 100%
            le = int(numpy.ceil(len(cur_df1) / 3))
            z_to_33 = list(range(le))
            random.shuffle(z_to_33)
            if 2 * le > len(cur_df1):
                t_to_66 = []
                s_to_100 = []
                thrshlds = [z_to_33[0]]
            else:
                t_to_66 = list(range(le, 2 * le))
                s_to_100 = list(range(2 * le, len(cur_df1)))
                random.shuffle(t_to_66)
                random.shuffle(s_to_100)
                thrshlds = [z_to_33[0], t_to_66[0], s_to_100[0]]

            cur_df3 = cur_df1.sort_values("RUL", ascending=False)
            for thrshld in thrshlds:
                current_test_df = cur_df3.iloc[:thrshld]

                dist = current_test_df["RUL"].max(
                ) - current_test_df["RUL"].min()
                if dist > segment_thrshld:
                    print(
                        "SHORTENED RUL AREA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! "
                    )
                    thrs = current_test_df["RUL"].min() + segment_thrshld
                    current_test_df = current_test_df[
                        current_test_df["RUL"] < thrs]

                # do prediction
                try:
                    predicted_risk, predicted_rul, m, all_feature_sum, per_feature_sum, feature_favorites = tester._predict_RUL(
                        data_in,
                        current_test_df,
                        cluster_id,
                        None, [],
                        0,
                        current_test_df["RUL"].min(),
                        test=True,
                        fine=finetuner_index)
                except:
                    print("No prediction to short shit")
                    continue
                true_RUL = current_test_df.iloc[-1]["RUL"]
                true_risk = 1 + m * true_RUL

                # post process

                # asses per_feature_sum
                for col_name in per_feature_sum.keys():
                    #print("--- Feature: " + col_name)
                    cur = per_feature_sum[col_name]
                    cur[1] = cur[1][0]

                    #m_idx = numpy.argmax(cur[1])
                    #da_risk_found = cur[0][m_idx]
                    #predicted_rul_feature = (da_risk_found - 1)/m

                    if numpy.count_nonzero(cur[1]) == 0:
                        if col_name in distances:
                            distances[col_name].append(
                                [1])  # this curve did not help at all
                            phm_scores[col_name].append(["None"])
                            rmse_scores[col_name].append(["None"])
                        else:
                            distances[col_name] = [[1]]
                            phm_scores[col_name] = [["None"]]
                            rmse_scores[col_name] = [["None"]]
                        #print(" - Distance - 1")
                        continue

                    ten_perc = math.ceil(0.05 * len(cur[1]))
                    subs = int(0.1 * len(cur[1]))
                    ind = sorted(
                        numpy.argpartition(cur[1], -ten_perc)
                        [-ten_perc:])  # indices of x percent of highest values

                    # if gap bigger than subs indices == 0.1 risk -> split to to regions
                    gaps = numpy.where(numpy.diff(ind) > subs)
                    if not gaps: runner = [None]
                    if gaps: runner = sorted(list(gaps[0]))
                    prev = 0
                    multi_dist = []
                    phm_scores_lst = []
                    rmse_scores_lst = []
                    for gap_idx in runner:
                        if gap_idx == None:
                            cur_subset_selection = ind
                        else:
                            gap_idx += 1
                            cur_subset_selection = ind[int(prev):int(gap_idx)]

                        values = cur[0][cur_subset_selection]

                        avg = numpy.average(values)
                        dist = avg - true_risk

                        # bevorzuge praktisch frühere weil das den index nicht so zerlegt
                        multi_dist.append(dist)
                        #print(" - Distance - " + str(dist))

                        # analog find phm and risk
                        da_risk_found = avg
                        predicted_rul_feature = (da_risk_found - 1) / m

                        phmScore = self.score_phm(
                            pd.DataFrame(
                                [[true_RUL, predicted_rul_feature, -1]],
                                columns=["RUL", "predicted_RUL", "object_id"]))
                        rmse = self.score_rmse(
                            numpy.array([true_RUL]),
                            numpy.array([predicted_rul_feature]))
                        phm_scores_lst.append(phmScore)
                        rmse_scores_lst.append(rmse)

                        prev = gap_idx
                        '''
                        print("\nFeature: "+ str(col_name)+ "\nplot all - true risk: " + str(true_risk))
                        plt.plot(cur[0], cur[1])
                        plt.plot(cur[0][cur_subset_selection], cur[1][cur_subset_selection], color='red')
                        #plt.plot(cur[0], medfilt(cur[1], 61), color = "green") # KERNEL MUST BE ODD 
                        plt.xlabel("risk - true risk = " + str(true_risk))
                        plt.ylabel("heat - "+str(col_name))
                        plt.show()
                        '''

                    # use this for evaluation
                    #phmScore = self.score_phm(pd.DataFrame([[true_RUL, predicted_rul_feature, -1]], columns = ["RUL", "predicted_RUL", "object_id"]))
                    #rmse = self.score_rmse(numpy.array([true_RUL]), numpy.array([predicted_rul_feature]))

                    if col_name in distances:
                        distances[col_name].append(
                            multi_dist)  # this curve did not help at all
                        phm_scores[col_name].append(phm_scores_lst)
                        rmse_scores[col_name].append(rmse_scores_lst)
                    else:
                        distances[col_name] = [multi_dist]
                        phm_scores[col_name] = [phm_scores_lst]
                        rmse_scores[col_name] = [rmse_scores_lst]

        return distances, phm_scores, rmse_scores
    def run(self, data_in):
        super().run(data_in)  # do not remove this!
        Logging.log("Training da heat...")

        # cross validation params
        iter_idx = 2  # Monte Carlo cross-validation - randomly assign training and test set x times
        percentage_train = 0.8  # percentage of data being trainingset

        # 1. transform to df and keep critical
        whole_df = self._extract_critical_data_frame(data_in)
        whole_df[self._field_in_train_cluster_id] = data_in[
            self._field_in_train_cluster_id]
        lst_of_train_n_test = self._split_to_subsets(
            whole_df, percentage_train, iter_idx)  # split frame multiple times
        # each element being [train_df, test_df]

        # 2. distance - scoring quality of a feature
        dist_score = {
        }  # key: clusterid_featureid e.g. c1 value: list of dict: key: feature_id, value score

        for train_test in lst_of_train_n_test:
            train_df = train_test[0]
            test_df = train_test[1]
            test_df["cluster_id"] = test_df["train_cluster_id"]

            for cluster_id in list(train_df["train_cluster_id"].unique()
                                   ):  # per cluster own model
                #if cluster_id in [1, 5, 4, 0, 3]: # den hab ich schon
                #    continue

                if self._test_mode and not (cluster_id == 3
                                            or cluster_id == 1):
                    continue

                print("\n\n TRAINING CLUSTER: " + str(cluster_id))
                cur_train_df = train_df[train_df[
                    self._field_in_train_cluster_id] == cluster_id]
                cur_test_df = test_df[test_df["cluster_id"] == cluster_id]

                # 2. scale data and remove outliers
                output_dfs, trained_scalers = self._preprocess_data(
                    cur_train_df, self._remove_empty_features,
                    self._nr_outlier_iterations, self._outlier_window_size,
                    self._outlier_std_threshold)
                data_in[
                    "CL_" + str(cluster_id) + "_" + self.
                    _field_out_train_model_trained_scalers] = trained_scalers

                # 3. Train the model
                model_per_feature = self._build_heat_map_parallel(output_dfs)
                data_in[
                    "CL_" + str(cluster_id) + "_" +
                    self._field_out_train_model_grid_area] = self._grid_area

                # 4. Store the models
                data_in["CL_" + str(cluster_id) + "_" +
                        self._field_out_train_model] = model_per_feature

                # 5. score the feature quality for cross validation
                score_dict = self._score_feature_quality(
                    cur_test_df, whole_df, model_per_feature, cluster_id,
                    data_in)
                print("Found scores: " + str(score_dict))
                idfr = "c" + str(cluster_id)
                if idfr not in dist_score:
                    dist_score[idfr] = [score_dict]
                else:
                    dist_score[idfr].append(score_dict)

                try:
                    pathh = os.path.join(r"C:\Users\q416435\Desktop\scores",
                                         "cluster_" + str(cluster_id) + ".csv")
                    print("Writing file to " + pathh)
                    self._csv_file = open(pathh, 'w')
                    for ke in score_dict.keys():
                        self._csv_writer = csv.writer(self._csv_file,
                                                      delimiter=';')
                        self._csv_writer.writerow([ke, str(score_dict[ke])])
                    self._csv_file.close()
                except:
                    pass

        # 3. Perform training for whole model now
        # get final model

        # 4. keep only optimal models now based on dist_score
        #T.B.D.
        # 5. empty metrics
        metrics = dict()

        return data_in, metrics