def run_old(self, data_in): super().run(data_in) # do not remove this! Logging.log("Training da heat...") # cross validation params iter_idx = 2 # Monte Carlo cross-validation - randomly assign training and test set x times percentage_train = 0.8 # percentage of data being trainingset # 1. transform to df and keep critical train_df = self._extract_critical_data_frame(data_in) train_df[self._field_in_train_cluster_id] = data_in[ self._field_in_train_cluster_id] for cluster_id in list(train_df["train_cluster_id"].unique() ): # per cluster own model if self._test_mode and not (cluster_id == 3 or cluster_id == 1): continue print("\n\n TRAINING CLUSTER: " + str(cluster_id)) cur_train_df = train_df[train_df[self._field_in_train_cluster_id] == cluster_id] # 2. scale data and remove outliers output_dfs, trained_scalers = self._preprocess_data( cur_train_df, self._remove_empty_features, self._nr_outlier_iterations, self._outlier_window_size, self._outlier_std_threshold) data_in[ "CL_" + str(cluster_id) + "_" + self._field_out_train_model_trained_scalers] = trained_scalers # 3. Train the model model_per_feature = self._build_heat_map_parallel(output_dfs) data_in["CL_" + str(cluster_id) + "_" + self._field_out_train_model_grid_area] = self._grid_area # 4. Store the models data_in["CL_" + str(cluster_id) + "_" + self._field_out_train_model] = model_per_feature # TODO: Training oben muss mit 80% der Dategeschehen und Testlauf dann mit 20 % # 5. Durchlaufe Testlauf hier # 5. empty metrics metrics = dict() return data_in, metrics
def _score_feature_quality(self, test_df, whole_df, model_per_feature, cluster_id, data_in, r_min, r_max, finetuner_index): # wenn einmal berechnet wegspeichern und im Zweifel wieder laden print("_________SCORING: " + str(r_min) + " to " + str(r_max)) # jedes model is ein heat map tester = HeatmapConvolutionTester(smooth_per_feature=True, enable_all_print=False, visualize_summed_curve=False, visualize_per_feature_curve=False) abs_max_rul = whole_df["RUL"].max() # 217 segment_thrshld = 0.33 * abs_max_rul distances = {} # key feat name, value list_dist phm_scores = {} rmse_scores = {} tot = len(list(test_df["id"].unique())) oo = 0 for object_id in list(test_df["id"].unique()): oo += 1 Logging.log( str(oo) + " of " + str(tot) + " - Optimizing based on - OBJECT ID: " + str(object_id)) cur_df1 = test_df[test_df['id'] == object_id] # predict immer einmal random aus erste 33% dann zwischen 33% und 66% und dann zwischen 66 und 100% le = int(numpy.ceil(len(cur_df1) / 3)) z_to_33 = list(range(le)) random.shuffle(z_to_33) if 2 * le > len(cur_df1): t_to_66 = [] s_to_100 = [] thrshlds = [z_to_33[0]] else: t_to_66 = list(range(le, 2 * le)) s_to_100 = list(range(2 * le, len(cur_df1))) random.shuffle(t_to_66) random.shuffle(s_to_100) thrshlds = [z_to_33[0], t_to_66[0], s_to_100[0]] cur_df3 = cur_df1.sort_values("RUL", ascending=False) for thrshld in thrshlds: current_test_df = cur_df3.iloc[:thrshld] dist = current_test_df["RUL"].max( ) - current_test_df["RUL"].min() if dist > segment_thrshld: print( "SHORTENED RUL AREA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! " ) thrs = current_test_df["RUL"].min() + segment_thrshld current_test_df = current_test_df[ current_test_df["RUL"] < thrs] # do prediction try: predicted_risk, predicted_rul, m, all_feature_sum, per_feature_sum, feature_favorites = tester._predict_RUL( data_in, current_test_df, cluster_id, None, [], 0, current_test_df["RUL"].min(), test=True, fine=finetuner_index) except: print("No prediction to short shit") continue true_RUL = current_test_df.iloc[-1]["RUL"] true_risk = 1 + m * true_RUL # post process # asses per_feature_sum for col_name in per_feature_sum.keys(): #print("--- Feature: " + col_name) cur = per_feature_sum[col_name] cur[1] = cur[1][0] #m_idx = numpy.argmax(cur[1]) #da_risk_found = cur[0][m_idx] #predicted_rul_feature = (da_risk_found - 1)/m if numpy.count_nonzero(cur[1]) == 0: if col_name in distances: distances[col_name].append( [1]) # this curve did not help at all phm_scores[col_name].append(["None"]) rmse_scores[col_name].append(["None"]) else: distances[col_name] = [[1]] phm_scores[col_name] = [["None"]] rmse_scores[col_name] = [["None"]] #print(" - Distance - 1") continue ten_perc = math.ceil(0.05 * len(cur[1])) subs = int(0.1 * len(cur[1])) ind = sorted( numpy.argpartition(cur[1], -ten_perc) [-ten_perc:]) # indices of x percent of highest values # if gap bigger than subs indices == 0.1 risk -> split to to regions gaps = numpy.where(numpy.diff(ind) > subs) if not gaps: runner = [None] if gaps: runner = sorted(list(gaps[0])) prev = 0 multi_dist = [] phm_scores_lst = [] rmse_scores_lst = [] for gap_idx in runner: if gap_idx == None: cur_subset_selection = ind else: gap_idx += 1 cur_subset_selection = ind[int(prev):int(gap_idx)] values = cur[0][cur_subset_selection] avg = numpy.average(values) dist = avg - true_risk # bevorzuge praktisch frühere weil das den index nicht so zerlegt multi_dist.append(dist) #print(" - Distance - " + str(dist)) # analog find phm and risk da_risk_found = avg predicted_rul_feature = (da_risk_found - 1) / m phmScore = self.score_phm( pd.DataFrame( [[true_RUL, predicted_rul_feature, -1]], columns=["RUL", "predicted_RUL", "object_id"])) rmse = self.score_rmse( numpy.array([true_RUL]), numpy.array([predicted_rul_feature])) phm_scores_lst.append(phmScore) rmse_scores_lst.append(rmse) prev = gap_idx ''' print("\nFeature: "+ str(col_name)+ "\nplot all - true risk: " + str(true_risk)) plt.plot(cur[0], cur[1]) plt.plot(cur[0][cur_subset_selection], cur[1][cur_subset_selection], color='red') #plt.plot(cur[0], medfilt(cur[1], 61), color = "green") # KERNEL MUST BE ODD plt.xlabel("risk - true risk = " + str(true_risk)) plt.ylabel("heat - "+str(col_name)) plt.show() ''' # use this for evaluation #phmScore = self.score_phm(pd.DataFrame([[true_RUL, predicted_rul_feature, -1]], columns = ["RUL", "predicted_RUL", "object_id"])) #rmse = self.score_rmse(numpy.array([true_RUL]), numpy.array([predicted_rul_feature])) if col_name in distances: distances[col_name].append( multi_dist) # this curve did not help at all phm_scores[col_name].append(phm_scores_lst) rmse_scores[col_name].append(rmse_scores_lst) else: distances[col_name] = [multi_dist] phm_scores[col_name] = [phm_scores_lst] rmse_scores[col_name] = [rmse_scores_lst] return distances, phm_scores, rmse_scores
def run(self, data_in): super().run(data_in) # do not remove this! Logging.log("Training da heat...") # cross validation params iter_idx = 2 # Monte Carlo cross-validation - randomly assign training and test set x times percentage_train = 0.8 # percentage of data being trainingset # 1. transform to df and keep critical whole_df = self._extract_critical_data_frame(data_in) whole_df[self._field_in_train_cluster_id] = data_in[ self._field_in_train_cluster_id] lst_of_train_n_test = self._split_to_subsets( whole_df, percentage_train, iter_idx) # split frame multiple times # each element being [train_df, test_df] # 2. distance - scoring quality of a feature dist_score = { } # key: clusterid_featureid e.g. c1 value: list of dict: key: feature_id, value score for train_test in lst_of_train_n_test: train_df = train_test[0] test_df = train_test[1] test_df["cluster_id"] = test_df["train_cluster_id"] for cluster_id in list(train_df["train_cluster_id"].unique() ): # per cluster own model #if cluster_id in [1, 5, 4, 0, 3]: # den hab ich schon # continue if self._test_mode and not (cluster_id == 3 or cluster_id == 1): continue print("\n\n TRAINING CLUSTER: " + str(cluster_id)) cur_train_df = train_df[train_df[ self._field_in_train_cluster_id] == cluster_id] cur_test_df = test_df[test_df["cluster_id"] == cluster_id] # 2. scale data and remove outliers output_dfs, trained_scalers = self._preprocess_data( cur_train_df, self._remove_empty_features, self._nr_outlier_iterations, self._outlier_window_size, self._outlier_std_threshold) data_in[ "CL_" + str(cluster_id) + "_" + self. _field_out_train_model_trained_scalers] = trained_scalers # 3. Train the model model_per_feature = self._build_heat_map_parallel(output_dfs) data_in[ "CL_" + str(cluster_id) + "_" + self._field_out_train_model_grid_area] = self._grid_area # 4. Store the models data_in["CL_" + str(cluster_id) + "_" + self._field_out_train_model] = model_per_feature # 5. score the feature quality for cross validation score_dict = self._score_feature_quality( cur_test_df, whole_df, model_per_feature, cluster_id, data_in) print("Found scores: " + str(score_dict)) idfr = "c" + str(cluster_id) if idfr not in dist_score: dist_score[idfr] = [score_dict] else: dist_score[idfr].append(score_dict) try: pathh = os.path.join(r"C:\Users\q416435\Desktop\scores", "cluster_" + str(cluster_id) + ".csv") print("Writing file to " + pathh) self._csv_file = open(pathh, 'w') for ke in score_dict.keys(): self._csv_writer = csv.writer(self._csv_file, delimiter=';') self._csv_writer.writerow([ke, str(score_dict[ke])]) self._csv_file.close() except: pass # 3. Perform training for whole model now # get final model # 4. keep only optimal models now based on dist_score #T.B.D. # 5. empty metrics metrics = dict() return data_in, metrics