Ejemplo n.º 1
0
    def run(self, data):
        super().run(data)  # dont not remove this!

        # preprocessing - scaling X
        scaler = preprocessing.StandardScaler()
        scaler = scaler.fit(data[self._key_in_train_x])
        temp = dict()
        temp['train_X_scaled'] = scaler.transform(data[self._key_in_train_x])
        temp['test_X_scaled'] = scaler.transform(data[self._key_in_test_x])

        # Train model for criticality
        self._model_crit, self._model_auc = PredictorWrapperClassification.\
            train(temp['train_X_scaled'], temp['test_X_scaled'], data[self._key_in_train_crit],
                  data[self._key_in_test_crit], data['meta_dataset_name'] + "_crit_pred", self._reload_if_existing)

        # select critical samples based on model
        data[self._key_out_train_x], data[
            self._key_out_train_crit] = self._crit_get_varargs(
                temp['train_X_scaled'], data[self._key_in_train_rul])
        data[self._key_out_test_x], data[
            self._key_out_test_crit] = self._crit_get_varargs(
                temp['test_X_scaled'], data[self._key_in_test_rul])

        # plot histogram to show awesome results
        Logging.log("RUL histogram of complete test population")
        max_rul = data[self._key_in_test_rul].max()
        Visual.plot_hist(data[self._key_in_test_rul], max_x=max_rul)
        Logging.log("RUL histogram of sub test population labeled critical")
        Visual.plot_hist(data[self._key_out_test_crit], max_x=max_rul)

        # metrics
        metrics = dict()  # empty metrics
        metrics['model_auc'] = self._model_auc

        return data, metrics
Ejemplo n.º 2
0
    def run(self, data_in, extract_frame_override=False, train_dff=None):
        super().run(data_in)  # do not remove this!
        Logging.log("Training da heat...")

        # 1. transform to df and keep critical
        if not extract_frame_override:
            train_df = self._extract_critical_data_frame(data_in)
            train_df[self._field_in_train_cluster_id] = data_in[
                self._field_in_train_cluster_id]
        else:
            train_df = train_dff

        for cluster_id in list(train_df["train_cluster_id"].unique()
                               ):  # per cluster own model
            if self._test_mode and not (cluster_id == 3):
                continue

            print("\n\n TRAINING CLUSTER: " + str(cluster_id))
            cur_train_df = train_df[train_df[self._field_in_train_cluster_id]
                                    == cluster_id]

            # 2. scale data and remove outliers
            output_dfs, trained_scalers = self._preprocess_data(
                cur_train_df, self._remove_empty_features,
                self._nr_outlier_iterations, self._outlier_window_size,
                self._outlier_std_threshold)
            data_in[
                "CL_" + str(cluster_id) + "_" +
                self._field_out_train_model_trained_scalers] = trained_scalers

            # 3. Train the model
            model_per_feature = self._build_heat_map_parallel(output_dfs)
            data_in["CL_" + str(cluster_id) + "_" +
                    self._field_out_train_model_grid_area] = self._grid_area

            # 4. Store the models
            data_in["CL_" + str(cluster_id) + "_" +
                    self._field_out_train_model] = model_per_feature

        # 5. empty metrics
        metrics = dict()

        return data_in, metrics
Ejemplo n.º 3
0
    def _build_heat_map(self, output_dfs):
        ''' using convolution for each point of a 2d array risk vs. feature value per feature
            a heat map is generated 
            :param output_dfs: list of dataframes with each having a column scaled_FEATURE_X (that is outlierfree and scaled now) and a column 
                               risk which is the risk for that feature at its row
            :return a dictionary is returned that contains the feature name as key and its 2d heatmap as output
        '''
        dimensions = {}
        for feature_df in output_dfs:  # each output_df has one risk and value
            Logging().log("Processing Feature: " + feature_df.columns[1])

            # Testmode
            if self._test_mode and (feature_df.columns[1]
                                    == "scaled_FEATURE_5"):
                print("Testing thus, break now!")
                break

            try:
                values = np.empty(len(feature_df))
                values.fill(1)

                # Assign X Y Z
                X = feature_df.RISK.as_matrix()
                Y = feature_df[feature_df.columns[1]].as_matrix()
                Z = values

                # create x-y points to be used in heatmap of identical size
                risk_min = 0
                risk_max = 1
                feature_min = min([
                    rm
                    for rm in [df[df.columns[1]].min() for df in output_dfs]
                    if not math.isnan(rm)
                ])
                feature_max = max([
                    rm
                    for rm in [df[df.columns[1]].max() for df in output_dfs]
                    if not math.isnan(rm)
                ])

                xi = np.linspace(risk_min, risk_max, self._grid_area)
                yi = np.linspace(feature_min, feature_max, self._grid_area)

                # Z is a matrix of x-y values interpolated (!)
                zi = griddata((X, Y),
                              Z, (xi[None, :], yi[:, None]),
                              method=self._interpol_method)
                zmin = 0
                zmax = 1
                zi[(zi < zmin) | (zi > zmax)] = None

                # Convolve each  point with a gaussian kernel giving the heat value at point xi,yi being Z
                # Advantage: kee horizontal and vertical influence
                grid_cur = np.nan_to_num(zi)

                # Smooth with a Gaussian kernel
                kernel = Gaussian2DKernel(stddev=self._std_gaus,
                                          x_size=self._kernel_size,
                                          y_size=self._kernel_size)
                grad = scipy_convolve(grid_cur,
                                      kernel,
                                      mode='same',
                                      method='direct')

                # no constant/zero values shall be allowed -> first - horizontal
                # horizontal interpolation bis an Rand
                Logging.log("I AM NEW")
                for r in range(len(grad)):
                    # per dimension get first and last nonzero value
                    cur_line = grad[:, r]
                    nonzeros = numpy.where(cur_line > 0.0001)[0]
                    if list(nonzeros):
                        a = 20
                        # fill von 0 bis nonzeros[0]
                        v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] +
                                                                a)])
                        replacement = numpy.linspace(0, v, nonzeros[0] +
                                                     a)[:(nonzeros[0])]
                        grad[:len(replacement), r] = replacement

                        # fill von nonzeros[-1] bis len(grid)-1
                        v = numpy.average(cur_line[nonzeros[-1] -
                                                   a:(nonzeros[-1])])
                        replacement = numpy.linspace(
                            0, v,
                            len(cur_line) - nonzeros[-1])[::-1]
                        grad[nonzeros[-1]:, r] = replacement

                # vertikale interpolation bis an Rand
                for r in range(len(grad)):
                    # per dimension get first and last nonzero value
                    cur_line = grad[r, :]
                    nonzeros = numpy.where(cur_line > 0.0001)[0]
                    if list(nonzeros):
                        a = 20
                        # fill von 0 bis nonzeros[0]
                        v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] +
                                                                a)])
                        replacement = numpy.linspace(0, v, nonzeros[0] +
                                                     a)[:(nonzeros[0])]
                        grad[r, :len(replacement)] = replacement

                        # fill von nonzeros[-1] bis len(grid)-1
                        v = numpy.average(cur_line[nonzeros[-1] -
                                                   a:(nonzeros[-1])])
                        replacement = numpy.linspace(
                            0, v,
                            len(cur_line) - nonzeros[-1] + 1)[::-1]
                        grad[r, nonzeros[-1] - 1:] = replacement

                # Store the model in memory
                dimensions[feature_df.columns[1]] = [
                    copy.deepcopy(np.absolute(grad)),
                    copy.deepcopy(xi),
                    copy.deepcopy(yi)
                ]

                if self._visualize_heatmap:
                    fig, (ax_orig, ax_mag) = plt.subplots(1, 2)
                    ax_orig.imshow(grid_cur[::-1, ::-1], cmap='RdYlGn')
                    ax_orig.set_title('Original')
                    ax_mag.imshow(
                        np.absolute(grad)[::-1, ::-1], cmap='RdYlGn'
                    )  # https://matplotlib.org/examples/color/colormaps_reference.html
                    ax_mag.set_title('Heat')
                    fig.show()
                    plt.show()

            except:
                Logging().log("No chance")
                #traceback.print_exc()
                dimensions[feature_df.columns[1]] = None

        return dimensions, xi
    def run(self, data_in):
        super().run(data_in)  # do not remove this!
        Logging.log("Testing da heat...")

        # 1. transform to df and keep critical
        test_df = self._extract_critical_data_frame(data_in)

        # 2. assign cluster id, add column with id
        test_df = self._assign_cluster(data_in, test_df)
        test_df["predicted_rul"] = -1
        test_df["predicted_risk"] = -1

        abs_max_rul = test_df["RUL"].max()  # 217
        segment_thrshld = 0.33 * abs_max_rul
        if self._enable_all_print:
            print("THE MAXIMUM RUL IN THE DATA SET IS " + str(abs_max_rul))

        # 3. extract current relevant data - do this for all and append
        for object_id in list(test_df["id"].unique()):
            all_feature_sum = False
            cur_df1 = test_df[test_df['id'] == object_id]
            print("Current: OBJECT ID: " + str(object_id))

            timestamp_gap = 0  # PER Cluster need to shift incoming data else I cannot sum it up
            last_ts = 0
            expected_rul = 99999999
            all_feature_favorites = []
            for cluster_id in list(cur_df1["cluster_id"].unique()):
                if self._test_mode and not (cluster_id == 3):
                    continue
                Logging.log("--------> Eval: CLUSTER ID: " + str(cluster_id))
                cur_df2 = cur_df1[cur_df1['cluster_id'] == cluster_id]
                cnt = 0
                cur_df3 = cur_df2.sort_values("RUL", ascending=False)

                # per object predict only the maximal
                first = True
                for i in range(len(cur_df3)):

                    # 0. parallelize only estimate last one
                    current_test_df = cur_df3
                    if not first:
                        continue
                    if first:
                        first = False
                    Logging.log("--------> Eval: RUL RANGE: " +
                                str(current_test_df["RUL"].max()) + " to " +
                                str(current_test_df["RUL"].min()))

                    # 1. OPTIMIERUNG - nehme nicht alles sondern nur die maximal letzten 120 (ansonsten verzerrt weil ich ja nur bis 200 gelernt hab)
                    dist = current_test_df["RUL"].max(
                    ) - current_test_df["RUL"].min()
                    if dist > segment_thrshld:
                        if self._enable_all_print:
                            print(
                                "SHORTENED RUL AREA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! "
                            )
                        thrshld = current_test_df["RUL"].min(
                        ) + segment_thrshld
                        current_test_df = current_test_df[
                            current_test_df["RUL"] < thrshld]

                    # 4. run tester for this data frame and add column predicted
                    try:
                        skip = skip_features[int(cluster_id)]
                    except:
                        skip = []

                    # 5. shift the input curve to align with the one processed next
                    if last_ts != 0:
                        cur_ts = current_test_df["TS"].max()
                        timestamp_gap = cur_ts - last_ts

                    # 6. store last Timestamp for shifting if it is more urgent
                    if current_test_df["RUL"].min() < expected_rul:
                        expected_rul = current_test_df["RUL"].min()

                    predicted_risk, predicted_rul, m, all_feature_sum, per_feature_sum, feature_favorites = self._predict_RUL(
                        data_in, current_test_df, cluster_id, all_feature_sum,
                        skip, timestamp_gap, expected_rul)
                    all_feature_favorites += feature_favorites

                    # VARIANTE 1 - weighted average mit 1/x
                    print("USING WEIGHTED AVERAGE")
                    total_amount = 0
                    total_count = 0
                    for feat in all_feature_favorites:
                        weight = 1 / feat
                        total_count += (weight * feat)
                        total_amount += weight

                    wAvg = total_count / total_amount
                    predicted_risk = wAvg
                    predicted_rul = (predicted_risk - 1) / m
                    print("\n->>>>>> Estimated predicted RUL FINAL FINAL: " +
                          str(predicted_rul) + "\nUPDATE RISK: " +
                          str(predicted_risk))

                    # 7. wenn mehr als 2 features kleiner 0.53 sind dann nehme average dieser
                    rego = [a for a in all_feature_favorites if a < 0.53]
                    if len(rego) > 2:
                        predicted_risk = numpy.average(rego)
                        predicted_rul = (predicted_risk - 1) / m
                        print("Estimated predicted RUL UPDATED: " +
                              str(predicted_rul) + "\nUPDATE RISK: " +
                              str(predicted_risk))

                    # 5. result should be at location of test_df WHERE current_test_df["RUL"].min()
                    test_df = test_df.set_value(current_test_df.index[-1],
                                                "predicted_risk",
                                                predicted_risk)
                    test_df = test_df.set_value(current_test_df.index[-1],
                                                "predicted_rul", predicted_rul)

                    # 6. store last Timestamp for shifting if it is more urgent
                    if current_test_df["TS"].max() > last_ts:
                        last_ts = current_test_df["TS"].max()

                    # 3. store to file
                    if self._write_csv:
                        cnt += 1

                        object_id = str(object_id)
                        cluster_id = str(cluster_id)

        # 5. metrics
        metrics = {}
        return data_in, metrics