Ejemplo n.º 1
0
    def run(self, data):
        super().run(data)  # dont not remove this!

        # preprocessing - scaling X
        scaler = preprocessing.StandardScaler()
        scaler = scaler.fit(data[self._key_in_train_x])
        temp = dict()
        temp['train_X_scaled'] = scaler.transform(data[self._key_in_train_x])
        temp['test_X_scaled'] = scaler.transform(data[self._key_in_test_x])

        # Train model for criticality
        self._model_crit, self._model_auc = PredictorWrapperClassification.\
            train(temp['train_X_scaled'], temp['test_X_scaled'], data[self._key_in_train_crit],
                  data[self._key_in_test_crit], data['meta_dataset_name'] + "_crit_pred", self._reload_if_existing)

        # select critical samples based on model
        data[self._key_out_train_x], data[
            self._key_out_train_crit] = self._crit_get_varargs(
                temp['train_X_scaled'], data[self._key_in_train_rul])
        data[self._key_out_test_x], data[
            self._key_out_test_crit] = self._crit_get_varargs(
                temp['test_X_scaled'], data[self._key_in_test_rul])

        # plot histogram to show awesome results
        Logging.log("RUL histogram of complete test population")
        max_rul = data[self._key_in_test_rul].max()
        Visual.plot_hist(data[self._key_in_test_rul], max_x=max_rul)
        Logging.log("RUL histogram of sub test population labeled critical")
        Visual.plot_hist(data[self._key_out_test_crit], max_x=max_rul)

        # metrics
        metrics = dict()  # empty metrics
        metrics['model_auc'] = self._model_auc

        return data, metrics
Ejemplo n.º 2
0
    def run(self, data):
        super().run(data)  # dont not remove this!

        threshold = 1

        # predict criticality
        self._model_risk, self._model_risk_rmse = PredictorWrapperRegression.train(
            data["train_X_scaled_crit_bounded_scaled_top"],
            data["test_X_scaled_crit_bounded_scaled_top"],
            np.array(data["train_risc"]), np.array(data["test_risc"]),
            data['meta_dataset_name'] + "_risc_pred", self._reload_if_existing)

        below_threshold = np.array(data["train_risc"] < threshold)
        above_threshold = np.array(data["train_risc"] >= threshold)

        n_above = sum(above_threshold)
        n_below = sum(below_threshold)
        Logging().log(
            'there are {} samples above and {} below the threshold'.format(
                n_above, n_below))
        n_above_percentage_to_be_balanced = n_below / (n_above + n_below)
        rand_nums = np.random.choice([1, 0],
                                     size=(n_above + n_below, ),
                                     p=[
                                         n_above_percentage_to_be_balanced,
                                         (1 -
                                          n_above_percentage_to_be_balanced)
                                     ])
        Logging().log(
            'picked randomly a proportion of {0:.3}% samples from all samples ({1} samples)'
            .format(n_above_percentage_to_be_balanced, (n_above + n_below)))

        above_threshold_picked = rand_nums == 1 & above_threshold

        # Plot
        Visual().plot_scatter(
            self._model_risk.predict(
                data["test_X_scaled_crit_bounded_scaled_top"]),
            data["test_risc"])
        Visual().plot_scatter(
            self._model_risk.predict(
                data["train_X_scaled_crit_bounded_scaled_top"]),
            data["train_risc"])

        # aggregate metrics
        metrics = dict()
        metrics['model_rmse'] = self._model_risk_rmse

        return data, metrics
    def train(X_train,
              X_test,
              y_train,
              y_test,
              model_filename,
              reload_if_existing,
              modeltype="RF"):
        """
        trains and evaluates a model based on the given data
        :param X_train: Features for training, expected to a numpy.ndarray.
        :param X_test: Features for testing, expected to a numpy.ndarray.
        :param y_train: Labels for training. Expected to an one-dimesional array.
        :param y_test: Labels for testing. Expected to an one-dimesional array.
        :param model_filename: Filename of model when serialized to disk
        :param reload_if_existing: Boolean indicating if model should be restored from disk if existing.
        :param modeltype: modeltype to train (RF, SVC or LRCV). RF is recommended since being fast to train and non-
                          linear - therefore usually yielding the best results.
        :return: 
        """
        if reload_if_existing is False or Path(
                model_filename).exists() is False:
            Logging().log("training {}. ".format(modeltype))
            if modeltype is "RF":
                param_grid = {
                    'max_depth': [3, 5, 10, 15, 20],
                    'n_estimators': [3, 5, 10, 20]
                }

                clf = GridSearchCV(RandomForestRegressor(n_jobs=-1),
                                   param_grid)

                mdl = clf.fit(X_train, y_train)

            # output model quality
            rmse = RMSE.score(y_test, mdl.predict(X_test))
            Logging().log("Mean squared error: {0:.3}".format(rmse))

            # save model to file
            with open(model_filename, 'wb') as f:
                pickle.dump((mdl, rmse), f)

        else:
            Logging().log("restoring model from {}".format(model_filename))

            with open(model_filename, 'rb') as fid:
                mdl, rmse = pickle.load(fid)

        return mdl, rmse
Ejemplo n.º 4
0
    def _scale(self, train_df):
        ''' centers the data around 0 and scales it 
        :param: train_df: Dataframe that contains the training data
        :return: dataframe with additional column scaled_FEATURE_X containing scaled features
        :return: trained_scalers: dictionary - Per feature stores scaler object that is needed in the testing 
                 phase to perform identical scaling, with key as column name
        '''

        Logging().log("Scaling Features...")
        trained_scalers = {}
        for col in train_df.columns:

            # 1. consider only relevant columns
            if HeatmapConvolutionTrainer.relevant_columns(col):
                continue

            # 2. standard scaler
            scaler = preprocessing.StandardScaler()
            try:
                scaler = scaler.fit(train_df[col])
            except:
                scaler.fit(train_df[col].reshape(-1, 1))
            try:
                train_df['scaled_' + col] = scaler.transform(train_df[col])
            except:
                train_df['scaled_' + col] = scaler.transform(
                    train_df[col].reshape(-1, 1))

            trained_scalers[col] = copy.deepcopy(scaler)

        return train_df, trained_scalers
Ejemplo n.º 5
0
    def run(self, data_in, extract_frame_override=False, train_dff=None):
        super().run(data_in)  # do not remove this!
        Logging.log("Training da heat...")

        # 1. transform to df and keep critical
        if not extract_frame_override:
            train_df = self._extract_critical_data_frame(data_in)
            train_df[self._field_in_train_cluster_id] = data_in[
                self._field_in_train_cluster_id]
        else:
            train_df = train_dff

        for cluster_id in list(train_df["train_cluster_id"].unique()
                               ):  # per cluster own model
            if self._test_mode and not (cluster_id == 3):
                continue

            print("\n\n TRAINING CLUSTER: " + str(cluster_id))
            cur_train_df = train_df[train_df[self._field_in_train_cluster_id]
                                    == cluster_id]

            # 2. scale data and remove outliers
            output_dfs, trained_scalers = self._preprocess_data(
                cur_train_df, self._remove_empty_features,
                self._nr_outlier_iterations, self._outlier_window_size,
                self._outlier_std_threshold)
            data_in[
                "CL_" + str(cluster_id) + "_" +
                self._field_out_train_model_trained_scalers] = trained_scalers

            # 3. Train the model
            model_per_feature = self._build_heat_map_parallel(output_dfs)
            data_in["CL_" + str(cluster_id) + "_" +
                    self._field_out_train_model_grid_area] = self._grid_area

            # 4. Store the models
            data_in["CL_" + str(cluster_id) + "_" +
                    self._field_out_train_model] = model_per_feature

        # 5. empty metrics
        metrics = dict()

        return data_in, metrics
Ejemplo n.º 6
0
 def _visualize_feature_series(self, train_df, cluster_ids):
     ''' plots each feature of the reduced training set with the 
         color of its assigned cluster
         :param train_df: dataframe of prepared features (only critical values!)
         :param cluster_ids: array of cluster ids corresponding to the reduced rows
     '''
     seaborn.set(style='ticks')
     train_df["train_cluster_id"] = cluster_ids
     
     for col in train_df.columns:
         if not col.startswith("FEATURE"): continue 
         Logging().log("CURRENT -> "+ col)
         _order = list(set(cluster_ids))
         fg = seaborn.FacetGrid(data=train_df, hue='train_cluster_id', hue_order=_order, aspect=1.61)
         fg.map(plt.scatter, 'RISK', col).add_legend()
         plt.show()
Ejemplo n.º 7
0
 def _kmeans_parse(self, data):
     ''' extract all required parameters for k means clustering
     
     :param data: 2D array containing features in shape array([[ f1 f2 f3 f4, ...], [ f1 f2 f3 f4, ...], [ f1 f2 f3 f4, ...], ...])   
     
     :return n_samples: Number of input examples 
     :return n_features: Number of features per example
     :return n_clusters: Number of expected target clusters  
     '''        
     
     expected_cluster_number = self._algorithm_params[0]
     
     np.random.seed(42)    
     n_samples, n_features = data.shape
     n_clusters = expected_cluster_number # Anzahl Cluster
     Logging().log("n_clusters: %d, \t n_samples %d, \t n_features %d" % (n_clusters, n_samples, n_features))
     return n_samples, n_features, n_clusters
Ejemplo n.º 8
0
    def run_stage(self, key, data=None):
        """
        
        :param key: The key referencing the stage
        :param data: data dictionary
        :return: 
        """

        # current pipelineNode
        pipelineNode = self._stages[key]
        Logging().log("Running Stage: {}".format(
            pipelineNode.__class__.__name__))

        # always update data from previous stage
        if data == None:
            data = self._data

        # run stage
        self._data, self._metrics = pipelineNode.run(data)

        return self._data, self._metrics
    def run(self, data):
        super().run(data)  # dont not remove this!

        # determine feature weights
        pf = PolyFitter()

        # create temporary dictionary for data that is not passed to the next stage
        temp = dict()
        X = data['train_X_scaled_crit_bounded_scaled']
        n_samples = X.shape[0]
        n_idx_random = X.shape[
            1]  # features are zero indexed, so the number is equal to the index of the new feature
        rand_feature = np.random.randn(n_samples)
        temp['t_X_s_c_b_s_enhanced'] = np.c_[X, rand_feature]  # hstack
        data['meta_feature_weights'] = pf.get_weights(
            temp['t_X_s_c_b_s_enhanced'], data['train_risc'])

        # select top features
        if self._select_above_rand:
            Logging().log("selecting feature above random feature")

            # restore model from file in case existing
            model_filename = data[
                'meta_dataset_name'] + "_" + self._model_filename
            if self._reload_if_existing is False or Path(
                    model_filename).exists() is False:
                data[
                    'meta_feature_indices'] = pf.get_feature_idices_above_rand(
                        data['meta_feature_weights'],
                        n_idx_random=n_idx_random)
                # save model to file
                with open(model_filename, 'wb') as f:
                    pickle.dump(data['meta_feature_indices'], f)

            else:
                Logging().log("restoring model from {}".format(model_filename))
                with open(model_filename, 'rb') as fid:
                    data['meta_feature_indices'] = pickle.load(fid)

            Logging().log("selected {} features from {}".format(
                len(data['meta_feature_indices']), n_idx_random))
        else:
            Logging().log("selecting top {} features".format(
                self._n_top_features))
            data['meta_feature_indices'] = pf.get_top_feature_idices(
                data['meta_feature_weights'], self._n_top_features)

        data[
            self.
            _field_out_train_X_scaled_crit_bounded_scaled_top] = pf.get_top_features(
                data[self._field_in_train_X_scaled_crit_bounded_scaled],
                data['meta_feature_indices'])
        data[
            self.
            _field_out_test_X_scaled_crit_bounded_scaled_top] = pf.get_top_features(
                data[self._field_in_test_X_scaled_crit_bounded_scaled],
                data['meta_feature_indices'])

        # aggregate metrics
        metrics = dict()  # empty

        return data, metrics
Ejemplo n.º 10
0
    def _outlier_removal(self, train_df, remove_empty, nr_iterations,
                         split_windows, std_threshold):
        ''' outliers are removed from the training dataframe per feature by windowing and removing
            all values per window that are further away than std_threshold times the standard 
            deviation
        :param: train_df: Dataframe that contains the training data
        :param: remove_empty: Boolean - if true empty features are removed
        :param: nr_iterations: Number of iterations that are repeated to remove outliers per window
        :param: split_windows: Data is split into split_windows equal length window that are between minimal risk and 1
        :param: std_threshold: data that is further away than std_threshold * std of the feature is removed
        :return: output_dfs: list of dataframes with each having a column scaled_FEATURE_X that is outlierfree now and a column risk which is the risk 
                             for that feature at its row
        '''
        if not self._remove_outliers:
            print("Outlier removal disabled!")
        # 1. Initialize
        output_dfs = []
        iteration = range(nr_iterations)

        first = True
        # Per feature and window
        for col in train_df.columns:

            # 2. only scaled features are considered
            if HeatmapConvolutionTrainer.scaled_relevant_columns(col): continue
            result_df = train_df.sort_values("RISK")

            # 3. iterate multiple times over window
            #   on each iteration remove outliers
            for i in iteration:
                sub_dfs = []
                indices = []
                rs = 0
                # 4. iterate over windows
                for r in np.linspace(result_df["RISK"].min(), 1,
                                     split_windows):

                    sub_df = result_df[(rs <= result_df["RISK"])
                                       & (r > result_df["RISK"])]
                    if self._remove_outliers:
                        sub_df = sub_df[(
                            (sub_df[col] - sub_df[col].mean()) /
                            sub_df[col].std()).abs() < std_threshold]
                    sub_dfs.append(sub_df)
                    rs = r
                result_df = pd.concat(sub_dfs)

            # 5. Merge result to common dataframe
            output_dfs.append(result_df[["RISK", col]])

            # 6. Remove empty
            if (remove_empty and len(result_df[col].unique()) < 2):
                continue

            # 7. Plot results
            if self._visualize_outlier:
                Logging().log("Pre - Standard Deviation vorher: " +
                              str(train_df[col].std()))
                Visual().plot_scatter(train_df["RISK"],
                                      train_df[col])  #, "RISK", "feature")
                Logging().log("Post - Standard Deviation nachher: " +
                              str(result_df[col].std()))
                Visual().plot_scatter(result_df["RISK"], result_df[col])

        return output_dfs
Ejemplo n.º 11
0
    def _build_heat_map(self, output_dfs):
        ''' using convolution for each point of a 2d array risk vs. feature value per feature
            a heat map is generated 
            :param output_dfs: list of dataframes with each having a column scaled_FEATURE_X (that is outlierfree and scaled now) and a column 
                               risk which is the risk for that feature at its row
            :return a dictionary is returned that contains the feature name as key and its 2d heatmap as output
        '''
        dimensions = {}
        for feature_df in output_dfs:  # each output_df has one risk and value
            Logging().log("Processing Feature: " + feature_df.columns[1])

            # Testmode
            if self._test_mode and (feature_df.columns[1]
                                    == "scaled_FEATURE_5"):
                print("Testing thus, break now!")
                break

            try:
                values = np.empty(len(feature_df))
                values.fill(1)

                # Assign X Y Z
                X = feature_df.RISK.as_matrix()
                Y = feature_df[feature_df.columns[1]].as_matrix()
                Z = values

                # create x-y points to be used in heatmap of identical size
                risk_min = 0
                risk_max = 1
                feature_min = min([
                    rm
                    for rm in [df[df.columns[1]].min() for df in output_dfs]
                    if not math.isnan(rm)
                ])
                feature_max = max([
                    rm
                    for rm in [df[df.columns[1]].max() for df in output_dfs]
                    if not math.isnan(rm)
                ])

                xi = np.linspace(risk_min, risk_max, self._grid_area)
                yi = np.linspace(feature_min, feature_max, self._grid_area)

                # Z is a matrix of x-y values interpolated (!)
                zi = griddata((X, Y),
                              Z, (xi[None, :], yi[:, None]),
                              method=self._interpol_method)
                zmin = 0
                zmax = 1
                zi[(zi < zmin) | (zi > zmax)] = None

                # Convolve each  point with a gaussian kernel giving the heat value at point xi,yi being Z
                # Advantage: kee horizontal and vertical influence
                grid_cur = np.nan_to_num(zi)

                # Smooth with a Gaussian kernel
                kernel = Gaussian2DKernel(stddev=self._std_gaus,
                                          x_size=self._kernel_size,
                                          y_size=self._kernel_size)
                grad = scipy_convolve(grid_cur,
                                      kernel,
                                      mode='same',
                                      method='direct')

                # no constant/zero values shall be allowed -> first - horizontal
                # horizontal interpolation bis an Rand
                Logging.log("I AM NEW")
                for r in range(len(grad)):
                    # per dimension get first and last nonzero value
                    cur_line = grad[:, r]
                    nonzeros = numpy.where(cur_line > 0.0001)[0]
                    if list(nonzeros):
                        a = 20
                        # fill von 0 bis nonzeros[0]
                        v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] +
                                                                a)])
                        replacement = numpy.linspace(0, v, nonzeros[0] +
                                                     a)[:(nonzeros[0])]
                        grad[:len(replacement), r] = replacement

                        # fill von nonzeros[-1] bis len(grid)-1
                        v = numpy.average(cur_line[nonzeros[-1] -
                                                   a:(nonzeros[-1])])
                        replacement = numpy.linspace(
                            0, v,
                            len(cur_line) - nonzeros[-1])[::-1]
                        grad[nonzeros[-1]:, r] = replacement

                # vertikale interpolation bis an Rand
                for r in range(len(grad)):
                    # per dimension get first and last nonzero value
                    cur_line = grad[r, :]
                    nonzeros = numpy.where(cur_line > 0.0001)[0]
                    if list(nonzeros):
                        a = 20
                        # fill von 0 bis nonzeros[0]
                        v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] +
                                                                a)])
                        replacement = numpy.linspace(0, v, nonzeros[0] +
                                                     a)[:(nonzeros[0])]
                        grad[r, :len(replacement)] = replacement

                        # fill von nonzeros[-1] bis len(grid)-1
                        v = numpy.average(cur_line[nonzeros[-1] -
                                                   a:(nonzeros[-1])])
                        replacement = numpy.linspace(
                            0, v,
                            len(cur_line) - nonzeros[-1] + 1)[::-1]
                        grad[r, nonzeros[-1] - 1:] = replacement

                # Store the model in memory
                dimensions[feature_df.columns[1]] = [
                    copy.deepcopy(np.absolute(grad)),
                    copy.deepcopy(xi),
                    copy.deepcopy(yi)
                ]

                if self._visualize_heatmap:
                    fig, (ax_orig, ax_mag) = plt.subplots(1, 2)
                    ax_orig.imshow(grid_cur[::-1, ::-1], cmap='RdYlGn')
                    ax_orig.set_title('Original')
                    ax_mag.imshow(
                        np.absolute(grad)[::-1, ::-1], cmap='RdYlGn'
                    )  # https://matplotlib.org/examples/color/colormaps_reference.html
                    ax_mag.set_title('Heat')
                    fig.show()
                    plt.show()

            except:
                Logging().log("No chance")
                #traceback.print_exc()
                dimensions[feature_df.columns[1]] = None

        return dimensions, xi
Ejemplo n.º 12
0
    def _build_one_heat_map(self,
                            feature_df,
                            risk_min,
                            feature_min,
                            feature_max,
                            fine_tune=-1):
        Logging().log("Processing Feature: " + feature_df.columns[1])

        if fine_tune == -1:
            try:
                values = np.empty(len(feature_df))
                values.fill(1)

                # Assign X Y Z
                X = feature_df.RISK.as_matrix()
                Y = feature_df[feature_df.columns[1]].as_matrix()
                Z = values

                # create x-y points to be used in heatmap of identical size
                risk_min = feature_df.RISK.min()
                risk_max = 1

                xi = np.linspace(risk_min, risk_max, self._grid_area)
                yi = np.linspace(feature_min, feature_max, self._grid_area)

                # Z is a matrix of x-y values interpolated (!)
                zi = griddata((X, Y),
                              Z, (xi[None, :], yi[:, None]),
                              method=self._interpol_method)
                zmin = 0
                zmax = 1
                zi[(zi < zmin) | (zi > zmax)] = None

                # Convolve each  point with a gaussian kernel giving the heat value at point xi,yi being Z
                # Advantage: kee horizontal and vertical influence
                grid_cur = np.nan_to_num(zi)

                # Smooth with a Gaussian kernel
                kernel = Gaussian2DKernel(stddev=self._std_gaus,
                                          x_size=self._kernel_size,
                                          y_size=self._kernel_size)
                grad = scipy_convolve(grid_cur,
                                      kernel,
                                      mode='same',
                                      method='direct')

                # horizontal interpolation
                for r in range(len(grad)):
                    # per dimension get first and last nonzero value
                    cur_line = grad[:, r]
                    nonzeros = numpy.where(cur_line > 0.0001)[0]
                    if list(nonzeros):
                        a = 4
                        # fill von 0 bis nonzeros[0]
                        v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] +
                                                                a)])
                        replacement = numpy.linspace(0, v, nonzeros[0] +
                                                     a)[:(nonzeros[0])]
                        grad[:len(replacement), r] = replacement

                        # fill von nonzeros[-1] bis len(grid)-1
                        v = numpy.average(cur_line[nonzeros[-1] -
                                                   a:(nonzeros[-1])])
                        replacement = numpy.linspace(
                            0, v,
                            len(cur_line) - nonzeros[-1])[::-1]
                        grad[nonzeros[-1]:, r] = replacement

                # Store the model in memory
                feature_name = feature_df.columns[1]
                result = [
                    feature_name,
                    [
                        copy.deepcopy(np.absolute(grad)),
                        copy.deepcopy(xi),
                        copy.deepcopy(yi)
                    ], grid_cur
                ]

            except:
                feature_name = feature_df.columns[1]
                Logging().log(str(feature_df.columns[1]) + ": Feature skipped")
                result = [feature_name, None, None]

        else:
            if fine_tune == 0:
                feature_df = feature_df[feature_df["RISK"] <
                                        0.5]  # hier changed!!!!!!!!!!!!!
            if fine_tune == 1:
                feature_df = feature_df[feature_df["RISK"] > 0.25]
                feature_df = feature_df[feature_df["RISK"] < 0.75]
            if fine_tune == 2:
                feature_df = feature_df[feature_df["RISK"] > 0.5]

            try:
                feature_df = self._remove_one_outlier(feature_df,
                                                      feature_df.columns[1])

                values = np.empty(len(feature_df))
                values.fill(1)

                # Assign X Y Z
                X = feature_df.RISK.as_matrix()
                Y = feature_df[feature_df.columns[1]].as_matrix()
                Z = values
                risk_min = feature_df.RISK.min()
                risk_max = 1

                xi = np.linspace(risk_min, risk_max, self._grid_area)
                yi = np.linspace(feature_min, feature_max, self._grid_area)

                # Z is a matrix of x-y values interpolated (!)
                zi = griddata((X, Y),
                              Z, (xi[None, :], yi[:, None]),
                              method=self._interpol_method)
                zmin = 0
                zmax = 1
                zi[(zi < zmin) | (zi > zmax)] = None

                # Convolve each  point with a gaussian kernel giving the heat value at point xi,yi being Z
                # Advantage: kee horizontal and vertical influence
                grid_cur = np.nan_to_num(zi)

                # Smooth with a Gaussian kernel
                kernel = Gaussian2DKernel(stddev=self._std_gaus,
                                          x_size=self._kernel_size,
                                          y_size=self._kernel_size)
                grad = scipy_convolve(grid_cur,
                                      kernel,
                                      mode='same',
                                      method='direct')

                # vertikale interpolation bis an Rand
                for r in range(len(grad)):
                    # per dimension get first and last nonzero value
                    cur_line = grad[:, r]
                    nonzeros = numpy.where(cur_line > 0.0001)[0]
                    if list(nonzeros):
                        a = 4
                        # fill von 0 bis nonzeros[0]
                        v = numpy.average(cur_line[nonzeros[0]:(nonzeros[0] +
                                                                a)])
                        replacement = numpy.linspace(0, v, nonzeros[0] +
                                                     a)[:(nonzeros[0])]
                        grad[:len(replacement), r] = replacement

                        # fill von nonzeros[-1] bis len(grid)-1
                        v = numpy.average(cur_line[nonzeros[-1] -
                                                   a:(nonzeros[-1])])
                        replacement = numpy.linspace(
                            0, v,
                            len(cur_line) - nonzeros[-1])[::-1]
                        grad[nonzeros[-1]:, r] = replacement

                # Store the model in memory
                feature_name = feature_df.columns[1]
                result = [
                    "fine_" + str(fine_tune) + "_" + feature_name,
                    [
                        copy.deepcopy(np.absolute(grad)),
                        copy.deepcopy(xi),
                        copy.deepcopy(yi)
                    ], grid_cur
                ]

            except:
                #traceback.print_exc()
                feature_name = feature_df.columns[1]
                result = [feature_name, None, None]

        return result
Ejemplo n.º 13
0
    def run(self, data):
        super().run(data)  # dont not remove this!

        # temporary dictionary
        temp = dict(
        )  # use this for all data, that is not referenced by self._field

        # Assign and model risk
        rul_percentile_value = np.percentile(
            data[self._field_in_train_rul_crit], self._rul_percentile)
        Logging().log(
            "any rul value larger than {0:.1f} will be dropped.".format(
                rul_percentile_value))

        indices_train = np.array(
            data[self._field_in_train_rul_crit] <= rul_percentile_value)
        temp["train_rul_crit_bounded"] = data[self._field_in_train_rul_crit][
            indices_train]  # _bounded = only samples
        # with RUL in percentile
        temp["train_X_scaled_crit_bounded"] = data[
            self._field_in_train_X_scaled_crit][indices_train]

        indices_test = np.array(
            data[self._field_in_test_rul_crit] <= rul_percentile_value)
        temp["test_rul_crit_bounded"] = data[
            self._field_in_test_rul_crit][indices_test]
        temp["test_X_scaled_crit_bounded"] = data[
            self._field_in_test_X_scaled_crit][indices_test]

        scaler = preprocessing.StandardScaler()
        scaler = scaler.fit(temp["train_X_scaled_crit_bounded"])

        data[self.
             _field_out_train_X_scaled_crit_bounded_scaled] = scaler.transform(
                 temp["train_X_scaled_crit_bounded"])
        data[self.
             _field_out_test_X_scaled_crit_bounded_scaled] = scaler.transform(
                 temp["test_X_scaled_crit_bounded"])

        # first, we calculate the risk for all critical samples based on the rul
        rul_min = np.min(temp["train_rul_crit_bounded"])
        rul_max = np.max(temp["train_rul_crit_bounded"])
        Logging().log(
            "max RUL in bounded training dataset is {} (RISK = 1), min is {} (RISK = 0)."
            .format(rul_max, rul_min))

        data[self._field_out_train_risc] = self._get_risc_target(
            temp["train_rul_crit_bounded"])
        data[self._field_out_test_risc] = self._get_risc_target(
            temp["test_rul_crit_bounded"])

        Visual().plot_scatter(temp["train_rul_crit_bounded"],
                              data[self._field_out_train_risc])

        #for field in ["train", "test", "valid"]:
        #    field_real = "rul_" + field
        #    if field_real in data:
        #        data["risk_" + field] = self._get_risc_target(data[field_real])
        #        Visual().plot_scatter(data[field_real], data["risk_" + field])

        # metrics
        metrics = dict()  # empty metrics

        return data, metrics
    def run(self, data_in):
        super().run(data_in)  # do not remove this!
        Logging.log("Testing da heat...")

        # 1. transform to df and keep critical
        test_df = self._extract_critical_data_frame(data_in)

        # 2. assign cluster id, add column with id
        test_df = self._assign_cluster(data_in, test_df)
        test_df["predicted_rul"] = -1
        test_df["predicted_risk"] = -1

        abs_max_rul = test_df["RUL"].max()  # 217
        segment_thrshld = 0.33 * abs_max_rul
        if self._enable_all_print:
            print("THE MAXIMUM RUL IN THE DATA SET IS " + str(abs_max_rul))

        # 3. extract current relevant data - do this for all and append
        for object_id in list(test_df["id"].unique()):
            all_feature_sum = False
            cur_df1 = test_df[test_df['id'] == object_id]
            print("Current: OBJECT ID: " + str(object_id))

            timestamp_gap = 0  # PER Cluster need to shift incoming data else I cannot sum it up
            last_ts = 0
            expected_rul = 99999999
            all_feature_favorites = []
            for cluster_id in list(cur_df1["cluster_id"].unique()):
                if self._test_mode and not (cluster_id == 3):
                    continue
                Logging.log("--------> Eval: CLUSTER ID: " + str(cluster_id))
                cur_df2 = cur_df1[cur_df1['cluster_id'] == cluster_id]
                cnt = 0
                cur_df3 = cur_df2.sort_values("RUL", ascending=False)

                # per object predict only the maximal
                first = True
                for i in range(len(cur_df3)):

                    # 0. parallelize only estimate last one
                    current_test_df = cur_df3
                    if not first:
                        continue
                    if first:
                        first = False
                    Logging.log("--------> Eval: RUL RANGE: " +
                                str(current_test_df["RUL"].max()) + " to " +
                                str(current_test_df["RUL"].min()))

                    # 1. OPTIMIERUNG - nehme nicht alles sondern nur die maximal letzten 120 (ansonsten verzerrt weil ich ja nur bis 200 gelernt hab)
                    dist = current_test_df["RUL"].max(
                    ) - current_test_df["RUL"].min()
                    if dist > segment_thrshld:
                        if self._enable_all_print:
                            print(
                                "SHORTENED RUL AREA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! "
                            )
                        thrshld = current_test_df["RUL"].min(
                        ) + segment_thrshld
                        current_test_df = current_test_df[
                            current_test_df["RUL"] < thrshld]

                    # 4. run tester for this data frame and add column predicted
                    try:
                        skip = skip_features[int(cluster_id)]
                    except:
                        skip = []

                    # 5. shift the input curve to align with the one processed next
                    if last_ts != 0:
                        cur_ts = current_test_df["TS"].max()
                        timestamp_gap = cur_ts - last_ts

                    # 6. store last Timestamp for shifting if it is more urgent
                    if current_test_df["RUL"].min() < expected_rul:
                        expected_rul = current_test_df["RUL"].min()

                    predicted_risk, predicted_rul, m, all_feature_sum, per_feature_sum, feature_favorites = self._predict_RUL(
                        data_in, current_test_df, cluster_id, all_feature_sum,
                        skip, timestamp_gap, expected_rul)
                    all_feature_favorites += feature_favorites

                    # VARIANTE 1 - weighted average mit 1/x
                    print("USING WEIGHTED AVERAGE")
                    total_amount = 0
                    total_count = 0
                    for feat in all_feature_favorites:
                        weight = 1 / feat
                        total_count += (weight * feat)
                        total_amount += weight

                    wAvg = total_count / total_amount
                    predicted_risk = wAvg
                    predicted_rul = (predicted_risk - 1) / m
                    print("\n->>>>>> Estimated predicted RUL FINAL FINAL: " +
                          str(predicted_rul) + "\nUPDATE RISK: " +
                          str(predicted_risk))

                    # 7. wenn mehr als 2 features kleiner 0.53 sind dann nehme average dieser
                    rego = [a for a in all_feature_favorites if a < 0.53]
                    if len(rego) > 2:
                        predicted_risk = numpy.average(rego)
                        predicted_rul = (predicted_risk - 1) / m
                        print("Estimated predicted RUL UPDATED: " +
                              str(predicted_rul) + "\nUPDATE RISK: " +
                              str(predicted_risk))

                    # 5. result should be at location of test_df WHERE current_test_df["RUL"].min()
                    test_df = test_df.set_value(current_test_df.index[-1],
                                                "predicted_risk",
                                                predicted_risk)
                    test_df = test_df.set_value(current_test_df.index[-1],
                                                "predicted_rul", predicted_rul)

                    # 6. store last Timestamp for shifting if it is more urgent
                    if current_test_df["TS"].max() > last_ts:
                        last_ts = current_test_df["TS"].max()

                    # 3. store to file
                    if self._write_csv:
                        cnt += 1

                        object_id = str(object_id)
                        cluster_id = str(cluster_id)

        # 5. metrics
        metrics = {}
        return data_in, metrics
    def train(X_train,
              X_test,
              y_train,
              y_test,
              model_filename,
              reload_if_existing,
              modeltype="RF",
              cv_measure="roc_auc_score"):
        """
        trains and evaluates a model based on the given data
        :param X_train: Features for training, expected to a numpy.ndarray.
        :param X_test: Features for testing, expected to a numpy.ndarray.
        :param y_train: Labels for training. Expected to an one-dimesional array.
        :param y_test: Labels for testing. Expected to an one-dimesional array.
        :param model_filename: Filename of model when serialized to disk
        :param reload_if_existing: Boolean indicating if model should be restored from disk if existing.
        :param modeltype: modeltype to train (RF, SVC or LRCV). RF is recommended since being fast to train and non-
                          linear - therefore usually yielding the best results.
        :param cv_measure: possible cv_measure are ['accuracy', 'precision', 'recall', 'roc_auc']
        :return: 
        """
        if reload_if_existing is False or Path(
                model_filename).exists() is False:
            Logging().log("training {}. ".format(modeltype))
            if modeltype is "LRCV":
                Logging().log("Optimizing for {}...".format(cv_measure))
                lr = LogisticRegressionCV(
                    Cs=[0.001, 0.01, 0.1, 1],
                    cv=5,
                    penalty='l1',
                    scoring=cv_measure,  # Changed from auROCWeighted
                    solver='liblinear',
                    tol=0.001,
                    n_jobs=mp.cpu_count())
                mdl = lr.fit(X_train, y_train)
                Logging().log("cross validated {0} (train) is {1:.3}".format(
                    cv_measure, max(np.mean(mdl.scores_[1],
                                            axis=0))))  # get CV train metrics
            elif modeltype is "SVC":
                # after ~2h of training: cross validated roc_auc=0.511 on rex
                clf = SVC()
                mdl = clf.fit(X_train, y_train)
            elif modeltype is "RF":
                # after ~2h of training: cross validated roc_auc=0.511 on rex

                param_grid = {
                    'max_depth': [3, 5, 10, 15, 20],
                    'n_estimators': [3, 5, 10, 20]
                }

                clf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                                   param_grid)

                mdl = clf.fit(X_train, y_train)

            # output model quality
            cross_val_res = cross_val_score(mdl,
                                            X_test,
                                            y_test,
                                            scoring='roc_auc')
            auc_test = np.mean(cross_val_res)
            Logging().log(
                "cross validated AUC (test) is {0:.3}".format(auc_test))

            # save model to file
            with open(model_filename, 'wb') as f:
                pickle.dump((mdl, auc_test), f)

        else:
            Logging().log("restoring model from {}".format(model_filename))

            with open(model_filename, 'rb') as fid:
                (mdl, auc_test) = pickle.load(fid)

        return mdl, auc_test