Esempio n. 1
0
    def without_resampling(self, X_test, y_test, act_test, X_cnn_test,
                           X_lstm_test):
        if (not self.x_scaler is None):
            X_test = X_test.values
            y_test = y_test.values
            if self.is_global:
                predict_module = global_predict(self.static_data)
            else:
                predict_module = cluster_predict(self.static_data,
                                                 self.cluster_name)

            self.logger.info(
                'Make predictions of testing set not used in training')
            self.logger.info('/n')

            pred_test_comb = predict_module.predict(X_test,
                                                    X_cnn=X_cnn_test,
                                                    X_lstm=X_lstm_test,
                                                    fs_reduced=False)

            result_test_comb = predict_module.evaluate(pred_test_comb, y_test)

            if len(y_test.shape) == 1:
                y_test = y_test[:, np.newaxis]
            return pred_test_comb, y_test, result_test_comb.astype(float)
        else:
            raise ValueError('Scaler or data indices are not set')
    def predict_regressors(self, X1, y1, X_cnn, X_lstm):
        data_path = self.static_data['path_data']
        if not self.split_test is None:
            X_test = X1.loc[X1.index >= self.split_test]
            y_test = y1.loc[X1.index >= self.split_test]
            test_ind = np.where(X1.index >= self.split_test)[0]
            test_ind.sort()
            if len(X_cnn.shape) > 1:
                X_cnn_test = X_cnn[test_ind]
            else:
                X_cnn_test = np.array([])
            if len(X_lstm.shape) > 1:
                X_lstm_test = X_lstm[test_ind]
            else:
                X_lstm_test = np.array([])

            pred_cluster = dict()
            act_test = self.clusterer.compute_activations(X_test)
            for clust in self.regressors.keys():
                if clust == 'Global':
                    if len(self.regressors['Global']['models']) > 0:
                        predict_module = global_predict(self.static_data)
                        pred_cluster['Global'] = predict_module.predict(
                            X_test.values,
                            X_cnn=X_cnn_test,
                            X_lstm=X_lstm_test)
                        pred_cluster['Global'][
                            'metrics'] = predict_module.evaluate(
                                pred_cluster['Global'], y_test.values)
                        pred_cluster['Global']['dates'] = X_test.index
                        pred_cluster['Global']['index'] = np.arange(
                            0, X_test.shape[0])
                else:
                    dates = X_test.index[act_test[clust] >= self.thres_act]
                    nind = np.where(act_test[clust] >= self.thres_act)[0]
                    nind.sort()

                    x = X_test.loc[dates]
                    targ = y_test.loc[dates].values
                    if len(X_cnn_test.shape) > 1:
                        x_cnn = X_cnn_test[nind]
                    else:
                        x_cnn = np.array([])
                    if len(X_lstm_test.shape) > 1:
                        x_lstm = X_lstm_test[nind]
                    else:
                        x_lstm = np.array([])
                    predict_module = cluster_predict(self.static_data, clust)
                    pred_cluster[clust] = predict_module.predict(x.values,
                                                                 X_cnn=x_cnn,
                                                                 X_lstm=x_lstm)
                    pred_cluster[clust]['metrics'] = predict_module.evaluate(
                        pred_cluster[clust], targ)
                    pred_cluster[clust]['dates'] = dates
                    pred_cluster[clust]['index'] = nind
            predictions = dict()
            result_clust = pd.DataFrame()
            for clust in pred_cluster.keys():
                for method in pred_cluster[clust].keys():
                    if not method in {'dates', 'index', 'metrics'}:
                        if not method in predictions.keys():
                            predictions[method] = pd.DataFrame(
                                index=X_test.index,
                                columns=[cl for cl in pred_cluster.keys()])
                        predictions[method].loc[
                            pred_cluster[clust]['dates'],
                            clust] = pred_cluster[clust][method].ravel()
                    elif method in {'metrics'}:
                        result_clust = pd.concat([
                            result_clust,
                            pred_cluster[clust][method]['mae'].rename(clust)
                        ],
                                                 axis=1)
            result_clust.to_csv(
                os.path.join(data_path, 'result_of_clusters.csv'))
            joblib.dump(
                pred_cluster,
                os.path.join(data_path, 'predictions_by_cluster.pickle'))
            joblib.dump(
                predictions,
                os.path.join(data_path, 'predictions_by_method.pickle'))
            y_test.to_csv(os.path.join(data_path, 'target_test.csv'))
        else:
            self.static_data['combine_methods'] = ['average']
Esempio n. 3
0
    def resampling_for_combine_obsolete(self, X_test, y_test, act_test,
                                        X_cnn_test, X_lstm_test):
        if (not self.x_scaler is None):
            if self.is_global:
                predict_module = global_predict(self.static_data)
            else:
                predict_module = cluster_predict(self.static_data,
                                                 self.cluster_name)

            self.logger.info(
                'Make predictions of testing set not used in training')
            self.logger.info('/n')

            pred_test_comb = predict_module.predict(X_test.values,
                                                    X_cnn=X_cnn_test,
                                                    X_lstm=X_lstm_test,
                                                    fs_reduced=False)

            result_test_comb = predict_module.evaluate(pred_test_comb,
                                                       y_test.values)

            result_test_comb = result_test_comb.sort_values(by=['mae'])

            self.logger.info(
                'Make predictions of sampling set with nwp_sampler')
            self.logger.info('/n')

            sampler_dl = DataSampler(self.static_data,
                                     self.cluster_name,
                                     self.x_scaler,
                                     method='ADASYN')
            sampler_dl.istrained = False
            if not os.path.exists(
                    os.path.join(self.data_dir,
                                 'prediction_nwp_dl_resample.pickle')):
                if not sampler_dl.istrained:
                    if len(X_cnn_test.shape) > 1 and len(
                            X_lstm_test.shape) > 1:
                        raise NotImplementedError(
                            'X_lstm sampling not implemented yet')
                    elif len(X_cnn_test.shape) > 1:
                        X_sampl, y_sampl, X_cnn_sampl = sampler_dl.nwp_dl_sampling(
                            X=X_test, y=y_test, act=act_test, X_cnn=X_cnn_test)
                        # X_sampl, y_sampl, X_cnn_sampl, y_cnn_sampl = sampler_dl.nwp_dl_sampling()

                    elif len(X_lstm_test.shape) > 1:
                        raise NotImplementedError(
                            'X_lstm sampling not implemented yet')
                    else:
                        X_sampl, y_sampl, X_cnn_sampl = sampler_dl.nwp_dl_sampling(
                            X=X_test, y=y_test, act=act_test)
                        # X_sampl, y_sampl, X_cnn_sampl, y_cnn_sampl = sampler_dl.nwp_dl_sampling()

                if len(X_cnn_test.shape) > 1 and len(X_lstm_test.shape) > 1:
                    raise NotImplementedError(
                        'X_lstm sampling not implemented yet')
                elif len(X_cnn_test.shape) > 1:
                    pred_nwp_dl_resample = predict_module.spark_predict(
                        X_sampl.values, X_cnn=X_cnn_sampl, fs_reduced=False)
                elif len(X_lstm_test.shape) > 1:
                    raise NotImplementedError(
                        'X_lstm sampling not implemented yet')
                else:
                    pred_nwp_dl_resample = predict_module.spark_predict(
                        X_sampl.values, fs_reduced=False)

                joblib.dump(
                    pred_nwp_dl_resample,
                    os.path.join(self.data_dir,
                                 'prediction_nwp_dl_resample.pickle'))
                joblib.dump(
                    y_sampl, os.path.join(self.data_dir,
                                          'y_resample_dl.pickle'))
                # joblib.dump(y_cnn_sampl, os.path.join(self.data_dir, 'y_cnn_resample_dl.pickle'))
            else:
                pred_nwp_dl_resample = joblib.load(
                    os.path.join(self.data_dir,
                                 'prediction_nwp_dl_resample.pickle'))
                y_sampl = joblib.load(
                    os.path.join(self.data_dir, 'y_resample_dl.pickle'))
                # y_cnn_sampl = joblib.load(os.path.join(self.data_dir, 'y_cnn_resample_dl.pickle'))

            result_nwp_dl_resample = predict_module.evaluate(
                pred_nwp_dl_resample, y_sampl)

            result = pd.concat(
                {
                    'on_test': result_test_comb['mae'],
                    # 'with_nwp_dl_resample_org': result_nwp_dl_resample_org['mae'],
                    'with_nwp_dl_resample': result_nwp_dl_resample['mae']
                },
                axis=1)
            result.to_csv(os.path.join(self.data_dir, 'result_sampling.csv'))

            return pred_nwp_dl_resample, y_sampl, pred_test_comb
        else:
            raise ValueError('Scaler or data indices are not set')
    def fit(self):

        if not self.istrained:
            X, y, act, X_cnn, X_lstm, test_indices = self.load_data()
            self.logger.info('Start training cluster %s', self.cluster_name)
            self.logger.info('/n')
            self.variables = X.columns
            indices = X.index
            X, y, act, X_cnn, X_lstm, X_test, y_test, act_test, X_cnn_test, X_lstm_test = self.split_test_data(
                X,
                y,
                act,
                X_cnn=X_cnn,
                X_lstm=X_lstm,
                test_indices=test_indices)
            if X_test.shape[0] > 0:
                lin_models = LinearRegression().fit(X[self.var_lin].values,
                                                    y.values.ravel())
                preds = lin_models.predict(X_test[self.var_lin].values).ravel()

                err = (preds - y_test.values.ravel()) / 20

                rms = np.sum(np.square(err))
                mae = np.mean(np.abs(err))
                print('rms = %s', rms)
                print('mae = %s', mae)
                self.logger.info("Objective from linear models: %s", mae)
            X = X.values
            y = y.values / 20
            act = act.values

            if len(y.shape) == 1:
                y = y[:, np.newaxis]
            if len(act.shape) == 1:
                act = act[:, np.newaxis]

            try:
                self.load(self.cluster_dir)
            except:
                pass

            if hasattr(
                    self,
                    'features') and self.static_data['train_online'] == False:
                pass
            else:
                if self.static_data['sklearn']['fs_status'] != 'ok':
                    X_train, X_test1, y_train, y_test1 = split_continuous(
                        X, y, test_size=0.15, random_state=42)

                    cvs = []
                    for _ in range(3):
                        X_train1 = np.copy(X_train)
                        y_train1 = np.copy(y_train)
                        X_train1, X_val, y_train1, y_val = train_test_split(
                            X_train1, y_train1, test_size=0.15)
                        cvs.append([
                            X_train1, y_train1, X_val, y_val, X_test1, y_test1
                        ])
                    self.find_features(
                        cvs, self.static_data['sklearn']['fs_method'],
                        self.static_data['sklearn']['njobs'])

            cvs, mask_test1, X, y, act, X_cnn, X_lstm = self.split_dataset(
                X, y, act, X_cnn, X_lstm)
            self.indices = indices[:X.shape[0]]
            for i in range(3):
                cvs[i][0] = cvs[i][0][:, self.features]
                cvs[i][2] = cvs[i][2][:, self.features]
                cvs[i][4] = cvs[i][4][:, self.features]

            self.logger.info('Data info for cluster %s', self.cluster_name)
            self.logger.info('Number of variables %s', str(self.D))
            self.logger.info('Number of total samples %s', str(self.N_tot))
            self.logger.info('Number of training samples %s',
                             str(self.N_train))
            self.logger.info('Number of validation samples %s',
                             str(self.N_val))
            self.logger.info('Number of testing samples %s', str(self.N_test))
            self.logger.info('/n')

            self.models = dict()
            for method in self.static_data['project_methods'].keys():
                if self.static_data['project_methods'][method][
                        'status'] == 'train':
                    self.logger.info('Training start of method %s', method)
                    self.logger.info('/n')
                    if 'sklearn_method' in self.static_data['project_methods'][
                            method].keys():
                        optimize_method = self.static_data['project_methods'][
                            method]['sklearn_method']
                    else:
                        optimize_method = []
                    self.fit_model(cvs,
                                   method,
                                   self.static_data,
                                   self.cluster_dir,
                                   optimize_method,
                                   X_cnn=X_cnn,
                                   X_lstm=X_lstm,
                                   y=y,
                                   rated=1)
                    self.logger.info('Training end of method %s', method)

            self.logger.info('Training end for cluster %s', self.cluster_name)
            self.logger.info('/n')
            self.logger.info('Start of training of Combination models')
            comb_model = combine_model(self.static_data,
                                       self.cluster_dir,
                                       x_scaler=self.x_scaler)
            if comb_model.istrained == False and X_test.shape[0] > 0:
                comb_model.train(X_test, y_test, act_test, X_cnn_test,
                                 X_lstm_test)

                predict_module = cluster_predict(self.static_data,
                                                 self.cluster_name)
                predictions = predict_module.predict(X_test.values,
                                                     X_cnn=X_cnn_test,
                                                     X_lstm=X_lstm_test)
                result = predict_module.evaluate(predictions, y_test.values)
                result.to_csv(os.path.join(self.data_dir, 'result_test.csv'))

            self.istrained = True
            self.save(self.cluster_dir)
        else:
            self.logger.info('Cluster of %s loaded successfully',
                             self.cluster_name)

        return self.to_dict()
    def fit(self, rule_model=None):
        if not self.istrained:
            X, y, act, X_cnn, X_lstm, test_indices = self.load_data()
            self.variables = X.columns
            indices = X.index
            X, y, act, X_cnn, X_lstm, X_test, y_test, act_test, X_cnn_test, X_lstm_test = self.split_test_data(
                X,
                y,
                act,
                X_cnn=X_cnn,
                X_lstm=X_lstm,
                test_indices=test_indices)
            X = X.values
            y = y.values / 20
            act = act.values
            #

            if len(y.shape) == 1:
                y = y[:, np.newaxis]
            if len(act.shape) == 1:
                act = act[:, np.newaxis]

            if not 'features' in rule_model.keys():
                raise ValueError('the Main rule has not attribute features %s',
                                 self.cluster_name)
            self.features = rule_model['features']
            cvs, mask_test1, X, y, act, X_cnn, X_lstm = self.split_dataset(
                X, y, act, X_cnn, X_lstm)
            self.indices = indices[:X.shape[0]]
            for i in range(3):
                cvs[i][0] = cvs[i][0][:, self.features]
                cvs[i][2] = cvs[i][2][:, self.features]
                cvs[i][4] = cvs[i][4][:, self.features]

            self.models = dict()
            for method in self.static_data['project_methods'].keys():
                if self.static_data['project_methods'][method][
                        'status'] == 'train':

                    self.fit_model(cvs,
                                   method,
                                   self.static_data,
                                   self.cluster_dir,
                                   rule_model['models'],
                                   self.gpu,
                                   X_cnn=X_cnn,
                                   X_lstm=X_lstm,
                                   y=y,
                                   rated=1)

            comb_model = combine_model(self.static_data,
                                       self.cluster_dir,
                                       x_scaler=self.x_scaler)
            if comb_model.istrained == False and X_test.shape[0] > 0:
                comb_model.train(X_test, y_test, act_test, X_cnn_test,
                                 X_lstm_test)

                predict_module = cluster_predict(self.static_data,
                                                 self.cluster_name)
                predictions = predict_module.predict(X_test.values,
                                                     X_cnn=X_cnn_test,
                                                     X_lstm=X_lstm_test)
                result = predict_module.evaluate(predictions, y_test.values)
                result.to_csv(os.path.join(self.data_dir, 'result_test.csv'))

            self.istrained = True
            self.save(self.cluster_dir)

        return self.to_dict()
Esempio n. 6
0
    def predict_regressors(self, X_test, X_cnn_test, X_lstm_test, y_test=None):
        data_path = self.static_data['path_data']
        pred_cluster = dict()
        X_test = pd.DataFrame(self.sc.transform(X_test.values),
                              columns=X_test.columns,
                              index=X_test.index)
        if not hasattr(self, 'clusterer'):
            self.clusterer = clusterer(
                self.static_data['path_fuzzy_models'],
                self.static_data['clustering']['cluster_file'],
                self.static_data['type'])
        act_test = self.clusterer.compute_activations(X_test)
        act_test = self.check_if_all_nans(act_test)
        for clust in self.regressors.keys():
            if clust == 'Global':
                if len(self.regressors['Global']['models']) > 0:
                    predict_module = global_predict(self.static_data)
                    pred_cluster['Global'] = predict_module.predict(
                        X_test.values, X_cnn=X_cnn_test, X_lstm=X_lstm_test)
                    if y_test is not None:
                        pred_cluster['Global'][
                            'metrics'] = predict_module.evaluate(
                                pred_cluster['Global'],
                                self.scale_y.transform(y_test.values))
                    pred_cluster['Global']['dates'] = X_test.index
                    pred_cluster['Global']['index'] = np.arange(
                        0, X_test.shape[0])
            else:
                dates = X_test.index[act_test[clust] >= self.thres_act]
                nind = np.where(act_test[clust] >= self.thres_act)[0]
                nind.sort()

                x = X_test.loc[dates]
                if y_test is not None:
                    targ = y_test.loc[dates].values
                else:
                    targ = None
                if len(X_cnn_test.shape) > 1:
                    x_cnn = X_cnn_test[nind]
                else:
                    x_cnn = np.array([])
                if len(X_lstm_test.shape) > 1:
                    x_lstm = X_lstm_test[nind]
                else:
                    x_lstm = np.array([])
                predict_module = cluster_predict(self.static_data, clust)
                pred_cluster[clust] = predict_module.predict(x.values,
                                                             X_cnn=x_cnn,
                                                             X_lstm=x_lstm)
                if targ is not None and targ.shape[0] > 0:
                    pred_cluster[clust]['metrics'] = predict_module.evaluate(
                        pred_cluster[clust], self.scale_y.transform(targ))
                pred_cluster[clust]['dates'] = dates
                pred_cluster[clust]['index'] = nind
        predictions = dict()
        result_clust = pd.DataFrame()
        for clust in pred_cluster.keys():
            for method in pred_cluster[clust].keys():
                if not method in {'dates', 'index', 'metrics'}:
                    if not method in predictions.keys():
                        predictions[method] = pd.DataFrame(
                            index=X_test.index,
                            columns=[cl for cl in pred_cluster.keys()])
                    predictions[method].loc[
                        pred_cluster[clust]['dates'],
                        clust] = pred_cluster[clust][method].ravel()
                elif method in {'metrics'}:
                    result_clust = pd.concat([
                        result_clust,
                        pred_cluster[clust][method]['mae'].rename(clust)
                    ],
                                             axis=1)

        combine_overall = Combine_overall_predict(self.static_data)
        predictions_final = combine_overall.predict(pred_cluster, predictions)

        for method, pred in predictions_final.items():
            pred = self.scale_y.inverse_transform(pred.reshape(-1, 1))
            pred[np.where(pred < 0)] = 0
            predictions_final[method] = pred

        if y_test is not None:
            result_clust.to_csv(
                os.path.join(data_path, 'result_of_clusters.csv'))

        return predictions_final