Beispiel #1
0
    def predict_lstm(self,
                     script_config,
                     model_package_name,
                     future_steps,
                     step_mins=15):
        """
        Function to use a trained LSTM neural network to predict measurements
        """

        # Read configuration data

        n_past_steps = script_config.n_past_steps

        self.log_success("using {} steps in the past".format(n_past_steps))

        date_col = script_config.date_col
        hr_col = script_config.hr_col
        numeric_var = script_config.numeric_var
        sensor_var = script_config.sensor_var
        target_sensor = script_config.target_sensor
        output_models_path = script_config.output_models_path

        early_stop_patience = script_config.early_stop_patience
        epochs = script_config.epochs

        # Read  dataset slice from n past steps to build the datapoint
        raw_dataset = self.read_time_series_from_db(target_sensor, date_col,
                                                    hr_col, 'minute',
                                                    numeric_var, sensor_var,
                                                    n_past_steps)

        self.log_success("Dataset of shape {} read".format(raw_dataset.shape))

        # TODO : obtener tambien la columna minuto
        # Obtener la variable de interes del dataset
        time_series_dset = get_interest_variable(raw_dataset, sensor_var,
                                                 date_col, hr_col, numeric_var,
                                                 target_sensor)
        self.log_success(
            "Got time series dataset of shape {} with columns {}".format(
                time_series_dset.shape, time_series_dset.columns))

        datapoint = time_series_dset[numeric_var].values

        self.log_success("Got datapoint {}".format(datapoint))

        if not model_package_name:
            model_package_name = glob.glob(
                '{}/*_model_hyperopt_package_*.model'.format(
                    output_models_path))[-1]

        self.log_success(
            'Using {} packaged model to test'.format(model_package_name))

        # IS IT OK to scale here? predict_with_model does not scale so we should here
        with open(model_package_name, 'rb') as file_pi:
            model_package = pickle.load(file_pi)

        scaler = model_package['scaler']

        # ensure all data is float
        datapoint = datapoint.astype('float32')
        self.log_success("Got datapoint as float32 {}".format(datapoint))

        datapoint_scaled = scaler.transform(datapoint.reshape(-1, 1))
        self.log_success("Got datapoint_scaled {}".format(datapoint_scaled))

        tic = time.time()
        #pred,mae = predict_with_model(datapoint,model_package_name,future_steps = future_steps)
        pred, mae = predict_with_model(datapoint_scaled,
                                       model_package_name,
                                       future_steps=future_steps)
        prediction_time = time.time() - tic
        self.log_success('#{},{},prediction_time,{}'.format(
            model_package_name, future_steps, prediction_time))

        # writing predictions
        time_delta = np.timedelta64(step_mins, 'm')
        last_datetime = raw_dataset.tail(1).datetime.values[0]

        tic = time.time()
        self.save_predictions(target_sensor, pred, last_datetime, numeric_var,
                              time_delta)
        save_time = time.time() - tic
        self.log_success('#{},{},save_time,{}'.format(model_package_name,
                                                      future_steps, save_time))
Beispiel #2
0
def train_lstm(script_config,since_date,which_minutes,db=True):
    """Function to train an LSTM neural network looking at the past

    Args:
      n_steps_past (int): integer

    Returns: TO DO
      int: n-th Fibonacci number
    """

    time_stmp = datetime.now()

    time_stmp_str = time_stmp.strftime("%Y-%m-%d_%H:%M:%S")

    n_past_steps = script_config.n_past_steps
    input_csv = script_config.input_csv
    _logger.debug("using {} steps in the past".format(n_past_steps))
    _logger.debug("using {} input dataset".format(input_csv))


    date_col = script_config.date_col
    hr_col = script_config.hr_col
    numeric_var = script_config.numeric_var
    sensor_var = script_config.sensor_var
    target_sensor = script_config.target_sensor
    output_models_path = script_config.output_models_path
    output_results_path = script_config.output_results_path

    hyperopt_pars = script_config.hyperopt_pars

    model_loss = script_config.model_loss
    optimizer = script_config.optimizer

    early_stop_patience=script_config.early_stop_patience
    epochs=script_config.epochs

    # Leer el dataset
    if db:
        # leer de la base de datos
        date_since = datetime(int(since_date[0]), int(since_date[1]),
                              int(since_date[2]))
        raw_dataset = read_time_series_from_db(target_sensor, date_col,
                                                    hr_col, 'minute',
                                                    numeric_var, sensor_var,
                                                    date_since,
                                                    which_minutes
                                                   )
    else :
        raw_dataset = read_time_series_from_csv(input_csv,date_col,hr_col,numeric_var,sensor_var)
        
    _logger.debug("Dataset of shape {} read".format(raw_dataset.shape))

    # Obtener la variable de interes del dataset
    time_series_dset = get_interest_variable(raw_dataset,sensor_var,date_col,hr_col,numeric_var,target_sensor)
    _logger.debug("Got time series dataset of shape {} with columns {}".format(time_series_dset.shape,time_series_dset.columns))

    sup_dataset,scaler = get_dataset_from_series(time_series_dset,n_past_steps)
    _logger.debug("Got supervised dataset of shape {} with columns {}".format(sup_dataset.shape,sup_dataset.columns))

    # guardar el objeto scaler
    with open('{}{}_hyperopt_scaler_{}.pickle'.format(
        output_models_path,
        str(script_config),
        time_stmp_str), 'wb') as file_pi:
        pickle.dump(scaler, file_pi)

    n_features = time_series_dset.shape[1]
    dataset_splits = train_val_test_split(sup_dataset,n_past_steps,n_features,numeric_var)
    _logger.debug("Got split:")
    for key in dataset_splits.keys():
        _logger.debug("{} shapes: {},{}".format(key,dataset_splits[key]['X'].shape,dataset_splits[key]['y'].shape))


    trainset = dataset_splits['trainset']

    out_model_name = '{}{}_hyperopt_model_{}.hdf5'.format(
        output_models_path,
        str(script_config),
        time_stmp_str)

    history_out_name = '{}{}_hyperopt_history_{}.pickle'.format(
        output_models_path,
        str(script_config),
        time_stmp_str)

    mults = hyperopt_pars['mults']
    dropout_rate_range = hyperopt_pars['dropout_rate_range']
    n_mid_layers = hyperopt_pars['mid_layers']

    _logger.debug("LSTM NNNet hyperpars to optimize on: mults:{}, dropout:{}, n mid layers:{}".format(
        mults,dropout_rate_range,n_mid_layers))

    tic = time.time()
    space = hp.choice('nnet_config',[
        {'dataset_splits': dataset_splits,
         'mult_1': hp.choice('mult_1',mults),
         'dropout_rate_1' : hp.uniform('dropout_rate_1',
                                    dropout_rate_range[0],
                                    dropout_rate_range[1]),
         'mult_mid': hp.choice('mult_mid',mults),
         'dropout_rate_mid' : hp.uniform('dropout_rate_mid',
                                    dropout_rate_range[0],
                                    dropout_rate_range[1]),
         'mult_n': hp.choice('mult_n',mults),
         'dropout_rate_n' : hp.uniform('dropout_rate_n',
                                    dropout_rate_range[0],
                                    dropout_rate_range[1]),
         'n_mid_layers': hp.choice('n_mid_layers',n_mid_layers),
         'model_loss' : model_loss,
         'optimizer' : optimizer,
         'target_var' : numeric_var,
         'output_models_path' : output_models_path,
         'early_stop_patience': early_stop_patience,
         'epochs' : epochs,
         'time_stmp_str' : time_stmp_str,
         'out_model_name' : out_model_name,
         'history_out_name' : history_out_name
        }
        ])

    optimal_pars = fmin(get_lstm_nnet_opt,space,algo=tpe.suggest,max_evals=hyperopt_pars['max_evals'])

    opt_time = time.time() - tic
    _logger.debug("Hyper parameter optimization for optimal pars {} took {} seconds for {} datapoints".format(
        optimal_pars,opt_time,trainset['X'].shape[0]))


    # guardando hyper parameters
    with open('{}{}_hyperopt_optimal_pars_{}.pickle'.format(
        output_models_path,
        str(script_config),
        time_stmp_str), 'wb') as file_pi:
        pickle.dump(optimal_pars, file_pi)

    # read the model from disk ?
    #lstm_nnet = load_model(out_model_name)
    #_logger.debug("Got LSTM NNet from disk {}".format(lstm_nnet))

    #no estoy seguro que sea necesario, pero deberiamos construir una red nueva con los parametros elegidos pot hyperopt
    base_config_opt = {
        "first_layer":{
            "mult":int(mults[optimal_pars['mult_1']]),
            "dropout_rate":float(optimal_pars['dropout_rate_1'])
            #"dropout_range":[0,1]
            },
        "last_layer":{
            "mult":int(mults[optimal_pars['mult_n']]),
            "dropout_rate":float(optimal_pars['dropout_rate_n'])
            #"dropout_range":[0,1]
            }
    }
    mid_layers_config_opt = {
        "n_layers":int(n_mid_layers[optimal_pars['n_mid_layers']]),
        "mult":int(mults[optimal_pars['mult_mid']]),
        "dropout_rate":float(optimal_pars['dropout_rate_mid'])
        #"dropout_range":[0,1]
        }

    lstm_nnet_arq = build_lstm_nnet(trainset['X'],base_config_opt,mid_layers_config_opt,model_loss,optimizer)
    _logger.debug("Build LSTM NNet with optimal parameters \n {}".format(lstm_nnet_arq.summary()))


    #recien aqui entrenamos con todo el dataset y la arquitectura optima
    tic = time.time()
    lstm_nnet = fit_model(
        lstm_nnet_arq,
        trainset,
        dataset_splits['valset'],
        target_sensor,
        numeric_var,
        output_models_path,
        early_stop_patience,
        epochs,
        time_stmp_str,
        out_model_name,
        history_out_name
    )
    train_time = time.time() - tic
    _logger.debug("Trained LSTM NNet {} took {} seconds for {} datapoints".format(
        lstm_nnet,train_time,trainset['X'].shape[0]))


    tic = time.time()
    train_mae = eval_regression_performance(trainset,lstm_nnet,scaler,measure = mean_absolute_error)
    train_eval_time = time.time() - tic
    _logger.debug("Train MAE {} and took {} seconds".format(train_mae,train_eval_time))

    tic = time.time()
    test_mae = eval_regression_performance(dataset_splits['testset'],lstm_nnet,scaler, measure = mean_absolute_error)
    test_eval_time = time.time() - tic
    _logger.debug("Test MAE {} and took {} seconds".format(test_mae,test_eval_time))

    train_r2 = eval_regression_performance(trainset,lstm_nnet,scaler,measure = r2_score)
    _logger.debug("Train R2 {}".format(train_r2))

    test_r2 = eval_regression_performance(dataset_splits['testset'],lstm_nnet,scaler, measure = r2_score)
    _logger.debug("Test R2 {}".format(test_r2))


    # Saving the training result
    results = pd.DataFrame({
            'sensor':[target_sensor],
            'target_variable':[numeric_var],
            'hyperopt_pars':[hyperopt_pars],
            'optimal_pars':[optimal_pars],
            'model_loss':[model_loss],
            'optimizer':[optimizer],
            'early_stop_patience':[early_stop_patience],
            'epochs':[epochs],
            'train_mae':[train_mae],
            'test_mae':[test_mae],
            'train_r2':[train_r2],
            'test_r2':[test_r2],
            'train_time':[train_time],
            'train_eval_time':[train_eval_time],
            'test_eval_time':[test_eval_time],
            'trainset_size':[trainset['X'].shape[0]]
        }
        )



    results.to_csv(
        '{}{}_hyperopt_results_{}.csv'.format(
            output_results_path,
            str(script_config),
            time_stmp_str),
        index=False
        )

    # Empaquetamos modelo, scaler y mae en un objeto para usar al predecir
    model_package = {'model':lstm_nnet,'scaler':scaler,'test_mae':test_mae,'optimal_pars':optimal_pars}

    with open('{}{}_model_hyperopt_package_{}.model'.format(
        output_models_path,
        str(script_config),
        time_stmp_str), 'wb') as file_pi:
        pickle.dump(model_package, file_pi)
Beispiel #3
0
def train_lstm(script_config):
    """Function to train an LSTM neural network looking at the past

    Args:
      n_steps_past (int): integer

    Returns: TO DO
      int: n-th Fibonacci number
    """

    time_stmp = datetime.now()

    time_stmp_str = time_stmp.strftime("%Y-%m-%d_%H:%M:%S")

    n_past_steps = script_config.n_past_steps
    input_csv = script_config.input_csv
    _logger.debug("using {} steps in the past".format(n_past_steps))
    _logger.debug("using {} input dataset".format(input_csv))


    date_col = script_config.date_col
    hr_col = script_config.hr_col
    numeric_var = script_config.numeric_var
    sensor_var = script_config.sensor_var
    target_sensor = script_config.target_sensor
    output_models_path = script_config.output_models_path
    output_results_path = script_config.output_results_path

    base_config = script_config.base_config
    mid_layers_config = script_config.mid_layers_config
    model_loss = script_config.model_loss
    optimizer = script_config.optimizer

    early_stop_patience=script_config.early_stop_patience
    epochs=script_config.epochs

    # Leer el dataset
    raw_dataset = read_time_series_from_csv(input_csv,date_col,hr_col,numeric_var,sensor_var)
    _logger.debug("Dataset of shape {} read".format(raw_dataset.shape))

    # Obtener la variable de interes del dataset
    time_series_dset = get_interest_variable(raw_dataset,sensor_var,date_col,hr_col,numeric_var,target_sensor)
    _logger.debug("Got time series dataset of shape {} with columns {}".format(time_series_dset.shape,time_series_dset.columns))

    sup_dataset,scaler = get_dataset_from_series(time_series_dset,n_past_steps)
    _logger.debug("Got supervised dataset of shape {} with columns {}".format(sup_dataset.shape,sup_dataset.columns))

    # guardar el objeto scaler
    with open('{}{}_scaler_{}.pickle'.format(
        output_models_path,
        str(script_config),
        time_stmp_str), 'wb') as file_pi:
        pickle.dump(scaler, file_pi)

    n_features = time_series_dset.shape[1]
    dataset_splits = train_val_test_split(sup_dataset,n_past_steps,n_features,numeric_var)
    _logger.debug("Got split:")
    for key in dataset_splits.keys():
        _logger.debug("{} shapes: {},{}".format(key,dataset_splits[key]['X'].shape,dataset_splits[key]['y'].shape))


    trainset = dataset_splits['trainset']
    lstm_nnet = build_lstm_nnet(trainset['X'],base_config,mid_layers_config,model_loss,optimizer)
    _logger.debug("Got LSTM NNet {}".format(lstm_nnet))

    out_model_name = '{}{}_model_{}.hdf5'.format(
        output_models_path,
        str(script_config),
        time_stmp_str)

    history_out_name = '{}{}_history_{}.pickle'.format(
        output_models_path,
        str(script_config),
        time_stmp_str)

    tic = time.time()
    lstm_nnet = fit_model(
        lstm_nnet,
        trainset,
        dataset_splits['valset'],
        target_sensor,
        numeric_var,
        output_models_path,
        early_stop_patience,
        epochs,
        time_stmp_str,
        out_model_name,
        history_out_name
    )
    train_time = time.time() - tic
    _logger.debug("Trained LSTM NNet {} took {} seconds for {} datapoints".format(
        lstm_nnet,train_time,trainset['X'].shape[0]))

    tic = time.time()
    train_mae = eval_regression_performance(trainset,lstm_nnet,scaler,measure = mean_absolute_error)
    train_eval_time = time.time() - tic
    _logger.debug("Train MAE {} and took {} seconds".format(train_mae,train_eval_time))

    tic = time.time()
    test_mae = eval_regression_performance(dataset_splits['testset'],lstm_nnet,scaler, measure = mean_absolute_error)
    test_eval_time = time.time() - tic
    _logger.debug("Test MAE {} and took {} seconds".format(test_mae,test_eval_time))

    train_r2 = eval_regression_performance(trainset,lstm_nnet,scaler,measure = r2_score)
    _logger.debug("Train R2 {}".format(train_r2))

    test_r2 = eval_regression_performance(dataset_splits['testset'],lstm_nnet,scaler, measure = r2_score)
    _logger.debug("Test R2 {}".format(test_r2))


    # Saving the training result
    results = pd.DataFrame({
            'sensor':[target_sensor],
            'target_variable':[numeric_var],
            'base_nnet_config':[base_config],
            'mid_layers_config':[mid_layers_config],
            'model_loss':[model_loss],
            'optimizer':[optimizer],
            'early_stop_patience':[early_stop_patience],
            'epochs':[epochs],
            'train_mae':[train_mae],
            'test_mae':[test_mae],
            'train_r2':[train_r2],
            'test_r2':[test_r2],
            'train_time':[train_time],
            'train_eval_time':[train_eval_time],
            'test_eval_time':[test_eval_time],
            'trainset_size':[trainset['X'].shape[0]]
        }
        )



    results.to_csv(
        '{}{}_results_{}.csv'.format(
            output_results_path,
            str(script_config),
            time_stmp_str),
        index=False
        )

    # Empaquetamos modelo, scaler y mae en un objeto para usar al predecir
    model_package = {'model':lstm_nnet,'scaler':scaler,'test_mae':test_mae}

    with open('{}{}_model_package_{}.model'.format(
        output_models_path,
        str(script_config),
        time_stmp_str), 'wb') as file_pi:
        pickle.dump(model_package, file_pi)
    def train_lstm(self, script_config, since_date, which_minutes):

        time_stmp = datetime.now()
        # get a string with the timestamp to configure this training and save everything with this stamp
        time_stmp_str = time_stmp.strftime("%Y-%m-%d_%H:%M:%S")
        # we will look this number of steps in the past, reading from configuration
        n_past_steps = script_config.n_past_steps

        self.log_success("using {} steps in the past".format(n_past_steps))
        # extracting more configuration variables
        date_col = script_config.date_col
        hr_col = script_config.hr_col
        numeric_var = script_config.numeric_var
        sensor_var = script_config.sensor_var
        target_sensor = script_config.target_sensor
        output_models_path = script_config.output_models_path
        output_results_path = script_config.output_results_path
        # these are the hyper parameters related to the arquitecture of the Net to optimize over
        hyperopt_pars = script_config.hyperopt_pars
        # configuration values related to the optimizer
        model_loss = script_config.model_loss
        optimizer = script_config.optimizer
        # configuration values related to the training process
        early_stop_patience = script_config.early_stop_patience
        epochs = script_config.epochs

        date_since = datetime(int(since_date[0]), int(since_date[1]),
                              int(since_date[2]))
        # read the data from the database as time series
        raw_dataset = self.read_time_series_from_db(target_sensor, date_col,
                                                    hr_col, 'minute',
                                                    numeric_var, sensor_var,
                                                    date_since, which_minutes)
        self.log_success("Dataset of shape {} read".format(raw_dataset.shape))

        # we get the variable we want to train model on from the dataset
        time_series_dset = get_interest_variable(raw_dataset, sensor_var,
                                                 date_col, hr_col, numeric_var,
                                                 target_sensor)
        self.log_success(
            "Got time series dataset of shape {} with columns {}".format(
                time_series_dset.shape, time_series_dset.columns))
        # with this we turn the tabular data in time series examples
        sup_dataset, scaler = get_dataset_from_series(time_series_dset,
                                                      n_past_steps)
        self.log_success(
            "Got supervised dataset of shape {} with columns {}".format(
                sup_dataset.shape, sup_dataset.columns))

        # saving the scaler for future predictions if needed
        with open(
                '{}{}_hyperopt_scaler_{}.pickle'.format(
                    output_models_path, str(script_config), time_stmp_str),
                'wb') as file_pi:
            pickle.dump(scaler, file_pi)

        # the number of features is the number of past setps of the dataset
        n_features = time_series_dset.shape[1]
        # spliting into train, validation and test sets
        dataset_splits = train_val_test_split(sup_dataset, n_past_steps,
                                              n_features, numeric_var)
        self.log_success("Got split:")
        for key in dataset_splits.keys():
            self.log_success("{} shapes: {},{}".format(
                key, dataset_splits[key]['X'].shape,
                dataset_splits[key]['y'].shape))
        # get the trainset from the split
        trainset = dataset_splits['trainset']
        # generating the file names for the model, history and so on
        out_model_name = '{}{}_hyperopt_model_{}.hdf5'.format(
            output_models_path, str(script_config), time_stmp_str)

        history_out_name = '{}{}_hyperopt_history_{}.pickle'.format(
            output_models_path, str(script_config), time_stmp_str)
        # getting the array of multipliers to optimize on from the configuration
        mults = hyperopt_pars['mults']
        # also the dropout range
        dropout_rate_range = hyperopt_pars['dropout_rate_range']
        # and the number of mid layers
        n_mid_layers = hyperopt_pars['mid_layers']

        self.log_success(
            "LSTM NNNet hyperpars to optimize on: mults:{}, dropout:{}, n mid layers:{}"
            .format(mults, dropout_rate_range, n_mid_layers))

        tic = time.time()
        # this builds the space of possible configurations to optimize over by training and evalauting multiple Nets
        space = hp.choice('nnet_config', [{
            'dataset_splits':
            dataset_splits,
            'mult_1':
            hp.choice('mult_1', mults),
            'dropout_rate_1':
            hp.uniform('dropout_rate_1', dropout_rate_range[0],
                       dropout_rate_range[1]),
            'mult_mid':
            hp.choice('mult_mid', mults),
            'dropout_rate_mid':
            hp.uniform('dropout_rate_mid', dropout_rate_range[0],
                       dropout_rate_range[1]),
            'mult_n':
            hp.choice('mult_n', mults),
            'dropout_rate_n':
            hp.uniform('dropout_rate_n', dropout_rate_range[0],
                       dropout_rate_range[1]),
            'n_mid_layers':
            hp.choice('n_mid_layers', n_mid_layers),
            'model_loss':
            model_loss,
            'optimizer':
            optimizer,
            'target_var':
            numeric_var,
            'output_models_path':
            output_models_path,
            'early_stop_patience':
            early_stop_patience,
            'epochs':
            epochs,
            'time_stmp_str':
            time_stmp_str,
            'out_model_name':
            out_model_name,
            'history_out_name':
            history_out_name
        }])
        # this is the actual call to the search process for minimization of loss
        optimal_pars = fmin(get_lstm_nnet_opt,
                            space,
                            algo=tpe.suggest,
                            max_evals=hyperopt_pars['max_evals'])

        opt_time = time.time() - tic
        self.log_success(
            "Hyper parameter optimization for optimal pars {} took {} seconds for {} datapoints"
            .format(optimal_pars, opt_time, trainset['X'].shape[0]))

        # saving the optimal architecture
        with open(
                '{}{}_hyperopt_optimal_pars_{}.pickle'.format(
                    output_models_path, str(script_config), time_stmp_str),
                'wb') as file_pi:
            pickle.dump(optimal_pars, file_pi)

        # building the final Net with the best choice made by the optimization process
        # first and last layer configuration
        base_config_opt = {
            "first_layer": {
                "mult": int(mults[optimal_pars['mult_1']]),
                "dropout_rate": float(optimal_pars['dropout_rate_1'])
            },
            "last_layer": {
                "mult": int(mults[optimal_pars['mult_n']]),
                "dropout_rate": float(optimal_pars['dropout_rate_n'])
            }
        }
        # mid layers configuration
        mid_layers_config_opt = {
            "n_layers": int(n_mid_layers[optimal_pars['n_mid_layers']]),
            "mult": int(mults[optimal_pars['mult_mid']]),
            "dropout_rate": float(optimal_pars['dropout_rate_mid'])
        }
        # actually building the Net with the best architecture
        lstm_nnet_arq = build_lstm_nnet(trainset['X'], base_config_opt,
                                        mid_layers_config_opt, model_loss,
                                        optimizer)
        self.log_success(
            "Build LSTM NNet with optimal parameters \n {}".format(
                lstm_nnet_arq.summary()))

        # now we train again with the best Net over the full trainset
        tic = time.time()
        lstm_nnet = fit_model(lstm_nnet_arq, trainset,
                              dataset_splits['valset'], target_sensor,
                              numeric_var, output_models_path,
                              early_stop_patience, epochs, time_stmp_str,
                              out_model_name, history_out_name)
        train_time = time.time() - tic
        self.log_success(
            "Trained LSTM NNet {} took {} seconds for {} datapoints".format(
                lstm_nnet, train_time, trainset['X'].shape[0]))

        tic = time.time()
        # we evalaute MAE the trained net over trainset
        train_mae = eval_regression_performance(trainset,
                                                lstm_nnet,
                                                scaler,
                                                measure=mean_absolute_error)
        train_eval_time = time.time() - tic
        self.log_success("Train MAE {} and took {} seconds".format(
            train_mae, train_eval_time))

        tic = time.time()
        # we evalaute MAE the trained net over testset
        test_mae = eval_regression_performance(dataset_splits['testset'],
                                               lstm_nnet,
                                               scaler,
                                               measure=mean_absolute_error)
        test_eval_time = time.time() - tic
        self.log_success("Test MAE {} and took {} seconds".format(
            test_mae, test_eval_time))
        # evaluate R2 as well over train
        train_r2 = eval_regression_performance(trainset,
                                               lstm_nnet,
                                               scaler,
                                               measure=r2_score)
        self.log_success("Train R2 {}".format(train_r2))
        # and test
        test_r2 = eval_regression_performance(dataset_splits['testset'],
                                              lstm_nnet,
                                              scaler,
                                              measure=r2_score)
        self.log_success("Test R2 {}".format(test_r2))

        # also Maximum Error
        train_max_e = eval_regression_performance(trainset,
                                                  lstm_nnet,
                                                  scaler,
                                                  measure=max_error)

        self.log_success("Train MAXE {} ".format(train_max_e))

        test_max_e = eval_regression_performance(dataset_splits['testset'],
                                                 lstm_nnet,
                                                 scaler,
                                                 measure=max_error)

        self.log_success("Test MAXE {}".format(test_max_e))

        # Saving the training results to a csv table
        results = pd.DataFrame({
            'sensor': [target_sensor],
            'target_variable': [numeric_var],
            'hyperopt_pars': [hyperopt_pars],
            'optimal_pars': [optimal_pars],
            'model_loss': [model_loss],
            'optimizer': [optimizer],
            'early_stop_patience': [early_stop_patience],
            'epochs': [epochs],
            'train_mae': [train_mae],
            'test_mae': [test_mae],
            'train_r2': [train_r2],
            'test_r2': [test_r2],
            'train_max_e': [train_max_e],
            'test_max_e': [test_max_e],
            'train_time': [train_time],
            'train_eval_time': [train_eval_time],
            'test_eval_time': [test_eval_time],
            'trainset_size': [trainset['X'].shape[0]]
        })
        results.to_csv('{}{}_hyperopt_results_{}.csv'.format(
            output_results_path, str(script_config), time_stmp_str),
                       index=False)

        # Also we pack the model, scaler, optimal parameters and test MAE for future use in predictions
        model_package = {
            'model': lstm_nnet,
            'scaler': scaler,
            'test_mae': test_mae,
            'optimal_pars': optimal_pars
        }

        with open(
                '{}{}_model_hyperopt_package_{}.model'.format(
                    output_models_path, str(script_config), time_stmp_str),
                'wb') as file_pi:
            pickle.dump(model_package, file_pi)
Beispiel #5
0
    def predict_lstm(self,
                     script_config,
                     model_package_name,
                     future_steps,
                     step_mins=15):

        # Read configuration data
        n_past_steps = script_config.n_past_steps

        self.log_success("using {} steps in the past".format(n_past_steps))

        date_col = script_config.date_col
        hr_col = script_config.hr_col
        numeric_var = script_config.numeric_var
        sensor_var = script_config.sensor_var
        target_sensor = script_config.target_sensor
        output_models_path = script_config.output_models_path

        early_stop_patience = script_config.early_stop_patience
        epochs = script_config.epochs

        # Read  dataset slice from n past steps to build the datapoint
        raw_dataset = self.read_time_series_from_db(target_sensor, date_col,
                                                    hr_col, 'minute',
                                                    numeric_var, sensor_var,
                                                    n_past_steps)

        self.log_success("Dataset of shape {} read".format(raw_dataset.shape))

        # Get the variable of the interest meteorological measure
        time_series_dset = get_interest_variable(raw_dataset, sensor_var,
                                                 date_col, hr_col, numeric_var,
                                                 target_sensor)
        self.log_success(
            "Got time series dataset of shape {} with columns {}".format(
                time_series_dset.shape, time_series_dset.columns))

        # getting the values as array to build the datapoint
        datapoint = time_series_dset[numeric_var].values

        self.log_success("Got datapoint {}".format(datapoint))

        if not model_package_name:
            model_package_name = glob.glob(
                '{}/*_model_hyperopt_package_*.model'.format(
                    output_models_path))[-1]

        self.log_success(
            'Using {} packaged model to test'.format(model_package_name))

        # read the package object with the pre-trained model, scaler and extra objects
        with open(model_package_name, 'rb') as file_pi:
            model_package = pickle.load(file_pi)

        # get the actual scaler from the packaged object
        scaler = model_package['scaler']

        # ensure all data is float
        datapoint = datapoint.astype('float32')
        self.log_success("Got datapoint as float32 {}".format(datapoint))
        # scale the datapoint
        datapoint_scaled = scaler.transform(datapoint.reshape(-1, 1))
        self.log_success("Got datapoint_scaled {}".format(datapoint_scaled))

        tic = time.time()
        # make the prediction for the future steps using the scaled datapoint
        pred, mae = predict_with_model(datapoint_scaled,
                                       model_package_name,
                                       future_steps=future_steps)
        prediction_time = time.time() - tic
        self.log_success('#{},{},prediction_time,{}'.format(
            model_package_name, future_steps, prediction_time))

        time_delta = np.timedelta64(step_mins, 'm')
        last_datetime = raw_dataset.tail(1).datetime.values[0]

        tic = time.time()
        # writing predictions to the DB
        self.save_predictions(target_sensor, pred, last_datetime, numeric_var,
                              time_delta)
        save_time = time.time() - tic
        self.log_success('#{},{},save_time,{}'.format(model_package_name,
                                                      future_steps, save_time))