def test_return_str(): mylogging.config.COLOR = 1 try: raise Exception(mylogging.return_str("asdas", caption="User")) except Exception: pass assert mylogging.return_str("asdas", caption="User")
def lnu_core( data: tuple[np.ndarray, np.ndarray], learning_rate: float, epochs: int, normalize_learning_rate: bool, early_stopping: bool = True, learning_rate_decay: float = 0.8, damping: int | float = 1, return_all: bool = False, ) -> np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: X = data[0] y_hat = data[1] y = np.zeros(len(y_hat)) error = np.zeros(len(y_hat)) if return_all else np.zeros(1) w = np.zeros(X.shape[1]) last_running_error = np.inf w_all = np.zeros((X.shape[1], X.shape[0])) if return_all else None for epoch in range(epochs): running_error = np.zeros(1) for j in range(X.shape[0]): current_index = j if return_all else 0 y[j] = np.dot(w, X[j]) if y[j] > y_hat.max() * 10e6: raise RuntimeError(mylogging.return_str("Model is unstable")) error[current_index] = y_hat[j] - y[j] running_error[0] = running_error[0] + abs(error[current_index]) dydw = X[j] if normalize_learning_rate: minorm = learning_rate / (damping + np.dot(X[j], X[j].T)) dw = minorm * error[current_index] * dydw else: dw = learning_rate * error[current_index] * dydw w = w + dw if return_all: w_all[:, j] = w if (early_stopping and epoch > 1) and ( sum(np.abs(dw)) / len(w) < 10e-8 or ((running_error[0] / len(y_hat)) - last_running_error) < 10e-5 ): break last_running_error = running_error[0] / len(y_hat) if learning_rate_decay: learning_rate = learning_rate * learning_rate_decay if return_all: return w, w_all, y, error else: return w
def get_inputs( input: tuple[np.ndarray, np.ndarray] | Sequences ) -> tuple[np.ndarray, np.ndarray]: if isinstance(input, Sequences): return input[0], input[1] if not isinstance(input, tuple): raise TypeError( mylogging.return_str( "Data must be tuple of length 2 - input vector and output vector." )) if len(input) != 2: raise ValueError( mylogging.return_str( "Data must be tuple of length 2 - input vector and output vector." )) return input[0], input[1]
def get_optimizers_loses_activations(): """Return list of tensorflow optimizers. It's used by optimize function. Returns: list: List of tensorflow optimizers. """ if not importlib.util.find_spec("tensorflow"): raise ModuleNotFoundError( mylogging.return_str( "Tensorflow model configured, but tensorflow library not installed. It's not " "in general requirements, because very big and not work everywhere. If you " "want to use tensorflow model, install it via \n\n`pip install tensorflow`" ) ) import tensorflow as tf sgd = tf.keras.optimizers.SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True) rmsprop = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9, epsilon=None, decay=0.0) adagrad = tf.keras.optimizers.Adagrad(learning_rate=0.01, epsilon=None, decay=0.0) adadelta = tf.keras.optimizers.Adadelta(learning_rate=1.0, rho=0.95, epsilon=None, decay=0.0) adam = tf.keras.optimizers.Adam( learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False, ) adamax = tf.keras.optimizers.Adamax( learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0 ) nadam = tf.keras.optimizers.Nadam( learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004, ) return [sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam]
def get_eeg(n=1000): """Download real EEG data. Args: n (int, optional): Length of data. Defaults to 1000. Returns: np.ndarray: Slope test data. """ if not importlib.util.find_spec("wfdb"): raise ModuleNotFoundError( mylogging.return_str( "For parsing EEG signal, wfdb library is necessary. Install with `pip install wfdb`" ) ) import wfdb return wfdb.rdrecord("a103l", pn_dir="challenge-2015/training/", channels=[1], sampto=n).p_signal
def train( data: tuple[np.ndarray, np.ndarray], layers: Literal["lstm", "mlp"] | list[tuple[str, dict]] = "mlp", epochs: int = 100, load_trained_model: bool = True, update_trained_model: bool = True, save_model: bool = True, saved_model_path_string: str = "stored_models", optimizer: str = "adam", loss: str = "mse", summary: bool = False, verbose=0, used_metrics="accuracy", timedistributed=False, batch_size=64, ): """Tensorflow model. Neural nets - LSTM or MLP (dense layers). Layers are customizable with arguments. Args: data (tuple[np.ndarray, np.ndarray]) - Tuple (X, y) of input train vectors X and train outputs y layers (Literal["lstm", "mlp"] | list[tuple[str, dict]], optional) - List of tuples of layer name (e.g. 'lstm') and layer params dict e.g. (("lstm", {"units": 7, "activation": "relu"})). Check default layers list here for example. There are also some predefined architectures. You can use 'lstm' or 'mlp'. Defaults to 'mlp'. epochs (int, optional): Number of epochs to evaluate. Defaults to 100. load_trained_model (bool, optional): If True, load model from disk. Most of time is spend on training, so if loaded and not updated, it's very fast. Defaults to True. update_trained_model (bool, optional): Whether load_trained_model, it's updated with new input. Defaults to True. save_model (str, optional): If True, save model on disk on saved_model_path_string. Defaults to True. saved_model_path_string (str, optional): Full path to saved model with name. E.g. '/home/dan/mymodel.h5. If 'stored_models', then it's save to library folder models/stored_models. Defaults to 'stored_models'. optimizer (str, optional): Used optimizer. Defaults to 'adam'. loss (str, optional): Loss function. Defaults to 'mse'. summary (int, optional): Display model details table. Defaults to 0. verbose (int, optional): Whether display progress bar. Defaults to 0. used_metrics (str, optional): Used metrics. 'accuracy' or 'mape' Defaults to 'accuracy'. timedistributed (bool, optional): Whether add time distributed layer. Defaults to False. batch_size (int, optional): Used batch size. Defaults to 64. Returns: model: Trained model object. """ if not importlib.util.find_spec("tensorflow"): raise ModuleNotFoundError( mylogging.return_str( "Tensorflow model configured, but tensorflow library not installed. It's not " "in general requirements, because very big and not work everywhere. If you " "want to use tensorflow model, install it via \n\n`pip install tensorflow`" ) ) import tensorflow as tf from tensorflow.keras import Sequential from tensorflow.keras import layers as tf_layers from tensorflow.keras import metrics as tf_metrics from tensorflow.keras import models as tf_models from tensorflow.keras import Model as tf_model_type X, y = get_inputs(data) X_ndim = X.ndim models = { "dense": tf_layers.Dense, "lstm": tf_layers.LSTM, "mlp": tf_layers.Dense, "gru": tf_layers.GRU, "conv2d": tf_layers.Conv2D, "rnn": tf_layers.SimpleRNN, "convlstm2d": tf_layers.ConvLSTM2D, "dropout": tf_layers.Dropout, "batchnormalization": tf_layers.BatchNormalization, } if used_metrics == "accuracy": metrics = [tf_metrics.Accuracy()] elif used_metrics == "mape": metrics = [tf_metrics.MeanAbsolutePercentageError()] else: raise ValueError("metrics has to be one from ['accuracy', 'mape']") if saved_model_path_string == "stored_models": saved_model_path_string = str(Path(__file__).resolve().parent / "stored_models" / "tensorflow.h5") if load_trained_model: try: model = tf_models.load_model(saved_model_path_string) model = cast(tf_model_type, model) model.load_weights(saved_model_path_string) except Exception: raise NameError("Model is not saved, first save_model = 1 in config") if update_trained_model: model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=verbose) else: if isinstance(layers, str): if layers == "lstm": layers = [ ("lstm", {"units": 32, "activation": "relu", "return_sequences": 1}), ("dropout", {"rate": 0.1}), ("lstm", {"units": 7, "activation": "relu"}), ] elif layers == "mlp": layers = [ ("dense", {"units": 32, "activation": "relu"}), ("dropout", {"rate": 0.1}), ("dense", {"units": 7, "activation": "relu"}), ] else: raise ValueError( mylogging.return_str("Only possible predefined layers are 'lstm' and 'mlp'.") ) layers = cast(list[tuple[str, dict[str, Any]]], layers) if layers[0][0] == "lstm": if X.ndim == 2: X = X.reshape(X.shape[0], X.shape[1], 1) layers[0][1]["input_shape"] = (X.shape[1], X.shape[2]) elif layers[0][0] == "dense": layers[0][1]["input_shape"] = (X.shape[1],) if X.ndim > 2: raise ValueError( mylogging.return_str( "For dense first layer only univariate data supported (e.g. shape = (n_samples, n_features))" "if ndim > 2: serialize first." ) ) model = Sequential() for i in layers: model.add(models[i[0]](**i[1])) if timedistributed == 1: model.add(tf_layers.TimeDistributed(tf_layers.Dense(y.shape[1]))) else: model.add(tf_layers.Dense(y.shape[1])) model.compile(optimizer=optimizer, loss=loss, metrics=metrics) if summary: model.summary() model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=verbose) if save_model == 1: model.save(saved_model_path_string) model.layers_0_0 = layers[0][0] model.X_ndim = X_ndim model.y_shape_1 = y.shape[1] return model
def predict_multiple_columns( data=None, predicted_columns: list | tuple | str | None = None, freqs: list | tuple | str | None = None, config: predictit.configuration.Config | dict | None = None, **kwargs, ) -> predictit._result_classes.Multiple: """Predict multiple columns and multiple frequencies at once. Use predict function. Only data and predicted_columns can be positional. Check README or tests for working examples. Args: data (np.ndarray, pd.DataFrame): Time series. Can be 2-D - more columns. !!! In Numpy array use data series as rows, but in dataframe use cols !!!. Defaults to []. predicted_columns (list | tuple | str | None, optional): List of indexes of predicted columns or it's names (dataframe). Defaults to None. freqs (list | tuple | str | None, optional): If date index available, resample data and predict in defined time frequency. If None, then value from config will be used. Defaults to []. config (predictit.configuration.Config | dict | None, optional): Settings as Config instance or dictionary. Check class for what you can use. If None, then default config will be used. Defaults to None. **kwargs (dict, optional): There is much more parameters' in this function. Check configuration.py for parameters details. Returns: np.ndarray: All the predicted results. """ if config is None or isinstance(config, dict): update_config = config config = config_default config = config.copy() if update_config: config.update(update_config) elif isinstance(config, predictit.configuration.Config): config = config.copy() # Edit configuration.py default values with arguments values if exist if data is not None: config.data = data if predicted_columns is not None: config.predicted_columns = predicted_columns if freqs is not None: config.freqs = freqs if not config.predicted_columns or not isinstance(config.predicted_columns, list): raise TypeError( mylogging.return_str( "predict_multiple function need predicted_columns config value to be list." )) config.update(kwargs) predictit._helpers.logger_init_from_config(config.output.logger_subconfig) if not config.data_input.freqs: freqs = ["Default frequency"] else: freqs = config.data_input.freqs if config.predicted_columns in ["*", ["*"]]: if isinstance(config.data, str): config.data = mdp.load_data.load_data( config.data, header=config.header, csv_style=config.csv_style, predicted_table=config.predicted_table, max_imported_length=config.max_imported_length, request_datatype_suffix=config.request_datatype_suffix, data_orientation=config.data_orientation, ) config.predicted_columns = mdp.preprocessing.data_consolidation( config.data).columns results = {} best_predictions_dataframes = {} for fi, f in enumerate(freqs): result_dataframe = pd.DataFrame() for ci, c in enumerate(config.predicted_columns): config.predicted_column = c config.freq = f result_name = f"Column: {c}" if len( freqs) == 1 else f"Column: {c} - Freq: {f}" try: results[result_name] = predict(config=config) result_dataframe[c] = results[result_name].best_prediction except Exception: mylogging.traceback( f"Error in making predictions on column {c} and freq {f}", level="ERROR", ) best_predictions_dataframes[f"Freq: {f}"] = result_dataframe return predictit._result_classes.Multiple( best_predictions_dataframes=best_predictions_dataframes, results=results)
def predict( data=None, predicted_column: None | str | int = None, config: predictit.configuration.Config | dict | None = None, **kwargs, ) -> predictit._result_classes.Result: """Make predictions mostly on time-series data. Data input and other config options can be set up in configuration.py or overwritten on the fly. Setup can be also done as function input arguments or as command line arguments (it will overwrite config values). There are working examples in main readme and also in test_it module. Function can be configured from with config from configuration, with command line arguments as wel as with function parameters. There are only two possible positional parameters - `data` and `predicted_column`. Rest of parameters must be named parameters. Params are not documented here, because all config params works here in function passed as kwargs. Args: data (np.ndarray, pd.DataFrame, str): Time series. Can be 2-D - more columns. Can be numpy array, DataFrame, path to file or url. Examples: "/home/user/my.json", or "https://yoururl/your.csv" or np.random.randn(100, 2). predicted_column (None | str | int, optional): Index of predicted column or it's name (dataframe). If list with more values only the first one will be evaluated (use predict_multiple_columns function if you need that. Default to None. config (predictit.configuration.Config | dict | None, optional): Settings as Config instance or dictionary. Check class for what you can use. If None, then default config will be used. Defaults to None. **kwargs (dict, optional): There is much more parameters' of predict function. Check configuration.py for parameters details. Returns: Depend on 'return_type' config value - return best prediction {np.ndarray}, all models results {np.ndarray}, detailed results{dict} or interactive plot or print tables of results """ from mypythontools.plots import plot if config is None or isinstance(config, dict): update_config = config config = config_default config = config.copy() if update_config: config.update(update_config) elif isinstance(config, predictit.configuration.Config): config = config.copy() if config.use_config_preset and config.use_config_preset != "none": updated_config = config.presets[config.use_config_preset] config.update(updated_config) # Edit configuration.py default values with arguments values if exist if data is not None: config.data = data if predicted_column is not None: config.predicted_column = predicted_column config.update(kwargs) predictit._helpers.logger_init_from_config(config.output.logger_subconfig) # Do not repeat actually mean evaluate once if not config.repeatit: config.repeatit = 1 _GUI = GLOBAL_VARS.GUI # Add everything printed + warnings to variable to be able to print in GUI if _GUI: stdout = sys.stdout sys.stdout = io.StringIO() # Don't want to define in gui condition, so if not gui, do nothing if _GUI: def update_gui(content, html_id): try: predictit.gui_start.edit_gui_py(content, html_id) except (Exception, ): pass else: def update_gui(content, html_id): pass # Definition of the table for spent time on code parts time_df = [] def update_time_table(time_last): time_df.append([progress_phase, round((time.time() - time_last), 3)]) return time.time() time_point = time_begin = time.time() ############### ### ANCHOR ### Data ############# progress_phase = "Data loading and preprocessing" update_gui(progress_phase, "progress_phase") data = mdp.load_data.load_data( config.data, header=config.header, csv_style=config.csv_style, predicted_table=config.predicted_table, max_imported_length=config.max_imported_length, request_datatype_suffix=config.request_datatype_suffix, data_orientation=config.data_orientation, ) ############### ### ANCHOR ### Data consolidation ############# if not config.predicted_column: config.predicted_column = 0 data_for_predictions_df = mdp.preprocessing.data_consolidation( data, predicted_column=config.predicted_column, other_columns=config.other_columns, datalength=config.datalength, datetime_column=config.datetime_column, unique_threshold=config.unique_threshold, embedding=config.embedding, freq=config.freq, resample_function=config.resample_function, remove_nans_threshold=config.remove_nans_threshold, remove_nans_or_replace=config.remove_nans_or_replace, dtype=config.dtype, ) # In data consolidation predicted column was replaced on index 0 as first column predicted_column_index = 0 predicted_column_name = data_for_predictions_df.columns[0] ############### ### ANCHOR ### Analyze original data ############# column_for_predictions_series = data_for_predictions_df.iloc[:, 0:1] results = {} data_inputs = [] if config.mode == "validate": column_for_predictions_series = column_for_predictions_series.iloc[: -config . output . predicts, :] config.repeatit = 1 for i in config.used_models: data_inputs.append(config.models_input[i]) data_inputs = set(data_inputs) if config.analyzeit == 1 or config.analyzeit == 3: print("Analyze of unprocessed data") try: predictit.analyze.analyze_column(data_for_predictions_df.values[:, 0], window=30) predictit.analyze.analyze_data(data_for_predictions_df) predictit.analyze.decompose( data_for_predictions_df.values[:, 0], **config.analyze_seasonal_decompose, ) except Exception: mylogging.traceback("Analyze failed", level="ERROR") semaphor = None if config.multiprocessing: multiprocessing.freeze_support() if not config.processes_limit: config.processes_limit = multiprocessing.cpu_count() - 1 if config.multiprocessing == "process": pipes = [] semaphor = multiprocessing.Semaphore(config.processes_limit) elif config.multiprocessing == "pool": pool = multiprocessing.Pool(config.processes_limit) # It is not possible easy share data in multiprocessing, so results are resulted via callback function def return_result(result): for i, j in result.items(): results[i] = j ### Optimization loop if (not config.optimization or not config.optimization_variable or not config.optimization_values or len(config.optimization_values) == 1): config.variable_optimization.optimization = False config.optimization_values = ["Not optimized"] config.optimization_variable = None time_point = update_time_table(time_point) progress_phase = "Predict" update_gui(progress_phase, "progress_phase") models_indexed = {i: j for i, j in enumerate(config.used_models)} ############### ### ANCHOR ### Main loop ############# for optimization_index, optimization_value in enumerate( config.optimization_values): # TODO check why setattr - may be wrong after config change if config.optimization_variable: setattr(config, config.optimization_variable, optimization_value) ############### ### ANCHOR ### Feature extraction ############# if config.add_fft_columns: data_for_predictions_df = mdp.feature_engineering.add_frequency_columns( data_for_predictions_df, window=config.feature_engineering.add_fft_columns, ) if config.data_extension: data_for_predictions_df = mdp.feature_engineering.add_derived_columns( data_for_predictions_df, **config.feature_engineering.data_extension) ############### ### ANCHOR ### Feature selection ############# # data_for_predictions_df TODO ############### ### ANCHOR ### Data preprocessing ############# if config.mode == "validate": test_unstandardized = mdp.misc.split( data_for_predictions_df, predicts=config.predicts)[1].values models_test_outputs_unstandardized = [test_unstandardized] else: models_test_outputs_unstandardized = mdp.create_model_inputs.create_tests_outputs( data_for_predictions_df.values[:, 0], predicts=config.predicts, repeatit=config.repeatit, ) data_for_predictions, last_undiff_value, final_scaler = mdp.preprocessing.preprocess_data( data_for_predictions_df.values, remove_outliers=config.remove_outliers, smoothit=config.smoothit, correlation_threshold=config.correlation_threshold, data_transform=config.data_transform, standardizeit=config.standardizeit, bins=config.bins, binning_type=config.binning_type, ) data_for_predictions = cast(np.ndarray, data_for_predictions) if config.mode == "validate": data_for_predictions, test = mdp.misc.split( data_for_predictions, predicts=config.predicts) models_test_outputs = [test] else: models_test_outputs = mdp.create_model_inputs.create_tests_outputs( data_for_predictions[:, 0], predicts=config.predicts, repeatit=config.repeatit, ) column_for_predictions_processed = data_for_predictions[:, predicted_column_index] data_shape = np.shape(data_for_predictions) data_length = len(column_for_predictions_processed) data_std = np.std(column_for_predictions_processed[-30:]) data_mean = np.mean(column_for_predictions_processed[-30:]) data_abs_max = max( abs(column_for_predictions_processed.min()), abs(column_for_predictions_processed.max()), ) multicolumn = 0 if data_shape[1] == 1 else 1 if (config.analyzeit == 2 or config.analyzeit == 3 ) and optimization_index == len(config.optimization_values) - 1: print("\n\nAnalyze of preprocessed data\n") try: predictit.analyze.analyze_column( column_for_predictions_processed, window=30) predictit.analyze.analyze_data(data_for_predictions) predictit.analyze.decompose( column_for_predictions_processed, **config.analyze_seasonal_decompose, ) except Exception: mylogging.traceback("Analyze failed", level="ERROR") min_data_length = 3 * config.predicts + config.default_n_steps_in if (data_length < min_data_length or data_length < config.repeatit + config.default_n_steps_in + config.predicts): config.repeatit = 1 min_data_length = 3 * config.predicts + config.default_n_steps_in assert min_data_length < data_length, mylogging.return_str( "Set up less predicted values in settings or add more data", caption="To few data", ) for data_inputs_name in data_inputs: try: ( model_train_input, model_predict_input, model_test_inputs, ) = mdp.create_model_inputs.create_inputs( data_for_predictions, input_type_name=data_inputs_name, input_type_params=config.data_inputs[data_inputs_name], mode=config.mode, predicts=config.predicts, repeatit=config.repeatit, predicted_column_index=predicted_column_index, ) except Exception: mylogging.traceback( f"Error in creating input type: {data_inputs_name} with option optimization: {optimization_value}", level="WARNING", ) continue for (iterated_model_index, iterated_model_name) in models_indexed.items(): iterated_model = predictit.models.models_assignment[ iterated_model_name] if config.models_input[ iterated_model_name] == data_inputs_name: predict_parameters = { "config": config.get_dict(), # Functions to not import all modules "preprocess_data_inverse": mdp.preprocessing.preprocess_data_inverse, "fitted_power_transform": mdp.preprocessing.fitted_power_transform, # Other "iterated_model_train": iterated_model.train, "iterated_model_predict": iterated_model.predict, "iterated_model_name": iterated_model_name, "iterated_model_index": iterated_model_index, "optimization_index": optimization_index, "optimization_value": optimization_value, "model_train_input": model_train_input, "model_predict_input": model_predict_input, "model_test_inputs": model_test_inputs, "models_test_outputs": models_test_outputs, "models_test_outputs_unstandardized": models_test_outputs_unstandardized, "data_abs_max": data_abs_max, "data_mean": data_mean, "data_std": data_std, "last_undiff_value": last_undiff_value, "final_scaler": final_scaler, "semaphor": semaphor, } if config.models_input[iterated_model_name] in [ "one_step", "one_step_constant", ]: if multicolumn and config.predicts > 1: mylogging.warn( f"Warning in model {iterated_model_name} \n\nOne-step prediction on " "multivariate data (more columns). Use multi_step (y lengt equals to predict) " "or do use some one column data input in config models_input or predict just one value." ) continue if config.multiprocessing == "process": pipes.append(multiprocessing.Pipe(duplex=False)) p = multiprocessing.Process( target=predictit._main_loop.train_and_predict, kwargs={ **predict_parameters, **{ "pipe": pipes[-1][1] } }, ) p.Daemon = True # Baby process will be terminated if parent killed p.start() elif config.multiprocessing == "pool": pool.apply_async( predictit._main_loop.train_and_predict, (), predict_parameters, callback=return_result, ) else: results = { **results, **predictit._main_loop.train_and_predict(**predict_parameters), } if config.multiprocessing: if config.multiprocessing == "process": for i in pipes: try: results = {**results, **i[0].recv()} except Exception: pass if config.multiprocessing == "pool": pool.close() pool.join() for i in results.values(): mylogging.my_logger.log_and_warn_from_lists( i["logs_list"], i["warnings_list"]) # Create confidence intervals if config.confidence_interval: try: lower_bound, upper_bound = predictit.misc.confidence_interval( column_for_predictions_series.values, predicts=config.predicts, confidence=config.confidence_interval, ) grey_area = ["Lower bound", "Upper bound"] bounds = True except Exception: bounds = False grey_area = ["Lower bound", "Upper bound"] mylogging.traceback("Error in compute confidence interval", level="ERROR") else: bounds = False grey_area = False ############### ### ANCHOR ### Results processing ############# # Criterion is the best of average from repetitions time_point = update_time_table(time_point) progress_phase = "Evaluation" update_gui(progress_phase, "progress_phase") # Two kind of results we will create. Both as dataframe # - First are all the details around prediction. Model errors, time, memory peak etc. # - Second we have predicted values # Results such as trained model etc. that cannot be displayed in dataframe are in original results dict. # Convert results from dictionary to dataframe - exclude objects like trained model results_df = pd.DataFrame.from_dict(results, orient="index") if results_df.empty: raise RuntimeError( mylogging.return_str( "None of models finished predictions. Set config.logger_level = 'DEBUG' for more info.", caption="All models failed for some reason", )) evaluated_matrix = np.zeros( (1, len(config.optimization_values), len(config.used_models))) evaluated_matrix.fill(np.nan) for k in results.values(): evaluated_matrix[0, k["Index"][0], k["Index"][1]] = k["Model error"] ( _, best_models_optimized_values, optimized_values_results_df, best_model_name, best_optimized_value, ) = predictit.analyze.analyze_results( evaluated_matrix, config.optimization_values, config.models.used_models, config.prediction.error_criterion, ) # Generate date indexes for result predictions last_date = column_for_predictions_series.index[-1] if isinstance( last_date, (pd.core.indexes.datetimes.DatetimeIndex, pd._libs.tslibs.timestamps.Timestamp), ): date_index = pd.date_range( start=last_date, periods=config.predicts + 1, freq=column_for_predictions_series.index.freq, )[1:] date_index = pd.to_datetime(date_index) else: date_index = list(range(last_date + 1, last_date + config.predicts + 1)) predictions_df = pd.DataFrame(index=date_index) results_df.sort_values("Model error", inplace=True) for i, row in results_df.iterrows(): predictions_df[i] = row["Results"] if predictions_df.empty: raise RuntimeError( mylogging.return_str( "Neither of models finished prediction. Set config.logger_level = 'DEBUG' for more info." )) if config.variable_optimization.optimization: best_optimized_values_dict = { j: best_models_optimized_values[i] for i, j in enumerate(config.used_models) } best_indexes = [] for i, row in results_df.iterrows(): if row["Optimization value"] == best_optimized_values_dict[ row["Name"]]: best_indexes.append(i) optimization_result = predictit._result_classes.Optimization( optimized_variable=config.variable_optimization. optimization_variable, optimized_options=config.variable_optimization.optimization_values, best_value=best_optimized_value, values_results_df=optimized_values_results_df, best_values_for_models=best_optimized_values_dict, all_models_results_df=results_df, all_models_predictions_df=predictions_df, ) predictions_df = predictions_df[best_indexes] predictions_df.columns = [ results_df.loc[i]["Name"] for i in predictions_df.columns ] results_df = results_df.loc[best_indexes] results_df.rename(columns={"A": "Col_1"}, inplace=True) else: optimization_result = None if config.hyperparameter_optimization.optimizeit: hyperparameter_optimization_kwargs = results_df[ "Best optimized parameters"].to_dict() else: hyperparameter_optimization_kwargs = None results_df.set_index("Name", inplace=True) results_to_drop = [ i for i in [ "Index", "Trained model", "Test errors", "Results", "logs_list", "warnings_list", ] if i in results_df.columns ] results_df.drop(columns=results_to_drop, inplace=True) best_model_predicts = predictions_df[best_model_name] ############### ### ANCHOR ### Plot ############# if config.variable_optimization.optimization and config.variable_optimization.plot_all_optimized_models: predictions_for_plot = optimization_result.all_models_predictions_df.copy( ) else: predictions_for_plot = predictions_df.copy() predictions_for_plot.columns = [ f"{i + 1} - {j}" for i, j in enumerate(predictions_for_plot.columns) ] if config.mode == "validate": best_model_name_plot = "Test" predictions_df.insert(0, "Test", test_unstandardized) predictions_for_plot.insert(0, "Test", test_unstandardized) else: best_model_name_plot = predictions_for_plot.columns[0] bounds_df = pd.DataFrame(index=date_index) if bounds: bounds_df["Upper bound"] = upper_bound bounds_df["Lower bound"] = lower_bound last_value = float(column_for_predictions_series.iloc[-1, 0]) predictions_for_plot_limited = pd.concat( [ predictions_for_plot.iloc[:, :config.plot_number_of_models], bounds_df ], axis=1, ) predictions_with_history = pd.concat( [ column_for_predictions_series[-config.plot_history_length:], predictions_for_plot_limited, ], sort=False, ) predictions_with_history.iloc[-config.predicts - 1, :] = last_value if config.sort_results_by == "name": results_df.sort_index(key=lambda x: x.str.lower(), inplace=True) predictions_df.sort_index(key=lambda x: x.str.lower(), inplace=True) if config.general.analyzeit: import matplotlib.pyplot as plt plt.show() time_point = update_time_table(time_point) progress_phase = "plot" update_gui(progress_phase, "progress_phase") if config.output.plot_subconfig.show_plot or config.output.plot_subconfig.save_plot: with warnings.catch_warnings(): warnings.simplefilter("ignore", ResourceWarning) return_div = True if _GUI else False if config.plot_type == "with_history": div = plot( predictions_with_history, plot_library=config.plot_library, plot_name=config.plot_name, legend=config.plot_legend, highlighted_column=predicted_column_name, surrounded_column=best_model_name_plot, grey_area=grey_area, save=config.save_plot, return_div=return_div, show=config.output.plot_subconfig.show_plot, ) elif config.plot_type == "just_results": div = plot( predictions_for_plot, plot_library=config.plot_library, legend=config.plot_legend, highlighted_column=best_model_name_plot, save=config.save_plot, show=config.output.plot_subconfig.show_plot, ) update_time_table(time_point) progress_phase = "Completed" update_gui(progress_phase, "progress_phase") ############### ### ANCHOR ### Table ############# time_df.append(["Complete time", round((time.time() - time_begin), 3)]) time_df = pd.DataFrame(time_df, columns=["Part", "Time"]) simple_table_df = mdp.misc.edit_table_to_printable(results_df[[ "Model error" ]].iloc[:config.print_number_of_models, :].reset_index()) detailed_table_df = results_df.iloc[:config. print_number_of_models, :].reset_index( ) detailed_table_df.drop(["Unstandardized model error"], axis=1, inplace=True) detailed_table_df = mdp.misc.edit_table_to_printable(detailed_table_df) tables = predictit._result_classes.Tables( simple=tabulate( simple_table_df.values, headers=["Model", f"Average {config.error_criterion} error"], **config.table_settings, ), detailed=tabulate( detailed_table_df.values, headers=detailed_table_df.columns, **config.table_settings, ), time=tabulate(time_df.values, headers=time_df.columns, **config.table_settings), simple_table_df=simple_table_df, detailed_table_df=detailed_table_df, ) ############### ### ANCHOR ### Results ############# misc_result = predictit._result_classes.Misc( evaluated_matrix=evaluated_matrix) result = predictit._result_classes.Result( best_prediction=best_model_predicts, best_model_name=best_model_name, predictions=predictions_df, results_df=results_df, results=results, with_history=predictions_with_history, tables=tables, config=config, misc=misc_result, optimization=optimization_result, hyperparameter_optimization_kwargs=hyperparameter_optimization_kwargs, ) ############### ### ANCHOR ### Print ############# if config.print_result_details: print(( f"\nBest model is {best_model_name} with results \n\n{best_model_predicts}\n\nWith model error {config.error_criterion} = " f"{results_df.loc[best_model_name, 'Model error']}")) if config.print_table == "simple": print(f"\n{tables.simple}\n") elif config.print_table == "detailed": print(f"\n{tables.detailed}\n") if config.print_time_table: print(f"\n{tables.time}\n") ############### ### ANCHOR ### Return ############# mylogging.reset_outer_warnings_filter() # Return stdout and stop collect warnings and printed output if _GUI: output = sys.stdout.getvalue() sys.stdout = stdout result.output = output print(output) if _GUI: result.plot = div if config.return_internal_results: return { "data_for_predictions (X, y)": data_for_predictions, "model_train_input": model_train_input, "model_predict_input": model_predict_input, "model_test_inputs": model_test_inputs, "models_test_outputs": models_test_outputs, } return result
def train( data: tuple[np.ndarray, np.ndarray], model="BayesianRidge", n_estimators=100, alpha=0.0001, alpha_1=1.0e-6, alpha_2=1.0e-6, lambda_1=1.0e-6, lambda_2=1.0e-6, n_iter=300, epsilon=1.35, alphas=[0.1, 0.5, 1], gcv_mode="auto", solver="auto", n_hidden=20, rbf_width=0, activation_func="selu" # load_trained_model=0, update_trained_model=1, save_model=1, saved_model_path_string='stored_models', ) -> Any: """Sklearn model. Models as input parameter. Can be linear, ridge, Huber or much more. It also contain extreme learning machine model from sklearn extensions. Note: There are many parameters in function, but all models use just a few of them. Usually default parameters are just enough. Some of models are regressors and some are classifiers. If it's classifier, it's optimal to have data sorted in limited number of bins. Args: data (tuple[np.ndarray, np.ndarray]) - Tuple (X, y) of input train vectors X and train outputs y. Insert input with no constant column - added by default in sklearn. Check `mydatapreprocessing` how to generate output. model ((str, object), optional): Model that will be used. You can insert model itself or just a name of used class. All possible options below in docs. Defaults to 'BayesianRidge'. n_estimators (100, optional): Parameter of some model. Defaults to 100. alpha (float, optional): Parameter of some model. Defaults to 0.0001. alpha_1 (float, optional): Parameter of some model. Defaults to 1.e-6. alpha_2 (float, optional): Parameter of some model. Defaults to 1.e-6. lambda_1 (float, optional): Parameter of some model. Defaults to 1.e-6. lambda_2 (float, optional): Parameter of some model. Defaults to 1.e-6. n_iter (int, optional): Parameter of some model. Defaults to 300. epsilon (float, optional): Parameter of some model. Defaults to 1.35. alphas (list, optional): Parameter of some model. Defaults to [0.1, 0.5, 1]. gcv_mode (str, optional): Parameter of some model. Defaults to 'auto'. solver (str, optional): Parameter of some model. Defaults to 'auto'. n_hidden (int, optional): Parameter of some model. Defaults to 20. rbf_width (int, optional): Parameter of some model. Defaults to 0. activation_func (str, optional): Parameter of some model. Defaults to 'selu'. Returns: np.ndarray: Predictions of input time series. Options if string:: ['PLSRegression', 'RandomForestRegressor', 'ExtraTreesRegressor', 'BaggingRegressor', 'GradientBoostingRegressor', 'AdaBoostRegressor', 'VotingRegressor', 'StackingRegressor', 'RandomForestClassifier', 'ExtraTreesClassifier', 'BaggingClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier', 'VotingClassifier', 'StackingClassifier', 'GaussianProcessRegressor', 'GaussianProcessClassifier', 'IsotonicRegression', Regression', 'HuberRegressor', 'LinearRegression', 'LogisticRegression', 'LogisticRegressionCV', 'PassiveAggressiveRegressor', 'SGDRegressor', 'TheilSenRegressor', 'RANSACRegressor', 'PoissonRegressor', 'GammaRegressor', 'TweedieRegressor', 'PassiveAggressiveClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'OneVsRestClassifier', 'OneVsOneClassifier', 'OutputCodeClassifier', 'MultiOutputRegressor', 'RegressorChain', 'MultiOutputClassifier', 'ClassifierChain', 'KNeighborsRegressor', 'RadiusNeighborsRegressor', 'KNeighborsClassifier', 'RadiusNeighborsClassifier', 'MLPRegressor', 'MLPClassifier', 'SelfTrainingClassifier', 'DecisionTreeRegressor', 'ExtraTreeRegressor', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'TransformedTargetRegressor', 'BayesianRidge', 'ElasticNet', 'Hinge', 'Lars', 'LarsCV', 'Lasso', 'LassoCV', 'LassoLarsIC', 'Log', 'ModifiedHuber', 'MultiTaskElasticNet', 'MultiTaskLasso', 'MultiTaskLassoCV', 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', 'Perceptron', 'Ridge', 'RidgeCV', 'SquaredLoss', 'SVR', # Sklearn extensions 'ELMClassifier', 'ELMRegressor', 'GenELMClassifier', 'GenELMRegressor'] """ from sklearn import ( multioutput, linear_model, ensemble, tree, neighbors, gaussian_process, ) X, y = get_inputs(data) # If string like 'LinearRegression', find class with such a name if isinstance(model, str): for i in [linear_model, ensemble, tree, neighbors, gaussian_process]: if model in i.__all__: model = getattr(i, model) break # If model is still string, not object from sklearn, it means it was not found, # may be from sklearnextensions library if isinstance(model, str): import sklearn_extensions.extreme_learning_machines.elm as elm model = getattr(elm, model) # Model defined by string not found if isinstance(model, str): raise AttributeError( mylogging.return_str( "You defined model that was not found in sklearn. You can use not only string, but also" "object or class itself. You can use function `get_all_models` to get list of all" "possible models and then use one of them.")) # If class, but no object was configured, create instance if callable(model): model = model() params = { "n_estimators": n_estimators, "alpha": alpha, "alpha_1": alpha_1, "alpha_2": alpha_2, "lambda_1": lambda_1, "lambda_2": lambda_2, "n_iter": n_iter, "epsilon": epsilon, "alphas": alphas, "gcv_mode": gcv_mode, "solver": solver, "n_hidden": n_hidden, "rbf_width": rbf_width, "activation_func": activation_func, } # Params, that are configured in function params as well as configurable in models used_params = { i: j for (i, j) in params.items() if i in model.get_params() } model.set_params(**used_params) if y.shape[1] == 1: model.output_shape = "one_step" setattr(model, "output_shape", "one_step") y = y.ravel() else: if model._estimator_type == "regressor": model = multioutput.MultiOutputRegressor(model) elif model._estimator_type == "classifier": model = multioutput.MultiOutputClassifier(model) setattr(model, "output_shape", "multi_step") model.fit(X, y) return model
analyze, best_params, configuration as _configuration, evaluate_predictions, gui_start, _helpers, main, _main_loop, misc, models, ) # Just shortcuts to avoid importing from main from .main import ( predict, predict_multiple_columns, compare_models, find_optimal_input_for_models, ) from .configuration import config import sys import mylogging if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor < 7): raise RuntimeError( mylogging.return_str("Python version >= 3.7 necessary."))
def compare_predicted_to_test( predicted: np.ndarray, test: np.ndarray, error_criterion: str = "mape", plot: bool = False, model_name: str = "Model", data_name: str = "Data", ): """Compare tested model with reality. Args: predicted (np.ndarray): Model output. test (np.ndarray): Correct values or output from data_pre funcs. error_criterion (str, optional): 'mape' or 'rmse'. Defaults to 'mape'. plot (bool, optional): Whether create plot. Defaults to False. model_name (str, optional): Model name for plot. Defaults to "Model". data_name (str, optional): Data name for plot. Defaults to "Data". Returns: float: Error criterion value (mape or rmse). If configured, plot of results as well. """ predicts = len(predicted) if predicts != len(test): print("Test and predicted length not equal") return np.nan if predicted is not None: if plot: if not misc.GLOBAL_VARS.PLOTS_CONFIGURED: misc.setup_plots() import matplotlib.pyplot as plt plt.figure(figsize=(10, 6)) plt.plot(test, label="Reality") plt.plot(predicted, label="Prediction") plt.legend(loc="upper right") plt.xlabel("t") plt.ylabel("Predicted value") plt.title("Prediction with \n {} with data {}".format( model_name, data_name)) plt.show() error = np.array(predicted) - np.array(test) """ abs_error = [abs(i) for i in error] sum_abs_error = sum(abs_error) mae = sum_abs_error / predicts """ if error_criterion == "mse" or error_criterion == "mse_sklearn": from sklearn.metrics import mean_squared_error criterion_value = mean_squared_error(test, predicted) elif error_criterion == "max_error": from sklearn.metrics import max_error criterion_value = max_error(test, predicted) elif error_criterion == "rmse": rmseerror = error**2 criterion_value = (sum(rmseerror) / predicts)**(1 / 2) elif error_criterion == "mape": no_zero_test = np.where(abs(test) >= 1, test, 1) criterion_value = np.mean(np.abs( (test - predicted) / no_zero_test)) * 100 elif error_criterion == "dtw": if not importlib.util.find_spec("dtaidistance"): raise ImportError( mylogging.return_str( "Library dtaidistance necessary for configured dtw (dynamic time warping) " "error criterion is not installed! Install it via \n\npip install dtaidistance" )) from dtaidistance import dtw criterion_value = dtw.distance_fast(predicted.astype("double"), test.astype("double")) else: raise KeyError( mylogging.return_str( f"bad 'error_criterion' in config - '{error_criterion}'. Use some from options from config " "comment... ")) return criterion_value
def analyze_column(data: np.ndarray | pd.DataFrame, lags: int = 5, window: int = 5) -> None: """Function one-dimensional data (predicted column), that plot data, it's distribution, some details like minimum, maximum, std, mean etc. It also create autocorrelation and partial autocorrelation (good for ARIMA models) and plot rolling mean and rolling std. It also tell if data are probably stationary or not. Args: data (np.ndarray | pd.DataFrame): Time series data. lags (int, optional): Lags used for autocorrelation. Defaults to 5. window (int, optional): Window for rolling average and rolling std. Defaults to 5. """ if not misc.GLOBAL_VARS.PLOTS_CONFIGURED: misc.setup_plots() import matplotlib.pyplot as plt import seaborn as sns from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf from statsmodels.tsa.stattools import adfuller import mydatapreprocessing data = np.array(data) if data.ndim != 1 and 1 not in data.shape: raise ValueError( mylogging.return_str( "Select column you want to analyze", caption="analyze_data function only for one-dimensional data!", )) data = data.ravel() print( f"Length: {len(data)}\n" f"Minimum: {np.nanmin(data)}\n" f"Maximum: {np.nanmax(data)}\n" f"Mean: {np.nanmean(data)}\n" f"Std: {np.nanstd(data)}\n" f"First few values: {data[-5:]}\n" f"Middle values: {data[int(-len(data)/2): int(-len(data)/2) + 5]}\n" f"Last few values: {data[-5:]}\n" f"Number of nan (not a number) values: {np.count_nonzero(np.isnan(data))}\n" ) # Data and it's distribution plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) plt.plot(data) plt.xlabel("t") plt.ylabel("f(x)") plt.subplot(1, 2, 2) sns.histplot(data, bins=100, kde=True, color="skyblue") plt.xlabel("f(x)") plt.ylabel("Distribution") plt.tight_layout() plt.suptitle("Data and it's distribution", fontsize=20) plt.subplots_adjust(top=0.88) plt.draw() fig, (ax, ax2) = plt.subplots(ncols=2, figsize=(10, 5)) fig.suptitle("Repeating patterns - autocorrelation") try: plot_acf(data, lags=lags, ax=ax) ax.set_xlabel("Lag") plot_pacf(data, lags=lags, ax=ax2) ax2.set_xlabel("Lag") plt.draw() except Exception: mylogging.traceback( "Error in analyze_column function - in autocorrelation function: Maybe more lags, than values" ) # Moving average rolling_mean = np.sum( mydatapreprocessing.misc.rolling_windows(data, window), 1) rolling_std = np.std( mydatapreprocessing.misc.rolling_windows(data, window), 1) plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) plt.plot(rolling_mean) plt.xlabel("t") plt.ylabel("Rolling average x") plt.subplot(1, 2, 2) plt.plot(rolling_std) plt.xlabel("f(x)") plt.ylabel("Rolling standard deviation x") plt.tight_layout() plt.suptitle("Rolling average and rolling standard deviation", fontsize=20) plt.subplots_adjust(top=0.88) plt.draw() # Dick Fuller test for stationarity pvalue = adfuller(data)[1] cutoff = 0.05 if pvalue < cutoff: print( f"\np-value = {pvalue} : Analyzed column is probably stationary.\n" ) else: print( f"\np-value = {pvalue} : Analyzed column is probably not stationary.\n" )