def as_data_frame(self): """Convert to a python 'data frame'.""" if can_use_pandas(): import pandas pandas.options.display.max_colwidth = 70 return pandas.DataFrame(self._cell_values, columns=self._col_header) return self
def show(self, header=True): """Print the contents of this table.""" # if h2o.can_use_pandas(): # import pandas # pandas.options.display.max_rows = 20 # print pandas.DataFrame(self._cell_values,columns=self._col_header) # return if header and self._table_header: print(self._table_header + ":", end=' ') if self._table_description: print(self._table_description) print() table = copy.deepcopy(self._cell_values) nr = 0 if _is_list_of_lists(table): nr = len( table ) # only set if we truly have multiple rows... not just one long row :) if nr > 20: # create a truncated view of the table, first/last 5 rows trunc_table = [] trunc_table += [v for v in table[:5]] trunc_table.append(["---"] * len(table[0])) trunc_table += [v for v in table[(nr - 5):]] table = trunc_table H2ODisplay(table, self._col_header, numalign="left", stralign="left") if nr > 20 and can_use_pandas(): print('\nSee the whole table with table.as_data_frame()')
def as_data_frame(self): """Convert to a python 'data frame'.""" if can_use_pandas(): import pandas pandas.options.display.max_colwidth = 70 return pandas.DataFrame(self._cell_values, columns=self._col_header) return self
def varsplits(self, use_pandas=False): """ Retrieve per-variable split information for a given Isolation Forest model. Output will include: - count - The number of times a variable was used to make a split. - aggregated_split_ratios - The split ratio is defined as "abs(#left_observations - #right_observations) / #before_split". Even splits (#left_observations approx the same as #right_observations) contribute less to the total aggregated split ratio value for the given feature; highly imbalanced splits (eg. #left_observations >> #right_observations) contribute more. - aggregated_split_depths - The sum of all depths of a variable used to make a split. (If a variable is used on level N of a tree, then it contributes with N to the total aggregate.) :param use_pandas: If True, then the variable splits will be returned as a Pandas data frame. :returns: A list or Pandas DataFrame. """ model = self._model_json["output"] if "variable_splits" in list( model.keys()) and model["variable_splits"]: vals = model["variable_splits"].cell_values header = model["variable_splits"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print( "Warning: This model doesn't provide variable split information" )
def scoring_history_plot(self, timestep, metric, server=False, save_plot_path=None): plt = get_matplotlib_pyplot(server) if plt is None: return decorate_plot_result(figure=RAISE_ON_FIGURE_ACCESS) scoring_history = self._get_scoring_history_to_plot() timestep = self._validate_timestep(timestep) training_metric = "training_{}".format(metric) validation_metric = "validation_{}".format(metric) if timestep == "duration": dur_colname = "duration_{}".format( scoring_history["duration"][1].split()[1]) scoring_history[dur_colname] = [ str(x).split()[0] for x in scoring_history["duration"] ] timestep = dur_colname if can_use_pandas(): valid = validation_metric in list(scoring_history) ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \ else (scoring_history[training_metric].min(), scoring_history[training_metric].max()) else: valid = validation_metric in scoring_history.col_header ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \ else (min(scoring_history[training_metric]), max(scoring_history[training_metric])) if ylim[0] == ylim[1]: ylim = (0, 1) fig = plt.figure() if valid: # Training and validation scoring history plt.xlabel(timestep) plt.ylabel(metric) plt.title("Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training") plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange", label="Validation") plt.legend() else: # Training scoring history only plt.xlabel(timestep) plt.ylabel(training_metric) plt.title("Training Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric]) if save_plot_path is not None: plt.savefig(fname=save_plot_path) if not server: plt.show() return decorate_plot_result(figure=fig)
def test_external_libraries_detection(): assert can_use_pandas(), "pandas should be detected in test environment" assert can_use_numpy(), "numpy should be detected in test environment" assert is_module_available( 'matplotlib'), "matplotlib should be detected in test environment" assert is_module_available( 'sklearn'), "sklearn should be detected in test environment" assert not is_module_available('foobar'), "please don't"
def show(self, header=True): """Print the contents of this table.""" print() if header and self._table_header: print(self._table_header + ":", end=' ') if self._table_description: print(self._table_description) (table, nr, is_pandas) = self._as_show_table() H2ODisplay(table, is_pandas=is_pandas, header=self._col_header, numalign="left", stralign="left") if nr > 20 and can_use_pandas(): print('\nSee the whole table with table.as_data_frame()')
def _as_show_table(self): if H2ODisplay.prefer_pandas() and can_use_pandas(): pd = self.as_data_frame() return self.as_data_frame().head(20), pd.shape[0], True else: table = copy.deepcopy(self._cell_values) nr = 0 if _is_list_of_lists(table): nr = len( table) # only set if we truly have multiple rows... not just one long row :) if nr > 20: # create a truncated view of the table, first/last 5 rows trunc_table = [] trunc_table += [v for v in table[:5]] trunc_table.append(["---"] * len(table[0])) trunc_table += [v for v in table[(nr - 5):]] table = trunc_table return table, nr, False
def varimp(self, use_pandas=False): """ Return the Importance of components associcated with a pca model. :param bool use_pandas: If True, then the variable importances will be returned as a pandas data frame. (Default is False.) """ model = self._model_json["output"] if "importance" in list(model.keys()) and model["importance"]: vals = model["importance"].cell_values header = model["importance"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print("Warning: This model doesn't have importances of components.")
def varimp(self, use_pandas=False): """ Return the Importance of components associcated with a pca model. use_pandas: ``bool`` (default: ``False``). """ model = self._model_json["output"] if "importance" in list(model.keys()) and model["importance"]: vals = model["importance"].cell_values header = model["importance"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print("Warning: This model doesn't have importances of components.")
def varimp(self, use_pandas=False): """ Return the Importance of components associcated with a pca model. use_pandas: ``bool`` (default: ``False``). """ model = self._model_json["output"] if "importance" in list(model.keys()) and model["importance"]: vals = model["importance"].cell_values header = model["importance"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print( "Warning: This model doesn't have importances of components.")
def varimp(self, use_pandas=False): """ Pretty print the variable importances, or return them in a list. :param use_pandas: If True, then the variable importances will be returned as a pandas data frame. :returns: A list or Pandas DataFrame. """ model = self._model_json["output"] if "variable_importances" in list(model.keys()) and model["variable_importances"]: vals = model["variable_importances"].cell_values header = model["variable_importances"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print("Warning: This model doesn't have variable importances")
def varimp(self, use_pandas=False): """ Pretty print the variable importances, or return them in a list. :param use_pandas: If True, then the variable importances will be returned as a pandas data frame. :returns: A list or Pandas DataFrame. """ model = self._model_json["output"] if "variable_importances" in list(model.keys()) and model["variable_importances"]: vals = model["variable_importances"].cell_values header = model["variable_importances"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print("Warning: This model doesn't have variable importances")
def varsplits(self, use_pandas=False): """ Retrieve per-variable split information for a given Isolation Forest model. :param use_pandas: If True, then the variable splits will be returned as a pandas data frame. :returns: A list or Pandas DataFrame. """ model = self._model_json["output"] if "variable_splits" in list( model.keys()) and model["variable_splits"]: vals = model["variable_splits"].cell_values header = model["variable_splits"].col_header if use_pandas and can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print( "Warning: This model doesn't provide variable split information" )
def show(self, header=True): # if h2o.can_use_pandas(): # import pandas # pandas.options.display.max_rows = 20 # print pandas.DataFrame(self.cell_values,columns=self.col_header) # return print() if header: print(self.table_header + ":", end=' ') if self.table_description: print(self.table_description) print() table = copy.deepcopy(self.cell_values) nr = 0 if _is_list_of_lists(table): nr = len( table) # only set if we truly have multiple rows... not just one long row :) if nr > 20: # create a truncated view of the table, first/last 5 rows trunc_table = [] trunc_table += [v for v in table[:5]] trunc_table.append(["---"] * len(table[0])) trunc_table += [v for v in table[(nr - 5):]] table = trunc_table H2ODisplay(table, self.col_header, numalign="left", stralign="left") if nr > 20 and can_use_pandas(): print('\nSee the whole table with table.as_data_frame()')
def as_data_frame(self): if can_use_pandas(): import pandas pandas.options.display.max_colwidth = 70 return pandas.DataFrame(self.cell_values, columns=self.col_header) return self
def _plot(self, timestep, metric, **kwargs): # check for matplotlib. exit if absent try: imp.find_module('matplotlib') import matplotlib if 'server' in kwargs and kwargs['server']: matplotlib.use('Agg', warn=False) import matplotlib.pyplot as plt except ImportError: print("matplotlib is required for this function!") return scoring_history = self.scoring_history() # Separate functionality for GLM since its output is different from other algos if self._model_json["algo"] == "glm": # GLM has only one timestep option, which is `iteration` timestep = "iteration" if metric == "AUTO": metric = "log_likelihood" elif metric not in ("log_likelihood", "objective"): raise ValueError("for GLM, metric must be one of: log_likelihood, objective") plt.xlabel(timestep) plt.ylabel(metric) plt.title("Validation Scoring History") plt.plot(scoring_history[timestep], scoring_history[metric]) elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"): # Set timestep if self._model_json["algo"] in ("gbm", "drf"): if timestep == "AUTO": timestep = "number_of_trees" elif timestep not in ("duration", "number_of_trees"): raise ValueError("timestep for gbm or drf must be one of: duration, number_of_trees") else: # self._model_json["algo"] == "deeplearning": # Delete first row of DL scoring history since it contains NAs & NaNs if scoring_history["samples"][0] == 0: scoring_history = scoring_history[1:] if timestep == "AUTO": timestep = "epochs" elif timestep not in ("epochs", "samples", "duration"): raise ValueError("timestep for deeplearning must be one of: epochs, samples, duration") training_metric = "training_{}".format(metric) validation_metric = "validation_{}".format(metric) if timestep == "duration": dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1]) scoring_history[dur_colname] = [str(x).split()[0] for x in scoring_history["duration"]] timestep = dur_colname if can_use_pandas(): valid = validation_metric in list(scoring_history) ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \ else (scoring_history[training_metric].min(), scoring_history[training_metric].max()) else: valid = validation_metric in scoring_history.col_header ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \ else (min(scoring_history[training_metric]), max(scoring_history[training_metric])) if ylim[0] == ylim[1]: ylim = (0, 1) if valid: # Training and validation scoring history plt.xlabel(timestep) plt.ylabel(metric) plt.title("Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training") plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange", label="Validation") plt.legend() else: # Training scoring history only plt.xlabel(timestep) plt.ylabel(training_metric) plt.title("Training Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric]) else: # algo is not glm, deeplearning, drf, gbm raise ValueError("Plotting not implemented for this type of model") if "server" not in list(kwargs.keys()) or not kwargs["server"]: plt.show()
def _plot(self, timestep, metric, **kwargs): # check for matplotlib. exit if absent try: imp.find_module('matplotlib') import matplotlib if 'server' in kwargs and kwargs['server']: matplotlib.use('Agg', warn=False) import matplotlib.pyplot as plt except ImportError: print("matplotlib is required for this function!") return scoring_history = self.scoring_history() # Separate functionality for GLM since its output is different from other algos if self._model_json["algo"] == "glm": # GLM has only one timestep option, which is `iteration` timestep = "iteration" if metric == "AUTO": metric = "log_likelihood" elif metric not in ("log_likelihood", "objective"): raise ValueError( "for GLM, metric must be one of: log_likelihood, objective" ) plt.xlabel(timestep) plt.ylabel(metric) plt.title("Validation Scoring History") plt.plot(scoring_history[timestep], scoring_history[metric]) elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"): # Set timestep if self._model_json["algo"] in ("gbm", "drf"): if timestep == "AUTO": timestep = "number_of_trees" elif timestep not in ("duration", "number_of_trees"): raise ValueError( "timestep for gbm or drf must be one of: duration, number_of_trees" ) else: # self._model_json["algo"] == "deeplearning": # Delete first row of DL scoring history since it contains NAs & NaNs if scoring_history["samples"][0] == 0: scoring_history = scoring_history[1:] if timestep == "AUTO": timestep = "epochs" elif timestep not in ("epochs", "samples", "duration"): raise ValueError( "timestep for deeplearning must be one of: epochs, samples, duration" ) training_metric = "training_{}".format(metric) validation_metric = "validation_{}".format(metric) if timestep == "duration": dur_colname = "duration_{}".format( scoring_history["duration"][1].split()[1]) scoring_history[dur_colname] = [ str(x).split()[0] for x in scoring_history["duration"] ] timestep = dur_colname if can_use_pandas(): valid = validation_metric in list(scoring_history) ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \ else (scoring_history[training_metric].min(), scoring_history[training_metric].max()) else: valid = validation_metric in scoring_history.col_header ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \ else (min(scoring_history[training_metric]), max(scoring_history[training_metric])) if ylim[0] == ylim[1]: ylim = (0, 1) if valid: # Training and validation scoring history plt.xlabel(timestep) plt.ylabel(metric) plt.title("Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training") plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange", label="Validation") plt.legend() else: # Training scoring history only plt.xlabel(timestep) plt.ylabel(training_metric) plt.title("Training Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric]) else: # algo is not glm, deeplearning, drf, gbm raise ValueError("Plotting not implemented for this type of model") if "server" not in list(kwargs.keys()) or not kwargs["server"]: plt.show()
def _plot(self, timestep, metric, server=False): plt = _get_matplotlib_pyplot(server) if not plt: return scoring_history = self.scoring_history() # Separate functionality for GLM since its output is different from other algos if self._model_json["algo"] == "glm": # GLM has only one timestep option, which is `iteration` timestep = "iteration" if metric == "AUTO": metric = "log_likelihood" elif metric not in ("log_likelihood", "objective"): raise H2OValueError("for GLM, metric must be one of: log_likelihood, objective") plt.xlabel(timestep) plt.ylabel(metric) plt.title("Validation Scoring History") plt.plot(scoring_history[timestep], scoring_history[metric]) elif self._model_json["algo"] in ("deeplearning", "deepwater", "drf", "gbm"): # Set timestep if self._model_json["algo"] in ("gbm", "drf"): assert_is_type(timestep, "AUTO", "duration", "number_of_trees") if timestep == "AUTO": timestep = "number_of_trees" else: # self._model_json["algo"] == "deeplearning": # Delete first row of DL scoring history since it contains NAs & NaNs if scoring_history["samples"][0] == 0: scoring_history = scoring_history[1:] assert_is_type(timestep, "AUTO", "epochs", "samples", "duration") if timestep == "AUTO": timestep = "epochs" training_metric = "training_{}".format(metric) validation_metric = "validation_{}".format(metric) if timestep == "duration": dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1]) scoring_history[dur_colname] = [str(x).split()[0] for x in scoring_history["duration"]] timestep = dur_colname if can_use_pandas(): valid = validation_metric in list(scoring_history) ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \ else (scoring_history[training_metric].min(), scoring_history[training_metric].max()) else: valid = validation_metric in scoring_history.col_header ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \ else (min(scoring_history[training_metric]), max(scoring_history[training_metric])) if ylim[0] == ylim[1]: ylim = (0, 1) if valid: # Training and validation scoring history plt.xlabel(timestep) plt.ylabel(metric) plt.title("Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training") plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange", label="Validation") plt.legend() else: # Training scoring history only plt.xlabel(timestep) plt.ylabel(training_metric) plt.title("Training Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric]) else: # algo is not glm, deeplearning, drf, gbm raise H2OValueError("Plotting not implemented for this type of model") if not server: plt.show()