def varimp(self, use_pandas=False): """ Pretty print the variable importances, or return them in a list Parameters ---------- use_pandas: boolean, optional If True, then the variable importances will be returned as a pandas data frame. Returns ------- A list or Pandas DataFrame. """ model = self._model_json["output"] if "variable_importances" in model.keys( ) and model["variable_importances"]: vals = model["variable_importances"].cell_values header = model["variable_importances"].col_header if use_pandas and h2o.can_use_pandas(): import pandas return pandas.DataFrame(vals, columns=header) else: return vals else: print "Warning: This model doesn't have variable importances"
def show(self, header=True): if h2o.can_use_pandas(): import pandas pandas.options.display.max_rows = 20 print pandas.DataFrame(self.cell_values, columns=self.col_header) return print if header: print self.table_header + ":" print table = copy.deepcopy(self.cell_values) nr = 0 if _is_list_of_lists(table): nr = len( table ) # only set if we truly have multiple rows... not just one long row :) if nr > 20: # create a truncated view of the table, first/last 5 rows trunc_table = [] trunc_table += [v for v in table[:5]] trunc_table.append(["---"] * len(table[0])) trunc_table += [v for v in table[(nr - 5):]] table = trunc_table h2o.H2ODisplay(table, self.col_header, numalign="left", stralign="left")
def score_history(self): """ Retrieve Model Score History :return: the score history (H2OTwoDimTable) """ model = self._model_json["output"] if 'scoring_history' in model.keys() and model["scoring_history"] != None: s = model["scoring_history"] if h2o.can_use_pandas(): import pandas pandas.options.display.max_rows = 20 return pandas.DataFrame(s.cell_values,columns=s.col_header) return s else: print "No score history for this model"
def score_history(self): """ Retrieve Model Score History :return: the score history (H2OTwoDimTable) """ model = self._model_json["output"] if 'scoring_history' in model.keys() and model["scoring_history"] != None: s = model["scoring_history"] if h2o.can_use_pandas(): import pandas pandas.options.display.max_rows = 20 return pandas.DataFrame(s.cell_values,columns=s.col_header) return model["scoring_history"] else: print "No score history for this model"
def show(self, header=True): if h2o.can_use_pandas(): import pandas pandas.options.display.max_rows = 20 print pandas.DataFrame(self.cell_values,columns=self.col_header) return print if header: print self.table_header + ":", if self.table_description: print self.table_description print table = copy.deepcopy(self.cell_values) nr=0 if _is_list_of_lists(table): nr = len(table) # only set if we truly have multiple rows... not just one long row :) if nr > 20: # create a truncated view of the table, first/last 5 rows trunc_table =[] trunc_table += [ v for v in table[:5]] trunc_table.append(["---"]*len(table[0])) trunc_table += [v for v in table[(nr-5):]] table = trunc_table h2o.H2ODisplay(table, self.col_header, numalign="left", stralign="left")
def as_data_frame(self, use_pandas=True): """Obtain the dataset as a python-local object. :param use_pandas: A flag specifying whether or not to return a pandas DataFrame. :return: A local python object (a list of lists of strings, each list is a row, if use_pandas=False, otherwise a pandas DataFrame) containing this self._newExpr instance's data. """ url = 'http://' + h2o.H2OConnection.ip() + ':' + str(h2o.H2OConnection.port()) + "/3/DownloadDataset?frame_id=" + urllib.quote(self.frame_id) + "&hex_string=false" response = urllib2.urlopen(url) if h2o.can_use_pandas() and use_pandas: import pandas df = pandas.read_csv(response,low_memory=False) time_cols = [] category_cols = [] if self.types is not None: for col_name in self.names: xtype = self.type(col_name) if xtype.lower() == 'time': time_cols.append(col_name) elif xtype.lower() == 'enum': category_cols.append(col_name) #change Time to pandas datetime if time_cols: #hacky way to get the utc offset from datetime import datetime sample_timestamp = 1380610868 utc_offset = 1000 * ((datetime.utcfromtimestamp(sample_timestamp) - datetime.fromtimestamp(sample_timestamp)).total_seconds()) try: df[time_cols] = (df[time_cols] - utc_offset).astype('datetime64[ms]') except pandas.tslib.OutOfBoundsDatetime: pass #change Enum to pandas category for cat_col in category_cols: #for loop is required df[cat_col] = df[cat_col].astype('category') return df else: cr = csv.reader(response) t_col_list = [[''] if row == [] else row for row in cr] return [list(x) for x in zip(*t_col_list)]
def _plot(self, timestep, metric, **kwargs): # check for matplotlib. exit if absent try: imp.find_module('matplotlib') import matplotlib if 'server' in kwargs.keys() and kwargs['server']: matplotlib.use('Agg', warn=False) import matplotlib.pyplot as plt except ImportError: print "matplotlib is required for this function!" return scoring_history = self.score_history() # Separate functionality for GLM since its output is different from other algos if self._model_json["algo"] == "glm": # GLM has only one timestep option, which is `iteration` timestep = "iteration" if metric == "AUTO": metric = "log_likelihood" elif metric not in ("log_likelihood", "objective"): raise ValueError("for GLM, metric must be one of: log_likelihood, objective") plt.xlabel(timestep) plt.ylabel(metric) plt.title("Validation Scoring History") plt.plot(scoring_history[timestep], scoring_history[metric]) elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"): # Set timestep if self._model_json["algo"] in ("gbm", "drf"): if timestep == "AUTO": timestep = "number_of_trees" elif timestep not in ("duration","number_of_trees"): raise ValueError("timestep for gbm or drf must be one of: duration, number_of_trees") else: #self._model_json["algo"] == "deeplearning": # Delete first row of DL scoring history since it contains NAs & NaNs if scoring_history["samples"][0] == 0: scoring_history = scoring_history[1:] if timestep == "AUTO": timestep = "epochs" elif timestep not in ("epochs","samples","duration"): raise ValueError("timestep for deeplearning must be one of: epochs, samples, duration") training_metric = "training_{}".format(metric) validation_metric = "validation_{}".format(metric) if timestep == "duration": dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1]) scoring_history[dur_colname] = map(lambda x: str(x).split()[0],scoring_history["duration"]) timestep = dur_colname if h2o.can_use_pandas(): valid = validation_metric in list(scoring_history) ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \ else (scoring_history[training_metric].min(), scoring_history[training_metric].max()) else: valid = validation_metric in scoring_history.col_header ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \ else (min(scoring_history[training_metric]), max(scoring_history[training_metric])) if valid: #Training and validation scoring history plt.xlabel(timestep) plt.ylabel(metric) plt.title("Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric], label = "Training") plt.plot(scoring_history[timestep], scoring_history[validation_metric], color = "orange", label = "Validation") plt.legend() else: #Training scoring history only plt.xlabel(timestep) plt.ylabel(training_metric) plt.title("Training Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric]) else: # algo is not glm, deeplearning, drf, gbm raise ValueError("Plotting not implemented for this type of model") if "server" not in kwargs.keys() or not kwargs["server"]: plt.show()