Example #1
0
    def varimp(self, use_pandas=False):
        """
    Pretty print the variable importances, or return them in a list

    Parameters
    ----------
    use_pandas: boolean, optional
      If True, then the variable importances will be returned as a pandas data frame.
    
    Returns
    -------
      A list or Pandas DataFrame.
    """
        model = self._model_json["output"]
        if "variable_importances" in model.keys(
        ) and model["variable_importances"]:
            vals = model["variable_importances"].cell_values
            header = model["variable_importances"].col_header
            if use_pandas and h2o.can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print "Warning: This model doesn't have variable importances"
Example #2
0
    def varimp(self, use_pandas=False):
        """
    Pretty print the variable importances, or return them in a list

    Parameters
    ----------
    use_pandas: boolean, optional
      If True, then the variable importances will be returned as a pandas data frame.
    
    Returns
    -------
      A list or Pandas DataFrame.
    """
        model = self._model_json["output"]
        if "variable_importances" in model.keys(
        ) and model["variable_importances"]:
            vals = model["variable_importances"].cell_values
            header = model["variable_importances"].col_header
            if use_pandas and h2o.can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print "Warning: This model doesn't have variable importances"
Example #3
0
    def show(self, header=True):
        if h2o.can_use_pandas():
            import pandas
            pandas.options.display.max_rows = 20
            print pandas.DataFrame(self.cell_values, columns=self.col_header)
            return
        print
        if header: print self.table_header + ":"
        print
        table = copy.deepcopy(self.cell_values)
        nr = 0
        if _is_list_of_lists(table):
            nr = len(
                table
            )  # only set if we truly have multiple rows... not just one long row :)
        if nr > 20:  # create a truncated view of the table, first/last 5 rows
            trunc_table = []
            trunc_table += [v for v in table[:5]]
            trunc_table.append(["---"] * len(table[0]))
            trunc_table += [v for v in table[(nr - 5):]]
            table = trunc_table

        h2o.H2ODisplay(table,
                       self.col_header,
                       numalign="left",
                       stralign="left")
Example #4
0
 def score_history(self):
   """
   Retrieve Model Score History
   :return: the score history (H2OTwoDimTable)
   """
   model = self._model_json["output"]
   if 'scoring_history' in model.keys() and model["scoring_history"] != None:
     s = model["scoring_history"]
     if h2o.can_use_pandas():
       import pandas
       pandas.options.display.max_rows = 20
       return pandas.DataFrame(s.cell_values,columns=s.col_header)
     return s
   else: print "No score history for this model"
Example #5
0
 def score_history(self):
   """
   Retrieve Model Score History
   :return: the score history (H2OTwoDimTable)
   """
   model = self._model_json["output"]
   if 'scoring_history' in model.keys() and model["scoring_history"] != None:
     s = model["scoring_history"]
     if h2o.can_use_pandas():
       import pandas
       pandas.options.display.max_rows = 20
       return pandas.DataFrame(s.cell_values,columns=s.col_header)
     return model["scoring_history"]
   else: print "No score history for this model"
Example #6
0
  def show(self, header=True):
    if h2o.can_use_pandas():
      import pandas
      pandas.options.display.max_rows = 20
      print pandas.DataFrame(self.cell_values,columns=self.col_header)
      return
    print
    if header:
      print self.table_header + ":",
      if self.table_description: print self.table_description
    print
    table = copy.deepcopy(self.cell_values)
    nr=0
    if _is_list_of_lists(table): nr = len(table)  # only set if we truly have multiple rows... not just one long row :)
    if nr > 20:    # create a truncated view of the table, first/last 5 rows
      trunc_table =[]
      trunc_table += [ v for v in table[:5]]
      trunc_table.append(["---"]*len(table[0]))
      trunc_table += [v for v in table[(nr-5):]]
      table = trunc_table

    h2o.H2ODisplay(table, self.col_header, numalign="left", stralign="left")
Example #7
0
  def as_data_frame(self, use_pandas=True):
    """Obtain the dataset as a python-local object.

    :param use_pandas: A flag specifying whether or not to return a pandas DataFrame.
    :return: A local python object (a list of lists of strings, each list is a row, if use_pandas=False, otherwise a
    pandas DataFrame) containing this self._newExpr instance's data.
    """
    url = 'http://' + h2o.H2OConnection.ip() + ':' + str(h2o.H2OConnection.port()) + "/3/DownloadDataset?frame_id=" + urllib.quote(self.frame_id) + "&hex_string=false"
    response = urllib2.urlopen(url)
    if h2o.can_use_pandas() and use_pandas:
      import pandas
      df = pandas.read_csv(response,low_memory=False)
      time_cols = []
      category_cols = []
      if self.types is not None:
        for col_name in self.names:
          xtype = self.type(col_name)
          if xtype.lower() == 'time': time_cols.append(col_name)
          elif xtype.lower() == 'enum': category_cols.append(col_name)
        #change Time to pandas datetime
        if time_cols:
          #hacky way to get the utc offset
          from datetime import datetime
          sample_timestamp = 1380610868
          utc_offset = 1000 * ((datetime.utcfromtimestamp(sample_timestamp) - datetime.fromtimestamp(sample_timestamp)).total_seconds())
          try:
            df[time_cols] = (df[time_cols] - utc_offset).astype('datetime64[ms]')
          except pandas.tslib.OutOfBoundsDatetime:
            pass
        #change Enum to pandas category
        for cat_col in category_cols: #for loop is required
          df[cat_col] = df[cat_col].astype('category')
      return df
    else:
      cr = csv.reader(response)
      t_col_list = [[''] if row == [] else row for row in cr]
      return [list(x) for x in zip(*t_col_list)]
Example #8
0
  def _plot(self, timestep, metric, **kwargs):

    # check for matplotlib. exit if absent
    try:
      imp.find_module('matplotlib')
      import matplotlib
      if 'server' in kwargs.keys() and kwargs['server']: matplotlib.use('Agg', warn=False)
      import matplotlib.pyplot as plt
    except ImportError:
      print "matplotlib is required for this function!"
      return

    scoring_history = self.score_history()
    # Separate functionality for GLM since its output is different from other algos
    if self._model_json["algo"] == "glm":
      # GLM has only one timestep option, which is `iteration`
      timestep = "iteration"
      if metric == "AUTO": metric = "log_likelihood"
      elif metric not in ("log_likelihood", "objective"):
        raise ValueError("for GLM, metric must be one of: log_likelihood, objective")
      plt.xlabel(timestep)
      plt.ylabel(metric)
      plt.title("Validation Scoring History")
      plt.plot(scoring_history[timestep], scoring_history[metric])

    elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"):
      # Set timestep
      if self._model_json["algo"] in ("gbm", "drf"):
        if timestep == "AUTO": timestep = "number_of_trees"
        elif timestep not in ("duration","number_of_trees"):
          raise ValueError("timestep for gbm or drf must be one of: duration, number_of_trees")
      else:  #self._model_json["algo"] == "deeplearning":
        # Delete first row of DL scoring history since it contains NAs & NaNs
        if scoring_history["samples"][0] == 0:
          scoring_history = scoring_history[1:]
        if timestep == "AUTO": timestep = "epochs"
        elif timestep not in ("epochs","samples","duration"):
          raise ValueError("timestep for deeplearning must be one of: epochs, samples, duration")

      training_metric = "training_{}".format(metric)
      validation_metric = "validation_{}".format(metric)
      if timestep == "duration":
        dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1])
        scoring_history[dur_colname] = map(lambda x: str(x).split()[0],scoring_history["duration"])
        timestep = dur_colname

      if h2o.can_use_pandas():
        valid = validation_metric in list(scoring_history)
        ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \
          else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
      else:
        valid = validation_metric in scoring_history.col_header
        ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
          else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))

      if valid: #Training and validation scoring history
        plt.xlabel(timestep)
        plt.ylabel(metric)
        plt.title("Scoring History")
        plt.ylim(ylim)
        plt.plot(scoring_history[timestep], scoring_history[training_metric], label = "Training")
        plt.plot(scoring_history[timestep], scoring_history[validation_metric], color = "orange", label = "Validation")
        plt.legend()
      else:  #Training scoring history only
        plt.xlabel(timestep)
        plt.ylabel(training_metric)
        plt.title("Training Scoring History")
        plt.ylim(ylim)
        plt.plot(scoring_history[timestep], scoring_history[training_metric])

    else: # algo is not glm, deeplearning, drf, gbm
      raise ValueError("Plotting not implemented for this type of model")
    if "server" not in kwargs.keys() or not kwargs["server"]: plt.show()
Example #9
0
  def _plot(self, timestep, metric, **kwargs):

    # check for matplotlib. exit if absent
    try:
      imp.find_module('matplotlib')
      import matplotlib
      if 'server' in kwargs.keys() and kwargs['server']: matplotlib.use('Agg', warn=False)
      import matplotlib.pyplot as plt
    except ImportError:
      print "matplotlib is required for this function!"
      return

    scoring_history = self.score_history()
    # Separate functionality for GLM since its output is different from other algos
    if self._model_json["algo"] == "glm":
      # GLM has only one timestep option, which is `iteration`
      timestep = "iteration"
      if metric == "AUTO": metric = "log_likelihood"
      elif metric not in ("log_likelihood", "objective"):
        raise ValueError("for GLM, metric must be one of: log_likelihood, objective")
      plt.xlabel(timestep)
      plt.ylabel(metric)
      plt.title("Validation Scoring History")
      plt.plot(scoring_history[timestep], scoring_history[metric])

    elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"):
      # Set timestep
      if self._model_json["algo"] in ("gbm", "drf"):
        if timestep == "AUTO": timestep = "number_of_trees"
        elif timestep not in ("duration","number_of_trees"):
          raise ValueError("timestep for gbm or drf must be one of: duration, number_of_trees")
      else:  #self._model_json["algo"] == "deeplearning":
        # Delete first row of DL scoring history since it contains NAs & NaNs
        if scoring_history["samples"][0] == 0:
          scoring_history = scoring_history[1:]
        if timestep == "AUTO": timestep = "epochs"
        elif timestep not in ("epochs","samples","duration"):
          raise ValueError("timestep for deeplearning must be one of: epochs, samples, duration")

      training_metric = "training_{}".format(metric)
      validation_metric = "validation_{}".format(metric)
      if timestep == "duration":
        dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1])
        scoring_history[dur_colname] = map(lambda x: str(x).split()[0],scoring_history["duration"])
        timestep = dur_colname

      if h2o.can_use_pandas():
        valid = validation_metric in list(scoring_history)
        ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \
          else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
      else:
        valid = validation_metric in scoring_history.col_header
        ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
          else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))

      if valid: #Training and validation scoring history
        plt.xlabel(timestep)
        plt.ylabel(metric)
        plt.title("Scoring History")
        plt.ylim(ylim)
        plt.plot(scoring_history[timestep], scoring_history[training_metric], label = "Training")
        plt.plot(scoring_history[timestep], scoring_history[validation_metric], color = "orange", label = "Validation")
        plt.legend()
      else:  #Training scoring history only
        plt.xlabel(timestep)
        plt.ylabel(training_metric)
        plt.title("Training Scoring History")
        plt.ylim(ylim)
        plt.plot(scoring_history[timestep], scoring_history[training_metric])

    else: # algo is not glm, deeplearning, drf, gbm
      raise ValueError("Plotting not implemented for this type of model")
    if "server" not in kwargs.keys() or not kwargs["server"]: plt.show()