Beispiel #1
0
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k])) for k in kwargs])
    algo = self._compute_algo()

    model = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo, **kwargs), job_type=(algo+" Model Build"))

    if self._future:
      self._job = model
      return

    model.poll()
    if '_rest_version' in list(kwargs.keys()): model_json = H2OConnection.get_json("Models/"+model.dest_key, _rest_version=kwargs['_rest_version'])["models"][0]
    else:                                model_json = H2OConnection.get_json("Models/"+model.dest_key)["models"][0]
    self._resolve_model(model.dest_key,model_json)
Beispiel #2
0
  def get_grid(self, sort_by=None, decreasing=None):
    """
    Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order.  
    
    Parameters
    ----------    
    sort_by : str, optional
      A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc.
    decreasing : bool, optional
      Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default).
    Returns
    -------
      A new H2OGridSearch instance optionally sorted on the specified metric.

    """
    if sort_by is None and decreasing is None: return self

    grid_json = H2OConnection.get_json("Grids/"+self._id, sort_by=sort_by, decreasing=decreasing, _rest_version=99)
    grid = H2OGridSearch(self.model, self.hyper_params, self._id)
    grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] #reordered
    first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=99)['models'][0]
    model_class = H2OGridSearch._metrics_class(first_model_json)
    m = model_class()
    m._id = self._id
    m._grid_json = grid_json
    # m._metrics_class = metrics_class
    m._parms = grid._parms
    H2OEstimator.mixin(grid,model_class)
    grid.__dict__.update(m.__dict__.copy())
    return grid
Beispiel #3
0
  def get_grid(self, sort_by=None, decreasing=None):
    """
    Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order.  
    
    Parameters
    ----------    
    sort_by : str, optional
      A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc.
    decreasing : bool, optional
      Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default).
    Returns
    -------
      A new H2OGridSearch instance optionally sorted on the specified metric.

    """
    if sort_by is None and decreasing is None: return self

    grid_json = H2OConnection.get_json("Grids/"+self._id, sort_by=sort_by, decreasing=decreasing, _rest_version=99)
    grid = H2OGridSearch(self.model, self.hyper_params, self._id)
    grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] #reordered
    first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=99)['models'][0]
    model_class = H2OGridSearch._metrics_class(first_model_json)
    m = model_class()
    m._id = self._id
    m._grid_json = grid_json
    # m._metrics_class = metrics_class
    m._parms = grid._parms
    H2OEstimator.mixin(grid,model_class)
    grid.__dict__.update(m.__dict__.copy())
    return grid
    def init_h2o_context_locally():
        jar_path = None
        jarpaths = [os.path.join(sys.prefix, "sparkling_water_jar", "sparkling-water-all.jar"),
                    os.path.join(os.path.sep,"usr","local","sparkling_water_jar","sparkling-water-all.jar"),
                    os.path.join(sys.prefix, "local", "sparkling_water_jar", "sparkling-water-all.jar"),
                    os.path.join(site.USER_BASE, "sparkling_water_jar", "sparkling-water-all.jar")
                    ]
        if os.path.exists(jarpaths[0]):   jar_path = jarpaths[0]
        elif os.path.exists(jarpaths[1]): jar_path = jarpaths[1]
        elif os.path.exists(jarpaths[2]): jar_path = jarpaths[2]
        else:                             jar_path = jarpaths[3]

        cmd = [os.environ['SPARK_HOME']+"/bin/spark-submit",
        "--class", "water.SparklingWaterDriver",
        "--master", "local-cluster[1,1,512]",
        "--driver-class-path", jar_path,
        "--conf", "spark.driver.extraJavaOptions=\"-XX:MaxPermSize=384m\"",
        jar_path]

        cwd = os.path.abspath(os.getcwd())

        stdout_name = H2OConnection._tmp_file("stdout")
        stdout_file = open(stdout_name, "w+b")
        stderr_name = H2OConnection._tmp_file("stderr")
        stderr_file = open(stderr_name, "w+b")

        ip_port = []
        if sys.platform == "win32":
            p = subprocess.Popen(args=cmd, stdout=stdout_file, stderr=stderr_file, cwd=cwd, creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
            ip_port = H2OContext.__get_local_ip_port(p, stderr_name)
        else:
            p = subprocess.Popen(args=cmd, stdout=stdout_file, stderr=stderr_file, cwd=cwd, preexec_fn=os.setsid)
            ip_port = H2OContext.__get_local_ip_port(p, stderr_name)
        return H2OContext(ip_port[0], ip_port[1])
Beispiel #5
0
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs["interactions"] = None if ("interactions" not in kwargs or kwargs["interactions"] is None) else [h2o.h2o._quoted(col) for col in kwargs["interactions"]]
    kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k])) for k in kwargs])  # gruesome one-liner
    algo = self._compute_algo()

    model = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo, **kwargs), job_type=(algo+" Model Build"))

    if self._future:
      self._job = model
      return

    model.poll()
    if '_rest_version' in list(kwargs.keys()): model_json = H2OConnection.get_json("Models/"+model.dest_key, _rest_version=kwargs['_rest_version'])["models"][0]
    else:                                model_json = H2OConnection.get_json("Models/"+model.dest_key)["models"][0]
    self._resolve_model(model.dest_key,model_json)
Beispiel #6
0
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None])  # gruesome one-liner
    algo = self.model._compute_algo()  #unique to grid search
    kwargs["_rest_version"] = 99  #unique to grid search
    if self.grid_id is not None: kwargs["grid_id"] = self.grid_id 

    grid = H2OJob(H2OConnection.post_json("Grid/"+algo, **kwargs), job_type=(algo+" Grid Build"))

    if self._future:
      self._job = grid
      return

    grid.poll()
    if '_rest_version' in list(kwargs.keys()):
      grid_json = H2OConnection.get_json("Grids/"+grid.dest_key, _rest_version=kwargs['_rest_version'])

      error_index = 0
      if len(grid_json["failure_details"]) > 0:
        print("Errors/Warnings building gridsearch model\n")

        for error_message in grid_json["failure_details"]:
          if isinstance(grid_json["failed_params"][error_index], dict):
            for h_name in grid_json['hyper_names']:
              print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name]))

          if len(grid_json["failure_stack_traces"]) > error_index:
            print("failure_details: {0}\nfailure_stack_traces: "
                  "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
          error_index += 1
    else:
      grid_json = H2OConnection.get_json("Grids/"+grid.dest_key)

    self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

    #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
    # sometimes no model is returned due to bad parameter values provided by the user.
    if len(grid_json['model_ids']) > 0:
      first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'],
                                                _rest_version=kwargs['_rest_version'])['models'][0]
      self._resolve_grid(grid.dest_key, grid_json, first_model_json)
    else:
      raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Beispiel #7
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if isinstance(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if isinstance(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            h2o.h2o._quoted(col) for col in ignored_columns
        ]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(
            kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs
                       if kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  #unique to grid search
        kwargs["_rest_version"] = 99  #unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id

        grid = H2OJob(H2OConnection.post_json("Grid/" + algo, **kwargs),
                      job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()
        if '_rest_version' in list(kwargs.keys()):
            grid_json = H2OConnection.get_json(
                "Grids/" + grid.dest_key,
                _rest_version=kwargs['_rest_version'])
            for error_message in grid_json["failure_details"]:
                print(error_message)
        else:
            grid_json = H2OConnection.get_json("Grids/" + grid.dest_key)

        self.models = [
            h2o.get_model(key['name']) for key in grid_json['model_ids']
        ]
        #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        first_model_json = H2OConnection.get_json(
            "Models/" + grid_json['model_ids'][0]['name'],
            _rest_version=kwargs['_rest_version'])['models'][0]

        self._resolve_grid(grid.dest_key, grid_json, first_model_json)
Beispiel #8
0
 def getGLMRegularizationPath(model):
   x = H2OConnection.get_json("GetGLMRegPath",model=model._model_json['model_id']['name'])
   ns = x.pop('coefficient_names')
   res = {'lambdas':x['lambdas'],'explained_deviance_train':x['explained_deviance_train'],'explained_deviance_valid':x['explained_deviance_valid']}
   res['coefficients'] = [dict(zip(ns,y)) for y in x['coefficients']]
   if 'coefficients_std' in x:
     res['coefficients_std'] = [dict(zip(ns,y)) for y in x['coefficients_std']]
   return res
Beispiel #9
0
 def getGLMRegularizationPath(model):
   x = H2OConnection.get_json("GetGLMRegPath",model=model._model_json['model_id']['name'])
   ns = x.pop('coefficient_names')
   res = {'lambdas':x['lambdas'],'explained_deviance_train':x['explained_deviance_train'],'explained_deviance_valid':x['explained_deviance_valid']}
   res['coefficients'] = [dict(zip(ns,y)) for y in x['coefficients']]
   if 'coefficients_std' in x:
     res['coefficients_std'] = [dict(zip(ns,y)) for y in x['coefficients_std']]
   return res
Beispiel #10
0
 def makeGLMModel(model, coefs, threshold=.5):
     model_json = H2OConnection.post_json(
         "MakeGLMModel",
         model=model._model_json['model_id']['name'],
         names=list(coefs.keys()),
         beta=list(coefs.values()),
         threshold=threshold)
     m = H2OGeneralizedLinearEstimator()
     m._resolve_model(model_json['model_id']['name'], model_json)
     return m
Beispiel #11
0
 def makeGLMModel(model, coefs, threshold=.5):
     """
     Create a custom GLM model using the given coefficients.
     Needs to be passed source model trained on the dataset to extract the dataset information from.
       @param model - source model, used for extracting dataset information
       @param coefs - dictionary containing model coefficients
       @param threshold - (optional, only for binomial) decision threshold used for classification
     """
     model_json = H2OConnection.post_json("MakeGLMModel", model=model._model_json["model_id"]["name"],
         names=list(coefs.keys()), beta=list(coefs.values()), threshold=threshold)
     m = H2OGeneralizedLinearEstimator()
     m._resolve_model(model_json["model_id"]["name"], model_json)
     return m
Beispiel #12
0
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None])  # gruesome one-liner
    algo = self.model._compute_algo()  #unique to grid search
    kwargs["_rest_version"] = 99  #unique to grid search
    if self.grid_id is not None: kwargs["grid_id"] = self.grid_id 

    grid = H2OJob(H2OConnection.post_json("Grid/"+algo, **kwargs), job_type=(algo+" Grid Build"))

    if self._future:
      self._job = grid
      return

    grid.poll()
    if '_rest_version' in list(kwargs.keys()):
      grid_json = H2OConnection.get_json("Grids/"+grid.dest_key, _rest_version=kwargs['_rest_version'])
      for error_message in grid_json["failure_details"]:
        print(error_message)
    else:                                grid_json = H2OConnection.get_json("Grids/"+grid.dest_key)

    self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]
    #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
    first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0]

    self._resolve_grid(grid.dest_key, grid_json, first_model_json)
Beispiel #13
0
 def getGLMRegularizationPath(model):
     """
     Extract full regularization path explored during lambda search from glm model.
     @param model - source lambda search model
     """
     x = H2OConnection.get_json("GetGLMRegPath", model=model._model_json["model_id"]["name"])
     ns = x.pop("coefficient_names")
     res = {
         "lambdas": x["lambdas"],
         "explained_deviance_train": x["explained_deviance_train"],
         "explained_deviance_valid": x["explained_deviance_valid"],
         "coefficients": [dict(zip(ns,y)) for y in x["coefficients"]],
     }
     if "coefficients_std" in x:
         res["coefficients_std"] = [dict(zip(ns,y)) for y in x["coefficients_std"]]
     return res
Beispiel #14
0
 def makeGLMModel(model, coefs, threshold=.5):
     """
     Create a custom GLM model using the given coefficients.
     Needs to be passed source model trained on the dataset to extract the dataset information from.
       @param model - source model, used for extracting dataset information
       @param coefs - dictionary containing model coefficients
       @param threshold - (optional, only for binomial) decision threshold used for classification
     """
     model_json = H2OConnection.post_json(
         "MakeGLMModel",
         model=model._model_json["model_id"]["name"],
         names=list(coefs.keys()),
         beta=list(coefs.values()),
         threshold=threshold)
     m = H2OGeneralizedLinearEstimator()
     m._resolve_model(model_json["model_id"]["name"], model_json)
     return m
Beispiel #15
0
 def getGLMRegularizationPath(model):
     """
     Extract full regularization path explored during lambda search from glm model.
     @param model - source lambda search model
     """
     x = H2OConnection.get_json("GetGLMRegPath",
                                model=model._model_json["model_id"]["name"])
     ns = x.pop("coefficient_names")
     res = {
         "lambdas": x["lambdas"],
         "explained_deviance_train": x["explained_deviance_train"],
         "explained_deviance_valid": x["explained_deviance_valid"],
         "coefficients": [dict(zip(ns, y)) for y in x["coefficients"]],
     }
     if "coefficients_std" in x:
         res["coefficients_std"] = [
             dict(zip(ns, y)) for y in x["coefficients_std"]
         ]
     return res
Beispiel #16
0
 def makeGLMModel(model, coefs, threshold=.5):
   model_json = H2OConnection.post_json("MakeGLMModel",model=model._model_json['model_id']['name'], names=list(coefs.keys()), beta = list(coefs.values()), threshold = threshold)
   m = H2OGeneralizedLinearEstimator()
   m._resolve_model(model_json['model_id']['name'], model_json)
   return m
Beispiel #17
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if isinstance(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if isinstance(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            h2o.h2o._quoted(col) for col in ignored_columns
        ]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(
            kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs
                       if kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  #unique to grid search
        kwargs["_rest_version"] = 99  #unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id

        grid = H2OJob(H2OConnection.post_json("Grid/" + algo, **kwargs),
                      job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()
        if '_rest_version' in list(kwargs.keys()):
            grid_json = H2OConnection.get_json(
                "Grids/" + grid.dest_key,
                _rest_version=kwargs['_rest_version'])

            error_index = 0
            if len(grid_json["failure_details"]) > 0:
                print("Errors/Warnings building gridsearch model\n")

                for error_message in grid_json["failure_details"]:
                    if isinstance(grid_json["failed_params"][error_index],
                                  dict):
                        for h_name in grid_json['hyper_names']:
                            print("Hyper-parameter: {0}, {1}".format(
                                h_name, grid_json['failed_params'][error_index]
                                [h_name]))

                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(
                              error_message,
                              grid_json['failure_stack_traces'][error_index]))
                    error_index += 1
        else:
            grid_json = H2OConnection.get_json("Grids/" + grid.dest_key)

        self.models = [
            h2o.get_model(key['name']) for key in grid_json['model_ids']
        ]

        #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = H2OConnection.get_json(
                "Models/" + grid_json['model_ids'][0]['name'],
                _rest_version=kwargs['_rest_version'])['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            raise ValueError(
                "Gridsearch returns no model due to bad parameter values or other reasons...."
            )
Beispiel #18
0
 def init_scala_int_session():
     res = H2OConnection.post("scalaint")
     session_id = res.json()["session_id"]
     return session_id