def pipeline(x_train,
             y_train,
             x_test,
             y_test,
             param_dict=None,
             problem='classification'):
  """Trains and evaluates a random forest classifier.

  Args:
    x_train: np.array or scipy.sparse.*matrix array of features of training data
    y_train: np.array 1-D array of class labels of training data
    x_test: np.array or scipy.sparse.*matrix array of features of test data
    y_test: np.array 1-D array of class labels of the test data
    param_dict: {string: ?} dictionary of parameters and their values
    problem: string type of learning problem; values = 'classification',
      'regression'

  Returns:
    model: sklearn.ensemble.RandomForestClassifier
      trained random forest model
    metrics: {str: float}
      dictionary of metric scores
  """
  assert problem in ['classification', 'regression']

  if param_dict is None:
    param_dict = {}

  if problem == 'regression':
    model = ensemble.RandomForestRegressor(**param_dict)
  else:
    model = ensemble.RandomForestClassifier(**param_dict)

  return generic_pipeline(
      model, x_train, y_train, x_test, y_test, problem=problem)
Esempio n. 2
0
def pipeline(x_train,
             y_train,
             x_test,
             y_test,
             param_dict=None,
             problem='classification'):
    """Trains and evaluates a logistic regression classifier.

  Args:
    x_train: np.array or scipy.sparse.*matrix array of features of training data
    y_train: np.array 1-D array of class labels of training data
    x_test: np.array or scipy.sparse.*matrix array of features of test data
    y_test: np.array 1-D array of class labels of the test data
    param_dict: {string: ?} dictionary of parameters and their values
    problem: string type of learning problem; values = 'classification',
      'regression'

  Returns:
    model: sklearn.linear_model.*
      trained linear model
    metrics: {str: float}
      dictionary of metric scores
  """
    assert problem in ['classification', 'regression']

    if param_dict is None:
        param_dict = {}

    if problem == 'classification':
        scaler = preprocessing.MaxAbsScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

    if 'penalty' in param_dict and problem == 'regression':
        penalty = param_dict.pop('penalty')
    elif 'penalty' in param_dict:
        penalty = param_dict['penalty']
    else:
        penalty = 'l2'  # default to l2

    model_init = choose_linear_model(problem, penalty)
    model = model_init(**param_dict)

    return generic_pipeline(model,
                            x_train,
                            y_train,
                            x_test,
                            y_test,
                            problem=problem)
Esempio n. 3
0
def pipeline(x_train,
             y_train,
             x_test,
             y_test,
             param_dict=None,
             problem='classification'):
    """Runs a pipeline to train and evaluate GBDT classifiers.

  Args:
    x_train: np.array or scipy.sparse.*matrix array of features of training data
    y_train: np.array 1-D array of class labels of training data
    x_test: np.array or scipy.sparse.*matrix array of features of test data
    y_test: np.array 1-D array of class labels of the test data
    param_dict: {string: ?} dictionary of parameters and their values
    problem: string type of learning problem; values = 'classification',
      'regression'

  Returns:
    model: xgb.Booster
      trained XGBoost gradient boosted trees model
    metrics: {str: float}
      dictionary of metric scores
  """
    assert problem in ['classification', 'regression']

    if param_dict is None:
        param_dict = {}

    if problem == 'regression':
        model = xgb.XGBRegressor(**param_dict)
    else:
        is_binary = max(y_train) + 1 == 2
        if 'objective' not in param_dict:
            param_dict['objective'] = get_objective(is_binary)
        model = xgb.XGBClassifier(**param_dict)

    return generic_pipeline(model,
                            x_train,
                            y_train,
                            x_test,
                            y_test,
                            problem=problem)
Esempio n. 4
0
def pipeline(x_train,
             y_train,
             x_test,
             y_test,
             param_dict=None,
             problem='classification'):
    """Trains and evaluates a DNN classifier.

  Args:
    x_train: np.array or scipy.sparse.*matrix array of features of training data
    y_train: np.array 1-D array of class labels of training data
    x_test: np.array or scipy.sparse.*matrix array of features of test data
    y_test: np.array 1-D array of class labels of the test data
    param_dict: {string: ?} dictionary of parameters of their values
    problem: string type of learning problem; values = 'classification',
      'regression'

  Returns:
    model: Keras.models.Model
      trained Keras model
    metrics: {str: float}
      dictionary of metric scores
  """
    assert problem in ['classification', 'regression']

    if param_dict is None:
        param_dict = {'epochs': 10, 'batch_size': 256}

    num_feature = x_train.shape[1]
    is_sparse = sparse.issparse(x_train)

    param_dict = param_dict.copy()
    num_epoch = param_dict.pop('epochs')
    batch_size = param_dict.pop('batch_size')

    if problem == 'regression':
        num_output = 1
        loss = 'mean_squared_error'
        model_init = KerasRegressor
    else:
        num_output = len(set(y_train))
        loss = 'categorical_crossentropy'
        model_init = FunctionalKerasClassifier

    build_fn = pseudo_partial(keras_build_fn,
                              num_feature=num_feature,
                              num_output=num_output,
                              is_sparse=is_sparse,
                              loss=loss,
                              **param_dict)
    model = model_init(build_fn=build_fn,
                       epochs=num_epoch,
                       batch_size=batch_size,
                       shuffle=True,
                       verbose=False)

    return generic_pipeline(model,
                            x_train,
                            y_train,
                            x_test,
                            y_test,
                            problem=problem)