Esempio n. 1
0
def test_preprocess_allnumeric():
    """Check preprocess_for_sklearn returns df with only numeric columns."""
    # we must have the right input, ie the input that is usually fed to
    # the preprocess_for_sklearn function
    data = prep_table()[1]
    is_number = np.vectorize(lambda x: np.issubdtype(x, np.number))
    assert (sum(is_number(preprocess_for_sklearn(data).dtypes)) ==
            preprocess_for_sklearn(data).shape[1])
Esempio n. 2
0
def fit_model(data, depth, seed, ntrees):
    """Fit a logistic regression on the training data.

    The input should be the training data read from the database.
    The output of this function should be fed to the pickle_model function
    so that the model can be pickled and stored for future use.

    Args:
        data (pd.DataFrame): The training data read from database.

    Returns:
        A trained logistic regression model
    """
    # get logger
    logger = logging.getLogger(__name__)

    # notice that data contains both response and predictors
    # X matrix is all columns but "left", the response
    X = preprocess_for_sklearn(data.drop(["left"], axis=1))
    y = data.left
    logger.info('Data has been preprocessed for sklearn')
    # fit model
    clf = RandomForestClassifier(max_depth=depth,
                                 random_state=seed,
                                 n_estimators=ntrees)
    clf.fit(X, y)
    logger.info('Model has been fit')
    return clf
Esempio n. 3
0
def make_predictions(dbtable, model, n):
    """Predict probability of quitting of all employees in evaluation.

    Bulk load the data from a table in the database and predict for
    each employee the probability he/she will quit.
    The prediction is performed on the test data that was heldout from the
    original data.
    This is meant to simulate in-production data that comes from the annual
    or biannual company-wide employee evaluation.
    The function then returns a table with the n employees who are most
    likely to quit.

    Args:
        dbtable (str): name of the table to be queried for the bulk load.
        model: model to be used for prediction.
        n (int): number of results to be shown.

    Returns:
        pd.dataframe: The n employees who are most likely to quit.
    """
    # read the data
    data = read_data(dbtable)
    # preprocess it
    X_matrix = preprocess_for_sklearn(data)
    # make predictions and add predicted probability as a new column
    # to the original data after formatting
    predictions = list(model.predict_proba(X_matrix)[:, 1])
    y_pred = [round(x * 100, 2) for x in predictions]
    data['phat'] = y_pred
    # sort by the predicted probability
    data = data.sort_values(by='phat', ascending=False)
    # return the n employees who are most likely to quit
    return data.head(n)
Esempio n. 4
0
def test_preprocess_size():
    """Check preprocess_for_sklearn returns a dataframe with the right
    number of columns."""
    # we must have the right input, ie the input that is usually fed to
    # the preprocess_for_sklearn function
    data = prep_table()[1]
    assert preprocess_for_sklearn(data).shape[1] == 18
Esempio n. 5
0
def read_test_data():
    # function for reading one line from the test set into the right format
    # the data will be used to check that model can be used for prediction
    # need a 2D numpy array with the first row of the preprocessed for sklean
    test_data = prep_table()[1]
    return np.array([preprocess_for_sklearn(test_data).loc[0, :]])
Esempio n. 6
0
def test_preprocess_type():
    """Check preprocess_for_sklearn returns a pandas dataframe."""
    # we must have the right input, ie the input that is usually fed to
    # the preprocess_for_sklearn function
    data = prep_table()[1]
    assert isinstance(preprocess_for_sklearn(data), pd.DataFrame)