def test_preprocess_allnumeric(): """Check preprocess_for_sklearn returns df with only numeric columns.""" # we must have the right input, ie the input that is usually fed to # the preprocess_for_sklearn function data = prep_table()[1] is_number = np.vectorize(lambda x: np.issubdtype(x, np.number)) assert (sum(is_number(preprocess_for_sklearn(data).dtypes)) == preprocess_for_sklearn(data).shape[1])
def fit_model(data, depth, seed, ntrees): """Fit a logistic regression on the training data. The input should be the training data read from the database. The output of this function should be fed to the pickle_model function so that the model can be pickled and stored for future use. Args: data (pd.DataFrame): The training data read from database. Returns: A trained logistic regression model """ # get logger logger = logging.getLogger(__name__) # notice that data contains both response and predictors # X matrix is all columns but "left", the response X = preprocess_for_sklearn(data.drop(["left"], axis=1)) y = data.left logger.info('Data has been preprocessed for sklearn') # fit model clf = RandomForestClassifier(max_depth=depth, random_state=seed, n_estimators=ntrees) clf.fit(X, y) logger.info('Model has been fit') return clf
def make_predictions(dbtable, model, n): """Predict probability of quitting of all employees in evaluation. Bulk load the data from a table in the database and predict for each employee the probability he/she will quit. The prediction is performed on the test data that was heldout from the original data. This is meant to simulate in-production data that comes from the annual or biannual company-wide employee evaluation. The function then returns a table with the n employees who are most likely to quit. Args: dbtable (str): name of the table to be queried for the bulk load. model: model to be used for prediction. n (int): number of results to be shown. Returns: pd.dataframe: The n employees who are most likely to quit. """ # read the data data = read_data(dbtable) # preprocess it X_matrix = preprocess_for_sklearn(data) # make predictions and add predicted probability as a new column # to the original data after formatting predictions = list(model.predict_proba(X_matrix)[:, 1]) y_pred = [round(x * 100, 2) for x in predictions] data['phat'] = y_pred # sort by the predicted probability data = data.sort_values(by='phat', ascending=False) # return the n employees who are most likely to quit return data.head(n)
def test_preprocess_size(): """Check preprocess_for_sklearn returns a dataframe with the right number of columns.""" # we must have the right input, ie the input that is usually fed to # the preprocess_for_sklearn function data = prep_table()[1] assert preprocess_for_sklearn(data).shape[1] == 18
def read_test_data(): # function for reading one line from the test set into the right format # the data will be used to check that model can be used for prediction # need a 2D numpy array with the first row of the preprocessed for sklean test_data = prep_table()[1] return np.array([preprocess_for_sklearn(test_data).loc[0, :]])
def test_preprocess_type(): """Check preprocess_for_sklearn returns a pandas dataframe.""" # we must have the right input, ie the input that is usually fed to # the preprocess_for_sklearn function data = prep_table()[1] assert isinstance(preprocess_for_sklearn(data), pd.DataFrame)