Beispiel #1
0
                " not `{}`").format(type(df),)
        expected_shape = (1460, 81)
        assert df.shape == expected_shape, ("Expected {} rows and {} columns, but"
                " got shape {}").format(expected_shape[0], expected_shape[1], df.shape)

class HomeDescription(EqualityCheckProblem):
    _vars = ['avg_lot_size', 'newest_home_age']
    max_year_built = 2010
    min_home_age = datetime.datetime.now().year - max_year_built
    _expected = [10517, min_home_age]
    _hint = 'Run the describe command. Lot size is in the column called LotArea. Also look at YearBuilt'
    _solution = CS(
"""# using data read from home_data.describe()
avg_lot_size = 10517
newest_home_age = 8
""")

# Shorter syntax equivalent to above
# from learntools.core.problem_factories import simple_problem
# hd = simple_problem('HomeDescription', solution='idk', hint='This is a hint')\
#        .with_expected(avg_lot_size=10516.828, newest_home_age=8)

qvars = bind_exercises(globals(), [
    LoadHomeData,
    HomeDescription,
    ],
    tutorial_id=118,
    var_format='step_{n}',
    )
__all__ = list(qvars)

class L2Predictions(CodingProblem):
    _var = 'l2_reccs'
    _default_values = [[]]

    _solution = CS('l2_reccs = recommend(l2_model, 26556)')

    def check_whether_attempted(self, *args):
        # This seems iffy.
        EqualityCheckProblem.check_whether_attempted(self, *args)

    def check(self, reccs):
        assert (reccs.iloc[:2].index == [228, 340]).all()


qvars = bind_exercises(
    globals(),
    [
        RecommendFunction,
        PredictionSanityCheck,
        FixingObscurity,
        RecommendNonObscure,
        L2Intro,
        L2Predictions,
    ],
    tutorial_id=-1,
    var_format='part{n}',
)
__all__ = list(qvars)
Beispiel #3
0
    _solution = """
    The scale of features does not affect permutation importance per se. The only reason that rescaling a feature would affect PI is indirectly, if rescaling helped or hurt the ability of the particular learning method we're using to make use of that feature.
    That won't happen with tree based models, like the Random Forest used here.
    If you are familiar with Ridge Regression, you might be able to think of how that would be affected.
    That said, the absolute change features are have high importance because they capture total distance traveled, which is the primary determinant of taxi fares...It is not an artifact of the feature magnitude.
    """


class FromPermImportanceToMarginalEffect(ThoughtExperiment):
    _solution = """
    We cannot tell from the permutation importance results whether traveling a fixed latitudinal distance is more or less expensive than traveling the same longitudinal distance.
    Possible reasons latitude feature are more important than longitude features
    1. latitudinal distances in the dataset tend to be larger
    2. it is more expensive to travel a fixed latitudinal distance
    3. Both of the above
    If abs_lon_change values were very small, longitues could be less important to the model even if the cost per mile of travel in that direction were high.
    """


qvars = bind_exercises(
    globals(),
    [
        WhichFeaturesAreUseful, FirstPermImportance, WhyLatitude,
        ImportanceWithAbsFeatures, ScaleUpFeatureMagnitude,
        FromPermImportanceToMarginalEffect
    ],
    tutorial_id=131,
    var_format='q_{n}',
)
__all__ = list(qvars)
Beispiel #4
0
        if spins_left == 0:
            successes += 1
    return successes / n_simulations""")

    def check(self, fn):
        actual = fn(10, 10, 1000)
        assert actual == 1.0, "Expected slots_survival_probability(10, 10, 1000) to be 1.0, but was actually {}".format(
            repr(actual))

        actual = fn(1, 2, 10000)
        assert .24 <= actual <= .26, "Expected slots_survival_probability(1, 2, 10000) to be around .25, but was actually {}".format(
            repr(actual))

        actual = fn(25, 150, 10000)
        assert .22 <= actual <= .235, "Expected slots_survival_probability(25, 150, 10000) to be around .228, but was actually {}".format(
            repr(actual))


qvars = bind_exercises(
    globals(),
    [
        EarlyExitDebugging,
        ElementWiseComparison,
        BoringMenu,
        ExpectedSlotsPayout,
        SlotsSurvival,
    ],
    tutorial_id=110,
)
__all__ = list(qvars) + ['play_slot_machine']
    _solution = (
        '''Directionally, these biases make sense. Highly rated movies have high biases, and poorly rated movies
have low biases. This agrees with the intuition discussed earlier about biases corresponding to goodness/badness.

But a problem sticks out. We're naively assigning biases which are approximately proportional to movies' 
average ratings - even for movies with few reviews. I'm not convinced that *Gray Lady Down* is the worst
movie ever based on *one* bad review. 

If you're shopping for a can opener, would you rather buy the one with a single 5-star review, or the one
with an average rating of 4.95 over 3,000 reviews? 

This is an especially important problem when dealing with sparse categorical data which can often have long tails
of rare values. We'll talk about an elegant solution to this problem - L2 regularization - in the next lesson.'''
    )


qvars = bind_exercises(
    globals(),
    [
        BiasIntro,
        CodingBiases,
        None,
        LoadingBiases,
        ExploringBiases,
        None,  # user biases
    ],
    tutorial_id=-1,
    var_format='part{n}',
)
__all__ = list(qvars)
Beispiel #6
0

class ValPreds(CodingProblem):
    _vars = ['val_predictions', 'iowa_model', 'val_X']
    _hint = 'Run predict on the right validation data object.'
    _solution = CS("""val_predictions = iowa_model.predict(val_X)""")

    def check(self, val_predictions, iowa_model, val_X):
        assert val_predictions.size == 365, "`val_predictions` is wrong size. Did you predict with the wrong data?"
        comparison_val_preds = iowa_model.predict(val_X)
        assert all(comparison_val_preds == val_predictions), (
            "Predictions do not match expectations. "
            "Did you supply the right data")


class MAE(EqualityCheckProblem):
    _var = 'val_mae'
    _expected = 29652.931506849316
    _hint = (
        "The order of arguments to mean_absolute_error doesn't matter. Make sure you fit to only the training data in step 2."
    )
    _solution = CS("""val_mae = mean_absolute_error(val_predictions, val_y)""")


qvars = bind_exercises(
    globals(),
    [SplitData, FitModelWithTrain, ValPreds, MAE],
    var_format='step_{n}',
)
__all__ = list(qvars)
average ratings - even for movies with few reviews. I'm not convinced that *Gray Lady Down* is the worst
movie ever based on *one* bad review. 

If you're shopping for a can opener, would you rather buy the one with a single 5-star review, or the one
with an average rating of 4.95 over 3,000 reviews? 

This is an especially important problem when dealing with sparse categorical data which can often have long tails
of rare values. We'll talk about an elegant solution to this problem - L2 regularization - in the next lesson.'''
    )


BigBiasProblem = MultipartProblem(
    WhyBiases,
    WhatBiases,
    CodingBiases,
    LoadingBiases,
    ExploringBiases,
)

qvars = bind_exercises(
    globals(),
    [
        ChooseEmbeddingVars,
        EmbeddingSizeInvestigation,
        BigBiasProblem,
    ],
    tutorial_id=149,
    var_format='part{n}',
)
__all__ = list(qvars)
Beispiel #8
0
result = first_term + second_term
""")

    def check(self, result_obj):
        assert result_obj == 71


class FromPermImportanceToMarginalEffect(CodingProblem):
    _var = 'result'
    _hint = ''
    _solution = CS("""
data = ["John", "Doe", 53.44]
result = f"Hello {data[0]} {data[1]}. Your current balance is ${data[2]}."
""")

    def check(self, result_obj):
        assert result_obj == "Hello John Doe. Your current balance is $53.44."


qvars = bind_exercises(
    globals(),
    [
        SumStringsWithNumbers, FirstPermImportance, WhyLatitude,
        ImportanceWithAbsFeatures, ScaleUpFeatureMagnitude,
        FromPermImportanceToMarginalEffect
    ],
    tutorial_id=131,
    var_format='q_{n}',
)
__all__ = list(qvars)
Beispiel #9
0
    _hint = ("You will call get_mae in the loop. You'll need to map "
             "the names of your data structure to the names in get_mae")
    _solution = CS("""# Here is a short solution with a dict comprehension.
# The lesson gives an example of how to do this with an explicit loop.
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)
""")


class FitModelWithAllData(CodingProblem):
    _vars = ['final_model', 'X', 'y']
    _hint = 'Fit with the ideal value of max_leaf_nodes. In the fit step, use all of the data in the dataset'
    _solution = CS("""# Fit the model with best_tree_size. Fill in argument to make optimal size
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model
final_model.fit(X, y)""")

    def check(self, final_model, X, y):
        assert final_model.max_leaf_nodes == 100, "Didn't set max_leaf_nodes to the right value when building the tree"
        # Model has in-sample R^2 of 0.92 when run on all data, independent of seed.
        # score(X,y) is 0.88 if model was trained on train_X and train_y
        assert final_model.score(X, y) > 0.9, "Your model isn't quite as accurate as expected. Did you fit it on all the data?"
qvars = bind_exercises(globals(), [
    BestTreeSize,
    FitModelWithAllData
    ],
    var_format='step_{n}',
    )
__all__ = list(qvars)
Beispiel #10
0
    _hint = 'Review the code above with a DecisionTreeRegressor. Use the RandomForestRegressor instead'
    _solution = CS("""rf_model = RandomForestRegressor()

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
""")

    def check(self, rf_val_mae):
        assert type(rf_val_mae) in [
            float, np.float64
        ], "Expected rf_val_mae to be a number with a decimal type. Observed type {}".format(
            type(rf_val_mae))
        # rf_val_mae should be 22,883. Giving wiggle room to handle version differences, etc.
        assert rf_val_mae > 20000, "Your validation score of {} is implausibly low.".format(
            rf_val_mae)
        assert rf_val_mae < 25000, "Your validation score of {} is higher than it should be.".format(
            rf_val_mae)


qvars = bind_exercises(
    globals(),
    [CheckRfScore],
    tutorial_id=121,
    var_format='step_{n}',
)
__all__ = list(qvars)
Beispiel #11
0
The above implementation relies on the fact that `list.index` returns the index of the *first* occurrence of a value. (You can verify this by calling `help(list.index)`.) So if, after sorting the list in ascending order, the value 0 is at index 0, then the number of negatives is 0. If 0 is at index 2 (i.e. the third element), then there are two elements smaller than 0. And so on.

*Note*: it's usually considered "impolite" to modify a list that someone passes to your function without giving them some warning (i.e. unless the docstring says that it modifies its input). So, if we wanted to be nice, we could have started by making a copy of nums using the `list.copy()` method (e.g. `our_nums = nums.copy()`), and then working with that copy rather than the original.

If you're a big Lisp fan, you might have written this technically compliant solution (we haven't talked about recursion, but I guess this doesn't use any syntax or functions we haven't seen yet...):

```python
def count_negatives(nums):
    # Equivalent to "if len(nums) == 0". An empty list is 'falsey'.
    if not nums:
        return 0
    else:
        # Implicitly converting a boolean to an int! See question 6 of the
        # exercise on booleans and conditionals
        return (nums[0] < 0) + count_negatives(nums[1:])
```"""


qvars = bind_exercises(
    globals(),
    [
        SelectSecondItem,
        LosingTeamCaptain,
        PurpleShell,
        UnderstandLen,
        FashionablyLate,
        CountNegativesRiddle,
    ],
)
__all__ = list(qvars)
Beispiel #12
0
    _var = 'rf_val_mae'
    _hint = 'Review the code above with a DecisionTreeRegressor. Use the RandomForestRegressor instead'
    _solution = CS("""rf_model = RandomForestRegressor()

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
""")

    def check(self, rf_val_mae):
        assert type(rf_val_mae) in [
            float, np.float64
        ], "Expected rf_val_mae to be a number with a decimal type. Observed type {}".format(
            type(rf_val_mae))
        # rf_val_mae should be 22,883. Giving wiggle room to handle version differences, etc.
        assert rf_val_mae > 20000, "Your validation score of {} is implausibly low.".format(
            rf_val_mae)
        assert rf_val_mae < 25000, "Your validation score of {} is higher than it should be.".format(
            rf_val_mae)


qvars = bind_exercises(
    globals(),
    [CheckRfScore],
    var_format='step_{n}',
)
__all__ = list(qvars)
Beispiel #13
0
c = df.mean_rating
pts = ax.scatter(df.x, df.y, c=c)
cbar = fig.colorbar(pts)
```

Unlike with year of release, there seems to be a clear global pattern here: average rating tends to increase moving from left to right.
""")


class NRatingsPlot(ThoughtExperiment):

    # TODO: Mention alternatives like PowerNorm?
    _solution = CS("""fig, ax = plt.subplots(figsize=FS)
c = df.n_ratings
pts = ax.scatter(df.x, df.y, c=c, norm=mpl.colors.LogNorm())
cbar = fig.colorbar(pts)
""")


qvars = bind_exercises(
    globals(),
    [
        YearPlot,
        MeanRatingPlot,
        NRatingsPlot,
    ],
    tutorial_id=-1,
    var_format='part{n}',
)
__all__ = list(qvars)
Beispiel #14
0
Now focus your eye on the blue dots, and imagine a best fit line through those dots.  It is generally pretty flat, possibly even curving up on the right side of the graph. So increasing `feature_of_interest` has a more positive impact on predictions when `other_feature` is high.
"""


class CompareSHAPDepPlots(ThoughtExperiment):
    _solution = \
"""
Here is the code:

    shap.dependence_plot('num_lab_procedures', shap_values[1], small_val_X)
    shap.dependence_plot('num_medications', shap_values[1], small_val_X)

.
Loosely speaking, **num_lab_procedures** looks like a cloud with little disernible pattern.  It does not slope steeply up nor down at any point. It's hard to say we've learned much from that plot. At the same time, the values are not all very close to 0. So the model seems to think this is a relevant feature. One potential next step would be to explore more by coloring it with different other features to search for an interaction.

On the other hand, **num_medications** clearly slopes up until a value of about 20, and then it turns back down. Without more medical background, this seems a surprising phenomenon... You could do some exploration to see whether these patients have unusual values for other features too. But a good next step would be to discuss this phenomenon with domain experts (in this case, the doctors).
"""


qvars = bind_exercises(
    globals(),
    [
        WhichEffectLargerRange, IsEffectRangeImportance,
        CompareEffectSizeWhenChanged, WhyAreShapsValuesJumbled,
        WhichWayInteraction, CompareSHAPDepPlots
    ],
    tutorial_id=141,
    var_format='q_{n}',
)
__all__ = list(qvars)
Beispiel #15
0
class NormColumn(CodingProblem):
    _var = 'all_movies_df'

    _solution = CS("all_movies_df['norm'] = norms")

    def check(self, df):
        assert_has_columns(df, ['norm'], 'all_movies_df')
        exp = 1.623779
        jum = df.loc[1, 'norm']
        assert math.isclose(
            exp, df.loc[1, 'norm'],
            rel_tol=1e-3), ("Expected norm column for movie 'Jumanji' to be {}"
                            ". Was actually {}").format(exp, jum)


class NormPatterns(ThoughtExperiment):

    _solution = 'TODO'


VectorLengths = MultipartProblem(CalculateNorms, NormColumn, NormPatterns)

qvars = bind_exercises(
    globals(),
    [None, VectorAddition, VectorLengths],
    tutorial_id=-1,
    var_format='part{n}',
)
__all__ = list(qvars)
Beispiel #16
0

class UseShap(ThoughtExperiment):
    _hint = "Here's the time to use SHAP values"
    _solution = CS("""
# Use SHAP values to show the effect of each feature of a given patient

import shap  # package used to calculate Shap values

sample_data_for_prediction = val_X.iloc[0].astype(float)  # to test function

def patient_risk_factors(model, patient_data):
    # Create object that can calculate shap values
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(patient_data)
    shap.initjs()
    shap.force_plot(explainer.expected_value[1], shap_values[1], patient_data)
""")


qvars = bind_exercises(
    globals(),
    [
        SummarizeModel, EffectNumInpatient, EffectTimeInHospital,
        RawActualsInsteadOfPDP, UseShap
    ],
    tutorial_id=135,
    var_format='q_{n}',
)
__all__ = list(qvars)
Beispiel #17
0
                    ("Expected `dtree` to be of type DecisionTreeRegressor but got an "
                     "object of type `{}`").format(type(dtree))
        assert dtree.random_state is not None, "You forgot to set the random_state."
        assert getattr(dtree, 'tree_',
                       None) is not None, "You have not fit the model."


class MakePredictions(CodingProblem):
    _vars = ['predictions', 'iowa_model', 'X']
    _hint = """Use `iowa_model.predict` with an argument holding the data to predict with."""
    _solution = CS('iowa_model.predict(X)')

    def check(self, predictions, iowa_model, X):
        # This step is just checking that they can make predictions.
        # If we want to check model is correct, do it in fitting step.
        ground_truth = iowa_model.predict(X)
        assert ground_truth.shape == predictions.shape, (
            "Your predictions are "
            "shape {}. Expected shape {}").format(ground_truth.shape,
                                                  predictions.shape)
        assert all(predictions == ground_truth), (
            "Expected {} but got predictions {}").format(ground_truth, preds)


qvars = bind_exercises(
    globals(),
    [SetTarget, SelectPredictionData, CreateModel, MakePredictions],
    var_format='step_{n}',
)
__all__ = list(qvars)
Beispiel #18
0

def gen_bj_inputs(n):
    random.seed(1)
    return [(gen_bj_hand(), gen_bj_hand()) for _ in range(n)]


class BlackjackCmp(FunctionProblem):
    _var = 'blackjack_hand_greater_than'
    _hint = (
        "This problem is a lot easier to solve if you define at least one 'helper' function."
        " The logic for calculating a hand's total points is a good candidate for extracting into a helper function."
    )
    _solution = CS.load(bj_module.__file__)

    # TODO: explicitly make sure to test multi-ace cases. e.g. [K, A, A]
    _test_cases = [(args, hand_gt_soln(*args)) for args in gen_bj_inputs(100)]


qvars = bind_exercises(
    globals(),
    [
        JimmySlots,
        LuigiAnalysis,
        BlackjackCmp,
        None,
    ],
    start=1,
)
__all__ = list(qvars)
Beispiel #19
0
X1 = 4 * rand(n_samples) - 2
X2 = 4 * rand(n_samples) - 2
# Create y. you should have X in the expression for y
y = X1 * X2

# Aside from these lines, use the code provided
""")

    def check(self, importance, pdpResult):
        X1_imp = importance.feature_importances_[0]
        pdpRange = max(pdpResult.pdp) - min(pdpResult.pdp)
        assert (X1_imp > 0.5), ("Tested that X1 has an importance > 0.5. "
                                "Actual importance was {}").format(X1_imp)
        assert (pdpRange <
                0.5), ("Tested that the highest point on the Partial "
                       "Dependence Plot is within 0.5 of the lowest point. "
                       "Actual difference was {}").format(pdpRange)


qvars = bind_exercises(
    globals(),
    [
        WhyThatUShape, PonderPDPContour, ReadPDPContour,
        MakePDPWithAbsFeatures, DoesSteepnessImplyImportance,
        DesignDatasetUShapedPdp, DesignFlatPDPWithHighImportance
    ],
    tutorial_id=135,
    var_format='q_{n}',
)
__all__ = list(qvars)