Beispiel #1
0
# MAGIC
# MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a).
# MAGIC
# MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing"  style="width: 600px;"/>

# COMMAND ----------

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(
    twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
    'twelve, once hashed, should equal the hashed value of 12')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (3b) Compare lists **

# COMMAND ----------

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'),
                                         (5, 'b')],
                  'unsortedList does not sort properly')
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers.
# For example, [2, 4, 8].
vectorX = <FILL IN>
vectorY = <FILL IN>

# COMMAND ----------

# TEST Scalar multiplication: vectors (1a)
# Import test library
from databricks_test_helper import Test

Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1b) Element-wise multiplication: vectors
# MAGIC 
# MAGIC In this exercise, you will calculate the element-wise multiplication of two vectors by hand and enter the result in the code cell below.  You'll later see that element-wise multiplication is the default method when two NumPy arrays are multiplied together.  Note we won't be performing element-wise multiplication in future labs, but we are introducing it here to distinguish it from other vector operators. It is also a common operation in NumPy, as we will discuss in Part (2b).
# MAGIC 
# MAGIC The element-wise calculation is as follows: \\[ \mathbf{x} \odot \mathbf{y} =  \begin{bmatrix} x_1 y_1 \\\  x_2 y_2 \\\ \vdots \\\ x_n y_n \end{bmatrix} \\]
# MAGIC 
# MAGIC Calculate the value of \\( \mathbf{z} \\): \\[ \mathbf{z} = \begin{bmatrix} 1 \\\  2 \\\ 3 \end{bmatrix} \odot \begin{bmatrix} 4 \\\  5 \\\ 6 \end{bmatrix} \\]

# COMMAND ----------
# COMMAND ----------

# ANSWER
# Manually calculate your answer and represent the vector as a list of integers.
# For example, [2, 4, 8].
vectorX = [3, -6, 0]
vectorY = [4, 8, 16]

# COMMAND ----------

# TEST Scalar multiplication: vectors (1a)
# Import test library
from databricks_test_helper import Test

Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# PRIVATE_TEST Scalar multiplication: vectors (1a)
Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1b) Element-wise multiplication: vectors
print 'The best model was trained with rank %s' % ranks[best_rank]
my_model = models[best_rank]

# COMMAND ----------

# TEST
Test.assertEquals(
    round(min_error, 2), 0.81,
    "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}"
    .format(round(min_error, 2)))
Test.assertEquals(
    ranks[best_rank], 12,
    "Unexpected value for best rank. Expected 12. Got {0}".format(
        ranks[best_rank]))
Test.assertEqualsHashed(
    als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6",
    "Incorrect choice of {0} for ALS item column.".format(als.getItemCol()))
Test.assertEqualsHashed(
    als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a",
    "Incorrect choice of {0} for ALS user column.".format(als.getUserCol()))
Test.assertEqualsHashed(
    als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c",
    "Incorrect choice of {0} for ALS rating column.".format(
        als.getRatingCol()))

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Testing Your Model
# MAGIC
# MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model.  Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting).  To decide how good our model is, we need to use the `test_df` dataset.  We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE.
Beispiel #5
0
# MAGIC ** (3a) Compare with hash **
# MAGIC 
# MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a).
# MAGIC 
# MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing"  style="width: 600px;"/>

# COMMAND ----------

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve, once hashed, should equal the hashed value of 12')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (3b) Compare lists **

# COMMAND ----------

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')],
                  'unsortedList does not sort properly')

# COMMAND ----------
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers.
# For example, [2, 4, 8].
vectorX = <FILL IN>
vectorY = <FILL IN>

# COMMAND ----------

# TEST Scalar multiplication: vectors (1a)
# Import test library
from databricks_test_helper import Test

Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1b) Element-wise multiplication: vectors
# MAGIC 
# MAGIC In this exercise, you will calculate the element-wise multiplication of two vectors by hand and enter the result in the code cell below.  You'll later see that element-wise multiplication is the default method when two NumPy arrays are multiplied together.  Note we won't be performing element-wise multiplication in future labs, but we are introducing it here to distinguish it from other vector operators. It is also a common operation in NumPy, as we will discuss in Part (2b).
# MAGIC 
# MAGIC The element-wise calculation is as follows: \\[ \mathbf{x} \odot \mathbf{y} =  \begin{bmatrix} x_1 y_1 \\\  x_2 y_2 \\\ \vdots \\\ x_n y_n \end{bmatrix} \\]
# MAGIC 
# MAGIC Calculate the value of \\( \mathbf{z} \\): \\[ \mathbf{z} = \begin{bmatrix} 1 \\\  2 \\\ 3 \end{bmatrix} \odot \begin{bmatrix} 4 \\\  5 \\\ 6 \end{bmatrix} \\]

# COMMAND ----------
  print 'For rank %s the RMSE is %s' % (rank, error)
  if error < min_error:
    min_error = error
    best_rank = err
  err += 1

als.setRank(ranks[best_rank])
print 'The best model was trained with rank %s' % ranks[best_rank]
my_model = models[best_rank]

# COMMAND ----------

# TEST
Test.assertEquals(round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}".format(round(min_error, 2)))
Test.assertEquals(ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format(ranks[best_rank]))
Test.assertEqualsHashed(als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol()))
Test.assertEqualsHashed(als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol()))
Test.assertEqualsHashed(als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format(als.getRatingCol()))

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Testing Your Model
# MAGIC 
# MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model.  Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting).  To decide how good our model is, we need to use the `test_df` dataset.  We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE.
# MAGIC 
# MAGIC The steps you should perform are:
# MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame.
# MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you.
# MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame.
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers.
# For example, [2, 4, 8].
vectorX = [3, -6, 0]
vectorY = [4, 8, 16]

# COMMAND ----------

# TEST Scalar multiplication: vectors (1a)
# Import test library
from databricks_test_helper import Test

Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1b) Element-wise multiplication: vectors
# MAGIC
# MAGIC In this exercise, you will calculate the element-wise multiplication of two vectors by hand and enter the result in the code cell below.  You'll later see that element-wise multiplication is the default method when two NumPy arrays are multiplied together.  Note we won't be performing element-wise multiplication in future labs, but we are introducing it here to distinguish it from other vector operators. It is also a common operation in NumPy, as we will discuss in Part (2b).
# MAGIC
# MAGIC The element-wise calculation is as follows: \\[ \mathbf{x} \odot \mathbf{y} =  \begin{bmatrix} x_1 y_1 \\\  x_2 y_2 \\\ \vdots \\\ x_n y_n \end{bmatrix} \\]
# MAGIC
# MAGIC Calculate the value of \\( \mathbf{z} \\): \\[ \mathbf{z} = \begin{bmatrix} 1 \\\  2 \\\ 3 \end{bmatrix} \odot \begin{bmatrix} 4 \\\  5 \\\ 6 \end{bmatrix} \\]

# COMMAND ----------