# MAGIC %md # MAGIC ** (3a) Compare with hash ** # MAGIC # MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a). # MAGIC # MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing" style="width: 600px;"/> # COMMAND ---------- # TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed( twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12') # COMMAND ---------- # MAGIC %md # MAGIC ** (3b) Compare lists ** # COMMAND ---------- # TEST Compare lists (2b) # This should print '1 test passed.' unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')] Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'),
# COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import lit, concat pluralDF = wordsDF.<FILL IN> pluralDF.show() # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from databricks_test_helper import Test # TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import length pluralLengthsDF = pluralDF.<FILL IN> pluralLengthsDF.show()
# TEST Top ten error endpoints (4a) top_10_err_urls = [(row[0], row[1]) for row in logs_sum_df.take(10)] top_10_err_expected = [ (u'/images/NASA-logosmall.gif', 8761), (u'/images/KSC-logosmall.gif', 7236), (u'/images/MOSAIC-logosmall.gif', 5197), (u'/images/USA-logosmall.gif', 5157), (u'/images/WORLD-logosmall.gif', 5020), (u'/images/ksclogo-medium.gif', 4728), (u'/history/apollo/images/apollo-logo1.gif', 2907), (u'/images/launch-logo.gif', 2811), (u'/', 2199), (u'/images/ksclogosmall.gif', 1622) ] Test.assertEquals(logs_sum_df.count(), 7675, 'incorrect count for logs_sum_df') Test.assertEquals(top_10_err_urls, top_10_err_expected, 'incorrect Top Ten failed URLs') # COMMAND ---------- # MAGIC %md # MAGIC ### (4b) Exercise: Number of Unique Hosts # MAGIC # MAGIC How many unique hosts are there in the entire log? # MAGIC # MAGIC There are multiple ways to find this. Try to find a more optimal way than grouping by 'host'. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code unique_host_count = <FILL IN>
u = np.arange(0, 5, .5) v = np.arange(5, 10, .5) elementWise = u * v dotProduct = u.dot(v) print 'u: {0}'.format(u) print 'v: {0}'.format(v) print '\nelementWise\n{0}'.format(elementWise) print '\ndotProduct\n{0}'.format(dotProduct) # COMMAND ---------- # TEST Element-wise multiplication and dot product (2b) Test.assertTrue(np.all(elementWise == [ 0., 2.75, 6., 9.75, 14., 18.75, 24., 29.75, 36., 42.75]), 'incorrect value for elementWise') Test.assertEquals(dotProduct, 183.75, 'incorrect value for dotProduct') # COMMAND ---------- # PRIVATE_TEST Element-wise multiplication and dot product (2b) Test.assertTrue(np.all(elementWise == [ 0., 2.75, 6., 9.75, 14., 18.75, 24., 29.75, 36., 42.75]), 'incorrect value for elementWise') Test.assertEquals(dotProduct, 183.75, 'incorrect value for dotProduct') # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Matrix math # MAGIC With NumPy it is very easy to perform matrix math. You can use [np.matrix()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html) to generate a NumPy matrix. Just pass a two-dimensional `ndarray` or a list of lists to the function. You can perform matrix math on NumPy matrices using `*`. # MAGIC # MAGIC You can transpose a matrix by calling [numpy.matrix.transpose()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.transpose.html) or by using `.T` on the matrix object (e.g. `myMatrix.T`). Transposing a matrix produces a matrix where the new rows are the columns from the old matrix. For example: \\[ \begin{bmatrix} 1 & 2 & 3 \\\ 4 & 5 & 6 \end{bmatrix}^\top = \begin{bmatrix} 1 & 4 \\\ 2 & 5 \\\ 3 & 6 \end{bmatrix} \\]
# COMMAND ---------- # One way of completing the function def makePlural(word): return word + 's' print makePlural('cat') # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from databricks_test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s') # COMMAND ---------- # MAGIC %md # MAGIC ### (1c) Apply `makePlural` to the base RDD # MAGIC # MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code pluralRDD = wordsRDD.map(<FILL IN>) print pluralRDD.collect() # COMMAND ----------
# One way of completing the function def makePlural(word): return word + 's' print makePlural('cat') # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from databricks_test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s') # COMMAND ---------- # MAGIC %md # MAGIC ### (1c) Apply `makePlural` to the base RDD # MAGIC # MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code pluralRDD = wordsRDD.map(makePlural) print pluralRDD.collect() # COMMAND ----------
u = np.arange(0, 5, .5) v = np.arange(5, 10, .5) elementWise = <FILL IN> dotProduct = <FILL IN> print 'u: {0}'.format(u) print 'v: {0}'.format(v) print '\nelementWise\n{0}'.format(elementWise) print '\ndotProduct\n{0}'.format(dotProduct) # COMMAND ---------- # TEST Element-wise multiplication and dot product (2b) Test.assertTrue(np.all(elementWise == [ 0., 2.75, 6., 9.75, 14., 18.75, 24., 29.75, 36., 42.75]), 'incorrect value for elementWise') Test.assertEquals(dotProduct, 183.75, 'incorrect value for dotProduct') # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Matrix math # MAGIC With NumPy it is very easy to perform matrix math. You can use [np.matrix()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html) to generate a NumPy matrix. Just pass a two-dimensional `ndarray` or a list of lists to the function. You can perform matrix math on NumPy matrices using `*`. # MAGIC # MAGIC You can transpose a matrix by calling [numpy.matrix.transpose()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.transpose.html) or by using `.T` on the matrix object (e.g. `myMatrix.T`). Transposing a matrix produces a matrix where the new rows are the columns from the old matrix. For example: \\[ \begin{bmatrix} 1 & 2 & 3 \\\ 4 & 5 & 6 \end{bmatrix}^\top = \begin{bmatrix} 1 & 4 \\\ 2 & 5 \\\ 3 & 6 \end{bmatrix} \\] # MAGIC # MAGIC Inverting a matrix can be done using [numpy.linalg.inv()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.inv.html). Note that only square matrices can be inverted, and square matrices are not guaranteed to have an inverse. If the inverse exists, then multiplying a matrix by its inverse will produce the identity matrix. \\( \scriptsize ( A^{-1} A = I_n ) \\) The identity matrix \\( \scriptsize I_n \\) has ones along its diagonal and zeros elsewhere. \\[ I_n = \begin{bmatrix} 1 & 0 & 0 & ... & 0 \\\ 0 & 1 & 0 & ... & 0 \\\ 0 & 0 & 1 & ... & 0 \\\ ... & ... & ... & ... & ... \\\ 0 & 0 & 0 & ... & 1 \end{bmatrix} \\] # MAGIC # MAGIC For this exercise, multiply \\( A \\) times its transpose \\( ( A^\top ) \\) and then calculate the inverse of the result \\( ( [ A A^\top ]^{-1} ) \\). # COMMAND ----------
# Note: movie_names_df is a temporary variable, used only to separate the steps necessary # to create the movie_names_with_avg_ratings_df DataFrame. movie_names_df = movie_ids_with_avg_ratings_df.join( movies_df, movie_ids_with_avg_ratings_df.movieId == movies_df.ID) movie_names_with_avg_ratings_df = movie_names_df.select( 'average', 'title', 'count', 'movieId') print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals( movie_ids_with_avg_ratings_df.count(), 26744, 'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)') movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy( 'MovieID').take(3) _take_0 = movie_ids_with_ratings_take_ordered[0] _take_1 = movie_ids_with_ratings_take_ordered[1] _take_2 = movie_ids_with_ratings_take_ordered[2] Test.assertTrue( _take_0[0] == 1 and _take_0[1] == 49695, 'incorrect count of ratings for movie with ID {0} (expected 49695)'.format( _take_0[0])) Test.assertEquals( round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0])) Test.assertTrue(
# MAGIC %md # MAGIC ** (3a) Compare with hash ** # MAGIC # MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a). # MAGIC # MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing" style="width: 600px;"/> # COMMAND ---------- # TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12') # COMMAND ---------- # MAGIC %md # MAGIC ** (3b) Compare lists ** # COMMAND ---------- # TEST Compare lists (2b) # This should print '1 test passed.' unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')] Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')], 'unsortedList does not sort properly')
# TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed( twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12')
movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average")) print 'movie_ids_with_avg_ratings_df:' movie_ids_with_avg_ratings_df.show(3, truncate=False) # Note: movie_names_df is a temporary variable, used only to separate the steps necessary # to create the movie_names_with_avg_ratings_df DataFrame. movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"]) movie_names_with_avg_ratings_df = movie_names_df.drop("Id") print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744, 'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)') movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('MovieID').take(3) _take_0 = movie_ids_with_ratings_take_ordered[0] _take_1 = movie_ids_with_ratings_take_ordered[1] _take_2 = movie_ids_with_ratings_take_ordered[2] Test.assertTrue(_take_0[0] == 1 and _take_0[1] == 49695, 'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(_take_0[0])) Test.assertEquals(round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0])) Test.assertTrue(_take_1[0] == 2 and _take_1[1] == 22243, 'incorrect count of ratings for movie with ID {0} (expected 22243)'.format(_take_1[0])) Test.assertEquals(round(_take_1[2], 2), 3.21, "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0])) Test.assertTrue(_take_2[0] == 3 and _take_2[1] == 12735, 'incorrect count of ratings for movie with ID {0} (expected 12735)'.format(_take_2[0])) Test.assertEquals(round(_take_2[2], 2), 3.15, "Incorrect average for movie ID {0}. Expected 3.15".format(_take_2[0]))
# TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12')