# MAGIC %md # MAGIC ** (3a) Compare with hash ** # MAGIC # MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a). # MAGIC # MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing" style="width: 600px;"/> # COMMAND ---------- # TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed( twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12') # COMMAND ---------- # MAGIC %md # MAGIC ** (3b) Compare lists ** # COMMAND ---------- # TEST Compare lists (2b) # This should print '1 test passed.' unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')] Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'),
# COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import lit, concat pluralDF = wordsDF.<FILL IN> pluralDF.show() # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from databricks_test_helper import Test # TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import length pluralLengthsDF = pluralDF.<FILL IN> pluralLengthsDF.show()
# TEST Top ten error endpoints (4a) top_10_err_urls = [(row[0], row[1]) for row in logs_sum_df.take(10)] top_10_err_expected = [ (u'/images/NASA-logosmall.gif', 8761), (u'/images/KSC-logosmall.gif', 7236), (u'/images/MOSAIC-logosmall.gif', 5197), (u'/images/USA-logosmall.gif', 5157), (u'/images/WORLD-logosmall.gif', 5020), (u'/images/ksclogo-medium.gif', 4728), (u'/history/apollo/images/apollo-logo1.gif', 2907), (u'/images/launch-logo.gif', 2811), (u'/', 2199), (u'/images/ksclogosmall.gif', 1622) ] Test.assertEquals(logs_sum_df.count(), 7675, 'incorrect count for logs_sum_df') Test.assertEquals(top_10_err_urls, top_10_err_expected, 'incorrect Top Ten failed URLs') # COMMAND ---------- # MAGIC %md # MAGIC ### (4b) Exercise: Number of Unique Hosts # MAGIC # MAGIC How many unique hosts are there in the entire log? # MAGIC # MAGIC There are multiple ways to find this. Try to find a more optimal way than grouping by 'host'. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code unique_host_count = <FILL IN>
# COMMAND ---------- # ANSWER # Manually calculate your answer and represent the vector as a list of integers. # For example, [2, 4, 8]. vectorX = [3, -6, 0] vectorY = [4, 8, 16] # COMMAND ---------- # TEST Scalar multiplication: vectors (1a) # Import test library from databricks_test_helper import Test Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981', 'incorrect value for vectorX') Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e', 'incorrect value for vectorY') # COMMAND ---------- # PRIVATE_TEST Scalar multiplication: vectors (1a) Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981', 'incorrect value for vectorX') Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e', 'incorrect value for vectorY') # COMMAND ---------- # MAGIC %md # MAGIC ### (1b) Element-wise multiplication: vectors
# COMMAND ---------- # One way of completing the function def makePlural(word): return word + 's' print makePlural('cat') # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from databricks_test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s') # COMMAND ---------- # MAGIC %md # MAGIC ### (1c) Apply `makePlural` to the base RDD # MAGIC # MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code pluralRDD = wordsRDD.map(<FILL IN>) print pluralRDD.collect() # COMMAND ----------
# One way of completing the function def makePlural(word): return word + 's' print makePlural('cat') # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from databricks_test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s') # COMMAND ---------- # MAGIC %md # MAGIC ### (1c) Apply `makePlural` to the base RDD # MAGIC # MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code pluralRDD = wordsRDD.map(makePlural) print pluralRDD.collect() # COMMAND ----------
# COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code # Manually calculate your answer and represent the vector as a list of integers. # For example, [2, 4, 8]. vectorX = <FILL IN> vectorY = <FILL IN> # COMMAND ---------- # TEST Scalar multiplication: vectors (1a) # Import test library from databricks_test_helper import Test Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981', 'incorrect value for vectorX') Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e', 'incorrect value for vectorY') # COMMAND ---------- # MAGIC %md # MAGIC ### (1b) Element-wise multiplication: vectors # MAGIC # MAGIC In this exercise, you will calculate the element-wise multiplication of two vectors by hand and enter the result in the code cell below. You'll later see that element-wise multiplication is the default method when two NumPy arrays are multiplied together. Note we won't be performing element-wise multiplication in future labs, but we are introducing it here to distinguish it from other vector operators. It is also a common operation in NumPy, as we will discuss in Part (2b). # MAGIC # MAGIC The element-wise calculation is as follows: \\[ \mathbf{x} \odot \mathbf{y} = \begin{bmatrix} x_1 y_1 \\\ x_2 y_2 \\\ \vdots \\\ x_n y_n \end{bmatrix} \\] # MAGIC # MAGIC Calculate the value of \\( \mathbf{z} \\): \\[ \mathbf{z} = \begin{bmatrix} 1 \\\ 2 \\\ 3 \end{bmatrix} \odot \begin{bmatrix} 4 \\\ 5 \\\ 6 \end{bmatrix} \\] # COMMAND ----------
# Note: movie_names_df is a temporary variable, used only to separate the steps necessary # to create the movie_names_with_avg_ratings_df DataFrame. movie_names_df = movie_ids_with_avg_ratings_df.join( movies_df, movie_ids_with_avg_ratings_df.movieId == movies_df.ID) movie_names_with_avg_ratings_df = movie_names_df.select( 'average', 'title', 'count', 'movieId') print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals( movie_ids_with_avg_ratings_df.count(), 26744, 'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)') movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy( 'MovieID').take(3) _take_0 = movie_ids_with_ratings_take_ordered[0] _take_1 = movie_ids_with_ratings_take_ordered[1] _take_2 = movie_ids_with_ratings_take_ordered[2] Test.assertTrue( _take_0[0] == 1 and _take_0[1] == 49695, 'incorrect count of ratings for movie with ID {0} (expected 49695)'.format( _take_0[0])) Test.assertEquals( round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0])) Test.assertTrue(
# MAGIC %md # MAGIC ** (3a) Compare with hash ** # MAGIC # MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a). # MAGIC # MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing" style="width: 600px;"/> # COMMAND ---------- # TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12') # COMMAND ---------- # MAGIC %md # MAGIC ** (3b) Compare lists ** # COMMAND ---------- # TEST Compare lists (2b) # This should print '1 test passed.' unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')] Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')], 'unsortedList does not sort properly')
# TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed( twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12')
movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average")) print 'movie_ids_with_avg_ratings_df:' movie_ids_with_avg_ratings_df.show(3, truncate=False) # Note: movie_names_df is a temporary variable, used only to separate the steps necessary # to create the movie_names_with_avg_ratings_df DataFrame. movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"]) movie_names_with_avg_ratings_df = movie_names_df.drop("Id") print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744, 'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)') movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('MovieID').take(3) _take_0 = movie_ids_with_ratings_take_ordered[0] _take_1 = movie_ids_with_ratings_take_ordered[1] _take_2 = movie_ids_with_ratings_take_ordered[2] Test.assertTrue(_take_0[0] == 1 and _take_0[1] == 49695, 'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(_take_0[0])) Test.assertEquals(round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0])) Test.assertTrue(_take_1[0] == 2 and _take_1[1] == 22243, 'incorrect count of ratings for movie with ID {0} (expected 22243)'.format(_take_1[0])) Test.assertEquals(round(_take_1[2], 2), 3.21, "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0])) Test.assertTrue(_take_2[0] == 3 and _take_2[1] == 12735, 'incorrect count of ratings for movie with ID {0} (expected 12735)'.format(_take_2[0])) Test.assertEquals(round(_take_2[2], 2), 3.15, "Incorrect average for movie ID {0}. Expected 3.15".format(_take_2[0]))
# TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from databricks_test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12')