# MAGIC > * Do not use the default implemenation of `split()`, but pass in a separator value. For example, to split `line` by commas you would use `line.split(',')`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code shakespeareWordsRDD = shakespeareRDD.<FILL_IN> shakespeareWordCount = shakespeareWordsRDD.count() print shakespeareWordsRDD.top(5) print shakespeareWordCount # COMMAND ---------- # TEST Words from lines (4d) # This test allows for leading spaces to be removed either before or after # punctuation is removed. Test.assertTrue(shakespeareWordCount == 927631 or shakespeareWordCount == 928908, 'incorrect value for shakespeareWordCount') Test.assertEquals(shakespeareWordsRDD.top(5), [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'], 'incorrect value for shakespeareWordsRDD') # COMMAND ---------- # MAGIC %md # MAGIC ** (4e) Remove empty elements ** # MAGIC # MAGIC The next step is to filter out the empty elements. Remove all entries where the word is `''`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code shakeWordsRDD = shakespeareWordsRDD.<FILL_IN>
daily_hosts_df = day_group_hosts_df.<FILL IN> print 'Unique hosts per day:' daily_hosts_df.show(30, False) # COMMAND ---------- # TEST Number of unique daily hosts (4c) daily_hosts_list = (daily_hosts_df .map(lambda r: (r[0], r[1])) .take(30)) Test.assertEquals(day_to_host_pair_df.count(), total_log_entries, 'incorrect row count for day_to_host_pair_df') Test.assertEquals(daily_hosts_df.count(), 21, 'incorrect daily_hosts_df.count()') Test.assertEquals(daily_hosts_list, [(1, 2582), (3, 3222), (4, 4190), (5, 2502), (6, 2537), (7, 4106), (8, 4406), (9, 4317), (10, 4523), (11, 4346), (12, 2864), (13, 2650), (14, 4454), (15, 4214), (16, 4340), (17, 4385), (18, 4168), (19, 2550), (20, 2560), (21, 4134), (22, 4456)], 'incorrect daily_hosts_df') Test.assertTrue(daily_hosts_df.is_cached, 'incorrect daily_hosts_df.is_cached') # COMMAND ---------- # MAGIC %md # MAGIC ### (4d) Exercise: Visualizing the Number of Unique Daily Hosts # MAGIC # MAGIC Using the results from the previous exercise, we will use `matplotlib` to plot a line graph of the unique hosts requests by day. We need a list of days called `days_with_hosts` and a list of the number of unique hosts for each corresponding day called `hosts`. # MAGIC # MAGIC **WARNING**: Simply calling `collect()` on your transformed DataFrame won't work, because `collect()` returns a list of Spark SQL `Row` objects. You must _extract_ the appropriate column values from the `Row` objects. Hint: A loop will help. # COMMAND ---------- # TODO: Your solution goes here days_with_hosts =
print '\ntimesFive\n{0}'.format(timesFive) # COMMAND ---------- # ANSWER # Create a numpy array with the values 1, 2, 3 simpleArray = np.array([1, 2, 3]) # Perform the scalar product of 5 and the numpy array timesFive = 5 * simpleArray print 'simpleArray\n{0}'.format(simpleArray) print '\ntimesFive\n{0}'.format(timesFive) # COMMAND ---------- # TEST Scalar multiplication (2a) Test.assertTrue(np.all(timesFive == [5, 10, 15]), 'incorrect value for timesFive') # COMMAND ---------- # PRIVATE_TEST Scalar multiplication (2a) Test.assertTrue(np.all(timesFive == [5, 10, 15]), 'incorrect value for timesFive') # COMMAND ---------- # MAGIC %md # MAGIC ### (2b) Element-wise multiplication and dot product # MAGIC # MAGIC NumPy arrays support both element-wise multiplication and dot product. Element-wise multiplication occurs automatically when you use the `*` operator to multiply two `ndarray` objects of the same length. # MAGIC # MAGIC To perform the dot product you can use either [np.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html#numpy.dot) or [np.ndarray.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.dot.html). For example, if you had NumPy arrays `x` and `y`, you could compute their dot product four ways: `np.dot(x, y)`, `np.dot(y, x)`, `x.dot(y)`, or `y.dot(x)`. # MAGIC
import numpy as np # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code # Create a numpy array with the values 1, 2, 3 simpleArray = <FILL IN> # Perform the scalar product of 5 and the numpy array timesFive = <FILL IN> print 'simpleArray\n{0}'.format(simpleArray) print '\ntimesFive\n{0}'.format(timesFive) # COMMAND ---------- # TEST Scalar multiplication (2a) Test.assertTrue(np.all(timesFive == [5, 10, 15]), 'incorrect value for timesFive') # COMMAND ---------- # MAGIC %md # MAGIC ### (2b) Element-wise multiplication and dot product # MAGIC # MAGIC NumPy arrays support both element-wise multiplication and dot product. Element-wise multiplication occurs automatically when you use the `*` operator to multiply two `ndarray` objects of the same length. # MAGIC # MAGIC To perform the dot product you can use either [np.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html#numpy.dot) or [np.ndarray.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.dot.html). For example, if you had NumPy arrays `x` and `y`, you could compute their dot product four ways: `np.dot(x, y)`, `np.dot(y, x)`, `x.dot(y)`, or `y.dot(x)`. # MAGIC # MAGIC For this exercise, multiply the arrays `u` and `v` element-wise and compute their dot product. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code
# COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code shakespeareWordsRDD = shakespeareRDD.flatMap(lambda ln: ln.split(' ')) shakespeareWordCount = shakespeareWordsRDD.count() print shakespeareWordsRDD.top(5) print shakespeareWordCount # COMMAND ---------- # TEST Words from lines (4d) # This test allows for leading spaces to be removed either before or after # punctuation is removed. Test.assertTrue( shakespeareWordCount == 927631 or shakespeareWordCount == 928908, 'incorrect value for shakespeareWordCount') Test.assertEquals(shakespeareWordsRDD.top(5), [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'], 'incorrect value for shakespeareWordsRDD') # COMMAND ---------- # MAGIC %md # MAGIC ** (4e) Remove empty elements ** # MAGIC # MAGIC The next step is to filter out the empty elements. Remove all entries where the word is `''`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code
print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals( movie_ids_with_avg_ratings_df.count(), 26744, 'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)') movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy( 'MovieID').take(3) _take_0 = movie_ids_with_ratings_take_ordered[0] _take_1 = movie_ids_with_ratings_take_ordered[1] _take_2 = movie_ids_with_ratings_take_ordered[2] Test.assertTrue( _take_0[0] == 1 and _take_0[1] == 49695, 'incorrect count of ratings for movie with ID {0} (expected 49695)'.format( _take_0[0])) Test.assertEquals( round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0])) Test.assertTrue( _take_1[0] == 2 and _take_1[1] == 22243, 'incorrect count of ratings for movie with ID {0} (expected 22243)'.format( _take_1[0])) Test.assertEquals( round(_take_1[2], 2), 3.21, "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0])) Test.assertTrue( _take_2[0] == 3 and _take_2[1] == 12735,
movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"]) movie_names_with_avg_ratings_df = movie_names_df.drop("Id") print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744, 'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)') movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('MovieID').take(3) _take_0 = movie_ids_with_ratings_take_ordered[0] _take_1 = movie_ids_with_ratings_take_ordered[1] _take_2 = movie_ids_with_ratings_take_ordered[2] Test.assertTrue(_take_0[0] == 1 and _take_0[1] == 49695, 'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(_take_0[0])) Test.assertEquals(round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0])) Test.assertTrue(_take_1[0] == 2 and _take_1[1] == 22243, 'incorrect count of ratings for movie with ID {0} (expected 22243)'.format(_take_1[0])) Test.assertEquals(round(_take_1[2], 2), 3.21, "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0])) Test.assertTrue(_take_2[0] == 3 and _take_2[1] == 12735, 'incorrect count of ratings for movie with ID {0} (expected 12735)'.format(_take_2[0])) Test.assertEquals(round(_take_2[2], 2), 3.15, "Incorrect average for movie ID {0}. Expected 3.15".format(_take_2[0])) Test.assertEquals(movie_names_with_avg_ratings_df.count(), 26744, 'incorrect movie_names_with_avg_ratings_df.count() (expected 26744)') movie_names_with_ratings_take_ordered = movie_names_with_avg_ratings_df.orderBy(['average', 'title']).take(3) result = [(r['average'], r['title'], r['count'], r['movieId']) for r in movie_names_with_ratings_take_ordered]