# MAGIC > * Do not use the default implemenation of `split()`, but pass in a separator value.  For example, to split `line` by commas you would use `line.split(',')`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
shakespeareWordsRDD = shakespeareRDD.<FILL_IN>
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount

# COMMAND ----------

# TEST Words from lines (4d)
# This test allows for leading spaces to be removed either before or after
# punctuation is removed.
Test.assertTrue(shakespeareWordCount == 927631 or shakespeareWordCount == 928908,
                'incorrect value for shakespeareWordCount')
Test.assertEquals(shakespeareWordsRDD.top(5),
                  [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'],
                  'incorrect value for shakespeareWordsRDD')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Remove empty elements **
# MAGIC 
# MAGIC The next step is to filter out the empty elements.  Remove all entries where the word is `''`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
shakeWordsRDD = shakespeareWordsRDD.<FILL_IN>
Esempio n. 2
0
daily_hosts_df = day_group_hosts_df.<FILL IN>

print 'Unique hosts per day:'
daily_hosts_df.show(30, False)

# COMMAND ----------

# TEST Number of unique daily hosts (4c)
daily_hosts_list = (daily_hosts_df
                    .map(lambda r: (r[0], r[1]))
                    .take(30))

Test.assertEquals(day_to_host_pair_df.count(), total_log_entries, 'incorrect row count for day_to_host_pair_df')
Test.assertEquals(daily_hosts_df.count(), 21, 'incorrect daily_hosts_df.count()')
Test.assertEquals(daily_hosts_list, [(1, 2582), (3, 3222), (4, 4190), (5, 2502), (6, 2537), (7, 4106), (8, 4406), (9, 4317), (10, 4523), (11, 4346), (12, 2864), (13, 2650), (14, 4454), (15, 4214), (16, 4340), (17, 4385), (18, 4168), (19, 2550), (20, 2560), (21, 4134), (22, 4456)], 'incorrect daily_hosts_df')
Test.assertTrue(daily_hosts_df.is_cached, 'incorrect daily_hosts_df.is_cached')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (4d) Exercise: Visualizing the Number of Unique Daily Hosts
# MAGIC 
# MAGIC Using the results from the previous exercise, we will use `matplotlib` to plot a line graph of the unique hosts requests by day.  We need a list of days called `days_with_hosts` and a list of the number of unique hosts for each corresponding day called `hosts`.
# MAGIC 
# MAGIC **WARNING**: Simply calling `collect()` on your transformed DataFrame won't work, because `collect()` returns a list of Spark SQL `Row` objects. You must _extract_ the appropriate column values from the `Row` objects. Hint: A loop will help.

# COMMAND ----------

# TODO: Your solution goes here

days_with_hosts =
print '\ntimesFive\n{0}'.format(timesFive)

# COMMAND ----------

# ANSWER
# Create a numpy array with the values 1, 2, 3
simpleArray = np.array([1, 2, 3])
# Perform the scalar product of 5 and the numpy array
timesFive = 5 * simpleArray
print 'simpleArray\n{0}'.format(simpleArray)
print '\ntimesFive\n{0}'.format(timesFive)

# COMMAND ----------

# TEST Scalar multiplication (2a)
Test.assertTrue(np.all(timesFive == [5, 10, 15]), 'incorrect value for timesFive')

# COMMAND ----------

# PRIVATE_TEST Scalar multiplication (2a)
Test.assertTrue(np.all(timesFive == [5, 10, 15]), 'incorrect value for timesFive')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2b) Element-wise multiplication and dot product
# MAGIC 
# MAGIC NumPy arrays support both element-wise multiplication and dot product.  Element-wise multiplication occurs automatically when you use the `*` operator to multiply two `ndarray` objects of the same length.
# MAGIC 
# MAGIC To perform the dot product you can use either [np.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html#numpy.dot) or [np.ndarray.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.dot.html).  For example, if you had NumPy arrays `x` and `y`, you could compute their dot product four ways: `np.dot(x, y)`, `np.dot(y, x)`, `x.dot(y)`, or `y.dot(x)`.
# MAGIC 
import numpy as np

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# Create a numpy array with the values 1, 2, 3
simpleArray = <FILL IN>
# Perform the scalar product of 5 and the numpy array
timesFive = <FILL IN>
print 'simpleArray\n{0}'.format(simpleArray)
print '\ntimesFive\n{0}'.format(timesFive)

# COMMAND ----------

# TEST Scalar multiplication (2a)
Test.assertTrue(np.all(timesFive == [5, 10, 15]), 'incorrect value for timesFive')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2b) Element-wise multiplication and dot product
# MAGIC 
# MAGIC NumPy arrays support both element-wise multiplication and dot product.  Element-wise multiplication occurs automatically when you use the `*` operator to multiply two `ndarray` objects of the same length.
# MAGIC 
# MAGIC To perform the dot product you can use either [np.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html#numpy.dot) or [np.ndarray.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.dot.html).  For example, if you had NumPy arrays `x` and `y`, you could compute their dot product four ways: `np.dot(x, y)`, `np.dot(y, x)`, `x.dot(y)`, or `y.dot(x)`.
# MAGIC 
# MAGIC For this exercise, multiply the arrays `u` and `v` element-wise and compute their dot product.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda ln: ln.split(' '))
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount

# COMMAND ----------

# TEST Words from lines (4d)
# This test allows for leading spaces to be removed either before or after
# punctuation is removed.
Test.assertTrue(
    shakespeareWordCount == 927631 or shakespeareWordCount == 928908,
    'incorrect value for shakespeareWordCount')
Test.assertEquals(shakespeareWordsRDD.top(5),
                  [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'],
                  'incorrect value for shakespeareWordsRDD')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Remove empty elements **
# MAGIC
# MAGIC The next step is to filter out the empty elements.  Remove all entries where the word is `''`.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(
    movie_ids_with_avg_ratings_df.count(), 26744,
    'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)')
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy(
    'MovieID').take(3)
_take_0 = movie_ids_with_ratings_take_ordered[0]
_take_1 = movie_ids_with_ratings_take_ordered[1]
_take_2 = movie_ids_with_ratings_take_ordered[2]
Test.assertTrue(
    _take_0[0] == 1 and _take_0[1] == 49695,
    'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(
        _take_0[0]))
Test.assertEquals(
    round(_take_0[2], 2), 3.92,
    "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0]))

Test.assertTrue(
    _take_1[0] == 2 and _take_1[1] == 22243,
    'incorrect count of ratings for movie with ID {0} (expected 22243)'.format(
        _take_1[0]))
Test.assertEquals(
    round(_take_1[2], 2), 3.21,
    "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0]))

Test.assertTrue(
    _take_2[0] == 3 and _take_2[1] == 12735,
movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"])
movie_names_with_avg_ratings_df = movie_names_df.drop("Id")

print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744,
                'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)')
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('MovieID').take(3)
_take_0 = movie_ids_with_ratings_take_ordered[0]
_take_1 = movie_ids_with_ratings_take_ordered[1]
_take_2 = movie_ids_with_ratings_take_ordered[2]
Test.assertTrue(_take_0[0] == 1 and _take_0[1] == 49695,
                'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(_take_0[0]))
Test.assertEquals(round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0]))

Test.assertTrue(_take_1[0] == 2 and _take_1[1] == 22243,
                'incorrect count of ratings for movie with ID {0} (expected 22243)'.format(_take_1[0]))
Test.assertEquals(round(_take_1[2], 2), 3.21, "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0]))

Test.assertTrue(_take_2[0] == 3 and _take_2[1] == 12735,
                'incorrect count of ratings for movie with ID {0} (expected 12735)'.format(_take_2[0]))
Test.assertEquals(round(_take_2[2], 2), 3.15, "Incorrect average for movie ID {0}. Expected 3.15".format(_take_2[0]))


Test.assertEquals(movie_names_with_avg_ratings_df.count(), 26744,
                  'incorrect movie_names_with_avg_ratings_df.count() (expected 26744)')
movie_names_with_ratings_take_ordered = movie_names_with_avg_ratings_df.orderBy(['average', 'title']).take(3)
result = [(r['average'], r['title'], r['count'], r['movieId']) for r in movie_names_with_ratings_take_ordered]