Example #1
0
# MAGIC %md
# MAGIC ** (3a) Compare with hash **
# MAGIC
# MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a).
# MAGIC
# MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing"  style="width: 600px;"/>

# COMMAND ----------

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(
    twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
    'twelve, once hashed, should equal the hashed value of 12')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (3b) Compare lists **

# COMMAND ----------

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'),
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import lit, concat

pluralDF = wordsDF.<FILL IN>
pluralDF.show()

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC 
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.<FILL IN>
pluralLengthsDF.show()
Example #3
0
# TEST Top ten error endpoints (4a)
top_10_err_urls = [(row[0], row[1]) for row in logs_sum_df.take(10)]
top_10_err_expected = [
  (u'/images/NASA-logosmall.gif', 8761),
  (u'/images/KSC-logosmall.gif', 7236),
  (u'/images/MOSAIC-logosmall.gif', 5197),
  (u'/images/USA-logosmall.gif', 5157),
  (u'/images/WORLD-logosmall.gif', 5020),
  (u'/images/ksclogo-medium.gif', 4728),
  (u'/history/apollo/images/apollo-logo1.gif', 2907),
  (u'/images/launch-logo.gif', 2811),
  (u'/', 2199),
  (u'/images/ksclogosmall.gif', 1622)
]
Test.assertEquals(logs_sum_df.count(), 7675, 'incorrect count for logs_sum_df')
Test.assertEquals(top_10_err_urls, top_10_err_expected, 'incorrect Top Ten failed URLs')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (4b) Exercise: Number of Unique Hosts
# MAGIC 
# MAGIC How many unique hosts are there in the entire log?
# MAGIC 
# MAGIC There are multiple ways to find this.  Try to find a more optimal way than grouping by 'host'.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
unique_host_count = <FILL IN>
u = np.arange(0, 5, .5)
v = np.arange(5, 10, .5)

elementWise = u * v
dotProduct = u.dot(v)
print 'u: {0}'.format(u)
print 'v: {0}'.format(v)
print '\nelementWise\n{0}'.format(elementWise)
print '\ndotProduct\n{0}'.format(dotProduct)

# COMMAND ----------

# TEST Element-wise multiplication and dot product (2b)
Test.assertTrue(np.all(elementWise == [ 0., 2.75, 6., 9.75, 14., 18.75, 24., 29.75, 36., 42.75]),
                'incorrect value for elementWise')
Test.assertEquals(dotProduct, 183.75, 'incorrect value for dotProduct')

# COMMAND ----------

# PRIVATE_TEST Element-wise multiplication and dot product (2b)
Test.assertTrue(np.all(elementWise == [ 0., 2.75, 6., 9.75, 14., 18.75, 24., 29.75, 36., 42.75]),
                'incorrect value for elementWise')
Test.assertEquals(dotProduct, 183.75, 'incorrect value for dotProduct')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Matrix math
# MAGIC With NumPy it is very easy to perform matrix math.  You can use [np.matrix()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html) to generate a NumPy matrix.  Just pass a two-dimensional `ndarray` or a list of lists to the function.  You can perform matrix math on NumPy matrices using `*`.
# MAGIC 
# MAGIC You can transpose a matrix by calling [numpy.matrix.transpose()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.transpose.html) or by using `.T` on the matrix object (e.g. `myMatrix.T`).  Transposing a matrix produces a matrix where the new rows are the columns from the old matrix. For example: \\[  \begin{bmatrix} 1 & 2 & 3 \\\ 4 & 5 & 6 \end{bmatrix}^\top = \begin{bmatrix} 1 & 4 \\\ 2 & 5 \\\ 3 & 6 \end{bmatrix} \\]
# COMMAND ----------

# One way of completing the function
def makePlural(word):
    return word + 's'

print makePlural('cat')

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Pluralize and test (1b)
Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1c) Apply `makePlural` to the base RDD
# MAGIC 
# MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(<FILL IN>)
print pluralRDD.collect()

# COMMAND ----------
# One way of completing the function
def makePlural(word):
    return word + 's'


print makePlural('cat')

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Pluralize and test (1b)
Test.assertEquals(makePlural('rat'), 'rats',
                  'incorrect result: makePlural does not add an s')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1c) Apply `makePlural` to the base RDD
# MAGIC
# MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(makePlural)
print pluralRDD.collect()

# COMMAND ----------
u = np.arange(0, 5, .5)
v = np.arange(5, 10, .5)

elementWise = <FILL IN>
dotProduct = <FILL IN>
print 'u: {0}'.format(u)
print 'v: {0}'.format(v)
print '\nelementWise\n{0}'.format(elementWise)
print '\ndotProduct\n{0}'.format(dotProduct)

# COMMAND ----------

# TEST Element-wise multiplication and dot product (2b)
Test.assertTrue(np.all(elementWise == [ 0., 2.75, 6., 9.75, 14., 18.75, 24., 29.75, 36., 42.75]),
                'incorrect value for elementWise')
Test.assertEquals(dotProduct, 183.75, 'incorrect value for dotProduct')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Matrix math
# MAGIC With NumPy it is very easy to perform matrix math.  You can use [np.matrix()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html) to generate a NumPy matrix.  Just pass a two-dimensional `ndarray` or a list of lists to the function.  You can perform matrix math on NumPy matrices using `*`.
# MAGIC 
# MAGIC You can transpose a matrix by calling [numpy.matrix.transpose()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.transpose.html) or by using `.T` on the matrix object (e.g. `myMatrix.T`).  Transposing a matrix produces a matrix where the new rows are the columns from the old matrix. For example: \\[  \begin{bmatrix} 1 & 2 & 3 \\\ 4 & 5 & 6 \end{bmatrix}^\top = \begin{bmatrix} 1 & 4 \\\ 2 & 5 \\\ 3 & 6 \end{bmatrix} \\]
# MAGIC 
# MAGIC Inverting a matrix can be done using [numpy.linalg.inv()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.inv.html).  Note that only square matrices can be inverted, and square matrices are not guaranteed to have an inverse.  If the inverse exists, then multiplying a matrix by its inverse will produce the identity matrix.  \\( \scriptsize ( A^{-1} A = I_n ) \\)  The identity matrix \\( \scriptsize I_n \\) has ones along its diagonal and zeros elsewhere. \\[ I_n = \begin{bmatrix} 1 & 0 & 0 & ... & 0 \\\ 0 & 1 & 0 & ... & 0 \\\ 0 & 0 & 1 & ... & 0 \\\ ... & ... & ... & ... & ... \\\ 0 & 0 & 0 & ... & 1 \end{bmatrix} \\]
# MAGIC 
# MAGIC For this exercise, multiply \\( A \\) times its transpose \\( ( A^\top ) \\) and then calculate the inverse of the result \\( (  [ A A^\top ]^{-1}  ) \\).

# COMMAND ----------
# Note: movie_names_df is a temporary variable, used only to separate the steps necessary
# to create the movie_names_with_avg_ratings_df DataFrame.
movie_names_df = movie_ids_with_avg_ratings_df.join(
    movies_df, movie_ids_with_avg_ratings_df.movieId == movies_df.ID)
movie_names_with_avg_ratings_df = movie_names_df.select(
    'average', 'title', 'count', 'movieId')

print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(
    movie_ids_with_avg_ratings_df.count(), 26744,
    'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)')
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy(
    'MovieID').take(3)
_take_0 = movie_ids_with_ratings_take_ordered[0]
_take_1 = movie_ids_with_ratings_take_ordered[1]
_take_2 = movie_ids_with_ratings_take_ordered[2]
Test.assertTrue(
    _take_0[0] == 1 and _take_0[1] == 49695,
    'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(
        _take_0[0]))
Test.assertEquals(
    round(_take_0[2], 2), 3.92,
    "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0]))

Test.assertTrue(
Example #9
0
# MAGIC %md
# MAGIC ** (3a) Compare with hash **
# MAGIC 
# MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a).
# MAGIC 
# MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing"  style="width: 600px;"/>

# COMMAND ----------

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve, once hashed, should equal the hashed value of 12')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (3b) Compare lists **

# COMMAND ----------

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')],
                  'unsortedList does not sort properly')
# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines

from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(
    twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
    'twelve, once hashed, should equal the hashed value of 12')
movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average"))
print 'movie_ids_with_avg_ratings_df:'
movie_ids_with_avg_ratings_df.show(3, truncate=False)

# Note: movie_names_df is a temporary variable, used only to separate the steps necessary
# to create the movie_names_with_avg_ratings_df DataFrame.
movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"])
movie_names_with_avg_ratings_df = movie_names_df.drop("Id")

print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744,
                'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)')
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('MovieID').take(3)
_take_0 = movie_ids_with_ratings_take_ordered[0]
_take_1 = movie_ids_with_ratings_take_ordered[1]
_take_2 = movie_ids_with_ratings_take_ordered[2]
Test.assertTrue(_take_0[0] == 1 and _take_0[1] == 49695,
                'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(_take_0[0]))
Test.assertEquals(round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0]))

Test.assertTrue(_take_1[0] == 2 and _take_1[1] == 22243,
                'incorrect count of ratings for movie with ID {0} (expected 22243)'.format(_take_1[0]))
Test.assertEquals(round(_take_1[2], 2), 3.21, "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0]))

Test.assertTrue(_take_2[0] == 3 and _take_2[1] == 12735,
                'incorrect count of ratings for movie with ID {0} (expected 12735)'.format(_take_2[0]))
Test.assertEquals(round(_take_2[2], 2), 3.15, "Incorrect average for movie ID {0}. Expected 3.15".format(_take_2[0]))
# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines

from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve, once hashed, should equal the hashed value of 12')