Example #1
0
# MAGIC %md
# MAGIC ** (3a) Compare with hash **
# MAGIC
# MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a).
# MAGIC
# MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing"  style="width: 600px;"/>

# COMMAND ----------

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(
    twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
    'twelve, once hashed, should equal the hashed value of 12')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (3b) Compare lists **

# COMMAND ----------

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'),
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import lit, concat

pluralDF = wordsDF.<FILL IN>
pluralDF.show()

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (1c) Length of each word **
# MAGIC 
# MAGIC Now use the SQL `length` function to find the number of characters in each word.  The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import length
pluralLengthsDF = pluralDF.<FILL IN>
pluralLengthsDF.show()
Example #3
0
# TEST Top ten error endpoints (4a)
top_10_err_urls = [(row[0], row[1]) for row in logs_sum_df.take(10)]
top_10_err_expected = [
  (u'/images/NASA-logosmall.gif', 8761),
  (u'/images/KSC-logosmall.gif', 7236),
  (u'/images/MOSAIC-logosmall.gif', 5197),
  (u'/images/USA-logosmall.gif', 5157),
  (u'/images/WORLD-logosmall.gif', 5020),
  (u'/images/ksclogo-medium.gif', 4728),
  (u'/history/apollo/images/apollo-logo1.gif', 2907),
  (u'/images/launch-logo.gif', 2811),
  (u'/', 2199),
  (u'/images/ksclogosmall.gif', 1622)
]
Test.assertEquals(logs_sum_df.count(), 7675, 'incorrect count for logs_sum_df')
Test.assertEquals(top_10_err_urls, top_10_err_expected, 'incorrect Top Ten failed URLs')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (4b) Exercise: Number of Unique Hosts
# MAGIC 
# MAGIC How many unique hosts are there in the entire log?
# MAGIC 
# MAGIC There are multiple ways to find this.  Try to find a more optimal way than grouping by 'host'.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
unique_host_count = <FILL IN>
# COMMAND ----------

# ANSWER
# Manually calculate your answer and represent the vector as a list of integers.
# For example, [2, 4, 8].
vectorX = [3, -6, 0]
vectorY = [4, 8, 16]

# COMMAND ----------

# TEST Scalar multiplication: vectors (1a)
# Import test library
from databricks_test_helper import Test

Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# PRIVATE_TEST Scalar multiplication: vectors (1a)
Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1b) Element-wise multiplication: vectors
# COMMAND ----------

# One way of completing the function
def makePlural(word):
    return word + 's'

print makePlural('cat')

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Pluralize and test (1b)
Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1c) Apply `makePlural` to the base RDD
# MAGIC 
# MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(<FILL IN>)
print pluralRDD.collect()

# COMMAND ----------
# One way of completing the function
def makePlural(word):
    return word + 's'


print makePlural('cat')

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Pluralize and test (1b)
Test.assertEquals(makePlural('rat'), 'rats',
                  'incorrect result: makePlural does not add an s')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1c) Apply `makePlural` to the base RDD
# MAGIC
# MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(makePlural)
print pluralRDD.collect()

# COMMAND ----------
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers.
# For example, [2, 4, 8].
vectorX = <FILL IN>
vectorY = <FILL IN>

# COMMAND ----------

# TEST Scalar multiplication: vectors (1a)
# Import test library
from databricks_test_helper import Test

Test.assertEqualsHashed(vectorX, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vectorX')
Test.assertEqualsHashed(vectorY, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vectorY')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (1b) Element-wise multiplication: vectors
# MAGIC 
# MAGIC In this exercise, you will calculate the element-wise multiplication of two vectors by hand and enter the result in the code cell below.  You'll later see that element-wise multiplication is the default method when two NumPy arrays are multiplied together.  Note we won't be performing element-wise multiplication in future labs, but we are introducing it here to distinguish it from other vector operators. It is also a common operation in NumPy, as we will discuss in Part (2b).
# MAGIC 
# MAGIC The element-wise calculation is as follows: \\[ \mathbf{x} \odot \mathbf{y} =  \begin{bmatrix} x_1 y_1 \\\  x_2 y_2 \\\ \vdots \\\ x_n y_n \end{bmatrix} \\]
# MAGIC 
# MAGIC Calculate the value of \\( \mathbf{z} \\): \\[ \mathbf{z} = \begin{bmatrix} 1 \\\  2 \\\ 3 \end{bmatrix} \odot \begin{bmatrix} 4 \\\  5 \\\ 6 \end{bmatrix} \\]

# COMMAND ----------
# Note: movie_names_df is a temporary variable, used only to separate the steps necessary
# to create the movie_names_with_avg_ratings_df DataFrame.
movie_names_df = movie_ids_with_avg_ratings_df.join(
    movies_df, movie_ids_with_avg_ratings_df.movieId == movies_df.ID)
movie_names_with_avg_ratings_df = movie_names_df.select(
    'average', 'title', 'count', 'movieId')

print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(
    movie_ids_with_avg_ratings_df.count(), 26744,
    'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)')
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy(
    'MovieID').take(3)
_take_0 = movie_ids_with_ratings_take_ordered[0]
_take_1 = movie_ids_with_ratings_take_ordered[1]
_take_2 = movie_ids_with_ratings_take_ordered[2]
Test.assertTrue(
    _take_0[0] == 1 and _take_0[1] == 49695,
    'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(
        _take_0[0]))
Test.assertEquals(
    round(_take_0[2], 2), 3.92,
    "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0]))

Test.assertTrue(
Example #9
0
# MAGIC %md
# MAGIC ** (3a) Compare with hash **
# MAGIC 
# MAGIC Run the following cell. If you see an **ImportError**, you should verify that you added the spark_mooc_meta library to your cluster and, if necessary, repeat step (1a).
# MAGIC 
# MAGIC <img src="http://spark-mooc.github.io/web-assets/images/Lab0_LibraryError.png" alt="Drawing"  style="width: 600px;"/>

# COMMAND ----------

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve, once hashed, should equal the hashed value of 12')

# COMMAND ----------

# MAGIC %md
# MAGIC ** (3b) Compare lists **

# COMMAND ----------

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')],
                  'unsortedList does not sort properly')
# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines

from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(
    twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
    'twelve, once hashed, should equal the hashed value of 12')
movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average"))
print 'movie_ids_with_avg_ratings_df:'
movie_ids_with_avg_ratings_df.show(3, truncate=False)

# Note: movie_names_df is a temporary variable, used only to separate the steps necessary
# to create the movie_names_with_avg_ratings_df DataFrame.
movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"])
movie_names_with_avg_ratings_df = movie_names_df.drop("Id")

print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744,
                'incorrect movie_ids_with_avg_ratings_df.count() (expected 26744)')
movie_ids_with_ratings_take_ordered = movie_ids_with_avg_ratings_df.orderBy('MovieID').take(3)
_take_0 = movie_ids_with_ratings_take_ordered[0]
_take_1 = movie_ids_with_ratings_take_ordered[1]
_take_2 = movie_ids_with_ratings_take_ordered[2]
Test.assertTrue(_take_0[0] == 1 and _take_0[1] == 49695,
                'incorrect count of ratings for movie with ID {0} (expected 49695)'.format(_take_0[0]))
Test.assertEquals(round(_take_0[2], 2), 3.92, "Incorrect average for movie ID {0}. Expected 3.92".format(_take_0[0]))

Test.assertTrue(_take_1[0] == 2 and _take_1[1] == 22243,
                'incorrect count of ratings for movie with ID {0} (expected 22243)'.format(_take_1[0]))
Test.assertEquals(round(_take_1[2], 2), 3.21, "Incorrect average for movie ID {0}. Expected 3.21".format(_take_1[0]))

Test.assertTrue(_take_2[0] == 3 and _take_2[1] == 12735,
                'incorrect count of ratings for movie with ID {0} (expected 12735)'.format(_take_2[0]))
Test.assertEquals(round(_take_2[2], 2), 3.15, "Incorrect average for movie ID {0}. Expected 3.15".format(_take_2[0]))
# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines

from databricks_test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve, once hashed, should equal the hashed value of 12')