def run_tests(): Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945') Test.assertEquals(test_year(1970, df), [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'], 'incorrect top 5 names for 1970') Test.assertEquals(test_year(1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'], 'incorrect top 5 names for 1987') Test.assertTrue(len(test_year(1945, df)) <= 5, 'list not limited to 5 names') Test.assertTrue(u'James' not in test_year(1945, df), 'male names not filtered') Test.assertTrue(test_year(1945, df) != [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered') Test.assertEqualsHashed(test_year(1880, df), "2038e2c0bb0b741797a47837c0f94dbf24123447", "incorrect top 5 names for 1880")
def run_tests(): Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945') Test.assertEquals(test_year(1970, df), [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'], 'incorrect top 5 names for 1970') Test.assertEquals(test_year( 1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'], 'incorrect top 5 names for 1987') Test.assertTrue( len(test_year(1945, df)) <= 5, 'list not limited to 5 names') Test.assertTrue(u'James' not in test_year(1945, df), 'male names not filtered') Test.assertTrue( test_year(1945, df) != [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered') Test.assertEqualsHashed(test_year(1880, df), "2038e2c0bb0b741797a47837c0f94dbf24123447", "incorrect top 5 names for 1880")
else : print("Duh...") # COMMAND ---------- # Now, thank each new friend iteratively, i.e. # print "Thanks <name of the friend>!" using loops and string formatting (cf. section 1) for name in new_friends : print "Thanks",name # This does not work the same for Python 3 # COMMAND ---------- # Sum all the number from 0 to 15 (included) using what we've seen so far (i.e. without the function sum() ) sum_to_fifteen = 0 for i in range(15) : sum_to_fifteen+=i+1 Test.assertEquals(sum_to_fifteen, 120) # COMMAND ---------- # Note: you can break a loop with the break statement for i in range(136): print(i) if i >= 2: break # COMMAND ---------- # enumerate function can be very useful when dealing with iterators: for i, value in enumerate(["a", "b", "c"]): print(value, i)
from test_helper import Test x = 1 y = 2 Test.assertEquals(x, 1) Test.assertEquals(x + 1, 2) Test.assertEquals(y, 1, "y is incorrect") Test.assertEqualsHashed(x, '356a192b7913b04c54574d18c28d46e6395428ab', 'this is a test') Test.printStats() Test.setFailFast() Test.setPrivateMode() Test.assertEquals(y, 1, "y is incorrect")
# MAGIC %md # MAGIC First, create a `DataFrame` named sized that has a `size` column with the size of each array of words. Here you can use `func.size`. # COMMAND ---------- # ANSWER sized = noStopWords.withColumn('size', func.size('words')) sizedFirst = sized.select('size', 'words').first() print sizedFirst[0] # COMMAND ---------- # TEST from test_helper import Test Test.assertEquals(sizedFirst[0], len(sizedFirst[1]), 'incorrect implementation for sized') # COMMAND ---------- # MAGIC %md # MAGIC Next, you'll need to aggregate the counts. You can do this using `func.sum` in either a `.select` or `.agg` method call on the `DataFrame`. Make sure to give your `Column` the alias `numberOfWords`. There are some examples in [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.GroupedData.agg) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrame) in the APIs. # COMMAND ---------- # ANSWER numberOfWords = sized.agg(func.sum('size').alias('numberOfWords')) wordCount = numberOfWords.first()[0] print wordCount # COMMAND ----------
def run_tests(): Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945')
# If the text file didn't load properly an AssertionError will be raised assert shakespeareCount == 122395 # ### ** Part 2: Check class testing library ** # #### ** (2a) Compare with hash ** # In[4]: # TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed( twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12') # #### ** (2b) Compare lists ** # In[5]: # TEST Compare lists (2b) # This should print '1 test passed.' unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')] Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')], 'unsortedList does not sort properly')
Args: IDandRatingsTuple: a single tuple of (MovieID, (Rating1, Rating2, Rating3, ...)) Returns: tuple: a tuple of (MovieID, (number of ratings, averageRating)) """ movieId = IDandRatingsTuple[0] ratings = IDandRatingsTuple[1] return (movieId, (len(ratings), float(sum(ratings)) / len(ratings))) # In[10]: # TEST Number of Ratings and Average Ratings for a Movie (1a) Test.assertEquals( getCountsAndAverages((1, (1, 2, 3, 4))), (1, (4, 2.5)), "incorrect getCountsAndAverages() with integer list" ) Test.assertEquals( getCountsAndAverages((100, (10.0, 20.0, 30.0))), (100, (3, 20.0)), "incorrect getCountsAndAverages() with float list", ) Test.assertEquals( getCountsAndAverages((110, xrange(20))), (110, (20, 9.5)), "incorrect getCountsAndAverages() with xrange" ) # #### **(1b) Movies with Highest Average Ratings** # #### Now that we have a way to calculate the average ratings, we will use the `getCountsAndAverages()` helper function with Spark to determine movies with highest average ratings. # #### The steps you should perform are: # * #### Recall that the `ratingsRDD` contains tuples of the form (UserID, MovieID, Rating). From `ratingsRDD` create an RDD with tuples of the form (MovieID, Python iterable of Ratings for that MovieID). This transformation will yield an RDD of the form: `[(1, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7c90>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e79d0>), (3, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7610>)]`. Note that you will only need to perform two Spark transformations to do this step.
dfcrashes.createOrReplaceTempView(temp_table) # COMMAND ---------- def checkanzrecord(): return dfcrashes.count() print(checkanzrecord()) # COMMAND ---------- # TEST Ob die eingelesenen Records der erwarteten Anzahl Records entspricht from test_helper import Test Test.assertEquals(checkanzrecord(), 5784, 'incorrect Total Records') # COMMAND ---------- # MAGIC %sql # MAGIC # MAGIC select * from crashestable # MAGIC where date is not null # MAGIC order by date asc, time asc # MAGIC limit 1 # COMMAND ---------- # MAGIC %sql # MAGIC # MAGIC select * from crashestable
Test.assertEqualsHashed(sampleOHEDictManual[(0,'mouse')], 'da4b9237bacccdf19c0760cab7aec4a8359010b0', "incorrect value for sampleOHEDictManual[(0,'mouse')]") Test.assertEqualsHashed(sampleOHEDictManual[(1,'black')], '77de68daecd823babbb58edb1c8e14d7106e83bb', "incorrect value for sampleOHEDictManual[(1,'black')]") Test.assertEqualsHashed(sampleOHEDictManual[(1,'tabby')], '1b6453892473a467d07372d45eb05abc2031647a', "incorrect value for sampleOHEDictManual[(1,'tabby')]") Test.assertEqualsHashed(sampleOHEDictManual[(2,'mouse')], 'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4', "incorrect value for sampleOHEDictManual[(2,'mouse')]") Test.assertEqualsHashed(sampleOHEDictManual[(2,'salmon')], 'c1dfd96eea8cc2b62785275bca38ac261256e278', "incorrect value for sampleOHEDictManual[(2,'salmon')]") Test.assertEquals(len(sampleOHEDictManual.keys()), 7, 'incorrect number of keys in sampleOHEDictManual') # ** Sparse vectors ** import numpy as np from pyspark.mllib.linalg import SparseVector aDense = np.array([0., 3., 0., 4.]) aSparse = SparseVector(4, [[0,0.], [1,3.], [2,0.], [3,4.]]) bDense = np.array([0., 0., 0., 1.]) bSparse = SparseVector(4, [[0,0.], [1,0.], [2,0.], [3,1.]]) w = np.array([0.4, 3.1, -1.4, -.5]) print aDense.dot(w) print aSparse.dot(w)
averageSparsityOHE = computeSparsity(OHETrainData, numCtrOHEFeats, nTrain) print 'Average OHE Sparsity: {0:.10e}'.format(averageSparsityOHE) print 'Average Hash Sparsity: {0:.10e}'.format(averageSparsityHash) #------------------ # Test Code #------------------ # TEST Loading and splitting the data (3a) Test.assertTrue( all([ rawTrainData.is_cached, rawValidationData.is_cached, rawTestData.is_cached ]), 'you must cache the split data') Test.assertEquals(nTrain, 79911, 'incorrect value for nTrain') Test.assertEquals(nVal, 10075, 'incorrect value for nVal') Test.assertEquals(nTest, 10014, 'incorrect value for nTest') # TEST Extract features (3b) Test.assertEquals(numCategories[2][1], 855, 'incorrect implementation of parsePoint') Test.assertEquals(numCategories[32][1], 4, 'incorrect implementation of parsePoint') # TEST Create an OHE dictionary from the dataset (3c) Test.assertEquals(numCtrOHEFeats, 233286, 'incorrect number of features in ctrOHEDict') Test.assertTrue((0, '') in ctrOHEDict, 'incorrect features in ctrOHEDict') # TEST Apply OHE to the dataset (3d)
print '\n'.join( shakespeareRDD.zipWithIndex() # to (line, lineNum) .map(lambda (l, num): '{0}: {1}'.format(num, l)) # to 'lineNum: line' .take(15)) shakespeareWordsRDD = shakespeareRDD.flatMap(lambda s: s.split(' ')) shakespeareWordCount = shakespeareWordsRDD.count() print shakespeareWordsRDD.top(5) print shakespeareWordCount Test.assertTrue( shakespeareWordCount == 927631 or shakespeareWordCount == 928908, 'incorrect value for shakespeareWordCount') Test.assertEquals( shakespeareWordsRDD.top(5), [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'], 'incorrect value for shakespeareWordsRDD') shakeWordsRDD = shakespeareWordsRDD.filter(lambda s: s != '') shakeWordCount = shakeWordsRDD.count() print shakeWordCount Test.assertEquals(shakeWordCount, 882996, 'incorrect value for shakeWordCount') top15WordsAndCounts = wordCount(shakeWordsRDD).takeOrdered( 15, lambda (w, c): -c) print '\n'.join( map(lambda (w, c): '{0}: {1}'.format(w, c), top15WordsAndCounts)) Test.assertEquals(top15WordsAndCounts, [(u'the', 27361), (u'and', 26028),
item_item_matrix = (item_item_matrix_haveRatings.map( lambda x: parser_iiMatrix_and_compute_SparseMatrix( x, broadcast_Mapping_item.value, broadcast_Mapping_user.value))) def sparseAdd(sv1, sv2, length): from pyspark.mllib.linalg import Vectors combinedV = sv1.toArray() + sv2.toArray() nonzeroes = combinedV.nonzero()[0] return Vectors.sparse(length, nonzeroes, combinedV[nonzeroes]) # Test sparseAdd function o = SparseVector(2241, [771, 806, 1209, 1574], [1.0, 1.0, 1.0, 1.0]) k = SparseVector(2241, [305, 1253, 1254], [1.0, 1.0, 1.0]) Test.assertEquals(SparseVector(2241, [305, 1253, 1254], [2.0, 2.0, 2.0]), sparseAdd(k, k, k.size), 'sparseAdd function malfunc') Test.assertEquals( SparseVector(2241, [771, 806, 1209, 1574], [2.0, 2.0, 2.0, 2.0]), sparseAdd(o, o, o.size), 'sparseAdd function malfunc') Test.assertEquals( SparseVector(2241, [305, 771, 806, 1209, 1253, 1254, 1574], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), sparseAdd(o, k, o.size), 'sparseAdd function malfunc') def matrixMultiplication(sv, item_item_matrix): indices = sv.indices # return item_item_matrix.filter(lambda (x,y):x in indices).map(lambda (x,y):y).reduce(lambda x,y:sparseAdd(x,y,y.size)) current_sv = None result_sv = None for i in indices:
pass # TODO: Replace <FILL IN> with appropriate code threeDData = sc.parallelize(dataThreeD) componentsThreeD, threeDScores, eigenvaluesThreeD = pca(threeDData, 2) print 'componentsThreeD: \n{0}'.format(componentsThreeD) print('\nthreeDScores (first three): \n{0}'.format('\n'.join( map(str, threeDScores.take(3))))) print '\neigenvaluesThreeD: \n{0}'.format(eigenvaluesThreeD) print format(np.sum(threeDScores.take(3))) print format(np.sum(eigenvaluesThreeD)) print format(np.sum(componentsThreeD)) # TEST 3D to 2D (2c) Test.assertEquals(componentsThreeD.shape, (3, 2), 'incorrect shape for componentsThreeD') Test.assertTrue(np.allclose(np.sum(eigenvaluesThreeD), 969.796443367), 'incorrect value for eigenvaluesThreeD') Test.assertTrue(np.allclose(np.abs(np.sum(componentsThreeD)), 1.77238943258), 'incorrect value for componentsThreeD') Test.assertTrue( np.allclose(np.abs(np.sum(threeDScores.take(3))), 237.782834092), 'incorrect value for threeDScores') scoresThreeD = np.asarray(threeDScores.collect()) # generate layout and plot data fig, ax = preparePlot(np.arange(20, 150, 20), np.arange(-40, 110, 20)) ax.set_xlabel(r'New $x_1$ values'), ax.set_ylabel(r'New $x_2$ values') ax.set_xlim(5, 150), ax.set_ylim(-45, 50) plt.scatter(scoresThreeD[:, 0],
# One way of completing the function def makePlural(word): return word + 's' print makePlural('cat') # In[8]: # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s') # #### ** (1c) Apply `makePlural` to the base RDD ** # #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # In[9]: # TODO: Replace <FILL IN> with appropriate code wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat'] wordsRDD = sc.parallelize(wordsList, 4) def makePlural(word): return word + 's' pluralRDD = wordsRDD.map(makePlural) print pluralRDD.collect()
def makePlural(word): return word + "s" print makePlural("cat") # In[5]: # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural("rat"), "rats", "incorrect result: makePlural does not add an s") # #### ** (1c) Apply `makePlural` to the base RDD ** # #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # In[6]: # TODO: Replace <FILL IN> with appropriate code pluralRDD = wordsRDD.map(makePlural) print pluralRDD.collect() # In[7]: # TEST Apply makePlural to the base RDD(1c)
def capitalize(word): """Capitalize lowercase `words`. Args: word (str): A lowercase string. Returns: str: A string which first letter is uppercase. """ return word.capitalize() print(capitalize('we')) Test.assertEquals(capitalize('we'), 'We', "Capitalize") # COMMAND ---------- # MAGIC %md Apply `capitalize` to the base RDD, using a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `capitalize()` function to each element. Then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to retrieve the values of the transformed RDD, and print them. # COMMAND ---------- capital_RDD = words_RDD.map(capitalize) local_result = capital_RDD.collect() print(local_result) Test.assertEqualsHashed(local_result, 'bd73c54004cc9655159aceb703bc14fe93369fb1', 'incorrect value for local_data')
# One way of completing the function def makePlural(word): return word + 's' print makePlural('cat') # In[4]: # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s') # #### ** (1c) Apply `makePlural` to the base RDD ** # #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # In[7]: # TODO: Replace <FILL IN> with appropriate code pluralRDD = wordsRDD.map(makePlural) print pluralRDD.collect() # In[ ]: # TEST Apply makePlural to the base RDD(1c)
Args: IDandRatingsTuple: a single tuple of (MovieID, (Rating1, Rating2, Rating3, ...)) Returns: tuple: a tuple of (MovieID, (number of ratings, averageRating)) """ movieID = IDandRatingsTuple[0] numRating = len(IDandRatingsTuple[1]) avgRating = sum(IDandRatingsTuple[1])/float(numRating) return (movieID, (numRating, avgRating)) # In[14]: # TEST Number of Ratings and Average Ratings for a Movie (1a) Test.assertEquals(getCountsAndAverages((1, (1, 2, 3, 4))), (1, (4, 2.5)), 'incorrect getCountsAndAverages() with integer list') Test.assertEquals(getCountsAndAverages((100, (10.0, 20.0, 30.0))), (100, (3, 20.0)), 'incorrect getCountsAndAverages() with float list') Test.assertEquals(getCountsAndAverages((110, xrange(20))), (110, (20, 9.5)), 'incorrect getCountsAndAverages() with xrange') # #### **(1b) Movies with Highest Average Ratings** # #### Now that we have a way to calculate the average ratings, we will use the `getCountsAndAverages()` helper function with Spark to determine movies with highest average ratings. # #### The steps you should perform are: # * #### Recall that the `ratingsRDD` contains tuples of the form (UserID, MovieID, Rating). From `ratingsRDD` create an RDD with tuples of the form (MovieID, Python iterable of Ratings for that MovieID). This transformation will yield an RDD of the form: `[(1, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7c90>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e79d0>), (3, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7610>)]`. Note that you will only need to perform two Spark transformations to do this step. # * #### Using `movieIDsWithRatingsRDD` and your `getCountsAndAverages()` helper function, compute the number of ratings and average rating for each movie to yield tuples of the form (MovieID, (number of ratings, average rating)). This transformation will yield an RDD of the form: `[(1, (993, 4.145015105740181)), (2, (332, 3.174698795180723)), (3, (299, 3.0468227424749164))]`. You can do this step with one Spark transformation # * #### We want to see movie names, instead of movie IDs. To `moviesRDD`, apply RDD transformations that use `movieIDsWithAvgRatingsRDD` to get the movie names for `movieIDsWithAvgRatingsRDD`, yielding tuples of the form (average rating, movie name, number of ratings). This set of transformations will yield an RDD of the form: `[(1.0, u'Autopsy (Macchie Solari) (1975)', 1), (1.0, u'Better Living (1998)', 1), (1.0, u'Big Squeeze, The (1996)', 3)]`. You will need to do two Spark transformations to complete this step: first use the `moviesRDD` with `movieIDsWithAvgRatingsRDD` to create a new RDD with Movie names matched to Movie IDs, then convert that RDD into the form of (average rating, movie name, number of ratings). These transformations will yield an RDD that looks like: `[(3.6818181818181817, u'Happiest Millionaire, The (1967)', 22), (3.0468227424749164, u'Grumpier Old Men (1995)', 299), (2.882978723404255, u'Hocus Pocus (1993)', 94)]` # In[25]:
def simpleTokenize(string): """ A simple implementation of input string tokenization Args: string (str): input string Returns: list: a list of tokens """ return re.sub('[^a-zA-Z0-9\s_]+', ' ', string).lower().split() print simpleTokenize(quickbrownfox) # Should give ['a', 'quick', 'brown', ... ] # TEST Tokenize a String (1a) Test.assertEquals(simpleTokenize(quickbrownfox), ['a','quick','brown','fox','jumps','over','the','lazy','dog'], 'simpleTokenize should handle sample text') Test.assertEquals(simpleTokenize(' '), [], 'simpleTokenize should handle empty string') Test.assertEquals(simpleTokenize('!!!!123A/456_B/789C.123A'), ['123a','456_b','789c','123a'], 'simpleTokenize should handle puntuations and lowercase result') Test.assertEquals(simpleTokenize('fox fox'), ['fox', 'fox'], 'simpleTokenize should not remove duplicates') # TODO: Replace <FILL IN> with appropriate code stopfile = os.path.join(baseDir, inputPath, STOPWORDS_PATH) stopwords = set(sc.textFile(stopfile).collect()) print 'These are the stopwords: %s' % stopwords def tokenize(string): """ An implementation of input string tokenization that excludes stopwords Args:
from test_helper import Test x = 1 y = 2 Test.assertEquals(x, 1) Test.assertEquals(x + 1, 2) Test.assertEquals(y, 1, "y is incorrect") Test.assertEqualsHashed(x,'356a192b7913b04c54574d18c28d46e6395428ab', 'this is a test') Test.printStats() Test.setFailFast() Test.setPrivateMode() Test.assertEquals(y, 1, "y is incorrect")
rawData = sc.textFile(fileName, numPartitions) # In[3]: # TODO: Replace <FILL IN> with appropriate code numPoints = rawData.count() print numPoints samplePoints = rawData.take(5) print samplePoints # In[4]: # TEST Load and check the data (1a) Test.assertEquals(numPoints, 6724, 'incorrect value for numPoints') Test.assertEquals(len(samplePoints), 5, 'incorrect length for samplePoints') # #### ** (1b) Using `LabeledPoint` ** # #### In MLlib, labeled training instances are stored using the [LabeledPoint](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) object. Write the parsePoint function that takes as input a raw data point, parses it using Python's [unicode.split](https://docs.python.org/2/library/string.html#string.split) method, and returns a `LabeledPoint`. Use this function to parse samplePoints (from the previous question). Then print out the features and label for the first training point, using the `LabeledPoint.features` and `LabeledPoint.label` attributes. Finally, calculate the number features for this dataset. # #### Note that `split()` can be called directly on a `unicode` or `str` object. For example, `u'split,me'.split(',')` returns `[u'split', u'me']`. # In[5]: from pyspark.mllib.regression import LabeledPoint import numpy as np # Here is a sample raw data point: # '2001.0,0.884,0.610,0.600,0.474,0.247,0.357,0.344,0.33,0.600,0.425,0.60,0.419' # In this raw data point, 2001.0 is the label, and the remaining values are features
splt = re.split(split_regex, string) fltr = filter(None,splt) return fltr #print simpleTokenize(quickbrownfox) print simpleTokenize(quickbrownfox) # Should give ['a', 'quick', 'brown', ... ] # In[5]: # TEST Tokenize a String (1a) Test.assertEquals(simpleTokenize(quickbrownfox), ['a','quick','brown','fox','jumps','over','the','lazy','dog'], 'simpleTokenize should handle sample text') Test.assertEquals(simpleTokenize(' '), [], 'simpleTokenize should handle empty string') Test.assertEquals(simpleTokenize('!!!!123A/456_B/789C.123A'), ['123a','456_b','789c','123a'], 'simpleTokenize should handle puntuations and lowercase result') Test.assertEquals(simpleTokenize('fox fox'), ['fox', 'fox'], 'simpleTokenize should not remove duplicates') # ### **(1b) Removing stopwords** # #### *[Stopwords][stopwords]* are common (English) words that do not contribute much to the content or meaning of a document (e.g., "the", "a", "is", "to", etc.). Stopwords add noise to bag-of-words comparisons, so they are usually excluded. # #### Using the included file "stopwords.txt", implement `tokenize`, an improved tokenizer that does not emit stopwords. # [stopwords]: https://en.wikipedia.org/wiki/Stop_words # In[6]:
# #### If you not familar with Python regular expression [`search` function](https://docs.python.org/2/library/re.html#regular-expression-objects), now would be a good time to check up on the [documentation](https://developers.google.com/edu/python/regular-expressions). One tip that might be useful is to use an online tester like http://pythex.org or http://www.pythonregex.com. To use it, copy and paste the regular expression string below (located between the single quotes ') and test it against one of the 'Invalid logline' above. # In[4]: # TODO: Replace <FILL IN> with appropriate code # This was originally '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)' APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s*" (\d{3}) (\S+)' parsed_logs, access_logs, failed_logs = parseLogs() # In[5]: # TEST Data cleaning (1c) Test.assertEquals(failed_logs.count(), 0, 'incorrect failed_logs.count()') Test.assertEquals(parsed_logs.count(), 1043177 , 'incorrect parsed_logs.count()') Test.assertEquals(access_logs.count(), parsed_logs.count(), 'incorrect access_logs.count()') # ### **Part 2: Sample Analyses on the Web Server Log File** # # ####Now that we have an RDD containing the log file as a set of Row objects, we can perform various analyses. # # #### **(2a) Example: Content Size Statistics** # # ####Let's compute some statistics about the sizes of content being returned by the web server. In particular, we'd like to know what are the average, minimum, and maximum content sizes. # # ####We can compute the statistics by applying a `map` to the `access_logs` RDD. The `lambda` function we want for the map is to extract the `content_size` field from the RDD. The map produces a new RDD containing only the `content_sizes` (one element for each Row object in the `access_logs` RDD). To compute the minimum and maximum statistics, we can use [`min()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.min) and [`max()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.max) functions on the new RDD. We can compute the average statistic by using the [`reduce`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.reduce) function with a `lambda` function that sums the two inputs, which represent two elements from the new RDD that are being reduced together. The result of the `reduce()` is the total content size from the log and it is to be divided by the number of requests as determined using the [`count()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.count) function on the new RDD. # In[6]:
# COMMAND ---------- # MAGIC %md # MAGIC Create a `DenseVector` with the values 1.5, 2.5, 3.0 (in that order). # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code denseVec = <FILL IN> # COMMAND ---------- # TEST from test_helper import Test Test.assertEquals(denseVec, DenseVector([1.5, 2.5, 3.0]), 'incorrect value for denseVec') # COMMAND ---------- # MAGIC %md # MAGIC Create a `LabeledPoint` with a label equal to 10.0 and features equal to `denseVec` # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code labeledP = <FILL IN> # COMMAND ---------- # TEST Test.assertEquals(str(labeledP), '(10.0,[1.5,2.5,3.0])', 'incorrect value for labeledP')
# MAGIC # MAGIC The resulting `DataFrame` should have two columns: one named `features` and another named `label`. # COMMAND ---------- # ANSWER from pyspark.sql.functions import col irisDFZeroIndex = irisDF.select('features', (col('label') - 1).alias('label')) display(irisDFZeroIndex) # COMMAND ---------- # TEST from test_helper import Test Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0], 'incorrect value for irisDFZeroIndex') # COMMAND ---------- # MAGIC %md # MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`. We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`. To do that we'll need to create a `udf` and apply it to our dataset. Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction). # MAGIC # MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method. # COMMAND ---------- # ANSWER from pyspark.sql.functions import udf # Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types # VectorUDT should be the return type of the udf from pyspark.mllib.linalg import Vectors, VectorUDT
assert oneSorted1 == twoSorted1 def getCountsAndAverages(IDandRatingsTuple): """ Calculate average rating Args: IDandRatingsTuple: a single tuple of (MovieID, (Rating1, Rating2, Rating3, ...)) Returns: tuple: a tuple of (MovieID, (number of ratings, averageRating)) """ tup = len(IDandRatingsTuple[1]) return (IDandRatingsTuple[0], (tup,sum(IDandRatingsTuple[1])/float((tup)))) Test.assertEquals(getCountsAndAverages((1, (1, 2, 3, 4))), (1, (4, 2.5)), 'incorrect getCountsAndAverages() with integer list') Test.assertEquals(getCountsAndAverages((100, (10.0, 20.0, 30.0))), (100, (3, 20.0)), 'incorrect getCountsAndAverages() with float list') Test.assertEquals(getCountsAndAverages((110, xrange(20))), (110, (20, 9.5)), 'incorrect getCountsAndAverages() with xrange') def sortFunction(tuple): """ Construct the sort string (does not perform actual sorting) Args: tuple: (rating, MovieName) Returns: sortString: the value to sort with, 'rating MovieName' """
#return df.limit(10) # THIS IS NOT CORRECT! FIX IT. # COMMAND ---------- # Transparent Tests from test_helper import Test def test_year(year, df): return [row.firstName for row in top_female_names_for_year(year, 5, df).collect()] # COMMAND ---------- def run_tests(): Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945') Test.assertEquals(test_year(1970, df), [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'], 'incorrect top 5 names for 1970') Test.assertEquals(test_year(1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'], 'incorrect top 5 names for 1987') Test.assertTrue(len(test_year(1945, df)) <= 5, 'list not limited to 5 names') Test.assertTrue(u'James' not in test_year(1945, df), 'male names not filtered') Test.assertTrue(test_year(1945, df) != [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered') Test.assertEqualsHashed(test_year(1880, df), "2038e2c0bb0b741797a47837c0f94dbf24123447", "incorrect top 5 names for 1880") run_tests() # COMMAND ---------- # MAGIC %md # COMMAND ---------- # MAGIC %md
# MAGIC # MAGIC If you not familar with Python regular expression [`search` function](https://docs.python.org/2/library/re.html#regular-expression-objects), now would be a good time to check up on the [documentation](https://developers.google.com/edu/python/regular-expressions). One tip that might be useful is to use an online tester like http://pythex.org or http://www.pythonregex.com. To use it, copy and paste the regular expression string below (located between the single quotes ') and test it against one of the 'Invalid logline' above. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code # This was originally '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)' APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+)\s*(\S+)\s*[\S*\s*]*(\S*)" (\d{3}) (\S+)' parsed_logs, access_logs, failed_logs = parseLogs() # COMMAND ---------- # TEST Data cleaning (1c) Test.assertEquals(failed_logs.count(), 0, 'incorrect failed_logs.count()') Test.assertEquals(parsed_logs.count(), 43177 , 'incorrect parsed_logs.count()') Test.assertEquals(access_logs.count(), parsed_logs.count(), 'incorrect access_logs.count()') # COMMAND ---------- # MAGIC %md #### **Part 2: Sample Analyses on the Web Server Log File** # MAGIC # MAGIC Now that we have an RDD containing the log file as a set of Row objects, we can perform various analyses. # MAGIC # MAGIC **(2a) Example: Content Size Statistics** # MAGIC # MAGIC Let's compute some statistics about the sizes of content being returned by the web server. In particular, we'd like to know what are the average, minimum, and maximum content sizes. # MAGIC # MAGIC We can compute the statistics by applying a `map` to the `access_logs` RDD. The `lambda` function we want for the map is to extract the `content_size` field from the RDD. The map produces a new RDD containing only the `content_sizes` (one element for each Row object in the `access_logs` RDD). To compute the minimum and maximum statistics, we can use [`min()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.min) and [`max()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.max) functions on the new RDD. We can compute the average statistic by using the [`reduce`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.reduce) function with a `lambda` function that sums the two inputs, which represent two elements from the new RDD that are being reduced together. The result of the `reduce()` is the total content size from the log and it is to be divided by the number of requests as determined using the [`count()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.count) function on the new RDD.
# COMMAND ---------- # ANSWER from pyspark.sql.functions import lit, concat pluralDF = wordsDF.select(concat('word', lit('s')).alias('word')) pluralDF.show() # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from test_helper import Test # TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # PRIVATE_TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (1c) Length of each word ** # MAGIC # MAGIC Now use the SQL `length` function to find the number of characters in each word. The [`length` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.length) is found in the `pyspark.sql.functions` module.
# Remember to cast the value you extract from the Vector using float() getElement = udf(lambda v, i: float(v[i]), DoubleType()) irisSeparateFeatures = (irisTwoFeatures .withColumn('sepalLength', getElement('features', lit(0))) .withColumn('sepalWidth', getElement('features', lit(1)))) display(irisSeparateFeatures) # COMMAND ---------- # TEST from test_helper import Test firstRow = irisSeparateFeatures.select('sepalWidth', 'features').map(lambda r: (r[0], r[1])).first() Test.assertEquals(firstRow[0], firstRow[1][1], 'incorrect definition for getElement') # COMMAND ---------- # MAGIC %md # MAGIC What about using `Column`'s `getItem` method? # COMMAND ---------- from pyspark.sql.functions import col display(irisTwoFeatures.withColumn('sepalLength', col('features').getItem(0))) # COMMAND ---------- # MAGIC %md
assert shakespeareCount == 122395 # ### ** Part 2: Check class testing library ** # #### ** (2a) Compare with hash ** # In[ ]: # TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, "twelve should equal 12") Test.assertEqualsHashed( twelve, "7b52009b64fd0a2a49e6d8a939753077792b0554", "twelve, once hashed, should equal the hashed value of 12" ) # #### ** (2b) Compare lists ** # In[ ]: # TEST Compare lists (2b) # This should print '1 test passed.' unsortedList = [(5, "b"), (5, "a"), (4, "c"), (3, "a")] Test.assertEquals(sorted(unsortedList), [(3, "a"), (4, "c"), (5, "a"), (5, "b")], "unsortedList does not sort properly")
ax.set_zlim((-20, 120)), ax.set_ylim((-20, 100)), ax.set_xlim((30, 75)) ax.plot_surface(xx, yy, z, alpha=.1) plt.tight_layout() pass threeDData = sc.parallelize(dataThreeD) componentsThreeD, threeDScores, eigenvaluesThreeD = pca(threeDData) print 'componentsThreeD: \n{0}'.format(componentsThreeD) print ('\nthreeDScores (first three): \n{0}' .format('\n'.join(map(str, threeDScores.take(3))))) print '\neigenvaluesThreeD: \n{0}'.format(eigenvaluesThreeD) Test.assertEquals(componentsThreeD.shape, (3, 2), 'incorrect shape for componentsThreeD') Test.assertTrue(np.allclose(np.sum(eigenvaluesThreeD), 969.796443367), 'incorrect value for eigenvaluesThreeD') Test.assertTrue(np.allclose(np.abs(np.sum(componentsThreeD)), 1.77238943258), 'incorrect value for componentsThreeD') Test.assertTrue(np.allclose(np.abs(np.sum(threeDScores.take(3))), 237.782834092), 'incorrect value for threeDScores') scoresThreeD = np.asarray(threeDScores.collect()) fig, ax = preparePlot(np.arange(20, 150, 20), np.arange(-40, 110, 20)) ax.set_xlabel(r'New $x_1$ values'), ax.set_ylabel(r'New $x_2$ values') ax.set_xlim(5, 150), ax.set_ylim(-45, 50) plt.scatter(scoresThreeD[:,0], scoresThreeD[:,1], s=14**2, c=clrs, edgecolors='#8cbfd0', alpha=0.75) pass
# One way of completing the function def capitalize(word): return word.upper() print capitalize('cat') # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(capitalize('rat'), 'RAT', 'incorrect result: capitalize does not work properly') # COMMAND ---------- # MAGIC %md ** (1c) Apply `capitalize` to the base RDD ** # MAGIC # MAGIC Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `capitalize()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code capitalRDD = wordsRDD.map(capitalize) print capitalRDD.collect() # COMMAND ----------
# One way of completing the function def makePlural(word): return word + 's' print makePlural('cat') # In[4]: # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from test_helper import Test # TEST Pluralize and test (1b) Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s') # #### ** (1c) Apply `makePlural` to the base RDD ** # #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD. # In[5]: # TODO: Replace <FILL IN> with appropriate code pluralRDD = wordsRDD.map(makePlural) print pluralRDD.collect() # In[6]: # TEST Apply makePlural to the base RDD(1c) Test.assertEquals(pluralRDD.collect(), ['cats', 'elephants', 'rats', 'rats', 'cats'],
# Remember to cast the value you extract from the Vector using float() getElement = udf(lambda v, i: float(v[i]), DoubleType()) irisSeparateFeatures = (irisTwoFeatures .withColumn('sepalLength', getElement('features', lit(0))) .withColumn('sepalWidth', getElement('features', lit(1)))) display(irisSeparateFeatures) # COMMAND ---------- # TEST from test_helper import Test firstRow = irisSeparateFeatures.select('sepalWidth', 'features').map(lambda r: (r[0], r[1])).first() Test.assertEquals(firstRow[0], firstRow[1][1], 'incorrect definition for getElement') # COMMAND ---------- # MAGIC %md # MAGIC What about using `Column`'s `getItem` method? # COMMAND ---------- from pyspark.sql.functions import col from pyspark.sql.utils import AnalysisException try: display(irisTwoFeatures.withColumn('sepalLength', col('features').getItem(0))) except AnalysisException as e: print e
assert shakespeareCount == 122395 # ### ** Part 2: Check class testing library ** # #### ** (2a) Compare with hash ** # In[ ]: # TEST Compare with hash (2a) # Check our testing library/package # This should print '1 test passed.' on two lines from test_helper import Test twelve = 12 Test.assertEquals(twelve, 12, 'twelve should equal 12') Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554', 'twelve, once hashed, should equal the hashed value of 12') # #### ** (2b) Compare lists ** # In[ ]: # TEST Compare lists (2b) # This should print '1 test passed.' unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')] Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')], 'unsortedList does not sort properly')
# #### If you not familar with Python regular expression [`search` function](https://docs.python.org/2/library/re.html#regular-expression-objects), now would be a good time to check up on the [documentation](https://developers.google.com/edu/python/regular-expressions). One tip that might be useful is to use an online tester like http://pythex.org or http://www.pythonregex.com. To use it, copy and paste the regular expression string below (located between the single quotes ') and test it against one of the 'Invalid logline' above. # In[6]: # TODO: Replace <FILL IN> with appropriate code # This was originally '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)' APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)' parsed_logs, access_logs, failed_logs = parseLogs() # In[7]: # TEST Data cleaning (1c) Test.assertEquals(failed_logs.count(), 0, 'incorrect failed_logs.count()') Test.assertEquals(parsed_logs.count(), 1043177 , 'incorrect parsed_logs.count()') Test.assertEquals(access_logs.count(), parsed_logs.count(), 'incorrect access_logs.count()') # ### **Part 2: Sample Analyses on the Web Server Log File** # # ####Now that we have an RDD containing the log file as a set of Row objects, we can perform various analyses. # # #### **(2a) Example: Content Size Statistics** # # ####Let's compute some statistics about the sizes of content being returned by the web server. In particular, we'd like to know what are the average, minimum, and maximum content sizes. # # ####We can compute the statistics by applying a `map` to the `access_logs` RDD. The `lambda` function we want for the map is to extract the `content_size` field from the RDD. The map produces a new RDD containing only the `content_sizes` (one element for each Row object in the `access_logs` RDD). To compute the minimum and maximum statistics, we can use [`min()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.min) and [`max()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.max) functions on the new RDD. We can compute the average statistic by using the [`reduce`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.reduce) function with a `lambda` function that sums the two inputs, which represent two elements from the new RDD that are being reduced together. The result of the `reduce()` is the total content size from the log and it is to be divided by the number of requests as determined using the [`count()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.count) function on the new RDD. # In[8]:
n_art = len(corpus_titles) print "\nLoaded " + str(n_art) + " articles from category " + cat corpus_tokens = [] corpus_filtered = [] for n, art in enumerate(corpus_text): print "\rTokenizing article {0} out of {1}".format(n + 1, n_art), # This is to make sure that all characters have the appropriate encoding. art = art.decode('utf-8') tokens = word_tokenize(art) corpus_tokens.append(tokens) print "\n The corpus has been tokenized. Let's check some portion of the first article:" print corpus_tokens[0][0:30] Test.assertEquals(len(corpus_tokens), n_art, "The number of articles has changed unexpectedly") Test.assertTrue( len(corpus_tokens) >= 100, "Your corpus_tokens has less than 100 articles. Consider using a larger dataset" ) # Select stemmer. stemmer = nltk.stem.SnowballStemmer('english') corpus_filtered = [[] for i in range(n_art)] for n, token_list in enumerate(corpus_tokens): print "\rFiltering article {0} out of {1}".format(n + 1, n_art), for token in token_list: if token.isalnum(): corpus_filtered[n].append(token.lower()) print "\nLet's check the first tokens from document 0 after stemming:"
# ANSWER from pyspark.ml.feature import StringIndexer stringIndexer = (StringIndexer() .setInputCol('label') .setOutputCol('indexed')) indexerModel = stringIndexer.fit(irisTrain) irisTrainIndexed = indexerModel.transform(irisTrain) display(irisTrainIndexed) # COMMAND ---------- # TEST from test_helper import Test Test.assertEquals(irisTrainIndexed.select('indexed').take(50)[-1][0], 2.0, 'incorrect values in indexed column') Test.assertTrue(irisTrainIndexed.schema.fields[2].metadata != {}, 'indexed should have metadata') # COMMAND ---------- # MAGIC %md # MAGIC We've updated the metadata for the field. Now we know that the field takes on three values and is nominal. # COMMAND ---------- print irisTrainIndexed.schema.fields[1].metadata print irisTrainIndexed.schema.fields[2].metadata # COMMAND ---------- # MAGIC %md
# TODO: Replace <FILL IN> with appropriate code from pyspark.ml.feature import StringIndexer stringIndexer = (<FILL IN> .<FILL IN> .<FILL IN>) indexerModel = stringIndexer.<FILL IN> irisTrainIndexed = indexerModel.<FILL IN> display(irisTrainIndexed) # COMMAND ---------- # TEST from test_helper import Test Test.assertEquals(irisTrainIndexed.select('indexed').take(50)[-1][0], 2.0, 'incorrect values in indexed column') Test.assertTrue(irisTrainIndexed.schema.fields[2].metadata != {}, 'indexed should have metadata') # COMMAND ---------- # MAGIC %md # MAGIC We've updated the metadata for the field. Now we know that the field takes on three values and is nominal. # COMMAND ---------- print irisTrainIndexed.schema.fields[1].metadata print irisTrainIndexed.schema.fields[2].metadata # COMMAND ---------- # MAGIC %md
IDandRatingsTuple: a single tuple of (MovieID, (Rating1, Rating2, Rating3, ...)) Returns: tuple: a tuple of (MovieID, (number of ratings, averageRating)) """ cnt = len(IDandRatingsTuple[1]) tot_rating = 0.0 for item in IDandRatingsTuple[1]: tot_rating += item return (IDandRatingsTuple[0], (cnt, tot_rating / cnt)) # In[11]: # TEST Number of Ratings and Average Ratings for a Movie (1a) Test.assertEquals(getCountsAndAverages((1, (1, 2, 3, 4))), (1, (4, 2.5)), 'incorrect getCountsAndAverages() with integer list') Test.assertEquals(getCountsAndAverages( (100, (10.0, 20.0, 30.0))), (100, (3, 20.0)), 'incorrect getCountsAndAverages() with float list') Test.assertEquals(getCountsAndAverages((110, xrange(20))), (110, (20, 9.5)), 'incorrect getCountsAndAverages() with xrange') # #### **(1b) Movies with Highest Average Ratings** # #### Now that we have a way to calculate the average ratings, we will use the `getCountsAndAverages()` helper function with Spark to determine movies with highest average ratings. # #### The steps you should perform are: # * #### Recall that the `ratingsRDD` contains tuples of the form (UserID, MovieID, Rating). From `ratingsRDD` create an RDD with tuples of the form (MovieID, Python iterable of Ratings for that MovieID). This transformation will yield an RDD of the form: `[(1, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7c90>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e79d0>), (3, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7610>)]`. Note that you will only need to perform two Spark transformations to do this step. # * #### Using `movieIDsWithRatingsRDD` and your `getCountsAndAverages()` helper function, compute the number of ratings and average rating for each movie to yield tuples of the form (MovieID, (number of ratings, average rating)). This transformation will yield an RDD of the form: `[(1, (993, 4.145015105740181)), (2, (332, 3.174698795180723)), (3, (299, 3.0468227424749164))]`. You can do this step with one Spark transformation # * #### We want to see movie names, instead of movie IDs. To `moviesRDD`, apply RDD transformations that use `movieIDsWithAvgRatingsRDD` to get the movie names for `movieIDsWithAvgRatingsRDD`, yielding tuples of the form (average rating, movie name, number of ratings). This set of transformations will yield an RDD of the form: `[(1.0, u'Autopsy (Macchie Solari) (1975)', 1), (1.0, u'Better Living (1998)', 1), (1.0, u'Big Squeeze, The (1996)', 3)]`. You will need to do two Spark transformations to complete this step: first use the `moviesRDD` with `movieIDsWithAvgRatingsRDD` to create a new RDD with Movie names matched to Movie IDs, then convert that RDD into the form of (average rating, movie name, number of ratings). These transformations will yield an RDD that looks like: `[(3.6818181818181817, u'Happiest Millionaire, The (1967)', 22), (3.0468227424749164, u'Grumpier Old Men (1995)', 299), (2.882978723404255, u'Hocus Pocus (1993)', 94)]` # In[18]:
rdd = sc.parallelize(xrange(-5,5)) # Rango [-5, 5) filtered_rdd = rdd.filter(lambda x: x >= 0) # Devuelve los positivos ''' `map(func)` aplica una función a los elementos de un RDD ''' # Ejemplo en PySpark # Añade 1 a cada elemento del RDD # Para cada elemento, obtiene una tupla (x, x**2) def add1(x): return(x+1) squared_rdd = (filtered_rdd .map(add1) # Añade 1 a cada elemento del RDD .map(lambda x: (x, x*x))) # Para cada elemento, obtén una tupla (x, x**2) Test.assertEquals(squared_rdd.collect(), [(1, 1), (2, 4), (3, 9), (4, 16), (5, 25)]) ''' flatMap(func) igual que map, pero “aplana” la salida ''' %pyspark squaredflat_rdd = (filtered_rdd .map(add1) .flatMap(lambda x: (x, x*x))) # Da la salida en forma de lista Test.assertEquals(squaredflat_rdd.collect(), [1, 1, 2, 4, 3, 9, 4, 16, 5, 25]) ''' sample(withReplacement, fraction, seed=None) devuelve una muestra del RDD withReplacement - si True, cada elemento puede aparecer varias veces en la muestra fraction - tamaño esperado de la muestra como una fracción del tamaño del RDD sin reemplazo: probabilidad de seleccionar un elemento, su valor debe ser [0, 1] con reemplazo: número esperado de veces que se escoge un elemento, su valor debe ser >= 0
v = np.arange(5, 10, .5) elementWise = u*v dotProduct = u.dot(v) print 'u: {0}'.format(u) print 'v: {0}'.format(v) print '\nelementWise\n{0}'.format(elementWise) print '\ndotProduct\n{0}'.format(dotProduct) # In[14]: # TEST Element-wise multiplication and dot product (2b) Test.assertTrue(np.all(elementWise == [ 0., 2.75, 6., 9.75, 14., 18.75, 24., 29.75, 36., 42.75]), 'incorrect value for elementWise') Test.assertEquals(dotProduct, 183.75, 'incorrect value for dotProduct') # #### ** (2c) Matrix math ** # #### With NumPy it is very easy to perform matrix math. You can use [np.matrix()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html) to generate a NumPy matrix. Just pass a two-dimensional `ndarray` or a list of lists to the function. You can perform matrix math on NumPy matrices using `*`. # #### You can transpose a matrix by calling [numpy.matrix.transpose()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.transpose.html) or by using `.T` on the matrix object (e.g. `myMatrix.T`). Transposing a matrix produces a matrix where the new rows are the columns from the old matrix. For example: $$ \begin{bmatrix} 1 & 2 & 3 \\\ 4 & 5 & 6 \end{bmatrix}^\mathbf{\top} = \begin{bmatrix} 1 & 4 \\\ 2 & 5 \\\ 3 & 6 \end{bmatrix} $$ # # #### Inverting a matrix can be done using [numpy.linalg.inv()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.inv.html). Note that only square matrices can be inverted, and square matrices are not guaranteed to have an inverse. If the inverse exists, then multiplying a matrix by its inverse will produce the identity matrix. $ \scriptsize ( \mathbf{A}^{-1} \mathbf{A} = \mathbf{I_n} ) $ The identity matrix $ \scriptsize \mathbf{I_n} $ has ones along its diagonal and zero elsewhere. $$ \mathbf{I_n} = \begin{bmatrix} 1 & 0 & 0 & \dots & 0 \\\ 0 & 1 & 0 & \dots & 0 \\\ 0 & 0 & 1 & \dots & 0 \\\ \vdots & \vdots & \vdots & \ddots & \vdots \\\ 0 & 0 & 0 & \dots & 1 \end{bmatrix} $$ # #### For this exercise, multiply $ \mathbf{A} $ times its transpose $ ( \mathbf{A}^\top ) $ and then calculate the inverse of the result $ ( [ \mathbf{A} \mathbf{A}^\top ]^{-1} ) $. # In[15]: # TODO: Replace <FILL IN> with appropriate code from numpy.linalg import inv A = np.matrix([[1,2,3,4],[5,6,7,8]])
# COMMAND ---------- # MAGIC %md # MAGIC Create a `DenseVector` with the values 1.5, 2.5, 3.0 (in that order). # COMMAND ---------- # ANSWER denseVec = Vectors.dense([1.5, 2.5, 3.0]) # COMMAND ---------- # TEST from test_helper import Test Test.assertEquals(denseVec, DenseVector([1.5, 2.5, 3.0]), 'incorrect value for denseVec') # COMMAND ---------- # MAGIC %md # MAGIC Create a `LabeledPoint` with a label equal to 10.0 and features equal to `denseVec` # COMMAND ---------- # ANSWER labeledP = LabeledPoint(10.0, denseVec) # COMMAND ---------- # TEST Test.assertEquals(str(labeledP), '(10.0,[1.5,2.5,3.0])', 'incorrect value for labeledP')
# In[7]: def calcUserMeanRating(userRatingGroup): """ Calculate the average rating of a user """ userID = userRatingGroup[0] ratingSum = 0.0 ratingCnt = len(userRatingGroup[1]) if ratingCnt == 0: return (userID, 0.0) for item in userRatingGroup[1]: ratingSum += item[1] return (userID, 1.0 * ratingSum / ratingCnt) Test.assertEquals(calcUserMeanRating((123, [(1, 1), (2, 2), (3, 3)])), (123, 2.0), 'incorrect calcUserMeanRating()') # In[8]: def broadcastUserRatingAvg(sContext, uRRDDTrain): """ Broadcast the user average rating RDD """ userRatingAvgList = uRRDDTrain.map(lambda x: calcUserMeanRating(x)).collect() userRatingAvgDict = {} for (user, avgscore) in userRatingAvgList: userRatingAvgDict[user] = avgscore uRatingAvgBC = sContext.broadcast(userRatingAvgDict)# broadcast return uRatingAvgBC def predictUsingAvg(tup, avgDict):
numPartitions = 2 rawData = sc.textFile(fileName, numPartitions) # In[8]: # TODO: Replace <FILL IN> with appropriate code numPoints = rawData.count() print numPoints samplePoints = rawData.take(5) print samplePoints # In[9]: # TEST Load and check the data (1a) Test.assertEquals(numPoints, 6724, 'incorrect value for numPoints') Test.assertEquals(len(samplePoints), 5, 'incorrect length for samplePoints') # #### ** (1b) Using `LabeledPoint` ** # #### In MLlib, labeled training instances are stored using the [LabeledPoint](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) object. Write the parsePoint function that takes as input a raw data point, parses it using Python's [unicode.split](https://docs.python.org/2/library/string.html#string.split) method, and returns a `LabeledPoint`. Use this function to parse samplePoints (from the previous question). Then print out the features and label for the first training point, using the `LabeledPoint.features` and `LabeledPoint.label` attributes. Finally, calculate the number features for this dataset. # #### Note that `split()` can be called directly on a `unicode` or `str` object. For example, `u'split,me'.split(',')` returns `[u'split', u'me']`. # In[10]: from pyspark.mllib.regression import LabeledPoint import numpy as np # Here is a sample raw data point: # '2001.0,0.884,0.610,0.600,0.474,0.247,0.357,0.344,0.33,0.600,0.425,0.60,0.419' # In this raw data point, 2001.0 is the label, and the remaining values are features
sampleOHEDictManual[(1, 'black')], '77de68daecd823babbb58edb1c8e14d7106e83bb', "incorrect value for sampleOHEDictManual[(1,'black')]") Test.assertEqualsHashed( sampleOHEDictManual[(1, 'tabby')], '1b6453892473a467d07372d45eb05abc2031647a', "incorrect value for sampleOHEDictManual[(1,'tabby')]") Test.assertEqualsHashed( sampleOHEDictManual[(2, 'mouse')], 'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4', "incorrect value for sampleOHEDictManual[(2,'mouse')]") Test.assertEqualsHashed( sampleOHEDictManual[(2, 'salmon')], 'c1dfd96eea8cc2b62785275bca38ac261256e278', "incorrect value for sampleOHEDictManual[(2,'salmon')]") Test.assertEquals(len(sampleOHEDictManual.keys()), 7, 'incorrect number of keys in sampleOHEDictManual') # TEST Sparse Vectors (1b) Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue( aDense.dot(w) == aSparse.dot(w), 'dot product of aDense and w should equal dot product of aSparse and w') Test.assertTrue( bDense.dot(w) == bSparse.dot(w), 'dot product of bDense and w should equal dot product of bSparse and w') # TEST OHE Features as sparse vectors (1c) Test.assertTrue(isinstance(sampleOneOHEFeatManual, SparseVector), 'sampleOneOHEFeatManual needs to be a SparseVector') Test.assertTrue(isinstance(sampleTwoOHEFeatManual, SparseVector),