Example #1
0
def lab4_test_ex_7(RMSE, predictions):
    Test.assertTrue(
        abs(RMSE - 4.72051562103) < 0.5,
        'incorrect RMSE value (expected about 4.721)', 'Test 7.1 - success')
    Test.assertEqualsHashed(
        predictions.take(3)[1], '11ec7603d09fbc0af0517cacd4d162f4a9e18196',
        'incorrect first 3 elements of the predictions', 'Test 7.2 - success')
    Test.assertEqualsHashed(
        abs(predictions.take(5)[4][2] - 5.048388885082981) < 0.2,
        '88b33e4e12f75ac8bf792aebde41f1a090f3a612',
        'incorrect prediction for the element in predictions',
        'Test 7.3 - success')
Example #2
0
def lab4_test_ex_6(errors_list):
    Test.assertTrue(
        abs(errors_list[0] - 6.3120935921) < 0.5,
        'incorrect RMSE[0] value (expected about 6.312)', 'Test 6.1 - success')
    Test.assertTrue(
        abs(errors_list[4] - 3.97614773441) < 0.5,
        'incorrect RMSE[4] value (expected about 3.976)', 'Test 6.2 - success')
    Test.assertTrue(
        abs(errors_list[7] - 4.07374957224) < 0.5,
        'incorrect RMSE[7] value (expected about 4.074)', 'Test 6.3 - success')
    Test.assertTrue(
        abs(errors_list[9] - 4.27126330603) < 0.5,
        'incorrect RMSE[9] value (expected about 4.272)', 'Test 6.4 - success')
    Test.assertTrue(
        abs(errors_list[14] - 4.02682737231) < 0.5,
        'incorrect RMSE[14] value (expected about 4.027)',
        'Test 6.4 - success')
Example #3
0
File: Lab.py Project: smoltis/spark
def run_tests():
  Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945')
  Test.assertEquals(test_year(1970, df), [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'], 'incorrect top 5 names for 1970')
  Test.assertEquals(test_year(1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'], 'incorrect top 5 names for 1987')
  Test.assertTrue(len(test_year(1945, df)) <= 5, 'list not limited to 5 names')
  Test.assertTrue(u'James' not in test_year(1945, df), 'male names not filtered')
  Test.assertTrue(test_year(1945, df) != [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered')
  Test.assertEqualsHashed(test_year(1880, df), "2038e2c0bb0b741797a47837c0f94dbf24123447", "incorrect top 5 names for 1880")
Example #4
0
def lab4_test_ex_5(error_list):

    RMSE1 = error_list[0]
    RMSE2 = error_list[1]
    RMSE3 = error_list[2]

    Test.assertTrue(
        abs(RMSE1 - 1.26491106407) < 0.00000001,
        'incorrect RMSE1 value (expected 1.26491106407)', 'test 5.1 - success')
    Test.assertTrue(
        abs(RMSE2 - 2.70801280155) < 0.00000001,
        'incorrect RMSE1 value (expected 2.70801280155)', 'test 5.2 - success')
    Test.assertTrue(
        abs(RMSE3 - 0.0) < 0.00000001, 'incorrect RMSE1 value (expected 0.0)',
        'test 5.3 - success')
Example #5
0
def run_tests():
    Test.assertEquals(test_year(1945, df),
                      [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'],
                      'incorrect top 5 names for 1945')
    Test.assertEquals(test_year(1970, df),
                      [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'],
                      'incorrect top 5 names for 1970')
    Test.assertEquals(test_year(
        1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'],
                      'incorrect top 5 names for 1987')
    Test.assertTrue(
        len(test_year(1945, df)) <= 5, 'list not limited to 5 names')
    Test.assertTrue(u'James' not in test_year(1945, df),
                    'male names not filtered')
    Test.assertTrue(
        test_year(1945, df) !=
        [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered')
    Test.assertEqualsHashed(test_year(1880, df),
                            "2038e2c0bb0b741797a47837c0f94dbf24123447",
                            "incorrect top 5 names for 1880")
# names for `movieIDsWithAvgRatingsRDD`, yielding tuples of the form
# (average rating, movie name, number of ratings)
movieNameWithAvgRatingsRDD = (moviesRDD.join(movieIDsWithAvgRatingsRDD).map(
    lambda x: (x[1][1][1], x[1][0], x[1][1][0])))
print 'movieNameWithAvgRatingsRDD: %s\n' % movieNameWithAvgRatingsRDD.take(3)

# In[19]:

# TEST Movies with Highest Average Ratings (1b)

Test.assertEquals(movieIDsWithRatingsRDD.count(), 3615,
                  'incorrect movieIDsWithRatingsRDD.count() (expected 3615)')
movieIDsWithRatingsTakeOrdered = movieIDsWithRatingsRDD.takeOrdered(3)
Test.assertTrue(
    movieIDsWithRatingsTakeOrdered[0][0] == 1
    and len(list(movieIDsWithRatingsTakeOrdered[0][1])) == 993,
    'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[0] (expected 993)'
)
Test.assertTrue(
    movieIDsWithRatingsTakeOrdered[1][0] == 2
    and len(list(movieIDsWithRatingsTakeOrdered[1][1])) == 332,
    'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[1] (expected 332)'
)
Test.assertTrue(
    movieIDsWithRatingsTakeOrdered[2][0] == 3
    and len(list(movieIDsWithRatingsTakeOrdered[2][1])) == 299,
    'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[2] (expected 299)'
)

Test.assertEquals(
    movieIDsWithAvgRatingsRDD.count(), 3615,
Example #7
0
# In[11]:

# TODO: Replace <FILL IN> with appropriate code
# Create a numpy array with the values 1, 2, 3
simpleArray = np.array([1, 2, 3])
# Perform the scalar product of 5 and the numpy array
timesFive = simpleArray*5
print simpleArray
print timesFive


# In[12]:

# TEST Scalar multiplication (2a)
Test.assertTrue(np.all(timesFive == [5, 10, 15]), 'incorrect value for timesFive')


# #### ** (2b) Element-wise multiplication and dot product **
# #### NumPy arrays support both element-wise multiplication and dot product.  Element-wise multiplication occurs automatically when you use the `*` operator to multiply two `ndarray` objects of the same length.
# #### To perform the dot product you can use either [np.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html#numpy.dot) or [np.ndarray.dot()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.dot.html).  For example, if you had NumPy arrays `x` and `y`, you could compute their dot product four ways: `np.dot(x, y)`, `np.dot(y, x)`, `x.dot(y)`, or `y.dot(x)`.
# #### For this exercise, multiply the arrays `u` and `v` element-wise and compute their dot product.

# In[13]:

# TODO: Replace <FILL IN> with appropriate code
# Create a ndarray based on a range and step size.
u = np.arange(0, 5, .5)
v = np.arange(5, 10, .5)

elementWise = u*v
Example #8
0
aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [[0,0.], [1,3.], [2,0.], [3,4.]])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [[0,0.], [1,0.], [2,0.], [3,1.]])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)


# TEST Sparse Vectors
Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(aDense.dot(w) == aSparse.dot(w),
                'dot product of aDense and w should equal dot product of aSparse and w')
Test.assertTrue(bDense.dot(w) == bSparse.dot(w),
                'dot product of bDense and w should equal dot product of bSparse and w')


# ** OHE features as sparse vectors **
sampleOneOHEFeatManual = SparseVector(7,[2,3],[1.0,1.0])
sampleTwoOHEFeatManual = SparseVector(7,[1,4,5],[1.0,1.0,1.0])
sampleThreeOHEFeatManual = SparseVector(7,[0,3,6],[1.0,1.0,1.0])


# TEST OHE Features as sparse vectors
Test.assertTrue(isinstance(sampleOneOHEFeatManual, SparseVector),
Example #9
0
    tokenCountPairTuple = uniqueTokens.map(lambda a: (a,1.0))
    tokenSumPairTuple = tokenCountPairTuple.reduceByKey(lambda a,b : (a+b))
    return (tokenSumPairTuple.map(lambda (a, b): (a, N/b)))
    #return tokenSumPairTuple
    
idfsSmall = idfs(amazonRecToToken.union(googleRecToToken))
uniqueTokenCount = idfsSmall.count()
#print amazonRecToToken.union(googleRecToToken).take(1)
#print idfsSmall.sortBy(lambda x: x[1],ascending = False).collect()
print 'There are %s unique tokens in the small datasets.' % uniqueTokenCount

# TEST Implement an IDFs function (2c)
Test.assertEquals(uniqueTokenCount, 4772, 'incorrect uniqueTokenCount')
tokenSmallestIdf = idfsSmall.takeOrdered(1, lambda s: s[1])[0]
Test.assertEquals(tokenSmallestIdf[0], 'software', 'incorrect smallest IDF token')
Test.assertTrue(abs(tokenSmallestIdf[1] - 4.25531914894) < 0.0000000001,
                'incorrect smallest IDF value')

smallIDFTokens = idfsSmall.takeOrdered(11, lambda s: s[1])
print smallIDFTokens

import matplotlib.pyplot as plt

small_idf_values = idfsSmall.map(lambda s: s[1]).collect()
fig = plt.figure(figsize=(8,3))
plt.hist(small_idf_values, 50, log=True)
pass

# TODO: Replace <FILL IN> with appropriate code
def tfidf(tokens, idfs):
    """ Compute TF-IDF
    Args:
Example #10
0
#  Call fit on the estimator and pass in our DataFrame
model = <FILL IN>

# Obtain the clusterCenters from the KMeansModel
centers = <FILL IN>

# Use the model to transform the DataFrame by adding cluster predictions
transformed = <FILL IN>

print centers

# COMMAND ----------

# TEST
import numpy as np
Test.assertTrue(np.allclose([ 0.35115296, -0.10691828], centers[0]),
                'incorrect centers.  check your params.')
Test.assertEquals(transformed.select('prediction').map(lambda r: r[0]).take(4), [1,1,1,1],
                  'incorrect predictions')

# COMMAND ----------

# MAGIC %md
# MAGIC ## PART 3

# COMMAND ----------

# MAGIC %md
# MAGIC From the class hierarchy it is clear that `KMeans` is an `Estimator` while `KMeansModel` is a `Transformer`.

# COMMAND ----------
Example #11
0
# In[30]:

# TODO: Replace <FILL IN> with appropriate code
print shakespeareRDD.top(5)
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split(' '))
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount

# In[31]:

# TEST Words from lines (4d)
# This test allows for leading spaces to be removed either before or after
# punctuation is removed.
Test.assertTrue(
    shakespeareWordCount == 927631 or shakespeareWordCount == 928908,
    'incorrect value for shakespeareWordCount')
Test.assertEquals(shakespeareWordsRDD.top(5),
                  [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'],
                  'incorrect value for shakespeareWordsRDD')

# #### ** (4e) Remove empty elements **
# #### The next step is to filter out the empty elements.  Remove all entries where the word is `''`.

# In[32]:

# TODO: Replace <FILL IN> with appropriate code
shakeWordsRDD = shakespeareWordsRDD.filter(lambda x: x != '')
shakeWordCount = shakeWordsRDD.count()
print shakeWordCount
# ** Interpreting PCA **

correlatedData = sc.parallelize(dataCorrelated)

meanCorrelated = correlatedData.mean()
correlatedDataZeroMean = correlatedData.map(lambda x: x - meanCorrelated)

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)

# TEST Interpreting PCA
from test_helper import Test

Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]), "incorrect value for meanCorrelated")
Test.assertTrue(
    np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
    "incorrect value for correlatedDataZeroMean",
)


# **Sample covariance matrix**

correlatedCov = correlatedDataZeroMean.map(lambda x: np.outer(x, x)).mean()
print correlatedCov

# TEST Sample covariance matrix
covResult = [[0.99558386, 0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(covResult, correlatedCov), "incorrect value for correlatedCov")
Example #13
0
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.ml.clustering import KMeans

# Create a KMeans Estimator and set k=3, seed=5, maxIter=20, initSteps=1
kmeans = (<FILL IN>  # create KMeans
          <FILL IN>  # set K
          <FILL IN>  # seed
          <FILL IN>  # maxIter
          <FILL IN>)  # initSteps

#  Call fit on the estimator and pass in our DataFrame
model = <FILL IN>

# Obtain the clusterCenters from the KMeansModel
centers = <FILL IN>

# Use the model to transform the DataFrame by adding cluster predictions
transformed = <FILL IN>

print centers

# COMMAND ----------

# TEST
import numpy as np
Test.assertTrue(np.allclose([ 0.35115296, -0.10691828], centers[0]),
                'incorrect centers.  check your params.')
Test.assertEquals(transformed.select('prediction').map(lambda r: r[0]).take(4), [1,1,1,1],
                  'incorrect predictions')
print('Test RMSE: Baseline = {0:.3f}\tLRInteract = {1:.3f}'.format(
    rmseTestBase, rmseTestInteract))

#---------------------------------------
#---------------------------------------
#Test Suites
#---------------------------------------
#---------------------------------------

# TEST Shift labels (1d)
#oldSampleFeatures = parsedDataInit.take(1)[0].features
#newSampleFeatures = parsedData.take(1)[0].features
#Test.assertTrue(np.allclose(oldSampleFeatures, newSampleFeatures),'new features do not match old features')
sumFeatTwo = parsedData.map(lambda lp: lp.features[2]).sum()
Test.assertTrue(np.allclose(sumFeatTwo, 3158.96224351),
                'parsedData has unexpected values')
minYearNew = parsedData.map(lambda lp: lp.label).min()
maxYearNew = parsedData.map(lambda lp: lp.label).max()
Test.assertTrue(minYearNew == 0, 'incorrect min year in shifted data')
Test.assertTrue(maxYearNew == 89, 'incorrect max year in shifted data')
# TEST Training, validation, and test sets (1e)
Test.assertEquals(parsedTrainData.getNumPartitions(), numPartitions,
                  'parsedTrainData has wrong number of partitions')
Test.assertEquals(parsedValData.getNumPartitions(), numPartitions,
                  'parsedValData has wrong number of partitions')
Test.assertEquals(parsedTestData.getNumPartitions(), numPartitions,
                  'parsedTestData has wrong number of partitions')
Test.assertEquals(len(parsedTrainData.take(1)[0].features), 12,
                  'parsedTrainData has wrong number of features')
sumFeatTwo = (parsedTrainData.map(lambda lp: lp.features[2]).sum())
sumFeatThree = (
Example #15
0
File: 3.py Project: Mvrm/Spark
 
idfsSmall = idfs(amazonRecToToken.union(googleRecToToken))
uniqueTokenCount = idfsSmall.count()

print idfsSmall.takeOrdered(1, lambda s: s[1])[0]

print 'There are %s unique tokens in the small datasets.' % uniqueTokenCount


# In[16]:

# TEST Implement an IDFs function (2c)
Test.assertEquals(uniqueTokenCount, 4772, 'incorrect uniqueTokenCount')
tokenSmallestIdf = idfsSmall.takeOrdered(1, lambda s: s[1])[0]
Test.assertEquals(tokenSmallestIdf[0], 'software', 'incorrect smallest IDF token')
Test.assertTrue(abs(tokenSmallestIdf[1] - 4.25531914894) < 0.0000000001,
                'incorrect smallest IDF value')


# ### **(2d) Tokens with the smallest IDF**
# #### Print out the 11 tokens with the smallest IDF in the combined small dataset.

# In[17]:

smallIDFTokens = idfsSmall.takeOrdered(11, lambda s: s[1])
print smallIDFTokens


# ### **(2e) IDF Histogram**
# #### Plot a histogram of IDF values.  Be sure to use appropriate scaling and bucketing for the data.
# #### First plot the histogram using `matplotlib`
        titleAndRatingsTuple[1][1],
        titleAndRatingsTuple[0],
        titleAndRatingsTuple[1][0],
    )
)
print "movieNameWithAvgRatingsRDD: %s\n" % movieNameWithAvgRatingsRDD.take(3)


# In[25]:

# TEST Movies with Highest Average Ratings (1b)

Test.assertEquals(movieIDsWithRatingsRDD.count(), 3615, "incorrect movieIDsWithRatingsRDD.count() (expected 3615)")
movieIDsWithRatingsTakeOrdered = movieIDsWithRatingsRDD.takeOrdered(3)
Test.assertTrue(
    movieIDsWithRatingsTakeOrdered[0][0] == 1 and len(list(movieIDsWithRatingsTakeOrdered[0][1])) == 993,
    "incorrect count of ratings for movieIDsWithRatingsTakeOrdered[0] (expected 993)",
)
Test.assertTrue(
    movieIDsWithRatingsTakeOrdered[1][0] == 2 and len(list(movieIDsWithRatingsTakeOrdered[1][1])) == 332,
    "incorrect count of ratings for movieIDsWithRatingsTakeOrdered[1] (expected 332)",
)
Test.assertTrue(
    movieIDsWithRatingsTakeOrdered[2][0] == 3 and len(list(movieIDsWithRatingsTakeOrdered[2][1])) == 299,
    "incorrect count of ratings for movieIDsWithRatingsTakeOrdered[2] (expected 299)",
)

Test.assertEquals(
    movieIDsWithAvgRatingsRDD.count(), 3615, "incorrect movieIDsWithAvgRatingsRDD.count() (expected 3615)"
)
Test.assertEquals(
    movieIDsWithAvgRatingsRDD.takeOrdered(3),
Example #17
0
# MAGIC
# MAGIC Apply a transformation that will split each element of the RDD by its spaces. For each element of the RDD, you should apply Python's string [split()](https://docs.python.org/2/library/string.html#string.split) function. You might think that a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation is the way to do this, but think about what the result of the `split()` function will be: there is a better option.
# MAGIC
# MAGIC Hint: remember the problem we had with `GroupByKey()`

# COMMAND ----------

shakespeare_words_RDD = shakespeare_RDD.filter(lambda x: x != '')
shakespeare_word_count_elem = shakespeare_words_RDD.count()
print shakespeare_words_RDD.top(5)
print shakespeare_word_count_elem

# This test allows for leading spaces to be removed either before or after
# punctuation is removed.
Test.assertTrue(
    shakespeare_word_count_elem == 927631
    or shakespeare_word_count_elem == 928908,
    'incorrect value for shakespeare_word_count_elem')

Test.assertEqualsHashed(shakespeare_words_RDD.top(5),
                        'f177c26ee0bc3a48368d7a92c08dd754237c3558',
                        'incorrect value for shakespeare_words_RDD')

# COMMAND ----------

# MAGIC %md The next step is to filter out the empty elements.  Remove all entries where the word is `''`.

# COMMAND ----------

shakespeare_nonempty_words_RDD = shakespeare_words_RDD.filter(
    lambda x: x != '')
shakespeare_nonempty_word_elem_count = shakespeare_nonempty_words_RDD.count()
print hashTrainData.take(1)

averageSparsityHash = computeSparsity(hashTrainData, numBucketsCTR, nTrain)
averageSparsityOHE = computeSparsity(OHETrainData, numCtrOHEFeats, nTrain)

print 'Average OHE Sparsity: {0:.10e}'.format(averageSparsityOHE)
print 'Average Hash Sparsity: {0:.10e}'.format(averageSparsityHash)

#------------------
# Test Code
#------------------

# TEST Loading and splitting the data (3a)
Test.assertTrue(
    all([
        rawTrainData.is_cached, rawValidationData.is_cached,
        rawTestData.is_cached
    ]), 'you must cache the split data')
Test.assertEquals(nTrain, 79911, 'incorrect value for nTrain')
Test.assertEquals(nVal, 10075, 'incorrect value for nVal')
Test.assertEquals(nTest, 10014, 'incorrect value for nTest')

# TEST Extract features (3b)
Test.assertEquals(numCategories[2][1], 855,
                  'incorrect implementation of parsePoint')
Test.assertEquals(numCategories[32][1], 4,
                  'incorrect implementation of parsePoint')

# TEST Create an OHE dictionary from the dataset (3c)
Test.assertEquals(numCtrOHEFeats, 233286,
                  'incorrect number of features in ctrOHEDict')
dayGroupedHosts = dayToHostPairTuple.distinct().map(lambda (k,v):(k[0],1))

dayHostCount = dayGroupedHosts.reduceByKey(lambda a,b:a+b)

dailyHosts = (dayHostCount.sortByKey(lambda (k,v):k)
            .cache())
dailyHostsList = dailyHosts.take(30)
print 'Unique hosts per day: %s' % dailyHostsList


# In[42]:

# TEST Number of unique daily hosts (3c)
Test.assertEquals(dailyHosts.count(), 21, 'incorrect dailyHosts.count()')
Test.assertEquals(dailyHostsList, [(1, 2582), (3, 3222), (4, 4190), (5, 2502), (6, 2537), (7, 4106), (8, 4406), (9, 4317), (10, 4523), (11, 4346), (12, 2864), (13, 2650), (14, 4454), (15, 4214), (16, 4340), (17, 4385), (18, 4168), (19, 2550), (20, 2560), (21, 4134), (22, 4456)], 'incorrect dailyHostsList')
Test.assertTrue(dailyHosts.is_cached, 'incorrect dailyHosts.is_cached')


# #### **(3d) Exercise: Visualizing the Number of Unique Daily Hosts**
# ####Using the results from the previous exercise, use `matplotlib` to plot a "Line" graph of the unique hosts requests by day.
# #### `daysWithHosts` should be a list of days and `hosts` should be a list of number of unique hosts for each corresponding day.
# #### * How could you convert a RDD into a list? See the [`collect()` method](http://spark.apache.org/docs/latest/api/python/pyspark.html?highlight=collect#pyspark.RDD.collect)*

# In[43]:

# TODO: Replace <FILL IN> with appropriate code

daysWithHosts = dailyHosts.map(lambda (k,v):k).collect()
hosts = dailyHosts.map(lambda (k,v):v).collect()

  #return df.limit(10) # THIS IS NOT CORRECT! FIX IT.

# COMMAND ----------

# Transparent Tests
from test_helper import Test
def test_year(year, df):
    return [row.firstName for row in top_female_names_for_year(year, 5, df).collect()]

# COMMAND ----------

def run_tests():
  Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945')
  Test.assertEquals(test_year(1970, df), [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'], 'incorrect top 5 names for 1970')
  Test.assertEquals(test_year(1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'], 'incorrect top 5 names for 1987')
  Test.assertTrue(len(test_year(1945, df)) <= 5, 'list not limited to 5 names')
  Test.assertTrue(u'James' not in test_year(1945, df), 'male names not filtered')
  Test.assertTrue(test_year(1945, df) != [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered')
  Test.assertEqualsHashed(test_year(1880, df), "2038e2c0bb0b741797a47837c0f94dbf24123447", "incorrect top 5 names for 1880")
  
run_tests()

# COMMAND ----------

# MAGIC %md

# COMMAND ----------

# MAGIC %md 
# MAGIC ## Solution
# MAGIC 
Example #21
0
Test.assertEqualsHashed(
    sampleOHEDictManual[(1, 'tabby')],
    '1b6453892473a467d07372d45eb05abc2031647a',
    "incorrect value for sampleOHEDictManual[(1,'tabby')]")
Test.assertEqualsHashed(
    sampleOHEDictManual[(2, 'mouse')],
    'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4',
    "incorrect value for sampleOHEDictManual[(2,'mouse')]")
Test.assertEqualsHashed(
    sampleOHEDictManual[(2, 'salmon')],
    'c1dfd96eea8cc2b62785275bca38ac261256e278',
    "incorrect value for sampleOHEDictManual[(2,'salmon')]")
Test.assertEquals(len(sampleOHEDictManual.keys()), 7,
                  'incorrect number of keys in sampleOHEDictManual')
# TEST Sparse Vectors (1b)
Test.assertTrue(isinstance(aSparse, SparseVector),
                'aSparse needs to be an instance of SparseVector')
Test.assertTrue(isinstance(bSparse, SparseVector),
                'aSparse needs to be an instance of SparseVector')
Test.assertTrue(
    aDense.dot(w) == aSparse.dot(w),
    'dot product of aDense and w should equal dot product of aSparse and w')
Test.assertTrue(
    bDense.dot(w) == bSparse.dot(w),
    'dot product of bDense and w should equal dot product of bSparse and w')
# TEST OHE Features as sparse vectors (1c)
Test.assertTrue(isinstance(sampleOneOHEFeatManual, SparseVector),
                'sampleOneOHEFeatManual needs to be a SparseVector')
Test.assertTrue(isinstance(sampleTwoOHEFeatManual, SparseVector),
                'sampleTwoOHEFeatManual needs to be a SparseVector')
Test.assertTrue(isinstance(sampleThreeOHEFeatManual, SparseVector),
                'sampleThreeOHEFeatManual needs to be a SparseVector')
Example #22
0
# ANSWER
from pyspark.ml.feature import Normalizer
normalizer = (Normalizer()
              .setInputCol('features')
              .setOutputCol('featureNorm')
              .setP(2.0))

irisNormalized = normalizer.transform(irisTwoFeatures)  # Note that we're calling transform here
display(irisNormalized)

# COMMAND ----------

# TEST
import numpy as np
firstVector = irisNormalized.select('featureNorm').map(lambda r: r[0]).first()
Test.assertTrue(np.allclose(firstVector.norm(2.0), 1.0), 'incorrect setup of normalizer')

# COMMAND ----------

# MAGIC %md
# MAGIC ## Part 3

# COMMAND ----------

# MAGIC %md
# MAGIC Let's just check and see that our norms are equal to 1.0

# COMMAND ----------

l2Norm = udf(lambda v: float(v.norm(2.0)), DoubleType())
Example #23
0
pass

# ** Interpreting PCA **

correlatedData = sc.parallelize(dataCorrelated)

meanCorrelated = correlatedData.mean()
correlatedDataZeroMean = correlatedData.map(lambda x: x - meanCorrelated)

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)

# TEST Interpreting PCA
from test_helper import Test
Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]),
                'incorrect value for meanCorrelated')
Test.assertTrue(
    np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
    'incorrect value for correlatedDataZeroMean')

# **Sample covariance matrix**

correlatedCov = correlatedDataZeroMean.map(lambda x: np.outer(x, x)).mean()
print correlatedCov

# TEST Sample covariance matrix
covResult = [[0.99558386, 0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(covResult, correlatedCov),
                'incorrect value for correlatedCov')

# ** Covariance Function **
# TODO: Replace <FILL IN> with appropriate code
correlatedData = sc.parallelize(dataCorrelated)
#print format(correlatedData)
meanCorrelated = correlatedData.mean()
print format(meanCorrelated)
correlatedDataZeroMean = correlatedData.map(lambda x:
                                            (np.subtract(x, meanCorrelated)))

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)

# TEST Interpreting PCA (1a)
from test_helper import Test
Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]),
                'incorrect value for meanCorrelated')
Test.assertTrue(
    np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
    'incorrect value for correlatedDataZeroMean')

# TODO: Replace <FILL IN> with appropriate code
# Compute the covariance matrix using outer products and correlatedDataZeroMean
n = correlatedDataZeroMean.count()
#print format(correlatedDataZeroMean.map(lambda x: np.outer(x, x)).take(1))
correlatedCov = correlatedDataZeroMean.map(lambda x: np.outer(x, x)).sum() / n
print format(correlatedCov)

# TEST Sample covariance matrix (1b)
covResult = [[0.99558386, 0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(covResult, correlatedCov),
                'incorrect value for correlatedCov')
Example #25
0
# TODO: Replace <FILL IN> with appropriate code
correlatedData = sc.parallelize(dataCorrelated)

meanCorrelated = <FILL IN>
correlatedDataZeroMean = correlatedData.<FILL IN>

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)

# COMMAND ----------

# TEST Interpreting PCA (1a)
from test_helper import Test
Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]),
                'incorrect value for meanCorrelated')
Test.assertTrue(np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
                'incorrect value for correlatedDataZeroMean')

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC **(1b) Sample covariance matrix**
# MAGIC 
# MAGIC We are now ready to compute the sample covariance matrix. If we define \\(\scriptsize \mathbf{X} \in \mathbb{R}^{n \times d}\\) as the zero mean data matrix, then the sample covariance matrix is defined as: \\[ \mathbf{C}_{\mathbf X} = \frac{1}{n} \mathbf{X}^\top \mathbf{X} \,.\\]  To compute this matrix, compute the outer product of each data point, add together these outer products, and divide by the number of data points. The data are two dimensional, so the resulting covariance matrix should be a 2x2 matrix.
# MAGIC  
# MAGIC 
# MAGIC Note that [np.outer()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.outer.html) can be used to calculate the outer product of two NumPy arrays.

# COMMAND ----------
Example #26
0
movieIDsWithRatingsRDD = (ratingsRDD.map(lambda x: (x[1],x[2]))).groupByKey()
print 'movieIDsWithRatingsRDD: {}\n'.format(movieIDsWithRatingsRDD.take(3))

movieIDsWithAvgRatingsRDD = movieIDsWithRatingsRDD.map(getCountsAndAverages)
print 'movieIDsWithAvgRatingsRDD: {}\n'.format(movieIDsWithAvgRatingsRDD.take(3))

movieNameWithAvgRatingsRDD = (moviesRDD.join(movieIDsWithAvgRatingsRDD)).map(lambda x: (x[1][1][1],x[1][0],x[1][1][0])).sortBy(lambda x: x[1])
print 'movieNameWithAvgRatingsRDD: {}\n'.format(movieNameWithAvgRatingsRDD.take(3))


print movieNameWithAvgRatingsRDD.takeOrdered(3)
Test.assertEquals(movieIDsWithRatingsRDD.count(), 3615,
                'incorrect movieIDsWithRatingsRDD.count() (expected 3615)')
movieIDsWithRatingsTakeOrdered = movieIDsWithRatingsRDD.takeOrdered(3)
Test.assertTrue(movieIDsWithRatingsTakeOrdered[0][0] == 1 and
                len(list(movieIDsWithRatingsTakeOrdered[0][1])) == 993,
                'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[0] (expected 993)')
Test.assertTrue(movieIDsWithRatingsTakeOrdered[1][0] == 2 and
                len(list(movieIDsWithRatingsTakeOrdered[1][1])) == 332,
                'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[1] (expected 332)')
Test.assertTrue(movieIDsWithRatingsTakeOrdered[2][0] == 3 and
                len(list(movieIDsWithRatingsTakeOrdered[2][1])) == 299,
                'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[2] (expected 299)')

Test.assertEquals(movieIDsWithAvgRatingsRDD.count(), 3615,
                'incorrect movieIDsWithAvgRatingsRDD.count() (expected 3615)')
Test.assertEquals(movieIDsWithAvgRatingsRDD.takeOrdered(3),
                [(1, (993, 4.145015105740181)), (2, (332, 3.174698795180723)),
                 (3, (299, 3.0468227424749164))],
                'incorrect movieIDsWithAvgRatingsRDD.takeOrdered(3)')
Example #27
0
                    .map(lambda iterator: list(iterator))
                    .sortByKey())

hourlyHosts = (hourGroupedHosts
               .map(lambda (k, v_it): (k, len(v_it)))
               .cache())
              
hourlyHostsList = hourlyHosts.collect()
print 'Unique hosts per hour: %s' % hourlyHostsList

# COMMAND ----------

# TEST Number of unique hourly hosts (3c)
Test.assertEquals(hourlyHosts.count(), 18, 'incorrect hourlyHosts.count()')
Test.assertEquals(hourlyHostsList, [(0, 378), (1, 329), (2, 263), (3, 194), (4, 179), (5, 156), (6, 165), (7, 170), (8, 211), (9, 245), (10, 328), (11, 323), (12, 280), (13, 306), (14, 317), (15, 351), (16, 362), (17, 112)], 'incorrect hourlyHostsList')
Test.assertTrue(hourlyHosts.is_cached, 'incorrect hourlyHosts.is_cached')

# COMMAND ----------

# MAGIC %md **(3d) Exercise: Visualizing the Number of Unique Hourly Hosts**
# MAGIC 
# MAGIC Using the results from the previous exercise, use `matplotlib` to plot a "Line" graph of the unique hosts requests by hour.
# MAGIC 
# MAGIC `hoursWithHosts` should be a list of hours and `hosts` should be a list of number of unique hosts for each corresponding hour.
# MAGIC 
# MAGIC * How could you convert a RDD into a list? See the [`collect()` method](http://spark.apache.org/docs/latest/api/python/pyspark.html?highlight=collect#pyspark.RDD.collect)*

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
            features.
    """
    <FILL IN>

parsedSamplePoints = <FILL IN>
firstPointFeatures = <FILL IN>
firstPointLabel = <FILL IN>
print firstPointFeatures, firstPointLabel

d = len(firstPointFeatures)
print d

# COMMAND ----------

# TEST Using LabeledPoint (1b)
Test.assertTrue(isinstance(firstPointLabel, float), 'label must be a float')
expectedX0 = [0.8841,0.6105,0.6005,0.4747,0.2472,0.3573,0.3441,0.3396,0.6009,0.4257,0.6049,0.4192]
Test.assertTrue(np.allclose(expectedX0, firstPointFeatures, 1e-4, 1e-4),
                'incorrect features for firstPointFeatures')
Test.assertTrue(np.allclose(2001.0, firstPointLabel), 'incorrect label for firstPointLabel')
Test.assertTrue(d == 12, 'incorrect number of features')

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC **Visualization 1: Features**
# MAGIC 
# MAGIC First we will load and setup the visualization library.  Then we will look at the raw features for 50 data points by generating a heatmap that visualizes each feature on a grey-scale and shows the variation of each feature across the 50 sample data points.  The features are all between 0 and 1, with values closer to 1 represented via darker shades of grey.

# COMMAND ----------
Example #29
0
plt.scatter(dataCorrelated[:,0], dataCorrelated[:,1], s=14**2, c='#d6ebf2',
            edgecolors='#8cbfd0', alpha=0.75)
pass

correlatedData = sc.parallelize(dataCorrelated)

meanCorrelated = correlatedData.mean()
correlatedDataZeroMean = correlatedData.map(lambda x:np.subtract(x,meanCorrelated))

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)


from test_helper import Test
Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]),
                'incorrect value for meanCorrelated')
Test.assertTrue(np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
                'incorrect value for correlatedDataZeroMean')

correlatedCov = correlatedDataZeroMean.map(lambda x: np.outer(x,x)).reduce(lambda x,y:x+y)/correlatedDataZeroMean.count()
print correlatedCov

covResult = [[ 0.99558386,  0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(covResult, correlatedCov), 'incorrect value for correlatedCov')


def estimateCovariance(data):
    meanData = data.mean()
    zeroMeanData = data.map(lambda x:np.subtract(x,meanData))
    correlatedMatrix = zeroMeanData.map(lambda x: np.outer(x,x)).reduce(lambda x,y:x+y)/zeroMeanData.count()
    return correlatedMatrix
Example #30
0
#  Call fit on the estimator and pass in our DataFrame
model = <FILL IN>

# Obtain the clusterCenters from the KMeansModel
centers = <FILL IN>

# Use the model to transform the DataFrame by adding cluster predictions
transformed = <FILL IN>

print centers

# COMMAND ----------

# TEST
import numpy as np
Test.assertTrue(np.allclose([ 0.35115296, -0.10691828], centers[0]),
                'incorrect centers.  check your params.')
Test.assertEquals(transformed.select('prediction').map(lambda r: r[0]).take(4), [1,1,1,1],
                  'incorrect predictions')

# COMMAND ----------

# MAGIC %md
# MAGIC ## PART 3

# COMMAND ----------

# MAGIC %md
# MAGIC From the class hierarchy it is clear that `KMeans` is an `Estimator` while `KMeansModel` is a `Transformer`.

# COMMAND ----------
# In[80]:

# TODO: Replace <FILL IN> with appropriate code
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split(" "))
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount


# In[81]:

# TEST Words from lines (4d)
# This test allows for leading spaces to be removed either before or after
# punctuation is removed.
Test.assertTrue(shakespeareWordCount == 927631 or shakespeareWordCount == 928908,
                'incorrect value for shakespeareWordCount')
Test.assertEquals(shakespeareWordsRDD.top(5),
                  [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'],
                  'incorrect value for shakespeareWordsRDD')


# #### ** (4e) Remove empty elements **
# #### The next step is to filter out the empty elements.  Remove all entries where the word is `''`.

# In[82]:

# TODO: Replace <FILL IN> with appropriate code
shakeWordsRDD = shakespeareWordsRDD.filter(lambda x:len(x) > 0)
shakeWordCount = shakeWordsRDD.count()
print shakeWordCount
Example #32
0
# MAGIC %md ** (4d) Words from lines **
# MAGIC
# MAGIC Before we can use the `wordcount()` function, we have to split each line by its spaces. Apply a transformation that will split each element of the RDD by its spaces. For each element of the RDD, you should apply Python's string [split()](https://docs.python.org/2/library/string.html#string.split) function. You might think that a `map()` transformation is the way to do this, but think about what the result of the `split()` function will be.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda x: x.split())
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount

# COMMAND ----------

# TEST Words from lines (4d)
Test.assertTrue(shakespeareWordCount == 959524,
                'incorrect value for shakespeareWordCount')
Test.assertEquals(shakespeareWordsRDD.top(5),
                  [u'zwounds', u'zwounds', u'zwounds', u'zwounds', u'zwounds'],
                  'incorrect value for shakespeareWordsRDD')

# COMMAND ----------

# MAGIC %md ** (4e) Count the words **
# MAGIC
# MAGIC We now have an RDD that is only words.  Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the top 10 words by using the `takeOrdered()` action; however, since the elements of the RDD are pairs, we need a custom sort function that sorts using the value part of the pair.
# MAGIC
# MAGIC You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results.
# MAGIC
# MAGIC Use the `wordCount()` function and `takeOrdered()` to obtain the fifteen most common words and their counts.

# COMMAND ----------
dayToHostPairTuple = access_logs.map(lambda log:(log.date_time.day, log.host))

dayGroupedHosts = dayToHostPairTuple.groupByKey()
dayHostCount = dayGroupedHosts.map(lambda (day, hosts): (day, len(set(hosts)) ) )

dailyHosts = dayHostCount.sortByKey().cache()
dailyHostsList = dailyHosts.take(30)
print 'Unique hosts per day: %s' % dailyHostsList


# In[46]:

# TEST Number of unique daily hosts (3c)
Test.assertEquals(dailyHosts.count(), 21, 'incorrect dailyHosts.count()')
Test.assertEquals(dailyHostsList, [(1, 2582), (3, 3222), (4, 4190), (5, 2502), (6, 2537), (7, 4106), (8, 4406), (9, 4317), (10, 4523), (11, 4346), (12, 2864), (13, 2650), (14, 4454), (15, 4214), (16, 4340), (17, 4385), (18, 4168), (19, 2550), (20, 2560), (21, 4134), (22, 4456)], 'incorrect dailyHostsList')
Test.assertTrue(dailyHosts.is_cached, 'incorrect dailyHosts.is_cached')


# #### **(3d) Exercise: Visualizing the Number of Unique Daily Hosts**
# ####Using the results from the previous exercise, use `matplotlib` to plot a "Line" graph of the unique hosts requests by day.
# #### `daysWithHosts` should be a list of days and `hosts` should be a list of number of unique hosts for each corresponding day.
# #### * How could you convert a RDD into a list? See the [`collect()` method](http://spark.apache.org/docs/latest/api/python/pyspark.html?highlight=collect#pyspark.RDD.collect)*

# In[53]:

# TODO: Replace <FILL IN> with appropriate code

daysWithHosts = dailyHosts.keys().collect()
hosts = dailyHosts.values().collect()

Example #34
0
# ANSWER
from pyspark.ml.feature import Normalizer
normalizer = (Normalizer()
              .setInputCol('features')
              .setOutputCol('featureNorm')
              .setP(2.0))

irisNormalized = normalizer.transform(irisTwoFeatures)  # Note that we're calling transform here
display(irisNormalized)

# COMMAND ----------

# TEST
import numpy as np
firstVector = irisNormalized.select('featureNorm').map(lambda r: r[0]).first()
Test.assertTrue(np.allclose(firstVector.norm(2.0), 1.0), 'incorrect setup of normalizer')

# COMMAND ----------

# MAGIC %md
# MAGIC ## Part 3

# COMMAND ----------

# MAGIC %md
# MAGIC Let's just check and see that our norms are equal to 1.0

# COMMAND ----------

l2Norm = udf(lambda v: float(v.norm(2.0)), DoubleType())
Example #35
0
display(irisStandardizedLength)

# COMMAND ----------

display(irisStandardizedLength.describe('sepalLength', 'standardizedLength'))

# COMMAND ----------

# MAGIC %md
# MAGIC What if instead we wanted to normalize the data?  For example, we might want to normalize each set of features (per row) to have length one using an \\( l^2 \\) norm.  That would cause the sum of the features squared to be one: \\( \sum_{i=1}^d x_i^2 = 1 \\).  This is could be useful if we wanted to compare observations based on a distance metric like in k-means clustering.
# MAGIC  
# MAGIC Normalizer can be found in [pyspark.ml.feature](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer) for Python and the [org.apache.spark.ml.feature](http://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.Normalizer) package for Scala.
# MAGIC  
# MAGIC Let's implement `Normalizer` and transform our features.  Make sure to use a `P` of 2.0 and to name the output column to "featureNorm".  Remember that we're working with the `irisTwoFeatures` dataset.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.ml.feature import Normalizer
normalizer = (<FILL IN>)

irisNormalized = normalizer.transform(irisTwoFeatures)  # Note that we're calling transform here
display(irisNormalized)

# COMMAND ----------

# TEST
import numpy as np
firstVector = irisNormalized.select('featureNorm').map(lambda r: r[0]).first()
Test.assertTrue(np.allclose(firstVector.norm(2.0), 1.0), 'incorrect setup of normalizer')
Example #36
0
corpus_tokens = []
corpus_filtered = []
for n, art in enumerate(corpus_text):
    print "\rTokenizing article {0} out of {1}".format(n + 1, n_art),
    # This is to make sure that all characters have the appropriate encoding.
    art = art.decode('utf-8')
    tokens = word_tokenize(art)
    corpus_tokens.append(tokens)

    print "\n The corpus has been tokenized. Let's check some portion of the first article:"
print corpus_tokens[0][0:30]

Test.assertEquals(len(corpus_tokens), n_art,
                  "The number of articles has changed unexpectedly")
Test.assertTrue(
    len(corpus_tokens) >= 100,
    "Your corpus_tokens has less than 100 articles. Consider using a larger dataset"
)

# Select stemmer.
stemmer = nltk.stem.SnowballStemmer('english')
corpus_filtered = [[] for i in range(n_art)]
for n, token_list in enumerate(corpus_tokens):
    print "\rFiltering article {0} out of {1}".format(n + 1, n_art),
    for token in token_list:
        if token.isalnum():
            corpus_filtered[n].append(token.lower())

print "\nLet's check the first tokens from document 0 after stemming:"
print corpus_filtered[0][0:30]

Test.assertTrue(all([c == c.lower() for c in corpus_filtered[23]]),
    label = line_array.pop(0)
    return LabeledPoint(label, np.array(line_array))

parsedSamplePoints = map(parsePoint, samplePoints)
firstPointFeatures = parsedSamplePoints[0].features
firstPointLabel = parsedSamplePoints[0].label
print firstPointFeatures, firstPointLabel

d = len(firstPointFeatures)
print d


# In[16]:

# TEST Using LabeledPoint (1b)
Test.assertTrue(isinstance(firstPointLabel, float), 'label must be a float')
expectedX0 = [0.8841,0.6105,0.6005,0.4747,0.2472,0.3573,0.3441,0.3396,0.6009,0.4257,0.6049,0.4192]
Test.assertTrue(np.allclose(expectedX0, firstPointFeatures, 1e-4, 1e-4),
                'incorrect features for firstPointFeatures')
Test.assertTrue(np.allclose(2001.0, firstPointLabel), 'incorrect label for firstPointLabel')
Test.assertTrue(d == 12, 'incorrect number of features')


# #### **Visualization 1: Features**
# #### First we will load and setup the visualization library.  Then we will look at the raw features for 50 data points by generating a heatmap that visualizes each feature on a grey-scale and shows the variation of each feature across the 50 sample data points.  The features are all between 0 and 1, with values closer to 1 represented via darker shades of grey.

# In[17]:

import matplotlib.pyplot as plt
import matplotlib.cm as cm
Example #38
0
from pyspark.ml.feature import StringIndexer

stringIndexer = (StringIndexer()
                 .setInputCol('label')
                 .setOutputCol('indexed'))

indexerModel = stringIndexer.fit(irisTrain)
irisTrainIndexed = indexerModel.transform(irisTrain)
display(irisTrainIndexed)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisTrainIndexed.select('indexed').take(50)[-1][0], 2.0, 'incorrect values in indexed column')
Test.assertTrue(irisTrainIndexed.schema.fields[2].metadata != {}, 'indexed should have metadata')

# COMMAND ----------

# MAGIC %md
# MAGIC We've updated the metadata for the field.  Now we know that the field takes on three values and is nominal.

# COMMAND ----------

print irisTrainIndexed.schema.fields[1].metadata
print irisTrainIndexed.schema.fields[2].metadata

# COMMAND ----------

# MAGIC %md
# MAGIC Let's build a decision tree to classify our data.
# (average rating, movie name, number of ratings)
movieNameWithAvgRatingsRDD = (moviesRDD
                              .join(movieIDsWithAvgRatingsRDD)
                              .map(lambda log: (log[1][1][1], log[1][0], log[1][1][0])))
print 'movieNameWithAvgRatingsRDD: %s\n' % movieNameWithAvgRatingsRDD.take(3)


# In[26]:

# TEST Movies with Highest Average Ratings (1b)

Test.assertEquals(movieIDsWithRatingsRDD.count(), 3615,
                'incorrect movieIDsWithRatingsRDD.count() (expected 3615)')
movieIDsWithRatingsTakeOrdered = movieIDsWithRatingsRDD.takeOrdered(3)
Test.assertTrue(movieIDsWithRatingsTakeOrdered[0][0] == 1 and
                len(list(movieIDsWithRatingsTakeOrdered[0][1])) == 993,
                'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[0] (expected 993)')
Test.assertTrue(movieIDsWithRatingsTakeOrdered[1][0] == 2 and
                len(list(movieIDsWithRatingsTakeOrdered[1][1])) == 332,
                'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[1] (expected 332)')
Test.assertTrue(movieIDsWithRatingsTakeOrdered[2][0] == 3 and
                len(list(movieIDsWithRatingsTakeOrdered[2][1])) == 299,
                'incorrect count of ratings for movieIDsWithRatingsTakeOrdered[2] (expected 299)')

Test.assertEquals(movieIDsWithAvgRatingsRDD.count(), 3615,
                'incorrect movieIDsWithAvgRatingsRDD.count() (expected 3615)')
Test.assertEquals(movieIDsWithAvgRatingsRDD.takeOrdered(3),
                [(1, (993, 4.145015105740181)), (2, (332, 3.174698795180723)),
                 (3, (299, 3.0468227424749164))],
                'incorrect movieIDsWithAvgRatingsRDD.takeOrdered(3)')
Example #40
0
from pyspark.ml.feature import StringIndexer

stringIndexer = (<FILL IN>
                 .<FILL IN>
                 .<FILL IN>)

indexerModel = stringIndexer.<FILL IN>
irisTrainIndexed = indexerModel.<FILL IN>
display(irisTrainIndexed)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisTrainIndexed.select('indexed').take(50)[-1][0], 2.0, 'incorrect values in indexed column')
Test.assertTrue(irisTrainIndexed.schema.fields[2].metadata != {}, 'indexed should have metadata')

# COMMAND ----------

# MAGIC %md
# MAGIC We've updated the metadata for the field.  Now we know that the field takes on three values and is nominal.

# COMMAND ----------

print irisTrainIndexed.schema.fields[1].metadata
print irisTrainIndexed.schema.fields[2].metadata

# COMMAND ----------

# MAGIC %md
# MAGIC Let's build a decision tree to classify our data.
Example #41
0
    parts = line.split(",")
    return LabeledPoint(parts[0], parts[1:len(parts)])


parsedSamplePoints = map(parsePoint, samplePoints)
firstPointFeatures = parsedSamplePoints[0].features
firstPointLabel = parsedSamplePoints[0].label
print firstPointFeatures, firstPointLabel

d = len(firstPointFeatures)
print d

# In[12]:

# TEST Using LabeledPoint (1b)
Test.assertTrue(isinstance(firstPointLabel, float), 'label must be a float')
expectedX0 = [
    0.8841, 0.6105, 0.6005, 0.4747, 0.2472, 0.3573, 0.3441, 0.3396, 0.6009,
    0.4257, 0.6049, 0.4192
]
Test.assertTrue(np.allclose(expectedX0, firstPointFeatures, 1e-4, 1e-4),
                'incorrect features for firstPointFeatures')
Test.assertTrue(np.allclose(2001.0, firstPointLabel),
                'incorrect label for firstPointLabel')
Test.assertTrue(d == 12, 'incorrect number of features')

# #### **Visualization 1: Features**
# #### First we will load and setup the visualization library.  Then we will look at the raw features for 50 data points by generating a heatmap that visualizes each feature on a grey-scale and shows the variation of each feature across the 50 sample data points.  The features are all between 0 and 1, with values closer to 1 represented via darker shades of grey.

# In[13]:
Example #42
0
# ANSWER
from pyspark.ml.feature import Normalizer
normalizer = (Normalizer()
              .setInputCol('features')
              .setOutputCol('featureNorm')
              .setP(2.0))

irisNormalized = normalizer.transform(irisTwoFeatures)  # Note that we're calling transform here
display(irisNormalized)

# COMMAND ----------

# TEST
import numpy as np
firstVector = irisNormalized.select('featureNorm').map(lambda r: r[0]).first()
Test.assertTrue(np.allclose(firstVector.norm(2.0), 1.0), 'incorrect setup of normalizer')

# COMMAND ----------

# MAGIC %md
# MAGIC ## Part 3

# COMMAND ----------

# MAGIC %md
# MAGIC Let's just check and see that our norms are equal to 1.0

# COMMAND ----------

l2Norm = udf(lambda v: float(v.norm(2.0)), DoubleType())