/
Qn8.py
101 lines (76 loc) · 3.62 KB
/
Qn8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.mllib.feature import Word2Vec
from pyspark.mllib.feature import Word2VecModel
import datetime
import numpy as np
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from collections import defaultdict
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.tree import RandomForest, RandomForestModel
path = 'Lab2/Assignment3'
conf = SparkConf()
conf.setMaster('yarn-client')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
df = sqlContext.read.json(path+'/reviews_Pet_Supplies_p1.json')
reviewDF = df.select("overall", "reviewText", "reviewTime")
def removePunctuation(text):
return re.sub("[^a-zA-Z]", " ", text)
cleanedReviewRDD = reviewDF.map(lambda row: (row.overall, removePunctuation(row.reviewText).lower().split(), row.reviewTime ))
wordsClustersRDD= sc.pickleFile(path+'/WordClustersRDD',10)
reviewRDDWithIndex = cleanedReviewRDD.zipWithIndex().map(lambda (row, index):(index, row)).cache()
reviewTextWithIndex = reviewRDDWithIndex.map(lambda (index,(score, words, time)): (index, words))
reviewScoreTimeWithIndex = reviewRDDWithIndex.map(lambda (index,(score, words, time)): (index, (score, time)))
def getKey(item):
return item[0]
def createSparseVector(histogram):
indexList = []
countList = []
for histogramIndex, count in sorted(histogram, key=getKey):
indexList.append(histogramIndex)
countList.append(count)
return Vectors.sparse(2000, indexList,countList)
reviewFeaturesRDD = reviewTextWithIndex\
.flatMapValues(lambda word: word)\
.map(lambda (rowNum, word): (word, rowNum))\
.join(wordsClustersRDD)\
.map(lambda (word, (rowNum,clusterIndex)): ((rowNum, clusterIndex), 1))\
.reduceByKey(lambda x,y: x+y)\
.map(lambda ((rowNum, clusterIndex), count):(rowNum, (clusterIndex, count)))\
.groupByKey()\
.mapValues(lambda histogram: createSparseVector(histogram))\
.cache()
norm = Normalizer(1)
normalisedReviewFeaturesRDD=reviewFeaturesRDD.map(lambda (rowNum, features):rowNum)\
.zip(norm.transform(reviewFeaturesRDD.map(lambda (rowNum, features):features)))
formatter_string = "%m %d %Y"
allDataRDD=normalisedReviewFeaturesRDD.join(reviewScoreTimeWithIndex)\
.map(lambda (rowNum, (features, (score, time))):(score, features, datetime.datetime.strptime(time[-4:], "%Y").year))\
.cache()
train_featureScoreTimeRDD=allDataRDD.filter(lambda (score, feature, time): time<2014 )\
.map(lambda (score, feature, time): LabeledPoint(float(score),feature))\
.repartition(10).cache()
val_featureScoreTimeRDD=allDataRDD.filter(lambda (score, feature, time): time>=2014 )\
.map(lambda (score, feature, time): LabeledPoint(float(score),feature))\
.repartition(10).cache()
allDataRDD.unpersist()
reviewFeaturesRDD.unpersist()
def getRandomForestRMSE(trees_array):
valRMSE_list = []
for trees in trees_array:
model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={},
numTrees=trees, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features))
labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions)
valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count())
valRMSE=valMSE**0.5
valRMSE_list.append((trees, valRMSE))
return valRMSE_list
trees_array = [10, 50, 100, 500, 1000]
valRMSE_List = getRandomForestRMSE(trees_array)
print "RMSE"
print valRMSE_List