コード例 #1
0
ファイル: index.py プロジェクト: diogovazc/mbigdata17ut
def combine(sqlc):
	companyName = consts.company
	consts.stockFile = consts.setStockFile(companyName, consts.user)

	"""Read stock file"""
	stockData = sqlc.read.format('com.databricks.spark.csv') \
	    .options(header='true') \
	    .option("inferschema", 'true') \
	    .option("encoding", "UTF-8") \
	    .load(consts.stockFile)

	a = datetime.datetime.fromtimestamp(consts.beginTime).strftime('%Y/%m/%d')
	b = datetime.datetime.fromtimestamp(consts.endTime).strftime('%Y/%m/%d')

	stockDataYear = operation.selectStock(stockData, ["date", "close"], a, b)

	"""Change Date Format from Y/M/d to Y-M-d"""
	my_udf = udf(operation.formatDate)
	stockData = stockDataYear.withColumn("date", my_udf(stockDataYear.date))
	if consts.timeframe != 'day': 
		stockData = operation.averageStock(stockData, consts.timeframe)
		print 'stockData != day'
		print stockData.take(3)

	"""Read Meta and Reviews Files"""
	df = sqlc.read.json(consts.filename)
	df2 = sqlc.read.json(consts.reviewsfile)
	meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50)
	reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime"], consts.beginTime, consts.endTime)

	"""Join Reviews asin"""
	reviews = reviews.join(meta, "asin")
	rating = operation.averageRating(reviews, consts.timeframe)

	"""Join ratings with stock"""
	combine = rating.join(stockData, "date")
	combine = combine.orderBy("date", ascending=True)

	"""combine.write.format("com.databricks.spark.csv").save("/user/" + consts.user + "/project/data/" + consts.folder, header="true")"""

	printR.printClusterRDD(combine.rdd, consts.user, consts.folder)
	"""printR.saveClusterCSV(combine, consts.user, consts.folder)"""

	dates = [rat.date for rat in combine.select('date').collect()]
	ratings = [float(rat.avgRating) for rat in combine.select('avgRating').collect()]
	stocks = [float(stock.close) for stock in combine.select('close').collect()]
	diffRatings = [(((j-i)*100.0)/i) for i, j in zip(ratings[:-1], ratings[1:])]
	diffStocks = [(((j-i)*100.0)/i) for i, j in zip(stocks[:-1], stocks[1:])]

	'''rows = zip(dates, ratings, stocks, diffRatings, diffStocks)
コード例 #2
0
def getRatingAvg(sqlc):
	"""Read Files"""
	df = sqlc.read.json(consts.filename)
	df2 = sqlc.read.json(consts.reviewsfilefarm)

	"""Select Data"""
	meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50)
	reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime", "reviewTime"], consts.beginTime, consts.endTime)

	"""Join"""
	reviews = reviews.join(meta, "asin")
	rating = reviews.join(meta, "asin").agg({"overall":"avg"})
	"""Print"""
	printR.printFarm(rating)
コード例 #3
0
ファイル: index.py プロジェクト: diogovazc/mbigdata17ut
def getRatingGroupAvg(sqlc):
	"""Read Files"""
	df = sqlc.read.json(consts.filename)
	df2 = sqlc.read.json(consts.reviewsfile)

	"""Select Data"""
	meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50)
	reviews = operation.selectReviews(df2, ['asin', 'overall', "unixReviewTime"], consts.beginTime, consts.endTime)

	"""Join"""
	reviews = reviews.join(meta, "asin")
	rating = operation.averageRating(reviews, consts.timeframe)

	"""Print"""
	printR.printClusterRDD(rating.rdd, consts.user, consts.folder)
コード例 #4
0
def countRatings(sqlc):
	"""Read Files"""
	df = sqlc.read.json(consts.filename)
	df2 = sqlc.read.json(consts.reviewsfilefarm)

	"""Select Data"""
	meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 25)
	reviews = operation.selectReviews(df2, ['asin', "unixReviewTime"], consts.beginTime, consts.endTime)

	"""Join Reviews asin"""
	reviews = reviews.join(meta, "asin")

	"""Count"""
	contagem = operation.countApprox(reviews.rdd)

	print contagem
コード例 #5
0
def multipleCompanies(sqlc):
	stockDataYearApple = operation.readStockValue(consts.appleStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime)
	stockDataYearHp = operation.readStockValue(consts.hpStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime)
	stockDataYearMicrosoft = operation.readStockValue(consts.microsoftStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime)
	stockDataYearDell = operation.readStockValue(consts.dellStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime)
	stockDataYearSony = operation.readStockValue(consts.sonyStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime)
	stockDataYearSamsung = operation.readStockValue(consts.samsungStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime)
	stockDataList = [stockDataYearApple, stockDataYearHp, stockDataYearMicrosoft, stockDataYearDell, stockDataYearSony, stockDataYearSamsung]
	companyList = ['apple', 'hp', 'microsoft', 'dell', 'sony', 'samsung']

	"""Change Date Format from Y/M/d to Y-M-d"""
	my_udf = udf(operation.formatDate)
	index = 0
	for stock in stockDataList:
		stockDataList[index] = stock.withColumn("date", my_udf("date"))
		print stockDataList[index].take(2)
		index += 1

	"""Read Meta and Reviews Files"""
	df = sqlc.read.json(consts.filename)
	df2 = sqlc.read.json(consts.reviewsfilefarm)

	results = None

	index = 0
	for company in companyList:
		stockDataList[index] = stockDataList[index].withColumnRenamed('close', 'stock ' + company)
		meta = operation.selectProducts(df, ["asin", "title", "price"], company, 50)
		reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime"], consts.beginTime, consts.endTime)
		amazonjoin = reviews.join(meta, "asin")
		print "amazonjoin " + company
		print amazonjoin.take(5)
		rating = operation.averageRatingAlias(amazonjoin, consts.timeframe, 'rating ' + company)
		print "rating and stock " + company
		print rating.take(5)
		print stockDataList[index].take(5)
		combine = rating.join(stockDataList[index], "date")
		combine = combine.orderBy("date", ascending=True)
		print "combine " + company
		print combine.take(5)
		"""if index == 0:
									results = combine
								else:
									results = results.join(combine, "date")"""
		index += 1
コード例 #6
0
def combine(sqlc):
	companyName = consts.company
	consts.stockFile = consts.setStockFile(companyName, consts.user)

	"""Read stock file"""
	stockData = sqlc.read.format('com.databricks.spark.csv') \
	    .options(header='true') \
	    .option("inferschema", 'true') \
	    .option("encoding", "UTF-8") \
	    .load(consts.stockFile)

	a = datetime.datetime.fromtimestamp(consts.beginTime).strftime('%Y/%m/%d')
	b = datetime.datetime.fromtimestamp(consts.endTime).strftime('%Y/%m/%d')

	stockDataYear = operation.selectStock(stockData, ["date", "close"], a, b)

	"""Change Date Format from Y/M/d to Y-M-d"""
	my_udf = udf(operation.formatDate)
	stockData = stockDataYear.withColumn("date", my_udf(stockDataYear.date))
	if consts.timeframe != 'day': 
		stockData = operation.averageStock(stockData, consts.timeframe)

	"""Read Meta and Reviews Files"""
	df = sqlc.read.json(consts.filename)
	df2 = sqlc.read.json(consts.reviewsfilefarm)
	meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50)
	reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime"], consts.beginTime, consts.endTime)

	"""Join Reviews asin"""
	reviews = reviews.join(meta, "asin")
	rating = operation.averageRating(reviews, consts.timeframe)
	printR.printFarmExample(rating, 2)

	"""Join ratings with stock"""
	combine = rating.join(stockData, "date")
	combine = combine.orderBy("date", ascending=True)

	printR.printFarm(combine)

	"""# Generate CSV with output data
コード例 #7
0
def countReviews(sqlc):
	"""Read Files"""
	df = sqlc.read.json(consts.filename)
	df2 = sqlc.read.json(consts.reviewsfilefarm)

	"""Select Data"""
	meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 25)
	reviews = operation.selectReviews(df2, ['asin', "unixReviewTime"], consts.beginTime, consts.endTime)

	timeframe = consts.timeframe

	"""Join Reviews asin"""
	reviews = reviews.join(meta, "asin") 

	if timeframe == 'month':
		res = reviews.groupBy(month(reviews.date)).count().orderBy('month(date)', ascending=True)
	elif timeframe == 'week':
		res = reviews.groupBy(weekofyear(reviews.date)).count().orderBy('weekofyear(date)', ascending=True)
	else:
		res = reviews.groupBy("date").count().orderBy('date', ascending=True)

	print res.collect()