def combine(sqlc): companyName = consts.company consts.stockFile = consts.setStockFile(companyName, consts.user) """Read stock file""" stockData = sqlc.read.format('com.databricks.spark.csv') \ .options(header='true') \ .option("inferschema", 'true') \ .option("encoding", "UTF-8") \ .load(consts.stockFile) a = datetime.datetime.fromtimestamp(consts.beginTime).strftime('%Y/%m/%d') b = datetime.datetime.fromtimestamp(consts.endTime).strftime('%Y/%m/%d') stockDataYear = operation.selectStock(stockData, ["date", "close"], a, b) """Change Date Format from Y/M/d to Y-M-d""" my_udf = udf(operation.formatDate) stockData = stockDataYear.withColumn("date", my_udf(stockDataYear.date)) if consts.timeframe != 'day': stockData = operation.averageStock(stockData, consts.timeframe) print 'stockData != day' print stockData.take(3) """Read Meta and Reviews Files""" df = sqlc.read.json(consts.filename) df2 = sqlc.read.json(consts.reviewsfile) meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50) reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime"], consts.beginTime, consts.endTime) """Join Reviews asin""" reviews = reviews.join(meta, "asin") rating = operation.averageRating(reviews, consts.timeframe) """Join ratings with stock""" combine = rating.join(stockData, "date") combine = combine.orderBy("date", ascending=True) """combine.write.format("com.databricks.spark.csv").save("/user/" + consts.user + "/project/data/" + consts.folder, header="true")""" printR.printClusterRDD(combine.rdd, consts.user, consts.folder) """printR.saveClusterCSV(combine, consts.user, consts.folder)""" dates = [rat.date for rat in combine.select('date').collect()] ratings = [float(rat.avgRating) for rat in combine.select('avgRating').collect()] stocks = [float(stock.close) for stock in combine.select('close').collect()] diffRatings = [(((j-i)*100.0)/i) for i, j in zip(ratings[:-1], ratings[1:])] diffStocks = [(((j-i)*100.0)/i) for i, j in zip(stocks[:-1], stocks[1:])] '''rows = zip(dates, ratings, stocks, diffRatings, diffStocks)
def getRatingAvg(sqlc): """Read Files""" df = sqlc.read.json(consts.filename) df2 = sqlc.read.json(consts.reviewsfilefarm) """Select Data""" meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50) reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime", "reviewTime"], consts.beginTime, consts.endTime) """Join""" reviews = reviews.join(meta, "asin") rating = reviews.join(meta, "asin").agg({"overall":"avg"}) """Print""" printR.printFarm(rating)
def getRatingGroupAvg(sqlc): """Read Files""" df = sqlc.read.json(consts.filename) df2 = sqlc.read.json(consts.reviewsfile) """Select Data""" meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50) reviews = operation.selectReviews(df2, ['asin', 'overall', "unixReviewTime"], consts.beginTime, consts.endTime) """Join""" reviews = reviews.join(meta, "asin") rating = operation.averageRating(reviews, consts.timeframe) """Print""" printR.printClusterRDD(rating.rdd, consts.user, consts.folder)
def countRatings(sqlc): """Read Files""" df = sqlc.read.json(consts.filename) df2 = sqlc.read.json(consts.reviewsfilefarm) """Select Data""" meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 25) reviews = operation.selectReviews(df2, ['asin', "unixReviewTime"], consts.beginTime, consts.endTime) """Join Reviews asin""" reviews = reviews.join(meta, "asin") """Count""" contagem = operation.countApprox(reviews.rdd) print contagem
def multipleCompanies(sqlc): stockDataYearApple = operation.readStockValue(consts.appleStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime) stockDataYearHp = operation.readStockValue(consts.hpStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime) stockDataYearMicrosoft = operation.readStockValue(consts.microsoftStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime) stockDataYearDell = operation.readStockValue(consts.dellStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime) stockDataYearSony = operation.readStockValue(consts.sonyStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime) stockDataYearSamsung = operation.readStockValue(consts.samsungStockFile, sqlc, ["date", "close"], consts.beginTime, consts.endTime) stockDataList = [stockDataYearApple, stockDataYearHp, stockDataYearMicrosoft, stockDataYearDell, stockDataYearSony, stockDataYearSamsung] companyList = ['apple', 'hp', 'microsoft', 'dell', 'sony', 'samsung'] """Change Date Format from Y/M/d to Y-M-d""" my_udf = udf(operation.formatDate) index = 0 for stock in stockDataList: stockDataList[index] = stock.withColumn("date", my_udf("date")) print stockDataList[index].take(2) index += 1 """Read Meta and Reviews Files""" df = sqlc.read.json(consts.filename) df2 = sqlc.read.json(consts.reviewsfilefarm) results = None index = 0 for company in companyList: stockDataList[index] = stockDataList[index].withColumnRenamed('close', 'stock ' + company) meta = operation.selectProducts(df, ["asin", "title", "price"], company, 50) reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime"], consts.beginTime, consts.endTime) amazonjoin = reviews.join(meta, "asin") print "amazonjoin " + company print amazonjoin.take(5) rating = operation.averageRatingAlias(amazonjoin, consts.timeframe, 'rating ' + company) print "rating and stock " + company print rating.take(5) print stockDataList[index].take(5) combine = rating.join(stockDataList[index], "date") combine = combine.orderBy("date", ascending=True) print "combine " + company print combine.take(5) """if index == 0: results = combine else: results = results.join(combine, "date")""" index += 1
def combine(sqlc): companyName = consts.company consts.stockFile = consts.setStockFile(companyName, consts.user) """Read stock file""" stockData = sqlc.read.format('com.databricks.spark.csv') \ .options(header='true') \ .option("inferschema", 'true') \ .option("encoding", "UTF-8") \ .load(consts.stockFile) a = datetime.datetime.fromtimestamp(consts.beginTime).strftime('%Y/%m/%d') b = datetime.datetime.fromtimestamp(consts.endTime).strftime('%Y/%m/%d') stockDataYear = operation.selectStock(stockData, ["date", "close"], a, b) """Change Date Format from Y/M/d to Y-M-d""" my_udf = udf(operation.formatDate) stockData = stockDataYear.withColumn("date", my_udf(stockDataYear.date)) if consts.timeframe != 'day': stockData = operation.averageStock(stockData, consts.timeframe) """Read Meta and Reviews Files""" df = sqlc.read.json(consts.filename) df2 = sqlc.read.json(consts.reviewsfilefarm) meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 50) reviews = operation.selectReviews(df2, ['asin', "overall", "unixReviewTime"], consts.beginTime, consts.endTime) """Join Reviews asin""" reviews = reviews.join(meta, "asin") rating = operation.averageRating(reviews, consts.timeframe) printR.printFarmExample(rating, 2) """Join ratings with stock""" combine = rating.join(stockData, "date") combine = combine.orderBy("date", ascending=True) printR.printFarm(combine) """# Generate CSV with output data
def countReviews(sqlc): """Read Files""" df = sqlc.read.json(consts.filename) df2 = sqlc.read.json(consts.reviewsfilefarm) """Select Data""" meta = operation.selectProducts(df, ["asin", "title", "price"], consts.company, 25) reviews = operation.selectReviews(df2, ['asin', "unixReviewTime"], consts.beginTime, consts.endTime) timeframe = consts.timeframe """Join Reviews asin""" reviews = reviews.join(meta, "asin") if timeframe == 'month': res = reviews.groupBy(month(reviews.date)).count().orderBy('month(date)', ascending=True) elif timeframe == 'week': res = reviews.groupBy(weekofyear(reviews.date)).count().orderBy('weekofyear(date)', ascending=True) else: res = reviews.groupBy("date").count().orderBy('date', ascending=True) print res.collect()