sc = SparkContext(conf=conf) sqlsc = SQLContext(sc) ordersFile = "/user/cloudera/cca175/retail_db/orders" order_itemsFile = "/user/cloudera/cca175/retail_db/order_items" ordersRdd = sc.textFile(ordersFile).map(lambda l: l.split(",")) order_itemsRdd = sc.textFile(order_itemsFile).map(lambda x: x.split(",")) ordersKvRdd = ordersRdd.map(lambda x: (Row( orderid=int(x[0]), odate=str(x[1]), custid=int(x[2]), status=str(x[3])))) orderItemsKvRdd = order_itemsRdd.map( lambda x: (Row(itemid=int(x[0]), orderid=int(x[0])))) schemaOrders = sqlsc.inferSchema(ordersKvRdd) schemaOrderItems = sqlsc.inferSchema(orderItemsKvRdd) schemaOrders.registerTempTable("orders") schemaOrderItems.registerTempTable("order_items") ordersCount = sqlsc.sql("select count(*) as x from orders") orderItemsCount = sqlsc.sql("select count(*) as x from order_items") orderStatus = sqlsc.sql("select distinct status from orders") ordersJoin = sqlsc.sql( "select count(distinct x.orderid) as z from order_items x join orders y on x.orderid = y.orderid" ) orderMatchCount = ordersJoin.map(lambda obj: obj.z).collect()
def main(): conf = SparkConf().setAppName('housingprice') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) taxreportSchema = StructType([ StructField('PID', StringType(), False), StructField('Legal_Type', StringType(), False), StructField('FOLIO', StringType(), False), StructField('Coordinates', StringType(), True), StructField('ZoneName', StringType(), True), StructField('ZoneCat', StringType(), True), StructField('LOT', StringType(), True), StructField('Block', StringType(), True), StructField('plan', StringType(), True), StructField('DisLot', StringType(), True), StructField('FCiviNum', StringType(), True), StructField('TCiviNum', StringType(), True), StructField('StreetName', StringType(), True), StructField('PostalCode', StringType(), True), StructField('NLegalName1', StringType(), True), StructField('NLegalName2', StringType(), True), StructField('NLegalName3', StringType(), True), StructField('NLegalName4', StringType(), True), StructField('NLegalName5', StringType(), True), StructField('CurVal', StringType(), True), StructField('CurImpVal', StringType(), True), StructField('Taxassess', StringType(), True), StructField('prevVal', StringType(), True), StructField('prevImpVal', StringType(), True), StructField('YearBuilt', StringType(), True), StructField('BigImpYear', StringType(), True), StructField('Tax_levy', StringType(), True), StructField('NeighbourhoodCode', StringType(), True), ]) conversionSchema = StructType([ StructField('date', StringType(), False), StructField('USD', StringType(), False), StructField('rate', StringType(), False), StructField('reciprate', StringType(), False), ]) crudeoilSchema = StructType([ StructField('date', DateType(), False), StructField('oilprice', StringType(), False), ]) def fixdate(convVal): a = convVal.split(" ") dates = a[0].split("/") alldate = "20"+dates[2]+'/'+dates[0] return (alldate,a[1]) def filterYear(dates): a = dates.split('/') if (a[1]=='2016'): return False else: return True def processDate(df): def splitMonth(cols): a = cols.split('/') return a[1] def splitYear(cols): a = cols.split('/') return a[0] fUDF = udf(splitMonth, StringType()) df1 = df.withColumn("month", fUDF('year')) fUDFyear = udf(splitYear, StringType()) return df1.withColumn("year", fUDFyear('year')) #Reading the Tax Report Dataset taxreportinfo = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(taxreportSchema).load(inputs+"taxreport/test") taxreportinfo.registerTempTable("taxreport") #Selecting the price,TaxAssessment Year and Postalcode of each property propertyVal = sqlContext.sql("SELECT CurVal, Taxassess, PostalCode FROM taxreport") propertyVal.registerTempTable("propertyVal") #Reading the CAN to USD conversion dataset conversion = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(conversionSchema).load(inputs+"conversion") conversion.registerTempTable("Conversion") #Selecting only the date and rate conversionrate = sqlContext.sql("SELECT date,rate FROM Conversion WHERE rate regexp '^[0-9]+'") conversionRDD = conversionrate.repartition(40).rdd.map(lambda w: (w.date+" "+w.rate)) conversiondates = conversionRDD.map(fixdate).filter(lambda (w,x):filterYear(w)).map(lambda l: Row(date=l[0], rate=l[1])) schemaConv = sqlContext.inferSchema(conversiondates) schemaConv.registerTempTable("ConversionDate") ConverDF = sqlContext.sql(" SELECT date,CAST(AVG(rate) AS DECIMAL(4,2)) as conversionrate FROM ConversionDate WHERE rate IS NOT NULL GROUP BY date") ConverDF.cache() #Reading the Canada Crude oil price dataset crudeoil = sc.textFile(inputs+"crudeoil") crudeoilRDD = crudeoil.map(lambda l: l.split()).map(lambda l: Row(date=l[0], oilprice=l[1])) crudeoilDF = sqlContext.inferSchema(crudeoilRDD) crudeoilDF.registerTempTable("crudeoil") #Selecting the date on M/Y format and oilprice oilprice = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,oilprice FROM crudeoil") oilprice.registerTempTable('oilprice') #Reading the interestrate of BC Dataset interestRate = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs+"interestrate") interestRate.registerTempTable("interest") #Selecting the date and 5-year fixed mortgage price from the dataset interestDF = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,CAST(`5y-fixed-posted` AS DECIMAL(4,2)) AS interestrate FROM interest WHERE date >='2006-01' AND date <= '2015-12'") interestDF.registerTempTable("allrates") #Getting the average of each month on days whose value is not null. avgInterest = sqlContext.sql(" SELECT date,AVG(interestrate) as interestrates FROM allrates WHERE interestrate IS NOT NULL GROUP BY date") avgInterest.cache() joinedTable = avgInterest.join(oilprice,(avgInterest['date']==oilprice['date'])).select(avgInterest['date'],avgInterest['interestrates'],oilprice['oilprice']) JoinedConversion = joinedTable.join(ConverDF,(joinedTable['date']==ConverDF['date'])).select(joinedTable['date'].alias('year'),joinedTable['interestrates'],joinedTable['oilprice'],ConverDF['conversionrate']) JoinedConversion.registerTempTable("joinedConversion") ls = processDate(JoinedConversion) ls.show()