class QueryT(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() #Select columns from the dataframe attrList = ["statusTimeliness_qid", "timeliness_score"] proj = Project(outputTimeliness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' nrows = outputFinal.count() stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryA(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" accInputExpr = AccuracyExpression(outputScan_left,"ship_date",">","submit_date") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["order_no","Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime()# Stop measuring time timing.durationTime(stopTime, startTime)
class QueryT(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, sqlContext, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, sqlContext, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Select columns from the dataframe attrList = ["VolumeTimeliness_id", "timeliness_score"] proj = Project(outputTimeliness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryT(): filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,filePath,typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right,filePath,typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Select columns from the dataframe attrList = ["VolumeTimeliness_id","timeliness_score"] proj = Project(outputTimeliness, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" if (typeOfData != 'Categorical'): accInputExpr_6 = AccuracyExpression(outputScan_left, "Class5Volume", "+", "Class6Volume") accInputExpr_5 = AccuracyExpression(outputScan_left, "Class4Volume", "+", accInputExpr_6) accInputExpr_4 = AccuracyExpression(outputScan_left, "Class3Volume", "+", accInputExpr_5) accInputExpr_3 = AccuracyExpression(outputScan_left, "Class2Volume", "+", accInputExpr_4) accInputExpr_2 = AccuracyExpression(outputScan_left, "Class1Volume", "+", accInputExpr_3) accInputExpr_1 = AccuracyExpression(outputScan_left, "Volume", "=", accInputExpr_2) accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) else: accInputExpr_1 = AccuracyExpression(outputScan_left, "Volume", "=", "Class2Volume") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe if (typeOfData != 'Categorical'): attrList = ["Sdate", "LaneNumber", "Accuracy_score"] else: attrList = ["Sdate", "Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryA(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" if(typeOfData!='Categorical'): accInputExpr_6 = AccuracyExpression(outputScan_left,"Class5Volume","+","Class6Volume") accInputExpr_5 = AccuracyExpression(outputScan_left,"Class4Volume","+",accInputExpr_6) accInputExpr_4 = AccuracyExpression(outputScan_left,"Class3Volume","+",accInputExpr_5) accInputExpr_3 = AccuracyExpression(outputScan_left,"Class2Volume","+",accInputExpr_4) accInputExpr_2 = AccuracyExpression(outputScan_left,"Class1Volume","+",accInputExpr_3) accInputExpr_1 = AccuracyExpression(outputScan_left,"Volume","=",accInputExpr_2) accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) else: accInputExpr_1 = AccuracyExpression(outputScan_left,"Volume","=","Class2Volume") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe if(typeOfData!='Categorical'): attrList = ["Sdate","LaneNumber","Accuracy_score"] else: attrList = ["Sdate","Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime()# Stop measuring time timing.durationTime(stopTime, startTime)
class QueryC(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryC_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,filePath,typeOfData) outputScan_left = selScan_left.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["Class1Volume","Class2Volume"] inputSymbols = ['empty','empty'] completenessOp = RowCompleteness(outputScan_left, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() #Select columns from the dataframe if(typeOfData!='Categorical'): attrList = ["Sdate","LaneNumber","Completeness_score"] else: attrList = ["Sdate","Completeness_score"] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryC(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryC_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["ship_date", "statusOrder"] inputSymbols = ['empty', 'empty'] completenessOp = RowCompleteness(outputScan_left, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = ["order_no", "Completeness_score"] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryC(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryC_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, sqlContext, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["ship_date", "statusOrder"] inputSymbols = [' ', ' '] completenessOp = RowCompleteness(outputScan_left, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = ["order_no", "Completeness_score"] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" accInputExpr = AccuracyExpression(outputScan_left, "ship_date", ">", "submit_date") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["order_no", "Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+A_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testeBusinessData_customerInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("customer_id", "=", "custom_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" setOfPostcodes = {"M46", "M26", "M50"} accInputExpr = AccuracyExpression(outputJoin_2, "postcode", "in", setOfPostcodes) accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = [ "order_no", "customer_id", "timeliness_score", "Accuracy_score" ] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+A_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testTrafficData_latlongInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("Cosit", "=", "Site_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" accInputExpr = AccuracyExpression(outputJoin_2, "Latitude", ">", "Longitude") accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["VolumeTimeliness_id", "timeliness_score", "Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTC(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+C_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testeBusinessData_customerInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("customer_id", "=", "custom_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["postcode", "email"] inputSymbols = ['empty', 'empty'] completenessOp = RowCompleteness(outputJoin_2, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = [ "order_no", "customer_id", "timeliness_score", "Completeness_score" ] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTC(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+C_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testTrafficData_latlongInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("Cosit", "=", "Site_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["Latitude", "Longitude"] inputSymbols = ['empty', 'empty'] completenessOp = RowCompleteness(outputJoin_2, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = [ "Cosit", "VolumeTimeliness_id", "timeliness_score", "Completeness_score" ] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTA(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+A_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right,sqlContext,filePath,typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr); outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testeBusinessData_customerInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2,sqlContext,filePath,typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("customer_id", "=", "custom_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" setOfPostcodes = {"M46","M26","M50"} accInputExpr = AccuracyExpression(outputJoin_2,"postcode","in",setOfPostcodes) accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["order_no","customer_id","timeliness_score","Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime()# Stop measuring time timing.durationTime(stopTime, startTime)