############# ############# ############# ############# #############
def locationTest(sc, sqlContext, lPolygon, lStop, modelName='random forest', num_features=-1): #Partition data into 4 parts: train (positive examples), train (negative examples), test (pos), test (neg) t1 = time.time() lAllPoly = lPolygon[0] lAllPoly.extend(lPolygon[1]) lAllPoly.extend(lPolygon[2]) bc_AllPoly = sc.broadcast(lAllPoly) bc_PosTrainPoly = sc.broadcast(lPolygon[0]) bc_PosTestPoly = sc.broadcast(lPolygon[1]) bc_NegTestPoly = sc.broadcast(lPolygon[2]) sqlContext.registerFunction( "posTrain", lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTrainPoly), returnType=BooleanType()) sqlContext.registerFunction( "negTrain", lambda lat, lon: fspLib.inROI(lat, lon, bc_AllPoly), returnType=BooleanType()) sqlContext.registerFunction( "posTest", lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTestPoly), returnType=BooleanType()) sqlContext.registerFunction( "negTest", lambda lat, lon: fspLib.inROI(lat, lon, bc_NegTestPoly), returnType=BooleanType()) df1 = sqlContext.sql( "SELECT * FROM records WHERE posTrain(records.lat, records.lon)" ).cache() dfn1 = sqlContext.sql( "SELECT * FROM records WHERE NOT negTrain(records.lat, records.lon)" ).cache() dap = sqlContext.sql( "SELECT * FROM records WHERE posTest(records.lat, records.lon)").cache( ) dan = sqlContext.sql( "SELECT * FROM records WHERE negTest(records.lat, records.lon)").cache( ) nInTrain = df1.count() nOutTrain = dfn1.count() nInApply = dap.count() nOutApply = dan.count() diff = time.time() - t1 print "GEQE: Time to partition data by region", diff print "GEQE: Positive training points:", nInTrain, ". Negative training points:", nOutTrain print "GEQE: Positive test points:", nInApply, ". Negative test points:", nOutApply #Map data for training t1 = time.time() trainIn = df1.map(lambda x: (x.key, [ LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() trainOut = dfn1.map(lambda x: (x.key, [ LabeledPoint(0.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() scaleFactor = (10. * nInTrain) / float(nOutTrain) mlTrain = trainIn.union(trainOut.sample(False, scaleFactor)) if len(lStop) != 0: mlTrain = mlTrain.map( lambda x: aggregatedComparison.removeStopWords(x, lStop)) mlTrain.cache() applyPos = dap.map(lambda x: LabeledPoint(1.0, x.vector)).cache() applyNeg = dan.map(lambda x: LabeledPoint(0.0, x.vector)).cache() diff = time.time() - t1 print "GEQE: Time to prepare training data", diff # feature selection if num_features > 0 # use chi sq test to find most relevant features trainingData, applyData = None, None if num_features < 1: trainingData = mlTrain.map(lambda x: x[1][0]) applyData = applyPos.union(applyNeg) else: # use chi sq feature selection print 'Selecting top ', num_features, ' features...' featureSelectionModel = ChiSqSelector(num_features).fit( mlTrain.map(lambda x: x[1][0])) print 'Features selected. Transforming training data' posTrain = mlTrain.filter(lambda x: x[1][0].label == 1.0).map( lambda x: x[1][0].features) posTrain = featureSelectionModel.transform(posTrain).map( lambda x: LabeledPoint(1.0, x)) negTrain = mlTrain.filter(lambda x: x[1][0].label == 0.0).map( lambda x: x[1][0].features) negTrain = featureSelectionModel.transform(negTrain).map( lambda x: LabeledPoint(0.0, x)) trainingData = posTrain.union(negTrain) # transform apply data print 'Transforming apply data' applyPos = featureSelectionModel.transform( applyPos.map(lambda x: x.features)).map( lambda x: LabeledPoint(1.0, x)) applyNeg = featureSelectionModel.transform( applyNeg.map(lambda x: x.features)).map( lambda x: LabeledPoint(0.0, x)) applyData = applyPos.union(applyNeg) #train model t1 = time.time() trainingFunction = getTrainModelFunc(modelName) model = trainingFunction(trainingData) diff = time.time() - t1 print "GEQE: Time to train model", diff #apply model t1 = time.time() predictions_Tree = model.predict(applyData.map(lambda x: x.features)) tAndP = applyData.map(lambda x: x.label).zip(predictions_Tree) diff = time.time() - t1 results = tAndP.collect() print "GEQE: Time to apply model", diff return (results, nInApply, nOutApply)
############# ############# ############# ############# ############# # filterData # by JAG3 # ############# ############# ############# ############# ############# from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.types import BooleanType from datetime import date import sys import argparse sys.path.insert(0, './lib/') from to_parquet import csvToDataFrame import fspLib import shapeReader # HARD CODE YOU INPUT DATA SETS AND DATA TYPES DATA_SETS = {"/data/ingest/twitter/success/":2} LOWER_TIME = date(2006,03,21) UPPER_TIME = date(3000,01,01) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("shapeFile", help="The shape file path") parser.add_argument("outputPath",help="Output destination") parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.') parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8) parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt") parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='') args = parser.parse_args() shapeFile = args.shapeFile
sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) #Create polygon list and broadcast variable based on it lPolygon = shapeReader.readInShapeJson(shapeFile) bc_lTargetPolygons = sc.broadcast(lPolygon) #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType) if inputPartitions != -1: records = records.repartition(inputPartitions) records.cache() records.registerTempTable('records') sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType()) data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)") #Split data into 2 DDSs depending on being in our out of region of interest rows = data.collect() if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles') fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb") for row in rows: try: buffer = [row.lat,row.lon,row.user,row.dt.date(),row.text,row.dt] buffer = map(lambda x: unicode(x).replace(u'\t',u' ').replace(u'\n',u' '),buffer) fOut.write(u'\t'.join(buffer)+u'\n') except: traceback.print_exc()
def run(jobNm,sc,sqlContext,inputFile,lPolygon,dictFile, nDataType=0, inputPartitions=-1, sNum=30, modelSavePath=None, bWriteMonitor=False, writeFileOutput=True, strStop=''): if bWriteMonitor: import plotting bc_lTargetPolygons = sc.broadcast(lPolygon) stopSet = set(strStop.split(',')) if strStop !='' else set() #Create monitoring plot and associated vectors mPX = range(7) mPY = [0.]*7 mSL = ["Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data"] mInd = 0 t0 = time.time() #Read in data and filter out entries with no valid words t1 = time.time() records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good points:", nGoodTweets diff = t2-t1 print "Time to read in and filter nonscorable words", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] lStop = [] if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile,"r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) nVecLen = len(revLookup) t2 = time.time() diff = t2-t1 print "Time to read dict: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Split data into training and apply samples # training data is 2 parts, as well as prepare application data # i.) In both the region, and in the time window # ii.) In the region, but outside the time window # iii.) Out of region, data to apply model to t1 = time.time() sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType()) df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache() df1.registerTempTable("df1") df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache() #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache() dfn1 = sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)") df1_inTime.registerTempTable("df1_inTime") #df1_outTime.registerTempTable("df1_outTime") #nL1T1 = df1_inTime.count() #nL1T0 = df1_outTime.count() exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup) t2 = time.time() #print nL1T1, "events in region in time,", nL1T0, "events in region out of time" diff = t2-t1 print "Time to partition by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Create training vectors from in region data t1 = time.time() groupedIn = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache() nSignal = float(groupedIn.count()) nBack = float(groupedOut.count()) scaleFactor = 10.*nSignal/nBack (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor]) mlApply.cache() mlTrain = groupedIn.union(groupedUse).cache() if len(lStop) != 0: mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop)) mlTrain.cache() nTotTrain = mlTrain.count() t2 = time.time() print nTotTrain, "entries for training" diff = t2-t1 print "Time to get data ready for model by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # train model t1 = time.time() model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) if modelSavePath is not None: if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/" model_Tree.save(sc, modelSavePath + jobNm) t2 = time.time() diff = t2-t1 print "Time to train model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Apply Model to out of region data t1 = time.time() predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() #print "Number of points to score:", nApply diff = t2-t1 print "Time aggregate and label points: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict) t2 = time.time() diff = t2-t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet
############# ############# ############# ############# ############# # filterData # by JAG3 # ############# ############# ############# ############# ############# from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.types import BooleanType from datetime import date import sys import argparse sys.path.insert(0, './lib/') from to_parquet import csvToDataFrame import fspLib import shapeReader # HARD CODE YOU INPUT DATA SETS AND DATA TYPES DATA_SETS = {"hdfs://xdata/qcr/gnip": 66} LOWER_TIME = date(2006, 03, 21) UPPER_TIME = date(3000, 01, 01) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("shapeFile", help="The shape file path") parser.add_argument("outputPath", help="Output destination") parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'", default='Geqe data filter.') parser.add_argument( "-cNum", type=int, help=
def locationTest(sc, sqlContext, lPolygon, lStop,modelName='random forest',num_features=-1): #Partition data into 4 parts: train (positive examples), train (negative examples), test (pos), test (neg) t1 = time.time() lAllPoly = lPolygon[0] lAllPoly.extend(lPolygon[1]) lAllPoly.extend(lPolygon[2]) bc_AllPoly = sc.broadcast(lAllPoly) bc_PosTrainPoly = sc.broadcast(lPolygon[0]) bc_PosTestPoly = sc.broadcast(lPolygon[1]) bc_NegTestPoly = sc.broadcast(lPolygon[2]) sqlContext.registerFunction("posTrain", lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTrainPoly), returnType=BooleanType()) sqlContext.registerFunction("negTrain", lambda lat, lon: fspLib.inROI(lat, lon, bc_AllPoly), returnType=BooleanType()) sqlContext.registerFunction("posTest", lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTestPoly), returnType=BooleanType()) sqlContext.registerFunction("negTest", lambda lat, lon: fspLib.inROI(lat, lon, bc_NegTestPoly), returnType=BooleanType()) df1 = sqlContext.sql("SELECT * FROM records WHERE posTrain(records.lat, records.lon)").cache() dfn1 = sqlContext.sql("SELECT * FROM records WHERE NOT negTrain(records.lat, records.lon)").cache() dap = sqlContext.sql("SELECT * FROM records WHERE posTest(records.lat, records.lon)").cache() dan = sqlContext.sql("SELECT * FROM records WHERE negTest(records.lat, records.lon)").cache() nInTrain = df1.count() nOutTrain = dfn1.count() nInApply = dap.count() nOutApply = dan.count() diff = time.time() - t1 print "GEQE: Time to partition data by region", diff print "GEQE: Positive training points:", nInTrain, ". Negative training points:", nOutTrain print "GEQE: Positive test points:", nInApply, ". Negative test points:", nOutApply #Map data for training t1 = time.time() trainIn = df1.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() trainOut = dfn1.map(lambda x: (x.key, [LabeledPoint(0.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() scaleFactor = (10.*nInTrain)/float(nOutTrain) mlTrain = trainIn.union(trainOut.sample(False, scaleFactor)) if len(lStop) != 0: mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop)) mlTrain.cache() applyPos = dap.map(lambda x: LabeledPoint(1.0, x.vector)).cache() applyNeg = dan.map(lambda x: LabeledPoint(0.0, x.vector)).cache() diff = time.time() - t1 print "GEQE: Time to prepare training data", diff # feature selection if num_features > 0 # use chi sq test to find most relevant features trainingData,applyData = None, None if num_features < 1: trainingData = mlTrain.map(lambda x: x[1][0]) applyData = applyPos.union(applyNeg) else: # use chi sq feature selection print 'Selecting top ',num_features,' features...' featureSelectionModel = ChiSqSelector(num_features).fit(mlTrain.map(lambda x: x[1][0])) print 'Features selected. Transforming training data' posTrain = mlTrain.filter(lambda x: x[1][0].label == 1.0).map(lambda x: x[1][0].features) posTrain = featureSelectionModel.transform( posTrain ).map( lambda x: LabeledPoint(1.0,x) ) negTrain = mlTrain.filter(lambda x: x[1][0].label == 0.0).map(lambda x: x[1][0].features) negTrain = featureSelectionModel.transform( negTrain ).map( lambda x: LabeledPoint(0.0,x) ) trainingData = posTrain.union(negTrain) # transform apply data print 'Transforming apply data' applyPos = featureSelectionModel.transform( applyPos.map(lambda x: x.features) ).map(lambda x: LabeledPoint(1.0,x)) applyNeg = featureSelectionModel.transform( applyNeg.map(lambda x: x.features) ).map(lambda x: LabeledPoint(0.0,x)) applyData = applyPos.union(applyNeg) #train model t1 = time.time() trainingFunction = getTrainModelFunc(modelName) model = trainingFunction(trainingData) diff = time.time() - t1 print "GEQE: Time to train model", diff #apply model t1 = time.time() predictions_Tree = model.predict(applyData.map(lambda x: x.features)) tAndP = applyData.map(lambda x: x.label).zip(predictions_Tree) diff = time.time() - t1 results = tAndP.collect() print "GEQE: Time to apply model", diff return (results, nInApply, nOutApply)
#Create polygon list and broadcast variable based on it lPolygon = shapeReader.readInShapeJson(shapeFile) bc_lTargetPolygons = sc.broadcast(lPolygon) #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job records = sqlContext.parquetFile( inputFile) if 0 == nDataType else csvToDataFrame( sc, sqlContext, inputFile, nDataType) if inputPartitions != -1: records = records.repartition(inputPartitions) records.cache() records.registerTempTable('records') sqlContext.registerFunction( "inRegionOfInterest", lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons), returnType=BooleanType()) sqlContext.registerFunction( "inEventOfInterest", lambda lat, lon, dt: fspLib.inEOI(lat, lon, dt, bc_lTargetPolygons), returnType=BooleanType()) data = sqlContext.sql( "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)" ) #Split data into 2 DDSs depending on being in our out of region of interest rows = data.collect() if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles') fOut = codecs.open('previewTrainingFiles/' + jobNm, encoding="utf-8",
def run(jobNm, sc, sqlContext, inputFile, lPolygon, dictFile, nDataType=0, inputPartitions=-1, sNum=30, modelSavePath=None, bWriteMonitor=False, writeFileOutput=True, strStop=''): if bWriteMonitor: import plotting bc_lTargetPolygons = sc.broadcast(lPolygon) stopSet = set(strStop.split(',')) if strStop != '' else set() #Create monitoring plot and associated vectors mPX = range(7) mPY = [0.] * 7 mSL = [ "Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data" ] mInd = 0 t0 = time.time() #Read in data and filter out entries with no valid words t1 = time.time() print 'inputFile ', inputFile print 'inputPartitions ', inputPartitions records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good tweets:", nGoodTweets diff = t2 - t1 print "Time to read in and filter nonscorable words", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] lStop = [] fDict = None if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile, "r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) nVecLen = len(revLookup) t2 = time.time() diff = t2 - t1 print "Time to read dict:", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Split data into training and apply samples # training data is 2 parts, inside r.o.i., and a sample of the areas outside the r.o.i. t1 = time.time() sqlContext.registerFunction( "inRegionOfInterest", lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons), returnType=BooleanType()) df1 = sqlContext.sql( "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)" ).cache() df1.registerTempTable("df1") nIn = df1.count() dfn1 = sqlContext.sql( "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)" ).cache() dfn1.registerTempTable("dfn1") nOut = dfn1.count() modelDict = aggregatedComparison.exemplarDict(df1, revLookup) t2 = time.time() diff = t2 - t1 print "Time to find in and out of ROI", diff print "N in:", nIn, ", N out:", nOut if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Create training vectors from in region data, and sample of out region data t1 = time.time() #grouped = aggregatedComparison.createAggregatedLabledPoint(df1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1.0) #grouped2 = aggregatedComparison.createAggregatedLabledPoint(dfn1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, -1.0) #nSignal = float(grouped.count()) #nBack = float(grouped2.count()) groupedIn = df1.map(lambda x: (x.key, [ LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() groupedOut = dfn1.map(lambda x: (x.key, [ LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() scaleFactor = (10. * nIn) / float(nOut) (mlApply, groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor]) mlTrain = groupedIn.union(groupedUse).cache() if len(lStop) != 0: mlTrain = mlTrain.map( lambda x: aggregatedComparison.removeStopWords(x, lStop)) nTotTrain = mlTrain.count() mlApply.cache() nApply = mlApply.count() t2 = time.time() print nTotTrain, "entries for training" diff = t2 - t1 print "Time to get data ready for model by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # train model t1 = time.time() model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) if modelSavePath is not None: if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/" model_Tree.save(sc, modelSavePath + jobNm) t2 = time.time() diff = t2 - t1 print "Time to train model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # apply model t1 = time.time() predictions_Tree = model_Tree.predict( mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() diff = t2 - t1 print "Time to apply model: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(False, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, modelDict) t2 = time.time() diff = t2 - t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet