Esempio n. 1
0
############# ############# ############# ############# #############
Esempio n. 2
0
def locationTest(sc,
                 sqlContext,
                 lPolygon,
                 lStop,
                 modelName='random forest',
                 num_features=-1):
    #Partition data into 4 parts: train (positive examples), train (negative examples), test (pos), test (neg)
    t1 = time.time()
    lAllPoly = lPolygon[0]
    lAllPoly.extend(lPolygon[1])
    lAllPoly.extend(lPolygon[2])
    bc_AllPoly = sc.broadcast(lAllPoly)
    bc_PosTrainPoly = sc.broadcast(lPolygon[0])
    bc_PosTestPoly = sc.broadcast(lPolygon[1])
    bc_NegTestPoly = sc.broadcast(lPolygon[2])
    sqlContext.registerFunction(
        "posTrain",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTrainPoly),
        returnType=BooleanType())
    sqlContext.registerFunction(
        "negTrain",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_AllPoly),
        returnType=BooleanType())
    sqlContext.registerFunction(
        "posTest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTestPoly),
        returnType=BooleanType())
    sqlContext.registerFunction(
        "negTest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_NegTestPoly),
        returnType=BooleanType())
    df1 = sqlContext.sql(
        "SELECT * FROM records WHERE posTrain(records.lat, records.lon)"
    ).cache()
    dfn1 = sqlContext.sql(
        "SELECT * FROM records WHERE NOT negTrain(records.lat, records.lon)"
    ).cache()
    dap = sqlContext.sql(
        "SELECT * FROM records WHERE posTest(records.lat, records.lon)").cache(
        )
    dan = sqlContext.sql(
        "SELECT * FROM records WHERE negTest(records.lat, records.lon)").cache(
        )
    nInTrain = df1.count()
    nOutTrain = dfn1.count()
    nInApply = dap.count()
    nOutApply = dan.count()
    diff = time.time() - t1
    print "GEQE: Time to partition data by region", diff
    print "GEQE: Positive training points:", nInTrain, ".  Negative training points:", nOutTrain
    print "GEQE: Positive test points:", nInApply, ".  Negative test points:", nOutApply

    #Map data for training
    t1 = time.time()
    trainIn = df1.map(lambda x: (x.key, [
        LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    trainOut = dfn1.map(lambda x: (x.key, [
        LabeledPoint(0.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    scaleFactor = (10. * nInTrain) / float(nOutTrain)
    mlTrain = trainIn.union(trainOut.sample(False, scaleFactor))
    if len(lStop) != 0:
        mlTrain = mlTrain.map(
            lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    applyPos = dap.map(lambda x: LabeledPoint(1.0, x.vector)).cache()
    applyNeg = dan.map(lambda x: LabeledPoint(0.0, x.vector)).cache()
    diff = time.time() - t1
    print "GEQE: Time to prepare training data", diff

    # feature selection  if num_features > 0
    # use chi sq test to find most relevant features
    trainingData, applyData = None, None
    if num_features < 1:
        trainingData = mlTrain.map(lambda x: x[1][0])
        applyData = applyPos.union(applyNeg)
    else:
        # use chi sq feature selection
        print 'Selecting top ', num_features, ' features...'
        featureSelectionModel = ChiSqSelector(num_features).fit(
            mlTrain.map(lambda x: x[1][0]))
        print 'Features selected.  Transforming training data'
        posTrain = mlTrain.filter(lambda x: x[1][0].label == 1.0).map(
            lambda x: x[1][0].features)
        posTrain = featureSelectionModel.transform(posTrain).map(
            lambda x: LabeledPoint(1.0, x))
        negTrain = mlTrain.filter(lambda x: x[1][0].label == 0.0).map(
            lambda x: x[1][0].features)
        negTrain = featureSelectionModel.transform(negTrain).map(
            lambda x: LabeledPoint(0.0, x))
        trainingData = posTrain.union(negTrain)

        # transform apply data
        print 'Transforming apply data'
        applyPos = featureSelectionModel.transform(
            applyPos.map(lambda x: x.features)).map(
                lambda x: LabeledPoint(1.0, x))
        applyNeg = featureSelectionModel.transform(
            applyNeg.map(lambda x: x.features)).map(
                lambda x: LabeledPoint(0.0, x))
        applyData = applyPos.union(applyNeg)

    #train model
    t1 = time.time()
    trainingFunction = getTrainModelFunc(modelName)
    model = trainingFunction(trainingData)
    diff = time.time() - t1
    print "GEQE: Time to train model", diff

    #apply model
    t1 = time.time()
    predictions_Tree = model.predict(applyData.map(lambda x: x.features))
    tAndP = applyData.map(lambda x: x.label).zip(predictions_Tree)
    diff = time.time() - t1
    results = tAndP.collect()
    print "GEQE: Time to apply model", diff
    return (results, nInApply, nOutApply)
Esempio n. 3
0
############# ############# ############# ############# #############
Esempio n. 4
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"/data/ingest/twitter/success/":2}

LOWER_TIME = date(2006,03,21)
UPPER_TIME = date(3000,01,01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath",help="Output destination")
    parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.')
    parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8)
    parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt")
    parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='')
    args = parser.parse_args()
    shapeFile = args.shapeFile
Esempio n. 5
0
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)


    #Create polygon list and broadcast variable based on it
    lPolygon = shapeReader.readInShapeJson(shapeFile)
    bc_lTargetPolygons = sc.broadcast(lPolygon)

    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)")


    #Split data into 2 DDSs depending on being in our out of region of interest
    rows = data.collect()
    if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles')
    fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb")
    for row in rows:
        try:
            buffer =  [row.lat,row.lon,row.user,row.dt.date(),row.text,row.dt]
            buffer = map(lambda x: unicode(x).replace(u'\t',u' ').replace(u'\n',u' '),buffer)
            fOut.write(u'\t'.join(buffer)+u'\n')
        except:
            traceback.print_exc()
Esempio n. 6
0
def run(jobNm,sc,sqlContext,inputFile,lPolygon,dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop !='' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.]*7
    mSL = ["Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data"]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good points:", nGoodTweets
    diff = t2-t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)


    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2-t1
    print "Time to read dict: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, as well as prepare application data
    # i.)  In both the region, and in the time window
    # ii.) In the region, but outside the time window
    # iii.) Out of region, data to apply model to
    t1 = time.time()
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache()
    df1.registerTempTable("df1")
    df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    dfn1 =  sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)")
    df1_inTime.registerTempTable("df1_inTime")
    #df1_outTime.registerTempTable("df1_outTime")
    #nL1T1 = df1_inTime.count()
    #nL1T0 = df1_outTime.count()
    exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup)
    t2 = time.time()
    #print nL1T1, "events in region in time,", nL1T0, "events in region out of time"
    diff = t2-t1
    print "Time to partition by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data
    t1 = time.time()
    groupedIn  = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
    nSignal = float(groupedIn.count())
    nBack = float(groupedOut.count())
    scaleFactor = 10.*nSignal/nBack
    (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor])
    mlApply.cache()
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    nTotTrain = mlTrain.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2-t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2-t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Apply Model to out of region data
    t1 = time.time()
    predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    #print "Number of points to score:", nApply
    diff = t2-t1
    print "Time aggregate and label points: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict)
    t2 = time.time()
    diff = t2-t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet
Esempio n. 7
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"hdfs://xdata/qcr/gnip": 66}
LOWER_TIME = date(2006, 03, 21)
UPPER_TIME = date(3000, 01, 01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath", help="Output destination")
    parser.add_argument("-jobNm",
                        help="Application name, default = 'Geqe Data Filter'",
                        default='Geqe data filter.')
    parser.add_argument(
        "-cNum",
        type=int,
        help=
Esempio n. 8
0
def locationTest(sc, sqlContext, lPolygon, lStop,modelName='random forest',num_features=-1):
    #Partition data into 4 parts: train (positive examples), train (negative examples), test (pos), test (neg)
    t1 = time.time()
    lAllPoly = lPolygon[0]
    lAllPoly.extend(lPolygon[1])
    lAllPoly.extend(lPolygon[2])
    bc_AllPoly = sc.broadcast(lAllPoly)
    bc_PosTrainPoly = sc.broadcast(lPolygon[0])
    bc_PosTestPoly  = sc.broadcast(lPolygon[1])
    bc_NegTestPoly  = sc.broadcast(lPolygon[2])
    sqlContext.registerFunction("posTrain", lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTrainPoly), returnType=BooleanType())
    sqlContext.registerFunction("negTrain", lambda lat, lon: fspLib.inROI(lat, lon, bc_AllPoly), returnType=BooleanType())
    sqlContext.registerFunction("posTest",  lambda lat, lon: fspLib.inROI(lat, lon, bc_PosTestPoly), returnType=BooleanType())
    sqlContext.registerFunction("negTest",  lambda lat, lon: fspLib.inROI(lat, lon, bc_NegTestPoly), returnType=BooleanType())
    df1  = sqlContext.sql("SELECT * FROM records WHERE posTrain(records.lat, records.lon)").cache()
    dfn1 = sqlContext.sql("SELECT * FROM records WHERE NOT negTrain(records.lat, records.lon)").cache()
    dap  = sqlContext.sql("SELECT * FROM records WHERE posTest(records.lat, records.lon)").cache()
    dan  = sqlContext.sql("SELECT * FROM records WHERE negTest(records.lat, records.lon)").cache()
    nInTrain = df1.count()
    nOutTrain = dfn1.count()
    nInApply = dap.count()
    nOutApply = dan.count()
    diff = time.time() - t1
    print "GEQE: Time to partition data by region", diff
    print "GEQE: Positive training points:", nInTrain, ".  Negative training points:", nOutTrain
    print "GEQE: Positive test points:", nInApply, ".  Negative test points:", nOutApply

    #Map data for training
    t1 = time.time()
    trainIn  = df1.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    trainOut = dfn1.map(lambda x: (x.key, [LabeledPoint(0.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    scaleFactor = (10.*nInTrain)/float(nOutTrain)
    mlTrain = trainIn.union(trainOut.sample(False, scaleFactor))
    if len(lStop) != 0:
        mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    applyPos = dap.map(lambda x: LabeledPoint(1.0, x.vector)).cache()
    applyNeg = dan.map(lambda x: LabeledPoint(0.0, x.vector)).cache()
    diff = time.time() - t1
    print "GEQE: Time to prepare training data", diff

    # feature selection  if num_features > 0
    # use chi sq test to find most relevant features
    trainingData,applyData = None, None
    if num_features < 1:
        trainingData = mlTrain.map(lambda x: x[1][0])
        applyData = applyPos.union(applyNeg)
    else:
        # use chi sq feature selection
        print 'Selecting top ',num_features,' features...'
        featureSelectionModel = ChiSqSelector(num_features).fit(mlTrain.map(lambda x: x[1][0]))
        print 'Features selected.  Transforming training data'
        posTrain = mlTrain.filter(lambda x: x[1][0].label == 1.0).map(lambda x: x[1][0].features)
        posTrain = featureSelectionModel.transform( posTrain ).map( lambda x: LabeledPoint(1.0,x) )
        negTrain = mlTrain.filter(lambda x: x[1][0].label == 0.0).map(lambda x: x[1][0].features)
        negTrain = featureSelectionModel.transform( negTrain ).map( lambda x: LabeledPoint(0.0,x) )
        trainingData = posTrain.union(negTrain)

        # transform apply data
        print 'Transforming apply data'
        applyPos = featureSelectionModel.transform( applyPos.map(lambda x: x.features) ).map(lambda x: LabeledPoint(1.0,x))
        applyNeg = featureSelectionModel.transform( applyNeg.map(lambda x: x.features) ).map(lambda x: LabeledPoint(0.0,x))
        applyData = applyPos.union(applyNeg)


    #train model
    t1 = time.time()
    trainingFunction = getTrainModelFunc(modelName)
    model = trainingFunction(trainingData)
    diff = time.time() - t1
    print "GEQE: Time to train model", diff

    #apply model
    t1 = time.time()
    predictions_Tree = model.predict(applyData.map(lambda x: x.features))
    tAndP = applyData.map(lambda x: x.label).zip(predictions_Tree)
    diff = time.time() - t1
    results = tAndP.collect()
    print "GEQE: Time to apply model", diff
    return (results, nInApply, nOutApply)
Esempio n. 9
0
    #Create polygon list and broadcast variable based on it
    lPolygon = shapeReader.readInShapeJson(shapeFile)
    bc_lTargetPolygons = sc.broadcast(lPolygon)

    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(
        inputFile) if 0 == nDataType else csvToDataFrame(
            sc, sqlContext, inputFile, nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    sqlContext.registerFunction(
        "inEventOfInterest",
        lambda lat, lon, dt: fspLib.inEOI(lat, lon, dt, bc_lTargetPolygons),
        returnType=BooleanType())
    data = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)"
    )

    #Split data into 2 DDSs depending on being in our out of region of interest
    rows = data.collect()
    if not os.path.isdir('previewTrainingFiles'):
        os.mkdir('previewTrainingFiles')
    fOut = codecs.open('previewTrainingFiles/' + jobNm,
                       encoding="utf-8",
Esempio n. 10
0
def run(jobNm,
        sc,
        sqlContext,
        inputFile,
        lPolygon,
        dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop != '' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.] * 7
    mSL = [
        "Initial Read", "Calculate IDF", "Partition for M.L.",
        "Create Training Vector", "Train Model", "Apply Model",
        "Prepare Output Data"
    ]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    print 'inputFile ', inputFile
    print 'inputPartitions ', inputPartitions
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile,
                                             inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:", nGoodTweets
    diff = t2 - t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile, "r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to read dict:", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, inside r.o.i., and a sample of the areas outside the r.o.i.
    t1 = time.time()
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    df1 = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    df1.registerTempTable("df1")
    nIn = df1.count()
    dfn1 = sqlContext.sql(
        "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    dfn1.registerTempTable("dfn1")
    nOut = dfn1.count()
    modelDict = aggregatedComparison.exemplarDict(df1, revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to find in and out of ROI", diff
    print "N in:", nIn, ", N out:", nOut
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data, and sample of out region data
    t1 = time.time()
    #grouped = aggregatedComparison.createAggregatedLabledPoint(df1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1.0)
    #grouped2 = aggregatedComparison.createAggregatedLabledPoint(dfn1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, -1.0)
    #nSignal = float(grouped.count())
    #nBack = float(grouped2.count())
    groupedIn = df1.map(lambda x: (x.key, [
        LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [
        LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    scaleFactor = (10. * nIn) / float(nOut)
    (mlApply,
     groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor])
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(
            lambda x: aggregatedComparison.removeStopWords(x, lStop))
    nTotTrain = mlTrain.count()
    mlApply.cache()
    nApply = mlApply.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2 - t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]),
                                             categoricalFeaturesInfo={},
                                             numTrees=100,
                                             featureSubsetStrategy="auto",
                                             impurity="variance",
                                             maxDepth=4,
                                             maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2 - t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # apply model
    t1 = time.time()
    predictions_Tree = model_Tree.predict(
        mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    diff = t2 - t1
    print "Time to apply model: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(False, jobNm,
                                                 vecAndPredictions, sNum,
                                                 revLookup, writeFileOutput,
                                                 modelDict)
    t2 = time.time()
    diff = t2 - t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet