Exemple #1
0
    sqlContext = SQLContext(sc)


    #Create polygon list and broadcast variable based on it
    lPolygon = shapeReader.readInShapeJson(shapeFile)
    bc_lTargetPolygons = sc.broadcast(lPolygon)

    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)")


    #Split data into 2 DDSs depending on being in our out of region of interest
    rows = data.collect()
    if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles')
    fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb")
    for row in rows:
        try:
            buffer =  [row.lat,row.lon,row.user,row.dt.date(),row.text,row.dt]
            buffer = map(lambda x: unicode(x).replace(u'\t',u' ').replace(u'\n',u' '),buffer)
            fOut.write(u'\t'.join(buffer)+u'\n')
        except:
            traceback.print_exc()
            print "Error with printing record: " + str(row)
Exemple #2
0
def run(jobNm,sc,sqlContext,inputFile,lPolygon,dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop !='' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.]*7
    mSL = ["Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data"]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good points:", nGoodTweets
    diff = t2-t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)


    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2-t1
    print "Time to read dict: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, as well as prepare application data
    # i.)  In both the region, and in the time window
    # ii.) In the region, but outside the time window
    # iii.) Out of region, data to apply model to
    t1 = time.time()
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache()
    df1.registerTempTable("df1")
    df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    dfn1 =  sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)")
    df1_inTime.registerTempTable("df1_inTime")
    #df1_outTime.registerTempTable("df1_outTime")
    #nL1T1 = df1_inTime.count()
    #nL1T0 = df1_outTime.count()
    exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup)
    t2 = time.time()
    #print nL1T1, "events in region in time,", nL1T0, "events in region out of time"
    diff = t2-t1
    print "Time to partition by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data
    t1 = time.time()
    groupedIn  = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
    nSignal = float(groupedIn.count())
    nBack = float(groupedOut.count())
    scaleFactor = 10.*nSignal/nBack
    (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor])
    mlApply.cache()
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    nTotTrain = mlTrain.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2-t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2-t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Apply Model to out of region data
    t1 = time.time()
    predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    #print "Number of points to score:", nApply
    diff = t2-t1
    print "Time aggregate and label points: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict)
    t2 = time.time()
    diff = t2-t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet
Exemple #3
0
    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(
        inputFile) if 0 == nDataType else csvToDataFrame(
            sc, sqlContext, inputFile, nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    sqlContext.registerFunction(
        "inEventOfInterest",
        lambda lat, lon, dt: fspLib.inEOI(lat, lon, dt, bc_lTargetPolygons),
        returnType=BooleanType())
    data = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)"
    )

    #Split data into 2 DDSs depending on being in our out of region of interest
    rows = data.collect()
    if not os.path.isdir('previewTrainingFiles'):
        os.mkdir('previewTrainingFiles')
    fOut = codecs.open('previewTrainingFiles/' + jobNm,
                       encoding="utf-8",
                       mode="wb")
    for row in rows:
        try:
            buffer = [
Exemple #4
0
############# ############# ############# ############# #############
Exemple #5
0
############# ############# ############# ############# #############
Exemple #6
0
def run(jobNm,
        sc,
        sqlContext,
        inputFile,
        lPolygon,
        dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop != '' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.] * 7
    mSL = [
        "Initial Read", "Calculate IDF", "Partition for M.L.",
        "Create Training Vector", "Train Model", "Apply Model",
        "Prepare Output Data"
    ]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile,
                                             inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good points:", nGoodTweets
    diff = t2 - t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile, "r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to read dict: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, as well as prepare application data
    # i.)  In both the region, and in the time window
    # ii.) In the region, but outside the time window
    # iii.) Out of region, data to apply model to
    t1 = time.time()
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest",
                                lambda lat, lon, date: fspLib.inEOI(
                                    lat, lon, date, bc_lTargetPolygons),
                                returnType=BooleanType())
    sqlContext.registerFunction(
        "outOfEventOfInterest",
        lambda lat, lon, dt: fspLib.outEOI(lat, lon, dt, bc_lTargetPolygons),
        returnType=BooleanType())
    df1 = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    df1.registerTempTable("df1")
    df1_inTime = sqlContext.sql(
        "SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)"
    ).cache()
    #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    dfn1 = sqlContext.sql(
        "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)"
    )
    df1_inTime.registerTempTable("df1_inTime")
    #df1_outTime.registerTempTable("df1_outTime")
    #nL1T1 = df1_inTime.count()
    #nL1T0 = df1_outTime.count()
    exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup)
    t2 = time.time()
    #print nL1T1, "events in region in time,", nL1T0, "events in region out of time"
    diff = t2 - t1
    print "Time to partition by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data
    t1 = time.time()
    groupedIn = df1_inTime.map(lambda x: (x.key, [
        LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [
        LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt
    ])).cache()
    nSignal = float(groupedIn.count())
    nBack = float(groupedOut.count())
    scaleFactor = 10. * nSignal / nBack
    (mlApply,
     groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor])
    mlApply.cache()
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(
            lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    nTotTrain = mlTrain.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2 - t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]),
                                             categoricalFeaturesInfo={},
                                             numTrees=2000,
                                             featureSubsetStrategy="auto",
                                             impurity="variance",
                                             maxDepth=4,
                                             maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2 - t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Apply Model to out of region data
    t1 = time.time()
    predictions_Tree = model_Tree.predict(
        mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    #print "Number of points to score:", nApply
    diff = t2 - t1
    print "Time aggregate and label points: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(True, jobNm,
                                                 vecAndPredictions, sNum,
                                                 revLookup, writeFileOutput,
                                                 exempDict)
    t2 = time.time()
    diff = t2 - t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet