Example #1
0
def initialFilter(sc, sqlContext, inputFile, nDataType, inputPartitions, bUseStopFilter, bc_lStopWords):
    records = sqlContext.read.parquet(inputFile) if 0 == nDataType else csvToDataFrame(sc, sqlContext, inputFile, nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("hasScorableWord", lambda text: fspLib.hasScorableWord(text, bUseStopFilter, bc_lStopWords), returnType=BooleanType())
    records = sqlContext.sql("SELECT * from records WHERE hasScorableWord(records.text) ")
    return records
def main():
    conf = SparkConf().setAppName("jag - preprocess twitter")
    sc = SparkContext(conf=conf)

    bc_lStopWords = fspLib.load_stopwords(sc, 'inputFiles/stopWordList.txt',
                                          '')
    dt_low = datetime.date(2015, 12, 1)
    bc_low = sc.broadcast(dt_low)
    dt_high = datetime.date(2016, 1, 1)
    bc_high = sc.broadcast(dt_high)

    sqlContext = SQLContext(sc)
    sqlContext.registerFunction(
        "hasScorableWord",
        lambda text: fspLib.hasScorableWord(text, True, bc_lStopWords),
        returnType=BooleanType())
    sqlContext.registerFunction("inDateWindow",
                                lambda dt: in_time_window(dt, bc_low, bc_high),
                                returnType=BooleanType())

    raw = to_parquet.csvToDataFrame(sc, sqlContext, "...", 66).cache()
    n_raw = raw.count()
    big_print("Read in " + str(n_raw) + " tweets")

    raw.registerTempTable("raw")
    sqlCommand = "SELECT * from raw WHERE hasScorableWord(text) AND inDateWindwo(dt)"
    df = sqlContext.sql(sqlCommand).cache()
    n_df = df.count()
    big_print("Binning " + str(n_df) + " entries with scorable words")

    binned = df.map(lambda x: (aggregatedComparison.groupString(x, True, 0.01), x))\
                .groupByKey()\
                .filter(lambda x: aggregatedComparison.hasMinUsers(x[1],4))\
                .map(lambda x: create_bin(x))\
                .cache()
    n_binned = binned.count()

    big_print("Writing " + str(n_binned) + "to ES")
    es_write_conf = {
        "es.nodes": "localhost",  #or ES url
        "es.port": "9200",
        "es.resource": "g_trainer/points"
    }

    binned.saveAsNewAPIHadoopFile(
        path='-',
        outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
        keyClass="org.apache.hadoop.io.NullWritable",
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
        conf=es_write_conf)
Example #3
0
def initialFilter(sc, sqlContext, inputFile, nDataType, inputPartitions,
                  bUseStopFilter, bc_lStopWords):
    records = sqlContext.read.parquet(
        inputFile) if 0 == nDataType else csvToDataFrame(
            sc, sqlContext, inputFile, nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("hasScorableWord",
                                lambda text: fspLib.hasScorableWord(
                                    text, bUseStopFilter, bc_lStopWords),
                                returnType=BooleanType())
    records = sqlContext.sql(
        "SELECT * from records WHERE hasScorableWord(records.text) ")
    return records
Example #4
0

    #Declare Spark Context
    conf = SparkConf().setAppName(jobNm)
    conf.set('spark.driver.maxResultSize','0')
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)


    #Create polygon list and broadcast variable based on it
    lPolygon = shapeReader.readInShapeJson(shapeFile)
    bc_lTargetPolygons = sc.broadcast(lPolygon)

    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)")


    #Split data into 2 DDSs depending on being in our out of region of interest
    rows = data.collect()
    if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles')
    fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb")
    for row in rows:
        try:
Example #5
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"/data/ingest/twitter/success/":2}

LOWER_TIME = date(2006,03,21)
UPPER_TIME = date(3000,01,01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath",help="Output destination")
    parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.')
    parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8)
    parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt")
    parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='')
    args = parser.parse_args()
    shapeFile = args.shapeFile
Example #6
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"hdfs://xdata/qcr/gnip": 66}
LOWER_TIME = date(2006, 03, 21)
UPPER_TIME = date(3000, 01, 01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath", help="Output destination")
    parser.add_argument("-jobNm",
                        help="Application name, default = 'Geqe Data Filter'",
                        default='Geqe data filter.')
    parser.add_argument(
        "-cNum",
        type=int,
        help=
Example #7
0
    nDataType = args.datTyp

    #Declare Spark Context
    conf = SparkConf().setAppName(jobNm)
    conf.set('spark.driver.maxResultSize', '0')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #Create polygon list and broadcast variable based on it
    lPolygon = shapeReader.readInShapeJson(shapeFile)
    bc_lTargetPolygons = sc.broadcast(lPolygon)

    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(
        inputFile) if 0 == nDataType else csvToDataFrame(
            sc, sqlContext, inputFile, nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    sqlContext.registerFunction(
        "inEventOfInterest",
        lambda lat, lon, dt: fspLib.inEOI(lat, lon, dt, bc_lTargetPolygons),
        returnType=BooleanType())
    data = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)"
    )