def initialFilter(sc, sqlContext, inputFile, nDataType, inputPartitions, bUseStopFilter, bc_lStopWords): records = sqlContext.read.parquet(inputFile) if 0 == nDataType else csvToDataFrame(sc, sqlContext, inputFile, nDataType) if inputPartitions != -1: records = records.repartition(inputPartitions) records.cache() records.registerTempTable('records') sqlContext.registerFunction("hasScorableWord", lambda text: fspLib.hasScorableWord(text, bUseStopFilter, bc_lStopWords), returnType=BooleanType()) records = sqlContext.sql("SELECT * from records WHERE hasScorableWord(records.text) ") return records
def main(): conf = SparkConf().setAppName("jag - preprocess twitter") sc = SparkContext(conf=conf) bc_lStopWords = fspLib.load_stopwords(sc, 'inputFiles/stopWordList.txt', '') dt_low = datetime.date(2015, 12, 1) bc_low = sc.broadcast(dt_low) dt_high = datetime.date(2016, 1, 1) bc_high = sc.broadcast(dt_high) sqlContext = SQLContext(sc) sqlContext.registerFunction( "hasScorableWord", lambda text: fspLib.hasScorableWord(text, True, bc_lStopWords), returnType=BooleanType()) sqlContext.registerFunction("inDateWindow", lambda dt: in_time_window(dt, bc_low, bc_high), returnType=BooleanType()) raw = to_parquet.csvToDataFrame(sc, sqlContext, "...", 66).cache() n_raw = raw.count() big_print("Read in " + str(n_raw) + " tweets") raw.registerTempTable("raw") sqlCommand = "SELECT * from raw WHERE hasScorableWord(text) AND inDateWindwo(dt)" df = sqlContext.sql(sqlCommand).cache() n_df = df.count() big_print("Binning " + str(n_df) + " entries with scorable words") binned = df.map(lambda x: (aggregatedComparison.groupString(x, True, 0.01), x))\ .groupByKey()\ .filter(lambda x: aggregatedComparison.hasMinUsers(x[1],4))\ .map(lambda x: create_bin(x))\ .cache() n_binned = binned.count() big_print("Writing " + str(n_binned) + "to ES") es_write_conf = { "es.nodes": "localhost", #or ES url "es.port": "9200", "es.resource": "g_trainer/points" } binned.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_write_conf)
def initialFilter(sc, sqlContext, inputFile, nDataType, inputPartitions, bUseStopFilter, bc_lStopWords): records = sqlContext.read.parquet( inputFile) if 0 == nDataType else csvToDataFrame( sc, sqlContext, inputFile, nDataType) if inputPartitions != -1: records = records.repartition(inputPartitions) records.cache() records.registerTempTable('records') sqlContext.registerFunction("hasScorableWord", lambda text: fspLib.hasScorableWord( text, bUseStopFilter, bc_lStopWords), returnType=BooleanType()) records = sqlContext.sql( "SELECT * from records WHERE hasScorableWord(records.text) ") return records
#Declare Spark Context conf = SparkConf().setAppName(jobNm) conf.set('spark.driver.maxResultSize','0') sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) #Create polygon list and broadcast variable based on it lPolygon = shapeReader.readInShapeJson(shapeFile) bc_lTargetPolygons = sc.broadcast(lPolygon) #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType) if inputPartitions != -1: records = records.repartition(inputPartitions) records.cache() records.registerTempTable('records') sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType()) data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)") #Split data into 2 DDSs depending on being in our out of region of interest rows = data.collect() if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles') fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb") for row in rows: try:
############# ############# ############# ############# ############# # filterData # by JAG3 # ############# ############# ############# ############# ############# from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.types import BooleanType from datetime import date import sys import argparse sys.path.insert(0, './lib/') from to_parquet import csvToDataFrame import fspLib import shapeReader # HARD CODE YOU INPUT DATA SETS AND DATA TYPES DATA_SETS = {"/data/ingest/twitter/success/":2} LOWER_TIME = date(2006,03,21) UPPER_TIME = date(3000,01,01) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("shapeFile", help="The shape file path") parser.add_argument("outputPath",help="Output destination") parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.') parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8) parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt") parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='') args = parser.parse_args() shapeFile = args.shapeFile
############# ############# ############# ############# ############# # filterData # by JAG3 # ############# ############# ############# ############# ############# from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.types import BooleanType from datetime import date import sys import argparse sys.path.insert(0, './lib/') from to_parquet import csvToDataFrame import fspLib import shapeReader # HARD CODE YOU INPUT DATA SETS AND DATA TYPES DATA_SETS = {"hdfs://xdata/qcr/gnip": 66} LOWER_TIME = date(2006, 03, 21) UPPER_TIME = date(3000, 01, 01) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("shapeFile", help="The shape file path") parser.add_argument("outputPath", help="Output destination") parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'", default='Geqe data filter.') parser.add_argument( "-cNum", type=int, help=
nDataType = args.datTyp #Declare Spark Context conf = SparkConf().setAppName(jobNm) conf.set('spark.driver.maxResultSize', '0') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #Create polygon list and broadcast variable based on it lPolygon = shapeReader.readInShapeJson(shapeFile) bc_lTargetPolygons = sc.broadcast(lPolygon) #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job records = sqlContext.parquetFile( inputFile) if 0 == nDataType else csvToDataFrame( sc, sqlContext, inputFile, nDataType) if inputPartitions != -1: records = records.repartition(inputPartitions) records.cache() records.registerTempTable('records') sqlContext.registerFunction( "inRegionOfInterest", lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons), returnType=BooleanType()) sqlContext.registerFunction( "inEventOfInterest", lambda lat, lon, dt: fspLib.inEOI(lat, lon, dt, bc_lTargetPolygons), returnType=BooleanType()) data = sqlContext.sql( "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)" )