def main(): conf = SparkConf().setAppName("jag - preprocess twitter") sc = SparkContext(conf=conf) bc_lStopWords = fspLib.load_stopwords(sc, 'inputFiles/stopWordList.txt', '') dt_low = datetime.date(2015, 12, 1) bc_low = sc.broadcast(dt_low) dt_high = datetime.date(2016, 1, 1) bc_high = sc.broadcast(dt_high) sqlContext = SQLContext(sc) sqlContext.registerFunction( "hasScorableWord", lambda text: fspLib.hasScorableWord(text, True, bc_lStopWords), returnType=BooleanType()) sqlContext.registerFunction("inDateWindow", lambda dt: in_time_window(dt, bc_low, bc_high), returnType=BooleanType()) raw = to_parquet.csvToDataFrame(sc, sqlContext, "...", 66).cache() n_raw = raw.count() big_print("Read in " + str(n_raw) + " tweets") raw.registerTempTable("raw") sqlCommand = "SELECT * from raw WHERE hasScorableWord(text) AND inDateWindwo(dt)" df = sqlContext.sql(sqlCommand).cache() n_df = df.count() big_print("Binning " + str(n_df) + " entries with scorable words") binned = df.map(lambda x: (aggregatedComparison.groupString(x, True, 0.01), x))\ .groupByKey()\ .filter(lambda x: aggregatedComparison.hasMinUsers(x[1],4))\ .map(lambda x: create_bin(x))\ .cache() n_binned = binned.count() big_print("Writing " + str(n_binned) + "to ES") es_write_conf = { "es.nodes": "localhost", #or ES url "es.port": "9200", "es.resource": "g_trainer/points" } binned.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_write_conf)
jobNm = args.jobNm outPart = args.outPart nDataType = args.datTyp sCustStop = args.sCustStop fBinSize = args.binSize nMinClusterUnique = args.nMinClusterUnique bUseStopFilter = args.bUseStopFilter stopWordsPath = args.stopWordsFile bUseDate = args.bUseDate == "True" or args.bUseDate == "true" conf = SparkConf().setAppName(jobNm) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #Read in stop word list early to get notified of issues early in process bc_lStopWords = fspLib.load_stopwords(sc, stopWordsPath, sCustStop) t0 = time.time() #Read in data and filter out entries with no valid words t1 = time.time() records = aggregatedComparison.initialFilter(sc, sqlContext, inputFile, nDataType, inputPartitions, bUseStopFilter, bc_lStopWords).cache() nGoodTweets = records.count() t2 = time.time() print "Number of good tweets:", nGoodTweets diff = t2 - t1 print "Time to read in and filter nonscorable words", diff #Find the word document frequency for the corpus
############# ############# ############# ############# ############# # filterData # by JAG3 # ############# ############# ############# ############# ############# from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.types import BooleanType from datetime import date import sys import argparse sys.path.insert(0, './lib/') from to_parquet import csvToDataFrame import fspLib import shapeReader # HARD CODE YOU INPUT DATA SETS AND DATA TYPES DATA_SETS = {"/data/ingest/twitter/success/":2} LOWER_TIME = date(2006,03,21) UPPER_TIME = date(3000,01,01) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("shapeFile", help="The shape file path") parser.add_argument("outputPath",help="Output destination") parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.') parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8) parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt") parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='') args = parser.parse_args() shapeFile = args.shapeFile
) args = parser.parse_args() inputPartitions = args.partitions jobNm = args.jobNm bUseStopFilter = args.bUseStopFilter stopWordsPath = args.stopWordsFile sCustStop = args.sCustStop datasets = args.datasets.split(",") conf = SparkConf().setAppName(jobNm) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # Read in stop word list early to get notified of issues early in process bc_lStopWords = fspLib.load_stopwords(sc, stopWordsPath, sCustStop) t0 = time.time() records = None for file in datasets: print "reading file: ", file if records == None: records = sqlContext.parquetFile(file) else: newRec = sqlContext.parquetFile(file) records = records.unionAll(newRec) if inputPartitions > 0: records = records.repartition(inputPartitions)
############# ############# ############# ############# ############# # filterData # by JAG3 # ############# ############# ############# ############# ############# from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.types import BooleanType from datetime import date import sys import argparse sys.path.insert(0, './lib/') from to_parquet import csvToDataFrame import fspLib import shapeReader # HARD CODE YOU INPUT DATA SETS AND DATA TYPES DATA_SETS = {"hdfs://xdata/qcr/gnip": 66} LOWER_TIME = date(2006, 03, 21) UPPER_TIME = date(3000, 01, 01) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("shapeFile", help="The shape file path") parser.add_argument("outputPath", help="Output destination") parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'", default='Geqe data filter.') parser.add_argument( "-cNum", type=int, help=