def main():
    conf = SparkConf().setAppName("jag - preprocess twitter")
    sc = SparkContext(conf=conf)

    bc_lStopWords = fspLib.load_stopwords(sc, 'inputFiles/stopWordList.txt',
                                          '')
    dt_low = datetime.date(2015, 12, 1)
    bc_low = sc.broadcast(dt_low)
    dt_high = datetime.date(2016, 1, 1)
    bc_high = sc.broadcast(dt_high)

    sqlContext = SQLContext(sc)
    sqlContext.registerFunction(
        "hasScorableWord",
        lambda text: fspLib.hasScorableWord(text, True, bc_lStopWords),
        returnType=BooleanType())
    sqlContext.registerFunction("inDateWindow",
                                lambda dt: in_time_window(dt, bc_low, bc_high),
                                returnType=BooleanType())

    raw = to_parquet.csvToDataFrame(sc, sqlContext, "...", 66).cache()
    n_raw = raw.count()
    big_print("Read in " + str(n_raw) + " tweets")

    raw.registerTempTable("raw")
    sqlCommand = "SELECT * from raw WHERE hasScorableWord(text) AND inDateWindwo(dt)"
    df = sqlContext.sql(sqlCommand).cache()
    n_df = df.count()
    big_print("Binning " + str(n_df) + " entries with scorable words")

    binned = df.map(lambda x: (aggregatedComparison.groupString(x, True, 0.01), x))\
                .groupByKey()\
                .filter(lambda x: aggregatedComparison.hasMinUsers(x[1],4))\
                .map(lambda x: create_bin(x))\
                .cache()
    n_binned = binned.count()

    big_print("Writing " + str(n_binned) + "to ES")
    es_write_conf = {
        "es.nodes": "localhost",  #or ES url
        "es.port": "9200",
        "es.resource": "g_trainer/points"
    }

    binned.saveAsNewAPIHadoopFile(
        path='-',
        outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
        keyClass="org.apache.hadoop.io.NullWritable",
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
        conf=es_write_conf)
Exemple #2
0
    jobNm = args.jobNm
    outPart = args.outPart
    nDataType = args.datTyp
    sCustStop = args.sCustStop
    fBinSize = args.binSize
    nMinClusterUnique = args.nMinClusterUnique
    bUseStopFilter = args.bUseStopFilter
    stopWordsPath = args.stopWordsFile
    bUseDate = args.bUseDate == "True" or args.bUseDate == "true"

    conf = SparkConf().setAppName(jobNm)
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #Read in stop word list early to get notified of issues early in process
    bc_lStopWords = fspLib.load_stopwords(sc, stopWordsPath, sCustStop)

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    records = aggregatedComparison.initialFilter(sc, sqlContext, inputFile,
                                                 nDataType, inputPartitions,
                                                 bUseStopFilter,
                                                 bc_lStopWords).cache()
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:", nGoodTweets
    diff = t2 - t1
    print "Time to read in and filter nonscorable words", diff

    #Find the word document frequency for the corpus
Exemple #3
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"/data/ingest/twitter/success/":2}

LOWER_TIME = date(2006,03,21)
UPPER_TIME = date(3000,01,01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath",help="Output destination")
    parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.')
    parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8)
    parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt")
    parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='')
    args = parser.parse_args()
    shapeFile = args.shapeFile
Exemple #4
0
    )
    args = parser.parse_args()

    inputPartitions = args.partitions
    jobNm = args.jobNm
    bUseStopFilter = args.bUseStopFilter
    stopWordsPath = args.stopWordsFile
    sCustStop = args.sCustStop
    datasets = args.datasets.split(",")

    conf = SparkConf().setAppName(jobNm)
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # Read in stop word list early to get notified of issues early in process
    bc_lStopWords = fspLib.load_stopwords(sc, stopWordsPath, sCustStop)

    t0 = time.time()

    records = None
    for file in datasets:
        print "reading file: ", file
        if records == None:
            records = sqlContext.parquetFile(file)
        else:
            newRec = sqlContext.parquetFile(file)
            records = records.unionAll(newRec)

    if inputPartitions > 0:
        records = records.repartition(inputPartitions)
Exemple #5
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"hdfs://xdata/qcr/gnip": 66}
LOWER_TIME = date(2006, 03, 21)
UPPER_TIME = date(3000, 01, 01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath", help="Output destination")
    parser.add_argument("-jobNm",
                        help="Application name, default = 'Geqe Data Filter'",
                        default='Geqe data filter.')
    parser.add_argument(
        "-cNum",
        type=int,
        help=