Esempio n. 1
0
def initialFilter(sc, sqlContext, inputFile, nDataType, inputPartitions, bUseStopFilter, bc_lStopWords):
    records = sqlContext.read.parquet(inputFile) if 0 == nDataType else csvToDataFrame(sc, sqlContext, inputFile, nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("hasScorableWord", lambda text: fspLib.hasScorableWord(text, bUseStopFilter, bc_lStopWords), returnType=BooleanType())
    records = sqlContext.sql("SELECT * from records WHERE hasScorableWord(records.text) ")
    return records
Esempio n. 2
0
def main():
    conf = SparkConf().setAppName("jag - preprocess twitter")
    sc = SparkContext(conf=conf)

    bc_lStopWords = fspLib.load_stopwords(sc, 'inputFiles/stopWordList.txt',
                                          '')
    dt_low = datetime.date(2015, 12, 1)
    bc_low = sc.broadcast(dt_low)
    dt_high = datetime.date(2016, 1, 1)
    bc_high = sc.broadcast(dt_high)

    sqlContext = SQLContext(sc)
    sqlContext.registerFunction(
        "hasScorableWord",
        lambda text: fspLib.hasScorableWord(text, True, bc_lStopWords),
        returnType=BooleanType())
    sqlContext.registerFunction("inDateWindow",
                                lambda dt: in_time_window(dt, bc_low, bc_high),
                                returnType=BooleanType())

    raw = to_parquet.csvToDataFrame(sc, sqlContext, "...", 66).cache()
    n_raw = raw.count()
    big_print("Read in " + str(n_raw) + " tweets")

    raw.registerTempTable("raw")
    sqlCommand = "SELECT * from raw WHERE hasScorableWord(text) AND inDateWindwo(dt)"
    df = sqlContext.sql(sqlCommand).cache()
    n_df = df.count()
    big_print("Binning " + str(n_df) + " entries with scorable words")

    binned = df.map(lambda x: (aggregatedComparison.groupString(x, True, 0.01), x))\
                .groupByKey()\
                .filter(lambda x: aggregatedComparison.hasMinUsers(x[1],4))\
                .map(lambda x: create_bin(x))\
                .cache()
    n_binned = binned.count()

    big_print("Writing " + str(n_binned) + "to ES")
    es_write_conf = {
        "es.nodes": "localhost",  #or ES url
        "es.port": "9200",
        "es.resource": "g_trainer/points"
    }

    binned.saveAsNewAPIHadoopFile(
        path='-',
        outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
        keyClass="org.apache.hadoop.io.NullWritable",
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
        conf=es_write_conf)
Esempio n. 3
0
def initialFilter(sc, sqlContext, inputFile, nDataType, inputPartitions,
                  bUseStopFilter, bc_lStopWords):
    records = sqlContext.read.parquet(
        inputFile) if 0 == nDataType else csvToDataFrame(
            sc, sqlContext, inputFile, nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("hasScorableWord",
                                lambda text: fspLib.hasScorableWord(
                                    text, bUseStopFilter, bc_lStopWords),
                                returnType=BooleanType())
    records = sqlContext.sql(
        "SELECT * from records WHERE hasScorableWord(records.text) ")
    return records
Esempio n. 4
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"/data/ingest/twitter/success/":2}

LOWER_TIME = date(2006,03,21)
UPPER_TIME = date(3000,01,01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath",help="Output destination")
    parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'",default='Geqe data filter.')
    parser.add_argument("-cNum", type=int, help="Number of processes to coalesce initial input data to, default = 3",default = 8)
    parser.add_argument("--stopWordsFile",help="File path to a stop words list. One word per line. default=inputFiles/stopWordList.txt",default="inputFiles/stopWordList.txt")
    parser.add_argument("-sCustStop", help="Comma seperated list of stop words to add include on this run",default='')
    args = parser.parse_args()
    shapeFile = args.shapeFile
Esempio n. 5
0
############# ############# ############# ############# #############
# filterData
# by JAG3
#
############# ############# ############# ############# #############
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import BooleanType
from datetime import date
import sys
import argparse
sys.path.insert(0, './lib/')
from to_parquet import csvToDataFrame
import fspLib
import shapeReader
# HARD CODE YOU INPUT DATA SETS AND DATA TYPES
DATA_SETS = {"hdfs://xdata/qcr/gnip": 66}
LOWER_TIME = date(2006, 03, 21)
UPPER_TIME = date(3000, 01, 01)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("shapeFile", help="The shape file path")
    parser.add_argument("outputPath", help="Output destination")
    parser.add_argument("-jobNm",
                        help="Application name, default = 'Geqe Data Filter'",
                        default='Geqe data filter.')
    parser.add_argument(
        "-cNum",
        type=int,
        help=