Exemple #1
0
 def __init__ (self, jars = [], properties = {}):
     ''' Initialize Pig. '''
     for jar in jars:
         logger.debug (" >>> register jar: %s", jar)
         Pig.registerJar (jar)
     for key in properties:
         logger.debug (" >>> set property: %s => %s", key, properties[key])
         Pig.set (key, properties [key]) 
Exemple #2
0
def pig_init():
    """ Setup the pig settings used for all runs."""
    #I'm using the piggybank from s3://elasticmapreduce/libs/pig/0.9.1/piggybank-0.9.1-amzn.jar
    Pig.registerJar('/usr/share/pig/contrib/piggybank/piggybank.jar')

    Pig.define('DATE_TIME', 'org.apache.pig.piggybank.evaluation.datetime.DATE_TIME()')
    Pig.define('EXTRACT', 'org.apache.pig.piggybank.evaluation.string.EXTRACT()')
    Pig.define('FORMAT', 'org.apache.pig.piggybank.evaluation.string.FORMAT()')
    Pig.define('FORMAT_DT', 'org.apache.pig.piggybank.evaluation.datetime.FORMAT_DT()')
    Pig.define('REPLACE', 'org.apache.pig.piggybank.evaluation.string.REPLACE()')

    #The box I use is dedicated to pig so use a lot of memory
    Pig.set('pig.cachedbag.memusage', '0.6')
Exemple #3
0
    sorted = ORDER non_null BY value DESC;
    limited = LIMIT sorted 1000;
    GENERATE group AS rowkey, FLATTEN(limited.(colkey, value));
};

jsonified =
FOREACH limited GENERATE rowkey,
                         colkey,
                         com.verbify.pig.TO_JSON(value);

STORE jsonified INTO '$OUTPUT' USING PigStorage();
"""

###### run the jobs
# register the verbify udfs
Pig.registerJar(SCRIPT_ROOT + "verbify-pig-udfs.jar")

# process rels
for rel, (cf, thing2_type) in relations.iteritems():
    # build source for a script
    script = "SET default_parallel 10;"
    script += load_rels
    if "inbox" in rel:
        script += load_and_map_data
        script += add_unread
    else:
        script += add_relname
    script += load_things
    script += generate_rel_items
    script += store_top_1000_per_rowkey
Exemple #4
0
if len(sys.argv) != 5:
    print "Usage: " + sys.argv[
        0] + " tenantCode start-date end-date parameters-file"
    print "Data format: yyyy/MM/dd"
    sys.exit()

tenantCode = sys.argv[1]
startDate = sys.argv[2]
endDate = sys.argv[3]
paramFile = sys.argv[4]

minObjectId = globalVars.dateToObjectId(startDate)
maxObjectId = globalVars.dateToObjectId(endDate)

Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
Pig.registerJar("../lib/yucca-phoenix-pig.jar")

try:
    props = util.Properties()
    propertiesfis = javaio.FileInputStream("mongo_parameters_prod.txt")
    props.load(propertiesfis)
except:
    print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0]
    sys.exit(1)

mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty(
    'mongoPort') + "/DB_SUPPORT"
mongo2 = " -u " + props.getProperty('mongoUsr')
mongo3 = " -p " + props.getProperty(
    'mongoPwd') + ''' --authenticationDatabase admin  --quiet --eval "'''
Exemple #5
0
#!/usr/bin/python
from __future__ import with_statement
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
import globalVars
from org.apache.pig.scripting import Pig
sys.path.append('../lib/jyson-1.0.2.jar')
from com.xhaus.jyson import JysonCodec as json
from subprocess import call
import java.util as util
import java.io as javaio
import csv

Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
Pig.registerJar("../lib/piggybankExtended.jar")

#Jar to use AVRO
#Pig.registerJar("/usr/hdp/current/pig-client/lib/avro-1.7.5.jar")
#Pig.registerJar("/usr/hdp/current/pig-client/lib/json-simple-1.1.jar")
#Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-core-asl-1.9.13.jar")
#Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-mapper-asl-1.9.13.jar")

if len(sys.argv) != 2:
    print "Usage: " + sys.argv[0] + " parameters-file"
    sys.exit(1)

paramFile = sys.argv[1]

try:
    props = util.Properties()
Exemple #6
0
if len(sys.argv) != 5:
    print "Usage: " + sys.argv[
        0] + " tenantCode start-date end-date parameters-file"
    print "Data format: yyyy/MM/dd"
    sys.exit()

tenantCode = sys.argv[1]
startDate = sys.argv[2]
endDate = sys.argv[3]
paramFile = sys.argv[4]

minObjectId = globalVars.dateToObjectId(startDate)
maxObjectId = globalVars.dateToObjectId(endDate)

Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar")
Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar")
Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar")
Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
Pig.registerJar("../lib/yucca-phoenix-pig.jar")
Pig.registerJar("/usr/hdp/current/pig-client/piggybank.jar")

try:
    props = util.Properties()
    propertiesfis = javaio.FileInputStream(paramFile)
    props.load(propertiesfis)
except:
    print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0]
    sys.exit(1)

mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty(
    sorted = ORDER non_null BY value DESC;
    limited = LIMIT sorted 1000;
    GENERATE group AS rowkey, FLATTEN(limited.(colkey, value));
};

jsonified =
FOREACH limited GENERATE rowkey,
                         colkey,
                         com.reddit.pig.TO_JSON(value);

STORE jsonified INTO '$OUTPUT' USING PigStorage();
"""

###### run the jobs
# register the reddit udfs
Pig.registerJar(SCRIPT_ROOT + "reddit-pig-udfs.jar")

# process rels
for rel, (cf, thing2_type) in relations.iteritems():
    # build source for a script
    script = "SET default_parallel 10;"
    script += load_rels
    if "inbox" in rel:
        script += load_and_map_data
        script += add_unread
    else:
        script += add_relname
    script += load_things
    script += generate_rel_items
    script += store_top_1000_per_rowkey