def __init__ (self, jars = [], properties = {}): ''' Initialize Pig. ''' for jar in jars: logger.debug (" >>> register jar: %s", jar) Pig.registerJar (jar) for key in properties: logger.debug (" >>> set property: %s => %s", key, properties[key]) Pig.set (key, properties [key])
def pig_init(): """ Setup the pig settings used for all runs.""" #I'm using the piggybank from s3://elasticmapreduce/libs/pig/0.9.1/piggybank-0.9.1-amzn.jar Pig.registerJar('/usr/share/pig/contrib/piggybank/piggybank.jar') Pig.define('DATE_TIME', 'org.apache.pig.piggybank.evaluation.datetime.DATE_TIME()') Pig.define('EXTRACT', 'org.apache.pig.piggybank.evaluation.string.EXTRACT()') Pig.define('FORMAT', 'org.apache.pig.piggybank.evaluation.string.FORMAT()') Pig.define('FORMAT_DT', 'org.apache.pig.piggybank.evaluation.datetime.FORMAT_DT()') Pig.define('REPLACE', 'org.apache.pig.piggybank.evaluation.string.REPLACE()') #The box I use is dedicated to pig so use a lot of memory Pig.set('pig.cachedbag.memusage', '0.6')
sorted = ORDER non_null BY value DESC; limited = LIMIT sorted 1000; GENERATE group AS rowkey, FLATTEN(limited.(colkey, value)); }; jsonified = FOREACH limited GENERATE rowkey, colkey, com.verbify.pig.TO_JSON(value); STORE jsonified INTO '$OUTPUT' USING PigStorage(); """ ###### run the jobs # register the verbify udfs Pig.registerJar(SCRIPT_ROOT + "verbify-pig-udfs.jar") # process rels for rel, (cf, thing2_type) in relations.iteritems(): # build source for a script script = "SET default_parallel 10;" script += load_rels if "inbox" in rel: script += load_and_map_data script += add_unread else: script += add_relname script += load_things script += generate_rel_items script += store_top_1000_per_rowkey
if len(sys.argv) != 5: print "Usage: " + sys.argv[ 0] + " tenantCode start-date end-date parameters-file" print "Data format: yyyy/MM/dd" sys.exit() tenantCode = sys.argv[1] startDate = sys.argv[2] endDate = sys.argv[3] paramFile = sys.argv[4] minObjectId = globalVars.dateToObjectId(startDate) maxObjectId = globalVars.dateToObjectId(endDate) Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") Pig.registerJar("../lib/yucca-phoenix-pig.jar") try: props = util.Properties() propertiesfis = javaio.FileInputStream("mongo_parameters_prod.txt") props.load(propertiesfis) except: print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0] sys.exit(1) mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty( 'mongoPort') + "/DB_SUPPORT" mongo2 = " -u " + props.getProperty('mongoUsr') mongo3 = " -p " + props.getProperty( 'mongoPwd') + ''' --authenticationDatabase admin --quiet --eval "'''
#!/usr/bin/python from __future__ import with_statement import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__')))) import globalVars from org.apache.pig.scripting import Pig sys.path.append('../lib/jyson-1.0.2.jar') from com.xhaus.jyson import JysonCodec as json from subprocess import call import java.util as util import java.io as javaio import csv Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") Pig.registerJar("../lib/piggybankExtended.jar") #Jar to use AVRO #Pig.registerJar("/usr/hdp/current/pig-client/lib/avro-1.7.5.jar") #Pig.registerJar("/usr/hdp/current/pig-client/lib/json-simple-1.1.jar") #Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-core-asl-1.9.13.jar") #Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-mapper-asl-1.9.13.jar") if len(sys.argv) != 2: print "Usage: " + sys.argv[0] + " parameters-file" sys.exit(1) paramFile = sys.argv[1] try: props = util.Properties()
if len(sys.argv) != 5: print "Usage: " + sys.argv[ 0] + " tenantCode start-date end-date parameters-file" print "Data format: yyyy/MM/dd" sys.exit() tenantCode = sys.argv[1] startDate = sys.argv[2] endDate = sys.argv[3] paramFile = sys.argv[4] minObjectId = globalVars.dateToObjectId(startDate) maxObjectId = globalVars.dateToObjectId(endDate) Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar") Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar") Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar") Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar") Pig.registerJar("../lib/yucca-phoenix-pig.jar") Pig.registerJar("/usr/hdp/current/pig-client/piggybank.jar") try: props = util.Properties() propertiesfis = javaio.FileInputStream(paramFile) props.load(propertiesfis) except: print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0] sys.exit(1) mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty(
sorted = ORDER non_null BY value DESC; limited = LIMIT sorted 1000; GENERATE group AS rowkey, FLATTEN(limited.(colkey, value)); }; jsonified = FOREACH limited GENERATE rowkey, colkey, com.reddit.pig.TO_JSON(value); STORE jsonified INTO '$OUTPUT' USING PigStorage(); """ ###### run the jobs # register the reddit udfs Pig.registerJar(SCRIPT_ROOT + "reddit-pig-udfs.jar") # process rels for rel, (cf, thing2_type) in relations.iteritems(): # build source for a script script = "SET default_parallel 10;" script += load_rels if "inbox" in rel: script += load_and_map_data script += add_unread else: script += add_relname script += load_things script += generate_rel_items script += store_top_1000_per_rowkey