def __init__ (self, jars = [], properties = {}): ''' Initialize Pig. ''' for jar in jars: logger.debug (" >>> register jar: %s", jar) Pig.registerJar (jar) for key in properties: logger.debug (" >>> set property: %s => %s", key, properties[key]) Pig.set (key, properties [key])
def pig_init(): """ Setup the pig settings used for all runs.""" #I'm using the piggybank from s3://elasticmapreduce/libs/pig/0.9.1/piggybank-0.9.1-amzn.jar Pig.registerJar('/usr/share/pig/contrib/piggybank/piggybank.jar') Pig.define('DATE_TIME', 'org.apache.pig.piggybank.evaluation.datetime.DATE_TIME()') Pig.define('EXTRACT', 'org.apache.pig.piggybank.evaluation.string.EXTRACT()') Pig.define('FORMAT', 'org.apache.pig.piggybank.evaluation.string.FORMAT()') Pig.define('FORMAT_DT', 'org.apache.pig.piggybank.evaluation.datetime.FORMAT_DT()') Pig.define('REPLACE', 'org.apache.pig.piggybank.evaluation.string.REPLACE()') #The box I use is dedicated to pig so use a lot of memory Pig.set('pig.cachedbag.memusage', '0.6')