Python Pig Examples, org.apache.pig.scripting.Pig Python Examples

Example #1

0

Show file

File: PigRunner.py Project: EliFinkelshteyn/Lipstick

 def __init__(self, params):
     # BIND and RUN
     self.params = params
     self.set_param_defaults()
     Pig.fs("rmr " + self.params['output_name'])
     generator = PigScriptGenerator.PigScriptGenerator(self.params)
     full_script = generator.generate()
     
     P = Pig.compile( full_script )
     
     results = P.bind({
                           'output':self.params['output_name'],
                           }).runSingle()
     
     if results.isSuccessful() :
         print 'Pig job succeeded'
     else :
         raise 'Pig job failed'
     result_iter = results.result("final_set").iterator()
     
     #This takes care of turning our iter into something we can use.
     self.make_dict_from_results(result_iter)
     
     send_to_grapht = raw_input('do you want to send this data to grapht?')
     if send_to_grapht not in ('y', 'yes', '1'): 
         sys.exit()
     connector = GraphtConnector('grapht.shuttercorp.net')
     metric = self.params['output_name']
     connector.record_data_points(metric, self.result)

Example #2

0

Show file

File: geocode.py Project: dacornej/crcsim

 def __init__ (self, jars = [], properties = {}):
     ''' Initialize Pig. '''
     for jar in jars:
         logger.debug (" >>> register jar: %s", jar)
         Pig.registerJar (jar)
     for key in properties:
         logger.debug (" >>> set property: %s => %s", key, properties[key])
         Pig.set (key, properties [key])

Example #3

0

Show file

File: pig_workflow.py Project: mortardata/bacon-bits

    def run(self):
        print "%s: %s" % (self.script_name, self.description)
        stats = self.bound_script.runSingle()

        if stats.isSuccessful():
            Pig.fs("touchz %s" % self.flag_file_path)
        else:
            raise Exception("\nScript %s failed! Error should be logged above.\n" % self.script_name +
                            "Once you have fixed the problem, you can restart the workflow at this step " +
                            "using the argument \"-p CHECKPOINT=%s\"" % self.script_name)

Example #4

0

Show file

File: bidipig.py Project: cdcttr/similarity-engine

def runbidi(src, fdest):
	P = Pig.compileFromFile('src/main/pig/bidi.pig')

	cntsbase = 'counts'
	Pig.fs('rmr ' + cntsbase)

	for count in range(10):
		dest = fdest + 'gm%04d' % count
		Pig.fs('rmr ' + dest)
		cnts = cntsbase
		params = {'src':src, 'dest':dest, 'cnts':cnts}
		bound = P.bind(params)
		job = bound.runSingle()

		if not job.isSuccessful():
			raise 'failed'

		src = dest

		iter = job.result('S').iterator()
		if iter.hasNext():
			Pig.fs('rmr ' + cnts)
		else:
			Pig.fs('mv ' + dest + ' ' + fdest)
			print 'ALL DONE!'
			break

Example #5

0

Show file

File: geocode.py Project: dacornej/crcsim

 def run (self, params, script_name, script_file, elements = []):
     ''' Execute pig. '''
     pig = Pig.compileFromFile (script_name, script_file)
     bound = pig.bind (params) 
     futures = bound.run () if isinstance (params, list) else bound.runSingle ()
     self.handle_future (futures, elements)
     self.complete ()

Example #6

0

Show file

File: controlscript.py Project: AsherBond/mortar

def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    P = Pig.compileFromFile("../pigscripts/#{script_name}.pig")
    bound = P.bind()
    bound.runSingle()

Example #7

0

Show file

File: web_process.py Project: tkuhlman/pigfeed

def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
#    if argv is None:
#        argv = sys.argv
#    if len(argv) != 3:
#        print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
#        return 1
#
#    profile_file = argv[1]
#    timeframe = argv[2]
    
    profile_file = os.environ['config_file']
    timeframe = os.environ['timeframe']

    if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
        print 'The time frame must be either daily, weekly or monthly.'
        return 1

    #Load the config
    profile = {}
    execfile(profile_file, {'timeframe':timeframe}, profile)

    #Clean up incomplete runs and create dir
    Pig.fs('rmr ' + profile['REPORTDIR'])
    Pig.fs('mkdir ' + profile['REPORTDIR'])

    #Start pig processing
    pig_init()
    if timeframe == 'daily':
        #Clean up incomplete runs and create dir
        Pig.fs('rmr %s' % profile['LOGDIR'])
        Pig.fs('mkdir %s' % profile['LOGDIR'])
        import_logs(profile['logs'])
    #The web_load.pig script is run by the processing scripts
    pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
    bstats = pstats.bind(profile)
    stats = bstats.run()
    if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
        if not stats.isSuccessful():
            print 'Error in web log stats, %s' % run.getErrorMessage()
            sys.exit(1)
    else:
        for run in stats:
            if not run.isSuccessful():
                print 'Error in web log stats, %s' % run.getErrorMessage()
                sys.exit(1)

Example #8

0

Show file

File: web_process.py Project: tkuhlman/pigfeed

def import_logs(profile):
    """ Import all the log files for a given day and processed them putting each in a log dir.
        If the profile is a list there are multiple files otherwise only a single one.
        The files are combined when running web_load.pig
    """
    #Clean up any left over files from the last run
    for logfile in profile:
        Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
    pload = Pig.compileFromFile('web_import.pig')
    bload = pload.bind(profile)
    load = bload.run()
    #Check for load errors
    if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
        if not load.isSuccessful():
            print 'Error in web log load, %s' % load.getErrorMessage()
            sys.exit(1)
    else:
        for run in load:
            if not run.isSuccessful():
                print 'Error in web log load, %s' % run.getErrorMessage()
                sys.exit(1)

Example #9

0

Show file

File: iterate-example.py Project: mortardata/load-grouper-example

def run_script():
    import os
    from org.apache.pig.scripting import Pig

    # compile the pig code
    for i in range(10):
        print 'Run %s started!' % i
        P = Pig.compileFromFile("../pigscripts/avg_songs_per_split_counter.pig")

        bound = P.bind({"ITERATION_NUM":i})

        ps = bound.runSingle()
        print 'Run %s done!' % i

        result = ps.result("avg_split_song_count")
        for r in result.iterator():
            print r

        if int(r.get(1).toString()) >= 5:
            print 'Good enough! Quitting time!'
            break

Example #10

0

Show file

File: build.py Project: aaronbinns/waimea

    print 'LOG: Elapsed %f' % (endTime - startTime)
    # Remove the guardFile
    fs.delete( guardFile, True )
    System.exit(0)

if fs.exists( parsedDir ):

    # parsed-captures
    if ( not fs.exists( parsedCaptures) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( parsedCaptures ).getModificationTime() ):
        print 'LOG: Graph parsed-captures create'
        fs.delete( parsedCaptures, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(parsedCaptures),
                   'JOBNAME': str(collection) + ' parsed-captures' }
        job = Pig.compileFromFile( 'pig/parsed-captures.pig' ).bind( params )
        result = job.runSingle(props)
        if not result.isSuccessful():
            print '\nERROR: Pig job parsed-captures for ' + collection
            System.exit(1)
    else:
        print 'LOG: Graph parsed-captures up-to-date'

    # link-graph
    if ( not fs.exists( linkGraph ) or
         fs.getFileStatus( parsedDir ).getModificationTime() > fs.getFileStatus( linkGraph ).getModificationTime() ):
        print 'LOG: Graph link-graph create'
        fs.delete( linkGraph, True )
        params = { 'INPUT'  : str(parsedDir),
                   'OUTPUT' : str(linkGraph),
                   'JOBNAME': str(collection) + ' link-graph' }

Example #11

0

Show file

try:
    props = util.Properties()
    propertiesfis = javaio.FileInputStream(paramFile)
    props.load(propertiesfis)
except:
    print "Errore leggendo " + paramFile + ": ", sys.exc_info()[0]
    sys.exit(1)

mongoConn = (props.getProperty('mongoHost') + ":" +
             props.getProperty('mongoPort') + "/DB_SUPPORT" + " -u " +
             props.getProperty('mongoUsr') + " -p " +
             props.getProperty('mongoPwd') +
             " --authenticationDatabase admin --quiet ")
mongoParam = ''' --eval "''' + "var param1='" + tenantCode + "' " + ''' " '''

Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
Pig.registerJar("../lib/yucca-phoenix-pig.jar")

#lastId = '000000000000000000000000'
#if mode in ["APPEND", "append"]:
# read from metadata source (mongoDB) lastIdDatalake2Speed for tenant
#    callResult, output = getstatusoutput('mongo ' + mongoConn + mongoParam + ' read_mongo_lastIdDatalake2Speed.js')
#    if callResult == 0:
#        print "Last id read successfully"
#        lastId = output
#    else:
#        print "Error while reading last id"
#        sys.exit(1)
print("mongo " + mongoConn + mongoParam +
      " ../list_tenant_defaults.js > tenant." + str(pid) + ".json")
callResult = call("mongo " + mongoConn + mongoParam +

Example #12

0

Show file

File: ViewPairs.py Project: olittle/cata-offline

        par = {}
        par['DATA_DATE'] = day
        par['REPORT_DATE'] = "2013/07/" + str(i) 
        params.append(par) 
    
    
    prefix = "2013080"
    for i in range(1, 10):
        day = prefix + str(i)
        par = {}
        par['DATA_DATE'] = day
        par['REPORT_DATE'] = "2013/08/0" + str(i) 
        params.append(par) 
    
    
    Pig.registerUDF("attribute_click.py", "myfuncs")

#     ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false');

    script = """
%declare OUTPUT '/user/haliu'
member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON();
events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false');

job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0;

job_views = FOREACH job_view_events GENERATE 
  (int)header.memberId   AS memberId,
  (long)header.time      AS time,
  trackingCode,
  (int)trackingInfo#'0'  AS jobId;

Example #13

0

Show file

    vertica.accert_table_exists(table_name)
    table_size = vertica.get_table_size(table_name)
    logger.info(table_name + " table  size is " + str(table_size) + " bytes")

    output_dir = "/user/mykhail.martsynyuk/vertica/export/" + table_name
    #prepare hdfs structure
    logger.info("Move folder " + output_dir + " to backup")
    hdfs.move_folder_to_backup(output_dir)
    logger.info("Remove " + output_dir)
    hdfs.remove_folder(output_dir)

    params.append({'out': output_dir, 'table': table_name})

P = Pig.compile("""
register /usr/lib/pig/lib/pig-vertica.jar
register /usr/lib/pig/lib/vertica-jdbc-7.0.1-0.jar
A = LOAD 'sql://{SELECT * FROM $table WHERE 1 = ?};{1}' USING com.vertica.pig.VerticaLoader('10.104.5.29','verticadst','5433','alfxplsit','xpl123');
STORE A INTO '$out';
""")

bound = P.bind(params)
stats_list = bound.run()

i = 0
for stats in stats_list:
    if stats.isSuccessful():
        logger.info("SUCCESS: Table: " + params[i]["table"] +
                    "; Number jobs: " + str(stats.getNumberJobs()) +
                    "; Time to run: " + str(stats.getDuration()) +
                    "; Files written: " + str(stats.getOutputLocations()))
    else:
        logger.info("FAIL: Table: " + params[i]["table"] + "; ERRORS: " +

Example #14

0

Show file

File: resourceSimple.py Project: Data2Semantics/PigAnalysis

"""

if aggregateMethod == "avg":
	pigScript += """
rankedTriples = FOREACH objGroup GENERATE 
		$0,$1,$2,
		AVG({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;"""
elif aggregateMethod == "max":
	pigScript += """
rankedTriples = FOREACH objGroup GENERATE 
		$0,$1,$2,
		MAX({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;"""
elif aggregateMethod == "min":
	pigScript += """
rankedTriples = FOREACH objGroup GENERATE 
		$0,$1,$2,
		MIN({($4 is null? 1F: $4),($6 is null? 1F: $6)}) AS ranking;"""
else: 
	pigScript += """
WRONGGGG. how to aggregate?!"""

pigScript += """

rmf $outputFile
STORE rankedTriples INTO '$outputFile' USING PigStorage();
"""


P = Pig.compile(pigScript)
stats = P.bind().runSingle()

Example #15

0

Show file

File: Vertica2HDFS.py Project: leandrotan/market

    vertica.accert_table_exists(table_name)
    table_size = vertica.get_table_size(table_name)
    logger.info(table_name + " table  size is " + str(table_size) + " bytes")
    
    output_dir = "/user/mykhail.martsynyuk/vertica/export/"+table_name
    #prepare hdfs structure
    logger.info("Move folder "+output_dir+" to backup")
    hdfs.move_folder_to_backup(output_dir)
    logger.info("Remove "+output_dir)
    hdfs.remove_folder(output_dir)
    
    params.append({'out':output_dir, 'table':table_name})
    
P = Pig.compile("""
register /usr/lib/pig/lib/pig-vertica.jar
register /usr/lib/pig/lib/vertica-jdbc-7.0.1-0.jar
A = LOAD 'sql://{SELECT * FROM $table WHERE 1 = ?};{1}' USING com.vertica.pig.VerticaLoader('10.104.5.29','verticadst','5433','alfxplsit','xpl123');
STORE A INTO '$out';
""")

bound = P.bind(params)
stats_list = bound.run()

i = 0
for stats in stats_list:
    if stats.isSuccessful():
        logger.info("SUCCESS: Table: "+params[i]["table"]+"; Number jobs: "+str(stats.getNumberJobs())+ "; Time to run: "+str(stats.getDuration())+"; Files written: "+str(stats.getOutputLocations()))
    else:
        logger.info("FAIL: Table: "+params[i]["table"]+"; ERRORS: "+stats.getAllErrorMessages())
    i+=1
    
    # Next is example of how to get script output:

Example #16

0

Show file

File: runPigScripts.py Project: hyqiu/ensae_bigdatatools

def runPigScript(pigScript,params):
    P = Pig.compileFromFile(pigScript)
    bound = P.bind(params)
    stat=bound.runSingle()

Example #17

0

Show file

File: cyy.py Project: e98877331/redgate-pig-temp

            postString += "Result = FOREACH Result GENERATE * AS (" + fsDic[
                'genSchema'] + ");\n"

            # postString += "Result = FOREACH Result GENERATE " + currentAction+ "Result::UserId AS UserId, *;\n"
            # postString += "DESCRIBE Result;\n"

    # A1BResult = JOIN BResult BY UserId, CResult By UserId;
    pigString += postString
    pigString += """
    DUMP Result;
    DESCRIBE Result;
    """

    print(pigString)

    # with open('cyygeneratedPig.pig','w') as outFile:
    #    outFile.write(pigString)
    if USE_PIG:
        P = Pig.compile(pigString)
        # P = Pig.compileFromFile('pig_bcd_bc.pig')

        # run the pig script

        if True:
            result = P.bind().runSingle()

            if result.isSuccessful():
                print 'run success'
            else:
                raise 'run failed'

Example #18

0

Show file

MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i != k - 1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register 'centroid.py' using jython as centroid; 
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, centroid.get_closest_centroid(gpa, '$centroids') as centroid;
                   grouped = group centroided by centroid parallel 2;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'kmoutput';
                """)

converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
    Q = P.bind({'centroids': initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration

Example #19

0

Show file

#!/usr/bin/python 
# -*- coding: utf-8 -*-
# explicitly import Pig class 
from org.apache.pig.scripting import Pig 

# COMPILE: compile method returns a Pig object that represents the pipeline
P = Pig.compile('''Arcs = LOAD '$docs_in'  USING PigStorage('\t') AS (url: chararray, pagerank: float, links:{ link: ( url: chararray ) } );   
        outlinkPageRank =  FOREACH Arcs    GENERATE   pagerank / COUNT ( links ) AS pagerank, FLATTEN ( links ) AS to_url;
        newPageRank = FOREACH   ( COGROUP outlinkPageRank BY to_url, Arcs BY url INNER )   GENERATE  
        FLATTEN (Arcs.url),
        ( 1.0 - 0.85 ) + 0.85 * SUM ( outlinkPageRank.pagerank ) AS pagerank,
	 FLATTEN (Arcs.links) AS links;
	dump newPageRank;
	STORE newPageRank INTO '$docs_out';''')
params = {'docs_in': 'urls2.txt' }
for i in range(1):
   out = "out/pagerank_data_" + str(i + 1)
   params["docs_out"] = out
   Pig.fs("rmr " + out)	
   stats = P.bind(params).runSingle()
   if not stats.isSuccessful():
      raise 'failed'
   params["docs_in"] = out

Example #20

0

Show file

File: kmeans.py Project: maodunzhe/2014summer_intern

    if i != k - 1:
        initial_centroids = initial_centroids + ":"
# initial_centroids = "37.475097, -122.155599:37.486098,-122.195388:37.4985769, -122.2195727:37.4608874, -122.143838:37.453407, -122.182255"

# initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0"
# last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)]
print last_centroids
print initial_centroids


P = Pig.compile(
    """register Find.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw_data = load 'MP_match.txt' as (latitude:double, longitude:double, status:chararray);
                   centroided = foreach raw_data generate status, latitude, longitude, find_centroid(latitude, longitude) as centroid;
                   grouped = group centroided by centroid;
                   store grouped into 'grouped';
                   result = foreach grouped generate group, AVG(centroided.latitude), AVG(centroided.longitude);
                   store result into 'output';
                """
)

converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
    Q = P.bind({"centroids": initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = []

Example #21

0

Show file

File: pagerank_final.py Project: ManjushaBolishetty/DocGraph-Page-Rank

def run_pagerank():
    """
    Calculates pageranks for Twitter users.

    Three main steps:
        1. Preprocessing: Process input data to:
             a) Count the total number of users.
             b) Prepare initial pagerank values for all users.
        2. Iterative: Calculate new pageranks for each user based on the previous pageranks of the
                      users' followers.
        3. Postprocesing: Find the top pagerank users and join to a separate dataset to find their names.
    """
    # Preprocessing step:
    print "Starting preprocessing step."
    preprocess = Pig.compileFromFile(PREPROCESS_SCRIPT)
    preprocess_bound = preprocess.bind({
        "INPUT_PATH": FOLLOWER_GRAPH_INPUT,
        "PAGERANKS_OUTPUT_PATH": PREPROCESS_PAGERANKS,
        "NUM_USERS_OUTPUT_PATH": PREPROCESS_NUM_USERS
    })
    preprocess_stats = preprocess_bound.runSingle()
    num_users = int(str(preprocess_stats.result("num_users").iterator().next().get(0)))
    convergence_threshold = CONVERGENCE_THRESHOLD / num_users


    # Iteration step:
    iteration = Pig.compileFromFile(PAGERANK_ITERATE_SCRIPT)
    for i in range(MAX_NUM_ITERATIONS):
        print "Starting iteration step: %s" % str(i + 1)

        # Append the iteration number to the input/output stems
        iteration_input = PREPROCESS_PAGERANKS if i == 0 else (ITERATION_PAGERANKS_PREFIX + str(i-1))
        iteration_pageranks_output = ITERATION_PAGERANKS_PREFIX + str(i)
        iteration_max_diff_output = ITERATION_MAX_DIFF_PREFIX + str(i)

        iteration_bound = iteration.bind({
            "INPUT_PATH": iteration_input,
            "DAMPING_FACTOR": DAMPING_FACTOR,
            "NUM_USERS": num_users,
            "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
            "MAX_DIFF_OUTPUT_PATH": iteration_max_diff_output
        })
        iteration_stats = iteration_bound.runSingle()

        # If we're below the convergence_threshold break out of the loop.
        max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0)))
        if max_diff < CONVERGENCE_THRESHOLD:
            print "Max diff %s under convergence threshold. Stopping." % max_diff
            break
        elif i == MAX_NUM_ITERATIONS-1:
            print "Max diff %s above convergence threshold but hit max number of iterations.  Stopping." \
                    % max_diff
        else:
            print "Max diff %s above convergence threshold. Continuing." % max_diff

    iteration_pagerank_result = ITERATION_PAGERANKS_PREFIX + str(i)

    # Postprocesing step:
    print "Starting postprocessing step."
    postprocess = Pig.compileFromFile(POSTPROCESS_SCRIPT)
    postprocess_bound = postprocess.bind({
        "PAGERANKS_INPUT_PATH": iteration_pagerank_result,
        "USERNAMES_INPUT_PATH": USERNAMES_INPUT,
        "TOP_N": NUM_TOP_USERS,
        "OUTPUT_BUCKET": OUTPUT_BUCKET
    })
    postprocess_stats = postprocess_bound.runSingle()

Example #22

0

Show file

File: lsh.pig.py Project: cdcttr/similarity-engine

#!/usr/bin/python
import sys
from org.apache.pig.scripting import Pig
from bidipig import runbidi

# make minhash clusters
minhash = Pig.compileFromFile('src/main/pig/minhash.pig')

osrc = src = sys.argv[1]
destminhash = sys.argv[2] + '-minhash'
dest = sys.argv[2] + '-jaccard'
minjaccard = 80

bound = minhash.bind()

job = bound.runSingle()

if not job.isSuccessful():
	raise 'failed in minhash'
# output is pairs and scores

# make transitive closure of clusters
src = dest
dest = sys.argv[2] + '-bidi'
runbidi(src, dest)

# join with original data
join = Pig.compileFromFile('src/main/pig/join.pig')

src = osrc
keys = dest

Example #23

0

Show file

File: calling_linear_pig.py Project: Aurametrix/HDFS

from org.apache.pig.scripting import Pig 
from org.codehaus.jackson.map import ObjectMapper

EPS          = 10e-6      # maximum distance between consective weights for convergence

pig_script   = sys.argv[1] # pig script to run iteratively
data_dir     = sys.argv[2] # directory where intermediate weights will be written
features     = sys.argv[3] # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4] # number of features

#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir    
Pig.fs(cmd)

#
# Initialize weights
#
w0_fields = []
weights   = []
for i in xrange(int(num_features)):
    weights.append(str(random.random()))    
    w0_fields.append({"name":"w%s" % i,"type":25,"schema":None}) # See Pig's DataType.java

path = tempfile.mkdtemp()
w0   = open("%s/part-r-00000" % path, 'w')
w0.write("\t".join(weights)+"\n")
w0.close()

Example #24

0

Show file

File: Pig.py Project: Faenya/TinyRank

#!/usr/bin/python

from org.apache.pig.scripting import Pig
import time

P = Pig.compile("""

InOut = LOAD '$in_links' using PigStorage('\t') as (home_url: chararray, links:{ link: ( url: chararray ) } );

InPagerank = LOAD '$in_pagerank' using PigStorage('\t') as (home_url: chararray, rank : float);

InData = JOIN InPagerank by home_url, InOut by home_url;

Data = FOREACH InData GENERATE InOut::home_url as url, InPagerank::rank as rank, InOut::links as links;

outbound_pagerank = FOREACH Data GENERATE rank/COUNT(links) AS pagerank_transfer, FLATTEN (links) AS outbound_links;

new_pagerank = FOREACH (GROUP outbound_pagerank BY outbound_links) GENERATE group AS url, 0.15 + 0.85 * SUM(outbound_pagerank.pagerank_transfer) AS pagerank;

STORE new_pagerank INTO '$out' USING PigStorage('\t');

""")

params = {
    'in_links': './data/output_links.txt',
    'in_pagerank': './data/output_rank.txt'
}
times = []

for i in range(30):
    print("Iteration " + str(i))

Example #25

0

Show file

File: pagerank_lib.py Project: zhumzhu/mortar-pagerank

    def run_pagerank(self):
        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names.
        """
        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile(self.preprocessing_script)
        preprocess_params = {
            "INPUT_PATH": self.edges_input,
            "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks,
            "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes
        }
        preprocess_bound = preprocess.bind(preprocess_params)
        preprocess_stats = preprocess_bound.runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes = long(
            str(preprocess_stats.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(self.convergence_threshold * num_nodes *
                                     num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (
            num_nodes, convergence_threshold)

        # Iteration step:
        iteration = Pig.compileFromFile(self.iteration_script)
        for i in range(self.max_num_iterations):
            print "Starting iteration step: %s" % str(i + 1)

            # Append the iteration number to the input/output stems
            iteration_input = self.preprocess_pageranks if i == 0 else (
                self.iteration_pageranks_prefix + str(i - 1))
            iteration_pageranks_output = self.iteration_pageranks_prefix + str(
                i)
            iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(
                i)

            iteration_bound = iteration.bind({
                "INPUT_PATH":
                iteration_input,
                "DAMPING_FACTOR":
                self.damping_factor,
                "NUM_NODES":
                num_nodes,
                "PAGERANKS_OUTPUT_PATH":
                iteration_pageranks_output,
                "AGG_RANK_CHANGE_OUTPUT_PATH":
                iteration_rank_changes_output
            })
            iteration_stats = iteration_bound.runSingle()

            # If we're below the convergence threshold break out of the loop.
            aggregate_rank_change = long(
                str(
                    iteration_stats.result(
                        "aggregate_rank_change").iterator().next().get(0)))
            if aggregate_rank_change < convergence_threshold:
                print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \
                       % (aggregate_rank_change, convergence_threshold)
                break
            elif i == self.max_num_iterations - 1:
                print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \
                      ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \
                       "Stopping."
            else:
                print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \
                       % (aggregate_rank_change, convergence_threshold)

        iteration_pagerank_result = self.iteration_pageranks_prefix + str(i)

        # Postprocesing step:
        print "Starting postprocessing step."
        postprocess = Pig.compileFromFile(self.postprocessing_script)
        postprocess_params = {
            "PAGERANKS_INPUT_PATH": iteration_pagerank_result
        }
        if self.output_path is not None:  # otherwise, the script outputs to the default location,
            # which is a special directory in s3://mortar-example-output-data
            # permissioned for your Mortar account.
            postprocess_params["OUTPUT_PATH"] = self.output_path
        postprocess_bound = postprocess.bind(postprocess_params)
        postprocess_stats = postprocess_bound.runSingle()

Example #26

0

Show file

File: pagerank.py Project: pombredanne/bacon-bits

from org.apache.pig.scripting import Pig
from pagerank_lib import Pagerank

if __name__ == "__main__":
    params = Pig.getParameters()

    try:
        input_path = params["INPUT_PATH"]
        output_path = params["OUTPUT_PATH"]
        tmp_output_dir = params["TMP_OUTPUT_DIR"]
    except:
        print "Usage: mortar baconbits:[local_]run pagerank " + "-p INPUT_PATH=<...> -p OUTPUT_PATH=<...> -p TMP_OUTPUT_DIR=<...> "

    damping_factor = 0.85
    if "DAMPING_FACTOR" in params:
        damping_factor = float(params["DAMPING_FACTOR"])

    convergence_threshold = 0.001
    if "CONVERGENCE_THRESHOLD" in params:
        convergence_threshold = float(params["CONVERGENCE_THRESHOLD"])

    max_num_iterations = 10
    if "MAX_NUM_ITERATIONS" in params:
        max_num_iterations = int(params["MAX_NUM_ITERATIONS"])

    id_name_map = None
    if "ID_NAME_MAP" in params:
        id_name_map = params["ID_NAME_MAP"]

    Pagerank.run_pagerank(
        input_path,

Example #27

0

Show file

File: Positive.py Project: olittle/cata-offline

    for i in range(10, 32):
        day = prefix + str(i)
        par = {}
        par['DATA_DATE'] = day
        par['REPORT_DATE'] = "2013/07/" + str(i)
        params.append(par)

    prefix = "2013080"
    for i in range(1, 10):
        day = prefix + str(i)
        par = {}
        par['DATA_DATE'] = day
        par['REPORT_DATE'] = "2013/08/0" + str(i)
        params.append(par)

    Pig.registerUDF("attribute_click.py", "myfuncs")

    script = """
%declare OUTPUT '/user/haliu'
        applypair = LOAD '/data/tracking/JobApplyClickEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false');
        applypair = foreach applypair generate header.memberId as memberId, jobId, header.time as time;
        
        member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON();
        
        applypair = join applypair by memberId, member by memberId parallel 2000;
        applypair = foreach applypair generate applypair::memberId as memberId, applypair::jobId as jobId, applypair::time as time; 
        
        applypair = distinct applypair parallel 1;  
        store applypair into '$OUTPUT/JYMBII-batch/history/positive/$REPORT_DATE' USING BinaryJSON('memberId'); 
    """

Example #28

0

Show file

File: driver.py Project: YanLiang1102/advanceddb_code

if __name__ == '__main__':
    from org.apache.pig.scripting import Pig
    import sys
    P = Pig.compileFromFile('/home/course/lian9478/task3.pig')
    params = {}
    for i in range(int(sys.argv[1])):
        if i == 0:
            out = '/home/course/lian9478/HW4-old_twitter_account_rank.csv'
        else:
            out = "out/pagerank_data_" + str(i + 1)
        params['doc_in'] = out
        params['doc_out'] = "out/pagerank_data_" + str(i + 2)
        bound = P.bind(params)
        bound.runSingle()
        #this is to do it one by one instead of parallel
        #so you can call this driver like this
        #pig -x local -embedded jython driver.py 20

Example #29

0

Show file

File: ngram_count_pigpipe_auto.py Project: lantuzi/topicrawler

_out     = _in  + '_counts_m' + _min_count
_out_nc  = _out + '/count'
_out_v   = _out + '/vocab'
_out_nf  = _out + '/nfollow'
_out_np  = _out + '/nprecede'
_out_nfp = _out + '/nfollowerprecede'
_out_njc = _out + '/countsjoined'

##
# start actual pig jobs
#
from org.apache.pig.scripting import Pig 

# if output path does not exist, create it
if Pig.fs('-test -d ' + _out):
	Pig.fs('mkdir ' + _out)

##
# CountJob
#
# if output path of countjob already exists, skip it, run job
##
if not Pig.fs('-test -d ' + _out_nc):
	print '\nPath ("%s") already exists, skipping job.\n' % _out_nc
else:
	result = Pig.compile(_header + """
	count_ngrams( '${in}', '${out}', '${min_count}' );
	""").bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle()
	# check the result
	if not result.isSuccessful():

Example #30

0

Show file

#!/usr/bin/python
from __future__ import with_statement
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
import globalVars
from org.apache.pig.scripting import Pig
sys.path.append('../lib/jyson-1.0.2.jar')
from com.xhaus.jyson import JysonCodec as json
from subprocess import call
import java.util as util
import java.io as javaio
import csv

Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
Pig.registerJar("../lib/piggybankExtended.jar")

#Jar to use AVRO
#Pig.registerJar("/usr/hdp/current/pig-client/lib/avro-1.7.5.jar")
#Pig.registerJar("/usr/hdp/current/pig-client/lib/json-simple-1.1.jar")
#Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-core-asl-1.9.13.jar")
#Pig.registerJar("/usr/hdp/current/pig-client/lib/jackson-mapper-asl-1.9.13.jar")

if len(sys.argv) != 2:
    print "Usage: " + sys.argv[0] + " parameters-file"
    sys.exit(1)

paramFile = sys.argv[1]

try:
    props = util.Properties()

Example #31

0

Show file

File: workflow.py Project: Dongximing/ap-pig

import sys
from org.apache.pig.scripting import Pig

load = Pig.compileFromFile(sys.argv[1])
iteration = Pig.compileFromFile('iteration.pig')
store = Pig.compileFromFile('store.pig')

print '*** Loading input ***'
load_stats = load.bind({'EDGES_OUT': 'edges0.tmp'}).runSingle()
if not load_stats.isSuccessful():
    raise 'Load failed'

i = 1
stable_inerations = 0
edges_in = 'edges' + str(i - 1) + '.tmp'
edges_out = ''

while True:
    print "*** Iteration " + str(i) + " ***"
    edges_out = 'edges' + str(i) + '.tmp'
    iteration_bound = iteration.bind({
        'EDGES_IN': edges_in,
        'EDGES_OUT': edges_out,
        'CONVERGENCE_OUT': 'convergence.tmp'
    })
    iteration_stats = iteration_bound.runSingle()
    if not iteration_stats.isSuccessful():
        raise 'Iteration failed'
    conv_result = iteration_stats.result('convergence').iterator().next()
    max_iter = int(str(conv_result.get(0)))
    conv_iter = int(str(conv_result.get(1)))

Example #32

0

Show file

File: cyy.py Project: e98877331/redgate-pig-temp

            postString += "Result = FOREACH Result GENERATE " + fsDic['genFields'] + ";\n"
            postString += "Result = FOREACH Result GENERATE * AS (" + fsDic['genSchema'] + ");\n"

            # postString += "Result = FOREACH Result GENERATE " + currentAction+ "Result::UserId AS UserId, *;\n"
            # postString += "DESCRIBE Result;\n"

    # A1BResult = JOIN BResult BY UserId, CResult By UserId;
    pigString += postString
    pigString += """
    DUMP Result;
    DESCRIBE Result;
    """

    print(pigString)

    # with open('cyygeneratedPig.pig','w') as outFile:
    #    outFile.write(pigString)
    if USE_PIG:
        P = Pig.compile(pigString)
        # P = Pig.compileFromFile('pig_bcd_bc.pig')

    # run the pig script

        if True:
            result = P.bind().runSingle()

            if result.isSuccessful():
                print 'run success'
            else:
                raise 'run failed'

Example #33

0

Show file

File: build.py Project: internetarchive/waimea

    System.exit(0)

if fs.exists(parsedDir):

    # parsed-captures
    if (not fs.exists(parsedCaptures)
            or fs.getFileStatus(parsedDir).getModificationTime() >
            fs.getFileStatus(parsedCaptures).getModificationTime()):
        print 'LOG: Graph parsed-captures create'
        fs.delete(parsedCaptures, True)
        params = {
            'INPUT': str(parsedDir),
            'OUTPUT': str(parsedCaptures),
            'JOBNAME': str(collection) + ' parsed-captures'
        }
        job = Pig.compileFromFile('pig/parsed-captures.pig').bind(params)
        result = job.runSingle(props)
        if not result.isSuccessful():
            print '\nERROR: Pig job parsed-captures for ' + collection
            System.exit(1)
    else:
        print 'LOG: Graph parsed-captures up-to-date'

    # link-graph
    if (not fs.exists(linkGraph)
            or fs.getFileStatus(parsedDir).getModificationTime() >
            fs.getFileStatus(linkGraph).getModificationTime()):
        print 'LOG: Graph link-graph create'
        fs.delete(linkGraph, True)
        params = {
            'INPUT': str(parsedDir),

Example #34

0

Show file

File: pagerank_lib.py Project: mortardata/bacon-bits

    def run_pagerank(edges_input,
                     output_path,
                     tmp_output_dir,
                     damping_factor=0.85,
                     convergence_threshold=0.0001,
                     max_num_iterations=10,
                     id_name_map=None,
                     preprocessing_script="../pigscripts/pagerank_preprocess.pig",
                     iteration_script="../pigscripts/pagerank_iterate.pig"
                    ):

        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Order nodes by pagerank
                               Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
                               to get human-readable names
        """

        preprocess_dir = "%s/preprocess" % tmp_output_dir
        iteration_dir  = "%s/iteration"  % tmp_output_dir

        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
            "INPUT_PATH"            : edges_input,
            "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
            "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
        }).runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes             = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        def iteration_param_func(it_num, it_dir):
            if it_num == 1:
                iteration_input = "%s/pageranks" % preprocess_dir
            else:
                iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)

            return {
                "INPUT_PATH"                  : iteration_input,
                "DAMPING_FACTOR"              : damping_factor,
                "NUM_NODES"                   : num_nodes,
                "PAGERANKS_OUTPUT_PATH"       : "%s/%d/pageranks"    % (it_dir, it_num),
                "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
            }

        iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed
        )

        # Postprocesing step:
        print "Starting postprocessing step."

        postprocess_script = """
            pageranks   =   LOAD '$PAGERANKS_INPUT_PATH'   USING PigStorage() AS (id: int, pagerank: double);
            pageranks   =   FILTER pageranks BY pagerank IS NOT NULL;
        """

        if id_name_map:
            postprocess_script += """
                id_name_map =   LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
                with_names  =   FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
                ordered     =   ORDER with_names BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "ID_NAME_MAP_INPUT_PATH" : id_name_map,
                "OUTPUT_PATH"            : output_path
            }).runSingle()
        else:
            postprocess_script += """
                ordered     =   ORDER pageranks BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "OUTPUT_PATH"            : output_path
            }).runSingle()

        Pig.fs("rmr %s" % preprocess_dir)
        Pig.fs("rmr %s" % iteration_dir)

Example #35

0

Show file

File: Split.py Project: olittle/cata-offline

    params = [] 
    
    for i in range(bucket):
        par = {} 
        par['INDEX'] = str(i)
        params.append(par) 

    
    script = """
        data = load '/user/hgui/JYMBII-batch/TMP/history' USING BinaryJSON(); 

        mem = foreach data generate memberId;
        mem = distinct mem parallel 100;

        mem = sample mem 0.5; 

        data = join data by memberId, mem by memberId; 

        data = distinct data parallel 1; 

        data = foreach data generate data::label as label, data::class as class, data::memberId as memberId, data::jobId as jobId, data::score as score; 
        store data into '/user/hgui/JYMBII-batch/TMP/bucket-$INDEX';

    """

    prog = Pig.compile(script)

    for para in params:
        bound = prog.bind(para) 
        stats = bound.runSingle()

Example #36

0

Show file

File: kmeans.py Project: Fan4530/distributed_crawler

def main():
    filename = "studenttab10k"
    k = 4
    tolerance = 0.01

    MAX_SCORE = 4
    MIN_SCORE = 0
    MAX_ITERATION = 100

    # initial centroid, equally divide the space
    initial_centroids = ""
    last_centroids = [None] * k
    for i in range(k):
        last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE)
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i != k - 1:
            initial_centroids = initial_centroids + ":"

    # Compile Pig script. Register the same script since it contains the Jython UDF.
    # $centroids is the only binding parameter. It will be bound to different parameter with the
    # estimation for centroid from the last round. Then we calculate the average of the new clusters
    # to get the new centroid estimation, and store into "output"
    P = Pig.compile("""register 'kmeans.py' using jython as util;
                       raw = load 'studenttab10k' as (name:chararray, age:int, gpa:double);
                       centroided = foreach raw generate gpa, util.findCentroid('$centroids', gpa) as centroid;
                       grouped = group centroided by centroid;
                       result = foreach grouped generate group, AVG(centroided.gpa);
                       store result into 'output';
                    """)

    converged = False
    iter_num = 0
    while iter_num < MAX_ITERATION:
        # Binding parameter centroids to current centroids
        Q = P.bind({'centroids': initial_centroids})

        # Run Pig script
        results = Q.runSingle()

        # Check the result of the Pig script
        if results.isSuccessful() == "FAILED":
            raise "Pig job failed"

        # Get the new centroids from the output
        iter = results.result("result").iterator()
        centroids = [None] * k
        distance_move = 0

        # Calculate the moving distance with last iteration
        for i in range(k):
            tuple = iter.next()
            centroids[i] = float(str(tuple.get(1)))
            distance_move = distance_move + fabs(last_centroids[i] -
                                                 centroids[i])
        distance_move = distance_move / k
        Pig.fs("rmr output")
        print("iteration " + str(iter_num))
        print("average distance moved: " + str(distance_move))

        # Converge
        if distance_move < tolerance:
            sys.stdout.write("k-means converged at centroids: [")
            sys.stdout.write(",".join(str(v) for v in centroids))
            sys.stdout.write("]\n")
            converged = True
            break

        # Not converge, use the new centroids as the initial centroids for next iteration
        last_centroids = centroids[:]
        initial_centroids = ""
        for i in range(k):
            initial_centroids = initial_centroids + str(last_centroids[i])
            if i != k - 1:
                initial_centroids = initial_centroids + ":"
        iter_num += 1

    # Not converge after MAX_ITERATION
    if not converged:
        print("not converge after " + str(iter_num) + " iterations")
        sys.stdout.write("last centroids: [")
        sys.stdout.write(",".join(str(v) for v in last_centroids))
        sys.stdout.write("]\n")

Example #37

0

Show file

File: pagerank.py Project: andresdouglas/amazon-product-graph

def run_script():
    import os
    from org.apache.pig.scripting import Pig

    nodes_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-nodes"
    edges_input = "s3n://jpacker-dev/amazon_products/fixtures/cathedral-edges"

    preprocess_vector_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/vector"
    preprocess_matrix_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/matrix"
    preprocess_num_vertices_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/preprocess/num_vertices"
    iteration_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/iteration_"
    max_diff_output_stem = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/max_diff_"
    postprocess_pageranks_output = "s3n://jpacker-dev/amazon_products/fixtures/cathedral_pagerank/pageranks"

    damping_factor = 0.85

    preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig")
    preprocess_bound = preprocess.bind({ 
        "EDGES_INPUT_PATH": edges_input,
        "VECTOR_OUTPUT_PATH": preprocess_vector_output,
        "MATRIX_OUTPUT_PATH": preprocess_matrix_output,
        "NUM_VERTICES_OUTPUT_PATH": preprocess_num_vertices_output,
        "DAMPING_FACTOR": damping_factor
    })
    preprocess_stats = preprocess_bound.runSingle()

    num_vertices = int(str(preprocess_stats.result("num_vertices_copy").iterator().next().get(0)))

    iteration = Pig.compileFromFile("../pigscripts/pagerank_iterate.pig")
    max_num_iterations = 7
    num_iterations = 0
    convergence_threshold = 0.15 / float(num_vertices)

    for i in range(1, max_num_iterations + 1):
        iteration_vector_input = preprocess_vector_output if i == 1 else (iteration_output_stem + str(i-1))
        iteration_matrix_input = preprocess_matrix_output

        iteration_output = iteration_output_stem + str(i)
        max_diff_output = max_diff_output_stem + str(i)

        iteration_bound = iteration.bind({
            "VECTOR_INPUT_PATH": iteration_vector_input,
            "MATRIX_INPUT_PATH": iteration_matrix_input,
            "ITERATION_OUTPUT_PATH": iteration_output,
            "MAX_DIFF_OUTPUT_PATH": max_diff_output,
            "NUM_VERTICES": num_vertices,
            "DAMPING_FACTOR": damping_factor
        })
        iteration_stats = iteration_bound.runSingle()

        num_iterations += 1
        max_diff = float(str(iteration_stats.result("max_diff").iterator().next().get(0)))
        if max_diff < convergence_threshold:
            break

    result_vector = iteration_output_stem + str(num_iterations)

    postprocess = Pig.compileFromFile("../pigscripts/pagerank_postprocess.pig")
    postprocess_bound = postprocess.bind({
        "NODES_INPUT_PATH": nodes_input,
        "RESULT_VECTOR": result_vector,
        "OUTPUT_PATH": postprocess_pageranks_output
    })
    postprocess_bound.runSingle()

Example #38

0

Show file

File: kmeans.py Project: mikebin/hdp-demos

MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar;
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid parallel 2;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'kmoutput';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration

Example #39

0

Show file

File: calvisit.py Project: ReutersMedia/batchquery

#!/usr/bin/python
import time
import sys 
from org.apache.pig.scripting import Pig

if __name__ == '__main__':
    P = Pig.compileFromFile("""calvisit.pig""")

    defaulttime = time.time()
    deadlinesec = defaulttime - 1800
    deadline = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(deadlinesec))
    
    if len(sys.argv) > 1:
        deadline = sys.argv[1]

    Q = P.bind({'deadline':deadline, 'input':'input', 'result':'result', 'inputtmp':'inputtmp'})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
       raise "Pig job failed"
    else:
       print result

Example #40

0

Show file

File: graph_sampler.py Project: mortardata/bacon-bits

from math                     import ceil, log
from org.apache.pig.scripting import Pig

if __name__ == "__main__":
    params        = Pig.getParameters()
    graph         = params["GRAPH"]
    seed_vertices = params["SEED_VERTICES"]
    tmp_dir       = params["TMP_DIR"]
    output_path   = params["OUTPUT_PATH"]
    nhood_size    = int(params["NEIGHBORHOOD_SIZE"])

    preprocess_graph        = "%s/preprocess/graph"        % tmp_dir
    preprocess_num_vertices = "%s/preprocess/num_vertices" % tmp_dir
    iteration_verts_prefix  = "%s/iteration/vertices_"     % tmp_dir

    print "Graph Sampler: starting preprocessing step."
    preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({
        "GRAPH_INPUT_PATH"         : graph,
        "GRAPH_OUTPUT_PATH"        : preprocess_graph,
        "NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices
    }).runSingle()

    iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig")
    num_iterations   = nhood_size - 1
    num_vertices     = long(str(preprocessing.result("num_vertices").iterator().next().get(0)))

    print "Graph Sampler: scheduling %d iterations" % num_iterations
    for i in range(num_iterations):
        print "Graph Sampler: starting iteration step %d" % (i+1)
        iteration = iteration_script.bind({
            "VERTICES_INPUT_PATH"  : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)),

Example #41

0

Show file

File: pagerank_lib.py Project: arunpn/mortar-pagerank

    def run_pagerank(self):
        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Find the top pagerank nodes and join to a separate dataset to find their names.
        """
        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile(self.preprocessing_script)
        preprocess_params = {
            "INPUT_PATH": self.edges_input,
            "PAGERANKS_OUTPUT_PATH": self.preprocess_pageranks,
            "NUM_NODES_OUTPUT_PATH": self.preprocess_num_nodes
        }
        preprocess_bound = preprocess.bind(preprocess_params)
        preprocess_stats = preprocess_bound.runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes = long(str(preprocess_stats.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(self.convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        iteration = Pig.compileFromFile(self.iteration_script)
        for i in range(self.max_num_iterations):
            print "Starting iteration step: %s" % str(i + 1)

            # Append the iteration number to the input/output stems
            iteration_input = self.preprocess_pageranks if i == 0 else (self.iteration_pageranks_prefix + str(i-1))
            iteration_pageranks_output = self.iteration_pageranks_prefix + str(i)
            iteration_rank_changes_output = self.iteration_rank_changes_prefix + str(i)

            iteration_bound = iteration.bind({
                "INPUT_PATH": iteration_input,
                "DAMPING_FACTOR": self.damping_factor,
                "NUM_NODES": num_nodes,
                "PAGERANKS_OUTPUT_PATH": iteration_pageranks_output,
                "AGG_RANK_CHANGE_OUTPUT_PATH": iteration_rank_changes_output
            })
            iteration_stats = iteration_bound.runSingle()

            # If we're below the convergence threshold break out of the loop.
            aggregate_rank_change = long(str(iteration_stats.result("aggregate_rank_change").iterator().next().get(0)))
            if aggregate_rank_change < convergence_threshold:
                print "Sum of ordering-rank changes %d under convergence threshold %d. Stopping." \
                       % (aggregate_rank_change, convergence_threshold)
                break
            elif i == self.max_num_iterations-1:
                print ("Sum of ordering-rank changes %d " % aggregate_rank_change) + \
                      ("above convergence threshold %d but hit max number of iterations. " % convergence_threshold) + \
                       "Stopping."
            else:
                print "Sum of ordering-rank changes %d above convergence threshold %d. Continuing." \
                       % (aggregate_rank_change, convergence_threshold)

        iteration_pagerank_result = self.iteration_pageranks_prefix + str(i)

        # Postprocesing step:
        print "Starting postprocessing step."
        postprocess = Pig.compileFromFile(self.postprocessing_script)
        postprocess_params = { "PAGERANKS_INPUT_PATH": iteration_pagerank_result }
        if self.output_path is not None: # otherwise, the script outputs to the default location,
                                         # which is a special directory in s3://mortar-example-output-data
                                         # permissioned for your Mortar account.
            postprocess_params["OUTPUT_PATH"] = self.output_path
        postprocess_bound = postprocess.bind(postprocess_params)
        postprocess_stats = postprocess_bound.runSingle()

Example #42

0

Show file

File: kmeans.py Project: HortonworksUniversity/Admin_Rev3

MIN_SCORE = 0
MAX_ITERATION = 5

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration

Example #43

0

Show file

MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i != k - 1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar;
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid parallel 2;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'kmoutput';
                """)

converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
    Q = P.bind({'centroids': initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration

Example #44

0

Show file

File: iteration_utils.py Project: mortardata/bacon-bits

    def iterate_until_convergence(script_path,
                                  iteration_dir,
                                  param_generator_func,
                                  metric_name,
                                  metric_type,
                                  metric_alias,
                                  metric_threshold,
                                  max_num_iterations):

    """
    Utility for running a pigscript which outputs data in the same schema as its input iteratively,
    with the output of the previous run being the input of the next run. Stops when some convergence
    metric has been reached or if a maximum number of iterations has been reached.

    Example usage:

    iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed

    iteration_result is a PigStats object for the results of the last iteration.

    Example iteration_param_func:

    def iteration_param_func(it_num, it_dir):
        if it_num == 1:
            iteration_input = preprocess_dir + "/pageranks"
        else:
            iteration_input = it_dir + "/" + str(it_num-1) + "/pageranks"

        return {
            "INPUT_PATH"                  : iteration_input,
            "DAMPING_FACTOR"              : damping_factor,
            "NUM_NODES"                   : num_nodes,
            "PAGERANKS_OUTPUT_PATH"       : it_dir + "/" + str(it_num) + "/pageranks"
            "AGG_RANK_CHANGE_OUTPUT_PATH" : it_dir + "/" + str(it_num) + "/rank_changes"
        }
    )
    """

        script = Pig.compileFromFile(script_path)
        for i in range(1, max_num_iterations+1):
            print "Starting iteration step: %d" % i

            iteration    = script.bind(param_generator_func(i, iteration_dir)).runSingle()
            metric_value = metric_type(str(iteration.result(metric_alias).iterator().next().get(0)))

            if metric_value < metric_threshold:
                print "%s %s under convergence threshold %s. Stopping." \
                       % (metric_name, str(metric_value), str(metric_threshold))
                return { "num_iterations": i, "stop_reason": "CONVERGED" }
            elif i == max_num_iterations:
                print "%s %s above convergence threshold %s but hit max number of iterations. Stopping" \
                       % (metric_name, str(metric_value), str(metric_threshold))
                return { "num_iterations": i, "stop_reason": "MAX_ITERATIONS" }
            else:
                print "%s %s above convergence threshold %s. Continuing." \
                       % (metric_name, str(metric_value), str(metric_threshold))

Example #45

0

Show file

    
    %declare DIR 'impression-inter-neg'
    data = load '/user/hgui/JYMBII-batch/history/$DIR/2013/$MM/$DD' USING BinaryJSON();
    data = join score by (memberId, jobId), data by (memberId, jobId) parallel 500;
    data = foreach data generate score::memberId as memberId, score::jobId as jobId, score::score as score;
    data = distinct data parallel 1; 
    store data into '/user/hgui/JYMBII-batch/history/$DIR/tmp-2013/$MM/$DD' USING BinaryJSON('memberId'); 
   ---------------------------------------------------------------------------------------------------------
    
    %declare DIR 'positive'
    data = load '/user/hgui/JYMBII-batch/history/$DIR/2013/$MM/$DD' USING BinaryJSON();
    data = join score by (memberId, jobId), data by (memberId, jobId) parallel 500;
    data = foreach data generate score::memberId as memberId, score::jobId as jobId, score::score as score; 
    data = distinct data parallel 1; 
    store data into '/user/hgui/JYMBII-batch/history/$DIR/tmp-2013/$MM/$DD' USING BinaryJSON('memberId'); 
   
   ---------------------------------------------------------------------------------------------------------

    %declare DIR 'view'
    data = load '/user/hgui/JYMBII-batch/history/$DIR/2013/$MM/$DD' USING BinaryJSON();
    data = join score by (memberId, jobId), data by (memberId, jobId) parallel 500; 
    data = foreach data generate score::memberId as memberId, data::time as time, score::jobId as jobId, score::score as score; 
    data = distinct data parallel 1; 
    store data into '/user/hgui/JYMBII-batch/history/$DIR/tmp-2013/$MM/$DD' USING BinaryJSON('memberId'); 
    """

    prog = Pig.compile(script)
    for para in params:
        bound = prog.bind(para)
        stats = bound.runSingle()

Example #46

0

Show file

if len(sys.argv) != 5:
    print "Usage: " + sys.argv[
        0] + " tenantCode start-date end-date parameters-file"
    print "Data format: yyyy/MM/dd"
    sys.exit()

tenantCode = sys.argv[1]
startDate = sys.argv[2]
endDate = sys.argv[3]
paramFile = sys.argv[4]

minObjectId = globalVars.dateToObjectId(startDate)
maxObjectId = globalVars.dateToObjectId(endDate)

Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
Pig.registerJar("../lib/yucca-phoenix-pig.jar")

try:
    props = util.Properties()
    propertiesfis = javaio.FileInputStream("mongo_parameters_prod.txt")
    props.load(propertiesfis)
except:
    print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0]
    sys.exit(1)

mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty(
    'mongoPort') + "/DB_SUPPORT"
mongo2 = " -u " + props.getProperty('mongoUsr')
mongo3 = " -p " + props.getProperty(
    'mongoPwd') + ''' --authenticationDatabase admin  --quiet --eval "'''

Example #47

0

Show file

File: kmeans.py Project: gerdansantos/Admin_Rev3

MIN_SCORE = 0
MAX_ITERATION = 5

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i != k - 1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
    Q = P.bind({'centroids': initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration

Example #48

0

Show file

File: pig_checkpoint_util.py Project: pombredanne/gitrec

 def __init__(self, script_name, description, script_path, script_params, checkpoint_path):
     self.script_name = script_name
     self.description = description
     self.bound_script = Pig.compileFromFile(script_path).bind(script_params)
     self.flag_file_path = "%s/%s.success" % (checkpoint_path, os.path.splitext(script_name)[0])

Example #49

0

Show file

File: regenerate-query-cache.py Project: z0r0/saidit

    sorted = ORDER non_null BY value DESC;
    limited = LIMIT sorted 1000;
    GENERATE group AS rowkey, FLATTEN(limited.(colkey, value));
};

jsonified =
FOREACH limited GENERATE rowkey,
                         colkey,
                         com.reddit.pig.TO_JSON(value);

STORE jsonified INTO '$OUTPUT' USING PigStorage();
"""

###### run the jobs
# register the reddit udfs
Pig.registerJar(SCRIPT_ROOT + "reddit-pig-udfs.jar")

# process rels
for rel, (cf, thing2_type) in relations.iteritems():
    # build source for a script
    script = "SET default_parallel 10;"
    script += load_rels
    if "inbox" in rel:
        script += load_and_map_data
        script += add_unread
    else:
        script += add_relname
    script += load_things
    script += generate_rel_items
    script += store_top_1000_per_rowkey

Example #50

0

Show file

File: calling_linear_pig.py Project: Aurametrix/HDFS

from org.apache.pig.scripting import Pig
from org.codehaus.jackson.map import ObjectMapper

EPS = 10e-6  # maximum distance between consective weights for convergence

pig_script = sys.argv[1]  # pig script to run iteratively
data_dir = sys.argv[2]  # directory where intermediate weights will be written
features = sys.argv[
    3]  # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4]  # number of features

#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir
Pig.fs(cmd)

#
# Initialize weights
#
w0_fields = []
weights = []
for i in xrange(int(num_features)):
    weights.append(str(random.random()))
    w0_fields.append({
        "name": "w%s" % i,
        "type": 25,
        "schema": None
    })  # See Pig's DataType.java

path = tempfile.mkdtemp()

Example #51

0

Show file

File: github_recommender.py Project: ChappedSky/gitrec

 def run(self):
     print project_name + ": " + self.action
     compiled = Pig.compileFromFile(self.script)
     bound = compiled.bind(self.params)
     return bound.runSingle()

Example #52

0

Show file

File: CategoryDomainTitles.pig.py Project: mrrqzhang/FromYahoo

    script = get_script(aggregateby=aggregateby)
    if mode == 'master':
        logging.warning('unsubstituted script:\n')
        logging.warning(script)
        logging.warning('substituted script:\n')
        logging.warning(PrintScript(args=args, sub=True))
        args['mode'] = 'pigmode'
        run_bash('pig %s %s' %
                 (self_name(), ' '.join('='.join(map(str, [k, v]))
                                        for k, v in args.items())))
        #run_bash('pig %s mode=pigmode output=%s dryrun=%s ldadata=%s titledata=%s aggregateby=%s'%(self_name(), output, str(dryrun), ldadata, titledata, aggvalue))

    elif mode == 'pigmode':
        if not dryrun:
            from org.apache.pig.scripting import Pig
            P = Pig.compile(script)
            result = P.bind({
                'output': output,
                'selfname': self_name(),
                'ldadata': ldadata,
                'titledata': titledata,
                'mylibloc': mylibloc
            }).runSingle()

            if result.isSuccessful():
                print 'Pig job succeeded!'
            else:
                raise 'Pig job failed!'
        else:
            print 'unsubstituted:'
            print PrintScript(args=args, sub=False)

Example #53

0

Show file

File: kmeans.py Project: Lokas54/KMEANS

#     last_centroids[i] = (MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE),MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE))
#     initial_centroids = initial_centroids + str(last_centroids[i])
#     if i!=k-1:
#         initial_centroids = initial_centroids + ":"

initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0"
last_centroids = [(-120.0, -120.0), (-60.0, -60.0), (0.0, 0.0), (60.0, 60.0),
                  (120.0, 120.0)]

print initial_centroids

P = Pig.compile("""register /Users/yun_shen/Desktop/spams/pigudf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw_data = load '1.log' as (spam_id:chararray, longitude:double, latitude:double);
                   raw = filter raw_data by longitude is not null and latitude is not null;
                   centroided = foreach raw generate spam_id, longitude, latitude, find_centroid(longitude, latitude) as centroid;
                   grouped = group centroided by centroid parallel 4;
                   store grouped into 'grouped';
                   result = foreach grouped generate group, AVG(centroided.longitude), AVG(centroided.latitude);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
    Q = P.bind({'centroids': initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = []
    x = 0.0

Example #54

0

Show file

    sorted = ORDER non_null BY value DESC;
    limited = LIMIT sorted 1000;
    GENERATE group AS rowkey, FLATTEN(limited.(colkey, value));
};

jsonified =
FOREACH limited GENERATE rowkey,
                         colkey,
                         com.verbify.pig.TO_JSON(value);

STORE jsonified INTO '$OUTPUT' USING PigStorage();
"""

###### run the jobs
# register the verbify udfs
Pig.registerJar(SCRIPT_ROOT + "verbify-pig-udfs.jar")

# process rels
for rel, (cf, thing2_type) in relations.iteritems():
    # build source for a script
    script = "SET default_parallel 10;"
    script += load_rels
    if "inbox" in rel:
        script += load_and_map_data
        script += add_unread
    else:
        script += add_relname
    script += load_things
    script += generate_rel_items
    script += store_top_1000_per_rowkey

Example #55

0

Show file

if len(sys.argv) != 5:
    print "Usage: " + sys.argv[
        0] + " tenantCode start-date end-date parameters-file"
    print "Data format: yyyy/MM/dd"
    sys.exit()

tenantCode = sys.argv[1]
startDate = sys.argv[2]
endDate = sys.argv[3]
paramFile = sys.argv[4]

minObjectId = globalVars.dateToObjectId(startDate)
maxObjectId = globalVars.dateToObjectId(endDate)

Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar")
Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar")
Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar")
Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
Pig.registerJar("../lib/yucca-phoenix-pig.jar")
Pig.registerJar("/usr/hdp/current/pig-client/piggybank.jar")

try:
    props = util.Properties()
    propertiesfis = javaio.FileInputStream(paramFile)
    props.load(propertiesfis)
except:
    print "Errore leggendo mongo_parameters_prod.txt: ", sys.exc_info()[0]
    sys.exit(1)

mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty(

Example #56

0

Show file

#!/usr/bin/python

# explicitly import Pig class
from org.apache.pig.scripting import Pig

# COMPILE: compile method returns a Pig object that represents the pipeline
P = Pig.compile(
    """a = load '$input' using PigStorage() as (name:chararray, age:int, gpa:double);
    a1 = filter a by age > 18;
    a2 = foreach a1 generate name, ROUND(gpa) as gpa;
    b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration:chararray, contributions:double);
    c = join a2 by name, b by name;
    d = group c by registration;
    e = foreach d generate group, AVG(c.gpa) as gpa;
    f = order e by gpa desc;
    store f into '$output';
""")

results = P.bind({'input': 'studenttab10k', 'output': 'output'}).runSingle()

if results.isSuccessful() == "FAILED":
    raise "Pig job failed"
iter = results.result("f").iterator()
while iter.hasNext():
    tuple = iter.next()
    print tuple

Example #57

0

Show file

import java.io as javaio

props = util.Properties()
#add try catch for this
propertiesfis = javaio.FileInputStream("mongo_parameters_prod.txt")
props.load(propertiesfis)

mongo1 = props.getProperty('mongoHost') + ":" + props.getProperty(
    'mongoPort') + "/DB_SUPPORT"
mongo2 = " -u " + props.getProperty('mongoUsr')
mongo3 = " -p " + props.getProperty(
    'mongoPwd') + ''' --authenticationDatabase admin  --quiet --eval "'''

# var param1=438; var param2=1; var param3='datalake'" delete_dataset.js

Pig.registerJar("../lib/mongo-java-driver-3.4.0.jar")
Pig.registerJar("../lib/mongo-hadoop-core-1.5.2.jar")
Pig.registerJar("../lib/mongo-hadoop-pig-1.5.2.jar")
Pig.registerJar("/usr/hdp/current/phoenix-client/phoenix-client.jar")
#Pig.registerJar("../lib/yucca-phoenix-pig.jar")

if mode in ["APPEND", "append"]:
    # read from metadata source (mongoDB) lastIdDatalake2Speed for tenant
    readLastIdJob = Pig.compileFromFile(
        """read_mongo_lastIdDatalake2Speed.pig""")
    results = readLastIdJob.bind({'tenantCode': tenantCode}).runSingle()
    if results.isSuccessful():
        print "Pig job succeeded"
        iter = results.result("lastId").iterator()
        if iter.hasNext():
            lastId = iter.next()