def __init__(self, params): # BIND and RUN self.params = params self.set_param_defaults() Pig.fs("rmr " + self.params['output_name']) generator = PigScriptGenerator.PigScriptGenerator(self.params) full_script = generator.generate() P = Pig.compile( full_script ) results = P.bind({ 'output':self.params['output_name'], }).runSingle() if results.isSuccessful() : print 'Pig job succeeded' else : raise 'Pig job failed' result_iter = results.result("final_set").iterator() #This takes care of turning our iter into something we can use. self.make_dict_from_results(result_iter) send_to_grapht = raw_input('do you want to send this data to grapht?') if send_to_grapht not in ('y', 'yes', '1'): sys.exit() connector = GraphtConnector('grapht.shuttercorp.net') metric = self.params['output_name'] connector.record_data_points(metric, self.result)
""" if aggregateMethod == "avg": pigScript += """ rankedTriples = FOREACH objGroup GENERATE $0,$1,$2, AVG({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;""" elif aggregateMethod == "max": pigScript += """ rankedTriples = FOREACH objGroup GENERATE $0,$1,$2, MAX({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;""" elif aggregateMethod == "min": pigScript += """ rankedTriples = FOREACH objGroup GENERATE $0,$1,$2, MIN({($4 is null? 1F: $4),($6 is null? 1F: $6)}) AS ranking;""" else: pigScript += """ WRONGGGG. how to aggregate?!""" pigScript += """ rmf $outputFile STORE rankedTriples INTO '$outputFile' USING PigStorage(); """ P = Pig.compile(pigScript) stats = P.bind().runSingle()
vertica.accert_table_exists(table_name) table_size = vertica.get_table_size(table_name) logger.info(table_name + " table size is " + str(table_size) + " bytes") output_dir = "/user/mykhail.martsynyuk/vertica/export/"+table_name #prepare hdfs structure logger.info("Move folder "+output_dir+" to backup") hdfs.move_folder_to_backup(output_dir) logger.info("Remove "+output_dir) hdfs.remove_folder(output_dir) params.append({'out':output_dir, 'table':table_name}) P = Pig.compile(""" register /usr/lib/pig/lib/pig-vertica.jar register /usr/lib/pig/lib/vertica-jdbc-7.0.1-0.jar A = LOAD 'sql://{SELECT * FROM $table WHERE 1 = ?};{1}' USING com.vertica.pig.VerticaLoader('10.104.5.29','verticadst','5433','alfxplsit','xpl123'); STORE A INTO '$out'; """) bound = P.bind(params) stats_list = bound.run() i = 0 for stats in stats_list: if stats.isSuccessful(): logger.info("SUCCESS: Table: "+params[i]["table"]+"; Number jobs: "+str(stats.getNumberJobs())+ "; Time to run: "+str(stats.getDuration())+"; Files written: "+str(stats.getOutputLocations())) else: logger.info("FAIL: Table: "+params[i]["table"]+"; ERRORS: "+stats.getAllErrorMessages()) i+=1 # Next is example of how to get script output:
MIN_SCORE = 0 MAX_ITERATION = 5 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
if i != k - 1: initial_centroids = initial_centroids + ":" # initial_centroids = "37.475097, -122.155599:37.486098,-122.195388:37.4985769, -122.2195727:37.4608874, -122.143838:37.453407, -122.182255" # initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0" # last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)] print last_centroids print initial_centroids P = Pig.compile( """register Find.jar DEFINE find_centroid FindCentroid('$centroids'); raw_data = load 'MP_match.txt' as (latitude:double, longitude:double, status:chararray); centroided = foreach raw_data generate status, latitude, longitude, find_centroid(latitude, longitude) as centroid; grouped = group centroided by centroid; store grouped into 'grouped'; result = foreach grouped generate group, AVG(centroided.latitude), AVG(centroided.longitude); store result into 'output'; """ ) converged = False iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({"centroids": initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = []
# ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false'); script = """ %declare OUTPUT '/user/haliu' member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON(); events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false'); job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0; job_views = FOREACH job_view_events GENERATE (int)header.memberId AS memberId, (long)header.time AS time, trackingCode, (int)trackingInfo#'0' AS jobId; job_views = join job_views by memberId, member by memberId; job_views = foreach job_views generate job_views::memberId as memberId, job_views::time as time, job_views::jobId as jobId; job_views = filter job_views by memberId > 0; job_views = distinct job_views parallel 1; STORE job_views INTO '$OUTPUT/JYMBII-batch/history/view/$REPORT_DATE' USING BinaryJSON('memberId'); """ prog = Pig.compile(script) for para in params: bound = prog.bind(para) stats = bound.runSingle() print "********************************Finish Current Data " + para['DATE'] + " *************************************************"
MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar; DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid parallel 2; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'kmoutput'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
postString += "Result = FOREACH Result GENERATE " + fsDic['genFields'] + ";\n" postString += "Result = FOREACH Result GENERATE * AS (" + fsDic['genSchema'] + ");\n" # postString += "Result = FOREACH Result GENERATE " + currentAction+ "Result::UserId AS UserId, *;\n" # postString += "DESCRIBE Result;\n" # A1BResult = JOIN BResult BY UserId, CResult By UserId; pigString += postString pigString += """ DUMP Result; DESCRIBE Result; """ print(pigString) # with open('cyygeneratedPig.pig','w') as outFile: # outFile.write(pigString) if USE_PIG: P = Pig.compile(pigString) # P = Pig.compileFromFile('pig_bcd_bc.pig') # run the pig script if True: result = P.bind().runSingle() if result.isSuccessful(): print 'run success' else: raise 'run failed'
def run_pagerank(edges_input, output_path, tmp_output_dir, damping_factor=0.85, convergence_threshold=0.0001, max_num_iterations=10, id_name_map=None, preprocessing_script="../pigscripts/pagerank_preprocess.pig", iteration_script="../pigscripts/pagerank_iterate.pig" ): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Order nodes by pagerank Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs to get human-readable names """ preprocess_dir = "%s/preprocess" % tmp_output_dir iteration_dir = "%s/iteration" % tmp_output_dir # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({ "INPUT_PATH" : edges_input, "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir, "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir }).runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) # Iteration step: def iteration_param_func(it_num, it_dir): if it_num == 1: iteration_input = "%s/pageranks" % preprocess_dir else: iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1) return { "INPUT_PATH" : iteration_input, "DAMPING_FACTOR" : damping_factor, "NUM_NODES" : num_nodes, "PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num), "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num) } iteration_result = IterationUtils.iterate_until_convergence( "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate iteration_dir, # temporary iteration outputs will be stored here iteration_param_func, # takes iteration #, returns Pig parameter dictionary "Sum of ordering-rank changes", # name of the convergence metric int, # Python type of the convergence metric "aggregate_rank_change", # alias in the pigscript where the metric is stored to convergence_threshold, # stop when metric less than this max_num_iterations # or if this many iterations have been performed ) # Postprocesing step: print "Starting postprocessing step." postprocess_script = """ pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double); pageranks = FILTER pageranks BY pagerank IS NOT NULL; """ if id_name_map: postprocess_script += """ id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray); with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank; ordered = ORDER with_names BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "ID_NAME_MAP_INPUT_PATH" : id_name_map, "OUTPUT_PATH" : output_path }).runSingle() else: postprocess_script += """ ordered = ORDER pageranks BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "OUTPUT_PATH" : output_path }).runSingle() Pig.fs("rmr %s" % preprocess_dir) Pig.fs("rmr %s" % iteration_dir)
MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register 'centroid.py' using jython as centroid; raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, centroid.get_closest_centroid(gpa, '$centroids') as centroid; grouped = group centroided by centroid parallel 2; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'kmoutput'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration
# initial_centroids = initial_centroids + str(last_centroids[i]) # if i!=k-1: # initial_centroids = initial_centroids + ":" initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0" last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)] print initial_centroids P = Pig.compile("""register /Users/yun_shen/Desktop/spams/pigudf.jar DEFINE find_centroid FindCentroid('$centroids'); raw_data = load '1.log' as (spam_id:chararray, longitude:double, latitude:double); raw = filter raw_data by longitude is not null and latitude is not null; centroided = foreach raw generate spam_id, longitude, latitude, find_centroid(longitude, latitude) as centroid; grouped = group centroided by centroid parallel 4; store grouped into 'grouped'; result = foreach grouped generate group, AVG(centroided.longitude), AVG(centroided.latitude); store result into 'output'; """) converged = False iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [] x = 0.0
from org.apache.pig.scripting import Pig # if output path does not exist, create it if Pig.fs('-test -d ' + _out): Pig.fs('mkdir ' + _out) ## # CountJob # # if output path of countjob already exists, skip it, run job ## if not Pig.fs('-test -d ' + _out_nc): print '\nPath ("%s") already exists, skipping job.\n' % _out_nc else: result = Pig.compile(_header + """ count_ngrams( '${in}', '${out}', '${min_count}' ); """).bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle() # check the result if not result.isSuccessful(): raise "Pig job failed" ## # ExtractVocabularyJob # # if output path of countjob already exists, skip it, run job ## if not Pig.fs('-test -d ' + _out_v): print '\nPath ("%s") already exists, skipping job.\n' % _out_v else: result = Pig.compile(_header + """
# ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false'); script = """ %declare OUTPUT '/user/haliu' member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON(); events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false'); job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0; job_views = FOREACH job_view_events GENERATE (int)header.memberId AS memberId, (long)header.time AS time, trackingCode, (int)trackingInfo#'0' AS jobId; job_views = join job_views by memberId, member by memberId; job_views = foreach job_views generate job_views::memberId as memberId, job_views::time as time, job_views::jobId as jobId; job_views = filter job_views by memberId > 0; job_views = distinct job_views parallel 1; STORE job_views INTO '$OUTPUT/JYMBII-batch/history/view/$REPORT_DATE' USING BinaryJSON('memberId'); """ prog = Pig.compile(script) for para in params: bound = prog.bind(para) stats = bound.runSingle() print "********************************Finish Current Data " + para[ 'DATE'] + " *************************************************"
P = Pig.compile(""" previous_pagerank = LOAD '$docs_in' AS ( url: $inputType, pagerank: float, links:{ link: ( url: $inputType ) } ); /** Creates <http://rdf.chemspider.com/3442>, 1.0, {(<http://www.w3.org/2004/02/skos/core#exactMatch>), (<http://bla>)} */ outbound_pagerank = FOREACH previous_pagerank GENERATE pagerank / COUNT ( links ) AS pagerank, FLATTEN ( links ) AS to_url; /** Creates: 1.0, <http://bla> 1.0, <http://www.w3.org/2004/02/skos/core#exactMatch> */ cogrpd = cogroup outbound_pagerank by to_url, previous_pagerank by url; /** creates: <http://rdf.chemspider.com/3442>, {}, {(<http://rdf.chemspider.com/3442>, 1.0, {(<http://www.w3.org/2004/02/skos/core#exactMatch>), (<http://bla>)})} */ new_pagerank = FOREACH cogrpd GENERATE group AS url, ( 1 - $d ) + $d * SUM (outbound_pagerank.pagerank) AS pagerank, FLATTEN ( previous_pagerank.links ) AS links, FLATTEN ( previous_pagerank.pagerank ) AS previous_pagerank; STORE new_pagerank INTO '$docs_out'; nonulls = filter new_pagerank by previous_pagerank is not null and pagerank is not null; pagerank_diff = FOREACH nonulls GENERATE ABS ( previous_pagerank - pagerank ); grpall = group pagerank_diff all; max_diff = foreach grpall generate MAX (pagerank_diff); STORE max_diff INTO '$max_diff'; """)