def __init__(self, params): # BIND and RUN self.params = params self.set_param_defaults() Pig.fs("rmr " + self.params['output_name']) generator = PigScriptGenerator.PigScriptGenerator(self.params) full_script = generator.generate() P = Pig.compile( full_script ) results = P.bind({ 'output':self.params['output_name'], }).runSingle() if results.isSuccessful() : print 'Pig job succeeded' else : raise 'Pig job failed' result_iter = results.result("final_set").iterator() #This takes care of turning our iter into something we can use. self.make_dict_from_results(result_iter) send_to_grapht = raw_input('do you want to send this data to grapht?') if send_to_grapht not in ('y', 'yes', '1'): sys.exit() connector = GraphtConnector('grapht.shuttercorp.net') metric = self.params['output_name'] connector.record_data_points(metric, self.result)
def run(self): print "%s: %s" % (self.script_name, self.description) stats = self.bound_script.runSingle() if stats.isSuccessful(): Pig.fs("touchz %s" % self.flag_file_path) else: raise Exception("\nScript %s failed! Error should be logged above.\n" % self.script_name + "Once you have fixed the problem, you can restart the workflow at this step " + "using the argument \"-p CHECKPOINT=%s\"" % self.script_name)
def runbidi(src, fdest): P = Pig.compileFromFile('src/main/pig/bidi.pig') cntsbase = 'counts' Pig.fs('rmr ' + cntsbase) for count in range(10): dest = fdest + 'gm%04d' % count Pig.fs('rmr ' + dest) cnts = cntsbase params = {'src':src, 'dest':dest, 'cnts':cnts} bound = P.bind(params) job = bound.runSingle() if not job.isSuccessful(): raise 'failed' src = dest iter = job.result('S').iterator() if iter.hasNext(): Pig.fs('rmr ' + cnts) else: Pig.fs('mv ' + dest + ' ' + fdest) print 'ALL DONE!' break
def import_logs(profile): """ Import all the log files for a given day and processed them putting each in a log dir. If the profile is a list there are multiple files otherwise only a single one. The files are combined when running web_load.pig """ #Clean up any left over files from the last run for logfile in profile: Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME'])) pload = Pig.compileFromFile('web_import.pig') bload = pload.bind(profile) load = bload.run() #Check for load errors if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats): if not load.isSuccessful(): print 'Error in web log load, %s' % load.getErrorMessage() sys.exit(1) else: for run in load: if not run.isSuccessful(): print 'Error in web log load, %s' % run.getErrorMessage() sys.exit(1)
def main(argv=None): #Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily' #however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to #work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case #https://issues.apache.org/jira/browse/PIG-2548 # if argv is None: # argv = sys.argv # if len(argv) != 3: # print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>" # return 1 # # profile_file = argv[1] # timeframe = argv[2] profile_file = os.environ['config_file'] timeframe = os.environ['timeframe'] if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'): print 'The time frame must be either daily, weekly or monthly.' return 1 #Load the config profile = {} execfile(profile_file, {'timeframe':timeframe}, profile) #Clean up incomplete runs and create dir Pig.fs('rmr ' + profile['REPORTDIR']) Pig.fs('mkdir ' + profile['REPORTDIR']) #Start pig processing pig_init() if timeframe == 'daily': #Clean up incomplete runs and create dir Pig.fs('rmr %s' % profile['LOGDIR']) Pig.fs('mkdir %s' % profile['LOGDIR']) import_logs(profile['logs']) #The web_load.pig script is run by the processing scripts pstats = Pig.compileFromFile('web_%s.pig' % timeframe) bstats = pstats.bind(profile) stats = bstats.run() if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats): if not stats.isSuccessful(): print 'Error in web log stats, %s' % run.getErrorMessage() sys.exit(1) else: for run in stats: if not run.isSuccessful(): print 'Error in web log stats, %s' % run.getErrorMessage() sys.exit(1)
iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1
print "Graph Sampler: starting preprocessing step." preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({ "GRAPH_INPUT_PATH" : graph, "GRAPH_OUTPUT_PATH" : preprocess_graph, "NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices }).runSingle() iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig") num_iterations = nhood_size - 1 num_vertices = long(str(preprocessing.result("num_vertices").iterator().next().get(0))) print "Graph Sampler: scheduling %d iterations" % num_iterations for i in range(num_iterations): print "Graph Sampler: starting iteration step %d" % (i+1) iteration = iteration_script.bind({ "VERTICES_INPUT_PATH" : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)), "GRAPH_INPUT_PATH" : preprocess_graph, "VERTICES_OUTPUT_PATH" : iteration_verts_prefix + str(i) }).runSingle() iteration_result = iteration_verts_prefix + str(i) print "Graph Sampler: starting postprocessing step." postprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_postprocess.pig").bind({ "GRAPH_INPUT_PATH" : graph, "VERTICES_INPUT_PATH" : iteration_result, "SAMPLE_OUTPUT_PATH" : output_path, }).runSingle() print "Graph Sampler: deleting temporary output directory" Pig.fs("rmr " + tmp_dir)
def main(): filename = "studenttab10k" k = 4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" # Compile Pig script. Register the same script since it contains the Jython UDF. # $centroids is the only binding parameter. It will be bound to different parameter with the # estimation for centroid from the last round. Then we calculate the average of the new clusters # to get the new centroid estimation, and store into "output" P = Pig.compile("""register 'kmeans.py' using jython as util; raw = load 'studenttab10k' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, util.findCentroid('$centroids', gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num < MAX_ITERATION: # Binding parameter centroids to current centroids Q = P.bind({'centroids': initial_centroids}) # Run Pig script results = Q.runSingle() # Check the result of the Pig script if results.isSuccessful() == "FAILED": raise "Pig job failed" # Get the new centroids from the output iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # Calculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i] - centroids[i]) distance_move = distance_move / k Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) # Converge if distance_move < tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break # Not converge, use the new centroids as the initial centroids for next iteration last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" iter_num += 1 # Not converge after MAX_ITERATION if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]\n")
def run_pagerank(edges_input, output_path, tmp_output_dir, damping_factor=0.85, convergence_threshold=0.0001, max_num_iterations=10, id_name_map=None, preprocessing_script="../pigscripts/pagerank_preprocess.pig", iteration_script="../pigscripts/pagerank_iterate.pig" ): """ Calculates pageranks for directed graph of nodes and edges. Three main steps: 1. Preprocessing: Process input data to: a) Count the total number of nodes. b) Prepare initial pagerank values for all nodes. 2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the nodes with edges going into the given node. 3. Postprocessing: Order nodes by pagerank Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs to get human-readable names """ preprocess_dir = "%s/preprocess" % tmp_output_dir iteration_dir = "%s/iteration" % tmp_output_dir # Preprocessing step: print "Starting preprocessing step." preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({ "INPUT_PATH" : edges_input, "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir, "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir }).runSingle() # Update convergence threshold based on the size of the graph (number of nodes) num_nodes = long(str(preprocess.result("num_nodes").iterator().next().get(0))) convergence_threshold = long(convergence_threshold * num_nodes * num_nodes) print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) # Iteration step: def iteration_param_func(it_num, it_dir): if it_num == 1: iteration_input = "%s/pageranks" % preprocess_dir else: iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1) return { "INPUT_PATH" : iteration_input, "DAMPING_FACTOR" : damping_factor, "NUM_NODES" : num_nodes, "PAGERANKS_OUTPUT_PATH" : "%s/%d/pageranks" % (it_dir, it_num), "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num) } iteration_result = IterationUtils.iterate_until_convergence( "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate iteration_dir, # temporary iteration outputs will be stored here iteration_param_func, # takes iteration #, returns Pig parameter dictionary "Sum of ordering-rank changes", # name of the convergence metric int, # Python type of the convergence metric "aggregate_rank_change", # alias in the pigscript where the metric is stored to convergence_threshold, # stop when metric less than this max_num_iterations # or if this many iterations have been performed ) # Postprocesing step: print "Starting postprocessing step." postprocess_script = """ pageranks = LOAD '$PAGERANKS_INPUT_PATH' USING PigStorage() AS (id: int, pagerank: double); pageranks = FILTER pageranks BY pagerank IS NOT NULL; """ if id_name_map: postprocess_script += """ id_name_map = LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray); with_names = FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank; ordered = ORDER with_names BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "ID_NAME_MAP_INPUT_PATH" : id_name_map, "OUTPUT_PATH" : output_path }).runSingle() else: postprocess_script += """ ordered = ORDER pageranks BY pagerank DESC; rmf $OUTPUT_PATH; STORE ordered INTO '$OUTPUT_PATH' USING PigStorage(); """ postprocess = Pig.compile(postprocess_script).bind({ "PAGERANKS_INPUT_PATH" : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]), "OUTPUT_PATH" : output_path }).runSingle() Pig.fs("rmr %s" % preprocess_dir) Pig.fs("rmr %s" % iteration_dir)
from org.apache.pig.scripting import Pig from org.codehaus.jackson.map import ObjectMapper EPS = 10e-6 # maximum distance between consective weights for convergence pig_script = sys.argv[1] # pig script to run iteratively data_dir = sys.argv[2] # directory where intermediate weights will be written features = sys.argv[3] # location, inside data_dir, where the data to fit exists num_features = sys.argv[4] # number of features # # Cleanup data dir # cmd = "rmr %s/weight-*" % data_dir Pig.fs(cmd) # # Initialize weights # w0_fields = [] weights = [] for i in xrange(int(num_features)): weights.append(str(random.random())) w0_fields.append({"name":"w%s" % i,"type":25,"schema":None}) # See Pig's DataType.java path = tempfile.mkdtemp() w0 = open("%s/part-r-00000" % path, 'w') w0.write("\t".join(weights)+"\n") w0.close()
distance_move = distance_move + sqrt(x_move + y_move) print distance_move new_centroid = (x, y) centroids.append(new_centroid) initial_centroids = initial_centroids + str(x) + "," + str(y) if i != k - 1: initial_centroids = initial_centroids + ":" iter_num = iter_num + 1 distance_move = distance_move / k if distance_move > tolerance: Pig.fs("rmr grouped") Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move < tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break last_centroids = centroids print last_centroids print initial_centroids
_out = _in + '_counts_m' + _min_count _out_nc = _out + '/count' _out_v = _out + '/vocab' _out_nf = _out + '/nfollow' _out_np = _out + '/nprecede' _out_nfp = _out + '/nfollowerprecede' _out_njc = _out + '/countsjoined' ## # start actual pig jobs # from org.apache.pig.scripting import Pig # if output path does not exist, create it if Pig.fs('-test -d ' + _out): Pig.fs('mkdir ' + _out) ## # CountJob # # if output path of countjob already exists, skip it, run job ## if not Pig.fs('-test -d ' + _out_nc): print '\nPath ("%s") already exists, skipping job.\n' % _out_nc else: result = Pig.compile(_header + """ count_ngrams( '${in}', '${out}', '${min_count}' ); """).bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle() # check the result if not result.isSuccessful():
from org.apache.pig.scripting import Pig EPS = 10e-6 # maximum distance between consective weights for convergence pig_script = sys.argv[1] # pig script to run iteratively data_dir = sys.argv[2] # directory where intermediate weights will be written features = sys.argv[ 3] # location, inside data_dir, where the data to fit exists num_features = sys.argv[4] # number of features # # Cleanup data dir # cmd = "rmr %s/weight-*" % data_dir Pig.fs(cmd) # # Initialize weights # weights = [] for _ in xrange(int(num_features)): weights.append(str(random.random())) fd, path = tempfile.mkstemp() f = open(path, 'w') f.write("\t".join(weights) + "\n") f.close() os.close(fd) copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0")
while True: print "*** Iteration " + str(i) + " ***" edges_out = 'edges' + str(i) + '.tmp' iteration_bound = iteration.bind({ 'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 'CONVERGENCE_OUT': 'convergence.tmp' }) iteration_stats = iteration_bound.runSingle() if not iteration_stats.isSuccessful(): raise 'Iteration failed' conv_result = iteration_stats.result('convergence').iterator().next() max_iter = int(str(conv_result.get(0))) conv_iter = int(str(conv_result.get(1))) change_count = int(str(conv_result.get(2))) Pig.fs('rm -r ' + 'convergence.tmp') Pig.fs('rm -r ' + edges_in) edges_in = edges_out print "Decision change count: " + str(change_count) if change_count == 0: stable_iterations += 1 else: stable_iterations = 0 print "Stable iterations: " + str(stable_iterations) print "Convergence iterations: " + str(conv_iter) print "Max iterations: " + str(max_iter) if stable_iterations >= conv_iter: print "Stopping due to convergence" break if i >= max_iter: print "Stopping due to max iterations reached"
#!/usr/bin/python # -*- coding: utf-8 -*- # explicitly import Pig class from org.apache.pig.scripting import Pig # COMPILE: compile method returns a Pig object that represents the pipeline P = Pig.compile('''Arcs = LOAD '$docs_in' USING PigStorage('\t') AS (url: chararray, pagerank: float, links:{ link: ( url: chararray ) } ); outlinkPageRank = FOREACH Arcs GENERATE pagerank / COUNT ( links ) AS pagerank, FLATTEN ( links ) AS to_url; newPageRank = FOREACH ( COGROUP outlinkPageRank BY to_url, Arcs BY url INNER ) GENERATE FLATTEN (Arcs.url), ( 1.0 - 0.85 ) + 0.85 * SUM ( outlinkPageRank.pagerank ) AS pagerank, FLATTEN (Arcs.links) AS links; dump newPageRank; STORE newPageRank INTO '$docs_out';''') params = {'docs_in': 'urls2.txt' } for i in range(1): out = "out/pagerank_data_" + str(i + 1) params["docs_out"] = out Pig.fs("rmr " + out) stats = P.bind(params).runSingle() if not stats.isSuccessful(): raise 'failed' params["docs_in"] = out
hdfsInputDir = sys.argv[2] print initialCentroidsFile k = 4 numOfCentroids = int(sys.argv[3]) numOfReducer = str(sys.argv[4]) # numOfMapper = str(sys.argv[5]) tolerance = 0.01 MAX_ITERATION = int(sys.argv[5]) initial_centroids = "" # only support local path currently hdfsCentroidFilePath = "/tmp/"+initialCentroidsFile cachedCentroidFilePath = hdfsCentroidFilePath+"#"+initialCentroidsFile Pig.fs("rm -r "+hdfsCentroidFilePath) Pig.fs("put "+initialCentroidsFile+" " + hdfsCentroidFilePath) #print initial_centroids pigScript = ("""SET default_parallel """+numOfReducer+"""; SET pig.noSplitCombination true; -- set mapred.child.java.opts '-Xmx900m'; set mapred.map.tasks.speculative.execution false; SET mapred.cache.files """+cachedCentroidFilePath+"""; register pig-kmeans-udf-yarn.jar; -- DEFINE find_centroid FindCentroid('$centroids'); raw = load '"""+hdfsInputDir+"""' using BinaryDataLoader('$centroids','"""+str(numOfCentroids)+"""') as (datapoints); -- line below may be the bottleneck datapointbag = foreach raw generate FLATTEN(datapoints) as datapointInString:chararray;
distance_move = distance_move + sqrt(x_move + y_move) print distance_move new_centroid = (x, y) centroids.append(new_centroid) initial_centroids = initial_centroids + str(x) + "," + str(y) if i != k - 1: initial_centroids = initial_centroids + ":" iter_num = iter_num + 1 distance_move = distance_move / k if distance_move > tolerance: Pig.fs("rmr grouped") Pig.fs("rmr output") print ("iteration " + str(iter_num)) print ("average distance moved: " + str(distance_move)) if distance_move <= tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break last_centroids = centroids print last_centroids print initial_centroids
import tempfile from org.apache.pig.scripting import Pig EPS = 10e-6 # maximum distance between consective weights for convergence pig_script = sys.argv[1] # pig script to run iteratively data_dir = sys.argv[2] # directory where intermediate weights will be written features = sys.argv[3] # location, inside data_dir, where the data to fit exists num_features = sys.argv[4] # number of features # # Cleanup data dir # cmd = "rmr %s/weight-*" % data_dir Pig.fs(cmd) # # Initialize weights # weights = [] for _ in xrange(int(num_features)): weights.append(str(random.random())) fd, path = tempfile.mkstemp() f = open(path, 'w') f.write("\t".join(weights)+"\n") f.close() os.close(fd) copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0")
iter_num = 0 while iter_num < MAX_ITERATION: Q = P.bind({'centroids': initial_centroids}) results = Q.runSingle() if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i] - centroids[i]) distance_move = distance_move / k Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move < tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i != k - 1: initial_centroids = initial_centroids + ":" iter_num += 1
edges_in = 'edges' + str(i - 1) + '.tmp' edges_out = '' while True: print "*** Iteration " + str(i) + " ***" edges_out = 'edges' + str(i) + '.tmp' iteration_bound = iteration.bind({'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 'CONVERGENCE_OUT': 'convergence.tmp'}) iteration_stats = iteration_bound.runSingle() if not iteration_stats.isSuccessful(): raise 'Iteration failed' conv_result = iteration_stats.result('convergence').iterator().next() max_iter = int(str(conv_result.get(0))) conv_iter = int(str(conv_result.get(1))) change_count = int(str(conv_result.get(2))) Pig.fs('rm -r ' + 'convergence.tmp') Pig.fs('rm -r ' + edges_in) edges_in = edges_out print "Decision change count: " + str(change_count) if change_count == 0: stable_iterations += 1 else: stable_iterations = 0 print "Stable iterations: " + str(stable_iterations) print "Convergence iterations: " + str(conv_iter) print "Max iterations: " + str(max_iter) if stable_iterations >= conv_iter: print "Stopping due to convergence" break if i >= max_iter: print "Stopping due to max iterations reached"
from org.apache.pig.scripting import Pig from org.codehaus.jackson.map import ObjectMapper EPS = 10e-6 # maximum distance between consective weights for convergence pig_script = sys.argv[1] # pig script to run iteratively data_dir = sys.argv[2] # directory where intermediate weights will be written features = sys.argv[ 3] # location, inside data_dir, where the data to fit exists num_features = sys.argv[4] # number of features # # Cleanup data dir # cmd = "rmr %s/weight-*" % data_dir Pig.fs(cmd) # # Initialize weights # w0_fields = [] weights = [] for i in xrange(int(num_features)): weights.append(str(random.random())) w0_fields.append({ "name": "w%s" % i, "type": 25, "schema": None }) # See Pig's DataType.java path = tempfile.mkdtemp()
if (len(sys.argv) <= 4): docs_in = sys.argv[2] if (len(sys.argv) <= 4): start_at = (int)(sys.argv[3]) else: start_at = 0 out_dir = "%s/tmp/%s" % (dataset, basename(preprocessedGraph)) inputType = "chararray" #use long if we have hashed urls for i in range(20): if i < start_at: continue docs_out = out_dir + "pagerank_data_" + str(i + 1) max_diff = out_dir + "max_diff_" + str(i + 1) Pig.fs("rmr " + docs_out) Pig.fs("rmr " + max_diff) stats = P.bind().runSingle() if not stats.isSuccessful(): raise 'failed' max_diff_value = float(str(stats.result("max_diff").iterator().next().get(0))) print " max_diff_value = " + str(max_diff_value) if max_diff_value < 0.01: print "done at iteration " + str(i) + ". Cleaning output" break #max_diff of previous iterations never used, so clean it up Pig.fs("rmr " + max_diff) if i > 1: #never for 1st iteration! (otherwise we delete original input... Pig.fs("rmr " + docs_in)