def __init__(self, params):
     # BIND and RUN
     self.params = params
     self.set_param_defaults()
     Pig.fs("rmr " + self.params['output_name'])
     generator = PigScriptGenerator.PigScriptGenerator(self.params)
     full_script = generator.generate()
     
     P = Pig.compile( full_script )
     
     results = P.bind({
                           'output':self.params['output_name'],
                           }).runSingle()
     
     if results.isSuccessful() :
         print 'Pig job succeeded'
     else :
         raise 'Pig job failed'
     result_iter = results.result("final_set").iterator()
     
     #This takes care of turning our iter into something we can use.
     self.make_dict_from_results(result_iter)
     
     send_to_grapht = raw_input('do you want to send this data to grapht?')
     if send_to_grapht not in ('y', 'yes', '1'): 
         sys.exit()
     connector = GraphtConnector('grapht.shuttercorp.net')
     metric = self.params['output_name']
     connector.record_data_points(metric, self.result)
     
    def run(self):
        print "%s: %s" % (self.script_name, self.description)
        stats = self.bound_script.runSingle()

        if stats.isSuccessful():
            Pig.fs("touchz %s" % self.flag_file_path)
        else:
            raise Exception("\nScript %s failed! Error should be logged above.\n" % self.script_name +
                            "Once you have fixed the problem, you can restart the workflow at this step " +
                            "using the argument \"-p CHECKPOINT=%s\"" % self.script_name)
Exemple #3
0
def runbidi(src, fdest):
	P = Pig.compileFromFile('src/main/pig/bidi.pig')

	cntsbase = 'counts'
	Pig.fs('rmr ' + cntsbase)

	for count in range(10):
		dest = fdest + 'gm%04d' % count
		Pig.fs('rmr ' + dest)
		cnts = cntsbase
		params = {'src':src, 'dest':dest, 'cnts':cnts}
		bound = P.bind(params)
		job = bound.runSingle()

		if not job.isSuccessful():
			raise 'failed'

		src = dest

		iter = job.result('S').iterator()
		if iter.hasNext():
			Pig.fs('rmr ' + cnts)
		else:
			Pig.fs('mv ' + dest + ' ' + fdest)
			print 'ALL DONE!'
			break
Exemple #4
0
def import_logs(profile):
    """ Import all the log files for a given day and processed them putting each in a log dir.
        If the profile is a list there are multiple files otherwise only a single one.
        The files are combined when running web_load.pig
    """
    #Clean up any left over files from the last run
    for logfile in profile:
        Pig.fs('rmr %s/%s' % (logfile['TMPDIR'], logfile['NAME']))
    pload = Pig.compileFromFile('web_import.pig')
    bload = pload.bind(profile)
    load = bload.run()
    #Check for load errors
    if isinstance(load, org.apache.pig.tools.pigstats.SimplePigStats):
        if not load.isSuccessful():
            print 'Error in web log load, %s' % load.getErrorMessage()
            sys.exit(1)
    else:
        for run in load:
            if not run.isSuccessful():
                print 'Error in web log load, %s' % run.getErrorMessage()
                sys.exit(1)
Exemple #5
0
def main(argv=None):
#Ideally I want to use arguments, ie 'pig -l /var/log/pig web_process.py /etc/rgpig/www.iresis.com.py daily'
#however it just doesn't work, I'm not sure why the code has been applied in my version, and I can get it to
#work with a test .py that only has two lines, import sys, and print sys.argv. Here is the case
#https://issues.apache.org/jira/browse/PIG-2548
#    if argv is None:
#        argv = sys.argv
#    if len(argv) != 3:
#        print "Usage: " + argv[0] + " <profile config> <daily|weekly|monthly>"
#        return 1
#
#    profile_file = argv[1]
#    timeframe = argv[2]
    
    profile_file = os.environ['config_file']
    timeframe = os.environ['timeframe']

    if not (timeframe == 'daily' or timeframe == 'weekly' or timeframe == 'monthly'):
        print 'The time frame must be either daily, weekly or monthly.'
        return 1

    #Load the config
    profile = {}
    execfile(profile_file, {'timeframe':timeframe}, profile)

    #Clean up incomplete runs and create dir
    Pig.fs('rmr ' + profile['REPORTDIR'])
    Pig.fs('mkdir ' + profile['REPORTDIR'])

    #Start pig processing
    pig_init()
    if timeframe == 'daily':
        #Clean up incomplete runs and create dir
        Pig.fs('rmr %s' % profile['LOGDIR'])
        Pig.fs('mkdir %s' % profile['LOGDIR'])
        import_logs(profile['logs'])
    #The web_load.pig script is run by the processing scripts
    pstats = Pig.compileFromFile('web_%s.pig' % timeframe)
    bstats = pstats.bind(profile)
    stats = bstats.run()
    if isinstance(stats, org.apache.pig.tools.pigstats.SimplePigStats):
        if not stats.isSuccessful():
            print 'Error in web log stats, %s' % run.getErrorMessage()
            sys.exit(1)
    else:
        for run in stats:
            if not run.isSuccessful():
                print 'Error in web log stats, %s' % run.getErrorMessage()
                sys.exit(1)
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
    for i in range(k):
        tuple = iter.next()
        centroids[i] = float(str(tuple.get(1)))
        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
    distance_move = distance_move / k;
    Pig.fs("rmr output")
    print("iteration " + str(iter_num))
    print("average distance moved: " + str(distance_move))
    if distance_move<tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]\n")
        converged = True
        break
    last_centroids = centroids[:]
    initial_centroids = ""
    for i in range(k):
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i!=k-1:
            initial_centroids = initial_centroids + ":"
    iter_num += 1
    print "Graph Sampler: starting preprocessing step."
    preprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_preprocess.pig").bind({
        "GRAPH_INPUT_PATH"         : graph,
        "GRAPH_OUTPUT_PATH"        : preprocess_graph,
        "NUM_VERTICES_OUTPUT_PATH" : preprocess_num_vertices
    }).runSingle()

    iteration_script = Pig.compileFromFile("../pigscripts/graph_sampler_iterate.pig")
    num_iterations   = nhood_size - 1
    num_vertices     = long(str(preprocessing.result("num_vertices").iterator().next().get(0)))

    print "Graph Sampler: scheduling %d iterations" % num_iterations
    for i in range(num_iterations):
        print "Graph Sampler: starting iteration step %d" % (i+1)
        iteration = iteration_script.bind({
            "VERTICES_INPUT_PATH"  : seed_vertices if i == 0 else (iteration_verts_prefix + str(i-1)),
            "GRAPH_INPUT_PATH"     : preprocess_graph,
            "VERTICES_OUTPUT_PATH" : iteration_verts_prefix + str(i)
        }).runSingle()
    iteration_result = iteration_verts_prefix + str(i)

    print "Graph Sampler: starting postprocessing step."
    postprocessing = Pig.compileFromFile("../pigscripts/graph_sampler_postprocess.pig").bind({
        "GRAPH_INPUT_PATH"     : graph,
        "VERTICES_INPUT_PATH"  : iteration_result,
        "SAMPLE_OUTPUT_PATH"   : output_path,
    }).runSingle()

    print "Graph Sampler: deleting temporary output directory"
    Pig.fs("rmr " + tmp_dir)
def main():
    filename = "studenttab10k"
    k = 4
    tolerance = 0.01

    MAX_SCORE = 4
    MIN_SCORE = 0
    MAX_ITERATION = 100

    # initial centroid, equally divide the space
    initial_centroids = ""
    last_centroids = [None] * k
    for i in range(k):
        last_centroids[i] = MIN_SCORE + float(i) / k * (MAX_SCORE - MIN_SCORE)
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i != k - 1:
            initial_centroids = initial_centroids + ":"

    # Compile Pig script. Register the same script since it contains the Jython UDF.
    # $centroids is the only binding parameter. It will be bound to different parameter with the
    # estimation for centroid from the last round. Then we calculate the average of the new clusters
    # to get the new centroid estimation, and store into "output"
    P = Pig.compile("""register 'kmeans.py' using jython as util;
                       raw = load 'studenttab10k' as (name:chararray, age:int, gpa:double);
                       centroided = foreach raw generate gpa, util.findCentroid('$centroids', gpa) as centroid;
                       grouped = group centroided by centroid;
                       result = foreach grouped generate group, AVG(centroided.gpa);
                       store result into 'output';
                    """)

    converged = False
    iter_num = 0
    while iter_num < MAX_ITERATION:
        # Binding parameter centroids to current centroids
        Q = P.bind({'centroids': initial_centroids})

        # Run Pig script
        results = Q.runSingle()

        # Check the result of the Pig script
        if results.isSuccessful() == "FAILED":
            raise "Pig job failed"

        # Get the new centroids from the output
        iter = results.result("result").iterator()
        centroids = [None] * k
        distance_move = 0

        # Calculate the moving distance with last iteration
        for i in range(k):
            tuple = iter.next()
            centroids[i] = float(str(tuple.get(1)))
            distance_move = distance_move + fabs(last_centroids[i] -
                                                 centroids[i])
        distance_move = distance_move / k
        Pig.fs("rmr output")
        print("iteration " + str(iter_num))
        print("average distance moved: " + str(distance_move))

        # Converge
        if distance_move < tolerance:
            sys.stdout.write("k-means converged at centroids: [")
            sys.stdout.write(",".join(str(v) for v in centroids))
            sys.stdout.write("]\n")
            converged = True
            break

        # Not converge, use the new centroids as the initial centroids for next iteration
        last_centroids = centroids[:]
        initial_centroids = ""
        for i in range(k):
            initial_centroids = initial_centroids + str(last_centroids[i])
            if i != k - 1:
                initial_centroids = initial_centroids + ":"
        iter_num += 1

    # Not converge after MAX_ITERATION
    if not converged:
        print("not converge after " + str(iter_num) + " iterations")
        sys.stdout.write("last centroids: [")
        sys.stdout.write(",".join(str(v) for v in last_centroids))
        sys.stdout.write("]\n")
    def run_pagerank(edges_input,
                     output_path,
                     tmp_output_dir,
                     damping_factor=0.85,
                     convergence_threshold=0.0001,
                     max_num_iterations=10,
                     id_name_map=None,
                     preprocessing_script="../pigscripts/pagerank_preprocess.pig",
                     iteration_script="../pigscripts/pagerank_iterate.pig"
                    ):

        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Order nodes by pagerank
                               Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
                               to get human-readable names
        """

        preprocess_dir = "%s/preprocess" % tmp_output_dir
        iteration_dir  = "%s/iteration"  % tmp_output_dir

        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
            "INPUT_PATH"            : edges_input,
            "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
            "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
        }).runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes             = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        def iteration_param_func(it_num, it_dir):
            if it_num == 1:
                iteration_input = "%s/pageranks" % preprocess_dir
            else:
                iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)

            return {
                "INPUT_PATH"                  : iteration_input,
                "DAMPING_FACTOR"              : damping_factor,
                "NUM_NODES"                   : num_nodes,
                "PAGERANKS_OUTPUT_PATH"       : "%s/%d/pageranks"    % (it_dir, it_num),
                "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
            }

        iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed
        )

        # Postprocesing step:
        print "Starting postprocessing step."

        postprocess_script = """
            pageranks   =   LOAD '$PAGERANKS_INPUT_PATH'   USING PigStorage() AS (id: int, pagerank: double);
            pageranks   =   FILTER pageranks BY pagerank IS NOT NULL;
        """

        if id_name_map:
            postprocess_script += """
                id_name_map =   LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
                with_names  =   FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
                ordered     =   ORDER with_names BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "ID_NAME_MAP_INPUT_PATH" : id_name_map,
                "OUTPUT_PATH"            : output_path
            }).runSingle()
        else:
            postprocess_script += """
                ordered     =   ORDER pageranks BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "OUTPUT_PATH"            : output_path
            }).runSingle()

        Pig.fs("rmr %s" % preprocess_dir)
        Pig.fs("rmr %s" % iteration_dir)
Exemple #10
0
from org.apache.pig.scripting import Pig 
from org.codehaus.jackson.map import ObjectMapper

EPS          = 10e-6      # maximum distance between consective weights for convergence

pig_script   = sys.argv[1] # pig script to run iteratively
data_dir     = sys.argv[2] # directory where intermediate weights will be written
features     = sys.argv[3] # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4] # number of features

#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir    
Pig.fs(cmd)

#
# Initialize weights
#
w0_fields = []
weights   = []
for i in xrange(int(num_features)):
    weights.append(str(random.random()))    
    w0_fields.append({"name":"w%s" % i,"type":25,"schema":None}) # See Pig's DataType.java

path = tempfile.mkdtemp()
w0   = open("%s/part-r-00000" % path, 'w')
w0.write("\t".join(weights)+"\n")
w0.close()
Exemple #11
0
        distance_move = distance_move + sqrt(x_move + y_move)
        print distance_move

        new_centroid = (x, y)
        centroids.append(new_centroid)

        initial_centroids = initial_centroids + str(x) + "," + str(y)

        if i != k - 1:
            initial_centroids = initial_centroids + ":"

    iter_num = iter_num + 1

    distance_move = distance_move / k
    if distance_move > tolerance:
        Pig.fs("rmr grouped")
    Pig.fs("rmr output")
    print("iteration " + str(iter_num))
    print("average distance moved: " + str(distance_move))
    if distance_move < tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]\n")
        converged = True
        break

    last_centroids = centroids

    print last_centroids
    print initial_centroids
_out     = _in  + '_counts_m' + _min_count
_out_nc  = _out + '/count'
_out_v   = _out + '/vocab'
_out_nf  = _out + '/nfollow'
_out_np  = _out + '/nprecede'
_out_nfp = _out + '/nfollowerprecede'
_out_njc = _out + '/countsjoined'

##
# start actual pig jobs
#
from org.apache.pig.scripting import Pig 

# if output path does not exist, create it
if Pig.fs('-test -d ' + _out):
	Pig.fs('mkdir ' + _out)

##
# CountJob
#
# if output path of countjob already exists, skip it, run job
##
if not Pig.fs('-test -d ' + _out_nc):
	print '\nPath ("%s") already exists, skipping job.\n' % _out_nc
else:
	result = Pig.compile(_header + """
	count_ngrams( '${in}', '${out}', '${min_count}' );
	""").bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle()
	# check the result
	if not result.isSuccessful():
Exemple #13
0
from org.apache.pig.scripting import Pig

EPS = 10e-6  # maximum distance between consective weights for convergence

pig_script = sys.argv[1]  # pig script to run iteratively
data_dir = sys.argv[2]  # directory where intermediate weights will be written
features = sys.argv[
    3]  # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4]  # number of features

#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir
Pig.fs(cmd)

#
# Initialize weights
#
weights = []
for _ in xrange(int(num_features)):
    weights.append(str(random.random()))

fd, path = tempfile.mkstemp()
f = open(path, 'w')
f.write("\t".join(weights) + "\n")
f.close()
os.close(fd)

copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0")
Exemple #14
0
while True:
    print "*** Iteration " + str(i) + " ***"
    edges_out = 'edges' + str(i) + '.tmp'
    iteration_bound = iteration.bind({
        'EDGES_IN': edges_in,
        'EDGES_OUT': edges_out,
        'CONVERGENCE_OUT': 'convergence.tmp'
    })
    iteration_stats = iteration_bound.runSingle()
    if not iteration_stats.isSuccessful():
        raise 'Iteration failed'
    conv_result = iteration_stats.result('convergence').iterator().next()
    max_iter = int(str(conv_result.get(0)))
    conv_iter = int(str(conv_result.get(1)))
    change_count = int(str(conv_result.get(2)))
    Pig.fs('rm -r ' + 'convergence.tmp')
    Pig.fs('rm -r ' + edges_in)
    edges_in = edges_out
    print "Decision change count: " + str(change_count)
    if change_count == 0:
        stable_iterations += 1
    else:
        stable_iterations = 0
    print "Stable iterations: " + str(stable_iterations)
    print "Convergence iterations: " + str(conv_iter)
    print "Max iterations: " + str(max_iter)
    if stable_iterations >= conv_iter:
        print "Stopping due to convergence"
        break
    if i >= max_iter:
        print "Stopping due to max iterations reached"
Exemple #15
0
#!/usr/bin/python 
# -*- coding: utf-8 -*-
# explicitly import Pig class 
from org.apache.pig.scripting import Pig 

# COMPILE: compile method returns a Pig object that represents the pipeline
P = Pig.compile('''Arcs = LOAD '$docs_in'  USING PigStorage('\t') AS (url: chararray, pagerank: float, links:{ link: ( url: chararray ) } );   
        outlinkPageRank =  FOREACH Arcs    GENERATE   pagerank / COUNT ( links ) AS pagerank, FLATTEN ( links ) AS to_url;
        newPageRank = FOREACH   ( COGROUP outlinkPageRank BY to_url, Arcs BY url INNER )   GENERATE  
        FLATTEN (Arcs.url),
        ( 1.0 - 0.85 ) + 0.85 * SUM ( outlinkPageRank.pagerank ) AS pagerank,
	 FLATTEN (Arcs.links) AS links;
	dump newPageRank;
	STORE newPageRank INTO '$docs_out';''')
params = {'docs_in': 'urls2.txt' }
for i in range(1):
   out = "out/pagerank_data_" + str(i + 1)
   params["docs_out"] = out
   Pig.fs("rmr " + out)	
   stats = P.bind(params).runSingle()
   if not stats.isSuccessful():
      raise 'failed'
   params["docs_in"] = out
hdfsInputDir = sys.argv[2]
print initialCentroidsFile
k = 4
numOfCentroids = int(sys.argv[3])
numOfReducer = str(sys.argv[4])
# numOfMapper = str(sys.argv[5])
tolerance = 0.01

MAX_ITERATION = int(sys.argv[5])

initial_centroids = ""

# only support local path currently
hdfsCentroidFilePath = "/tmp/"+initialCentroidsFile
cachedCentroidFilePath = hdfsCentroidFilePath+"#"+initialCentroidsFile
Pig.fs("rm -r "+hdfsCentroidFilePath)
Pig.fs("put "+initialCentroidsFile+" " + hdfsCentroidFilePath)


#print initial_centroids

pigScript = ("""SET default_parallel """+numOfReducer+""";
                   SET pig.noSplitCombination true;
                   -- set mapred.child.java.opts '-Xmx900m';
                   set mapred.map.tasks.speculative.execution false;
                   SET mapred.cache.files """+cachedCentroidFilePath+""";
                   register pig-kmeans-udf-yarn.jar;
                   -- DEFINE find_centroid FindCentroid('$centroids');
                   raw = load '"""+hdfsInputDir+"""' using BinaryDataLoader('$centroids','"""+str(numOfCentroids)+"""') as (datapoints);
                   -- line below may be the bottleneck
                   datapointbag = foreach raw generate FLATTEN(datapoints) as datapointInString:chararray;
Exemple #17
0
        distance_move = distance_move + sqrt(x_move + y_move)
        print distance_move

        new_centroid = (x, y)
        centroids.append(new_centroid)

        initial_centroids = initial_centroids + str(x) + "," + str(y)

        if i != k - 1:
            initial_centroids = initial_centroids + ":"

    iter_num = iter_num + 1

    distance_move = distance_move / k
    if distance_move > tolerance:
        Pig.fs("rmr grouped")
        Pig.fs("rmr output")
    print ("iteration " + str(iter_num))
    print ("average distance moved: " + str(distance_move))
    if distance_move <= tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]\n")
        converged = True
        break

    last_centroids = centroids

    print last_centroids
    print initial_centroids
Exemple #18
0
import tempfile

from org.apache.pig.scripting import Pig 

EPS          = 10e-6      # maximum distance between consective weights for convergence

pig_script   = sys.argv[1] # pig script to run iteratively
data_dir     = sys.argv[2] # directory where intermediate weights will be written
features     = sys.argv[3] # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4] # number of features

#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir    
Pig.fs(cmd)

#
# Initialize weights
#
weights = []
for _ in xrange(int(num_features)):
    weights.append(str(random.random()))

fd, path = tempfile.mkstemp()    
f = open(path, 'w')
f.write("\t".join(weights)+"\n")
f.close()
os.close(fd)

copyFromLocal = "copyFromLocal %s %s/%s" % (path, data_dir, "weight-0")
Exemple #19
0
iter_num = 0
while iter_num < MAX_ITERATION:
    Q = P.bind({'centroids': initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
    for i in range(k):
        tuple = iter.next()
        centroids[i] = float(str(tuple.get(1)))
        distance_move = distance_move + fabs(last_centroids[i] - centroids[i])
    distance_move = distance_move / k
    Pig.fs("rmr output")
    print("iteration " + str(iter_num))
    print("average distance moved: " + str(distance_move))
    if distance_move < tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]\n")
        converged = True
        break
    last_centroids = centroids[:]
    initial_centroids = ""
    for i in range(k):
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i != k - 1:
            initial_centroids = initial_centroids + ":"
    iter_num += 1
Exemple #20
0
edges_in = 'edges' + str(i - 1) + '.tmp'
edges_out = ''

while True:
    print "*** Iteration " + str(i) + " ***"
    edges_out = 'edges' + str(i) + '.tmp'
    iteration_bound = iteration.bind({'EDGES_IN': edges_in, 'EDGES_OUT': edges_out, 
        'CONVERGENCE_OUT': 'convergence.tmp'})
    iteration_stats = iteration_bound.runSingle()
    if not iteration_stats.isSuccessful():
        raise 'Iteration failed'
    conv_result = iteration_stats.result('convergence').iterator().next()
    max_iter = int(str(conv_result.get(0)))
    conv_iter = int(str(conv_result.get(1)))
    change_count = int(str(conv_result.get(2)))
    Pig.fs('rm -r ' + 'convergence.tmp')
    Pig.fs('rm -r ' + edges_in)
    edges_in = edges_out
    print "Decision change count: " + str(change_count)
    if change_count == 0:
        stable_iterations += 1
    else:
        stable_iterations = 0
    print "Stable iterations: " + str(stable_iterations)
    print "Convergence iterations: " + str(conv_iter)
    print "Max iterations: " + str(max_iter)
    if stable_iterations >= conv_iter:
        print "Stopping due to convergence"
        break
    if i >= max_iter:
        print "Stopping due to max iterations reached"
Exemple #21
0
from org.apache.pig.scripting import Pig
from org.codehaus.jackson.map import ObjectMapper

EPS = 10e-6  # maximum distance between consective weights for convergence

pig_script = sys.argv[1]  # pig script to run iteratively
data_dir = sys.argv[2]  # directory where intermediate weights will be written
features = sys.argv[
    3]  # location, inside data_dir, where the data to fit exists
num_features = sys.argv[4]  # number of features

#
# Cleanup data dir
#
cmd = "rmr %s/weight-*" % data_dir
Pig.fs(cmd)

#
# Initialize weights
#
w0_fields = []
weights = []
for i in xrange(int(num_features)):
    weights.append(str(random.random()))
    w0_fields.append({
        "name": "w%s" % i,
        "type": 25,
        "schema": None
    })  # See Pig's DataType.java

path = tempfile.mkdtemp()
if (len(sys.argv) <= 4):
    docs_in = sys.argv[2]
if (len(sys.argv) <= 4):
    start_at = (int)(sys.argv[3])
else:
    start_at = 0


out_dir = "%s/tmp/%s" % (dataset, basename(preprocessedGraph))
inputType = "chararray" #use long if we have hashed urls
for i in range(20):
    if i < start_at:
	continue
    docs_out = out_dir + "pagerank_data_" + str(i + 1)
    max_diff = out_dir + "max_diff_" + str(i + 1)
    Pig.fs("rmr " + docs_out)
    Pig.fs("rmr " + max_diff)
    stats = P.bind().runSingle()
    if not stats.isSuccessful():
        raise 'failed'
    max_diff_value = float(str(stats.result("max_diff").iterator().next().get(0)))
    print "    max_diff_value = " + str(max_diff_value)
    if max_diff_value < 0.01:
        print "done at iteration " + str(i) + ". Cleaning output"
        break
    #max_diff of previous iterations never used, so clean it up
    Pig.fs("rmr " + max_diff) 
    if i > 1:
        #never for 1st iteration! (otherwise we delete original input...
        Pig.fs("rmr " + docs_in)