def __init__(self, params):
     # BIND and RUN
     self.params = params
     self.set_param_defaults()
     Pig.fs("rmr " + self.params['output_name'])
     generator = PigScriptGenerator.PigScriptGenerator(self.params)
     full_script = generator.generate()
     
     P = Pig.compile( full_script )
     
     results = P.bind({
                           'output':self.params['output_name'],
                           }).runSingle()
     
     if results.isSuccessful() :
         print 'Pig job succeeded'
     else :
         raise 'Pig job failed'
     result_iter = results.result("final_set").iterator()
     
     #This takes care of turning our iter into something we can use.
     self.make_dict_from_results(result_iter)
     
     send_to_grapht = raw_input('do you want to send this data to grapht?')
     if send_to_grapht not in ('y', 'yes', '1'): 
         sys.exit()
     connector = GraphtConnector('grapht.shuttercorp.net')
     metric = self.params['output_name']
     connector.record_data_points(metric, self.result)
     
"""

if aggregateMethod == "avg":
	pigScript += """
rankedTriples = FOREACH objGroup GENERATE 
		$0,$1,$2,
		AVG({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;"""
elif aggregateMethod == "max":
	pigScript += """
rankedTriples = FOREACH objGroup GENERATE 
		$0,$1,$2,
		MAX({($4 is null? 0F: $4),($6 is null? 0F: $6)}) AS ranking;"""
elif aggregateMethod == "min":
	pigScript += """
rankedTriples = FOREACH objGroup GENERATE 
		$0,$1,$2,
		MIN({($4 is null? 1F: $4),($6 is null? 1F: $6)}) AS ranking;"""
else: 
	pigScript += """
WRONGGGG. how to aggregate?!"""

pigScript += """

rmf $outputFile
STORE rankedTriples INTO '$outputFile' USING PigStorage();
"""


P = Pig.compile(pigScript)
stats = P.bind().runSingle()
Exemple #3
0
    vertica.accert_table_exists(table_name)
    table_size = vertica.get_table_size(table_name)
    logger.info(table_name + " table  size is " + str(table_size) + " bytes")
    
    output_dir = "/user/mykhail.martsynyuk/vertica/export/"+table_name
    #prepare hdfs structure
    logger.info("Move folder "+output_dir+" to backup")
    hdfs.move_folder_to_backup(output_dir)
    logger.info("Remove "+output_dir)
    hdfs.remove_folder(output_dir)
    
    params.append({'out':output_dir, 'table':table_name})
    
P = Pig.compile("""
register /usr/lib/pig/lib/pig-vertica.jar
register /usr/lib/pig/lib/vertica-jdbc-7.0.1-0.jar
A = LOAD 'sql://{SELECT * FROM $table WHERE 1 = ?};{1}' USING com.vertica.pig.VerticaLoader('10.104.5.29','verticadst','5433','alfxplsit','xpl123');
STORE A INTO '$out';
""")

bound = P.bind(params)
stats_list = bound.run()

i = 0
for stats in stats_list:
    if stats.isSuccessful():
        logger.info("SUCCESS: Table: "+params[i]["table"]+"; Number jobs: "+str(stats.getNumberJobs())+ "; Time to run: "+str(stats.getDuration())+"; Files written: "+str(stats.getOutputLocations()))
    else:
        logger.info("FAIL: Table: "+params[i]["table"]+"; ERRORS: "+stats.getAllErrorMessages())
    i+=1
    
    # Next is example of how to get script output:
MIN_SCORE = 0
MAX_ITERATION = 5

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load '/user/hdfs/data/data1/student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
    if i != k - 1:
        initial_centroids = initial_centroids + ":"
# initial_centroids = "37.475097, -122.155599:37.486098,-122.195388:37.4985769, -122.2195727:37.4608874, -122.143838:37.453407, -122.182255"

# initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0"
# last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)]
print last_centroids
print initial_centroids


P = Pig.compile(
    """register Find.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw_data = load 'MP_match.txt' as (latitude:double, longitude:double, status:chararray);
                   centroided = foreach raw_data generate status, latitude, longitude, find_centroid(latitude, longitude) as centroid;
                   grouped = group centroided by centroid;
                   store grouped into 'grouped';
                   result = foreach grouped generate group, AVG(centroided.latitude), AVG(centroided.longitude);
                   store result into 'output';
                """
)

converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
    Q = P.bind({"centroids": initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = []
Exemple #6
0
#     ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false');

    script = """
%declare OUTPUT '/user/haliu'
member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON();
events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false');

job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0;

job_views = FOREACH job_view_events GENERATE 
  (int)header.memberId   AS memberId,
  (long)header.time      AS time,
  trackingCode,
  (int)trackingInfo#'0'  AS jobId;

job_views = join job_views by memberId, member by memberId; 
job_views = foreach job_views generate job_views::memberId as memberId, job_views::time as time, job_views::jobId as jobId;
job_views = filter job_views by memberId > 0; 
 
job_views = distinct job_views parallel 1;
STORE job_views INTO '$OUTPUT/JYMBII-batch/history/view/$REPORT_DATE' USING BinaryJSON('memberId');

    """

    prog = Pig.compile(script) 
    for para in params:
        bound = prog.bind(para) 
        stats = bound.runSingle()
        print "********************************Finish Current Data " + para['DATE'] + " *************************************************"
Exemple #7
0
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar;
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid parallel 2;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'kmoutput';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
Exemple #8
0
            postString += "Result = FOREACH Result GENERATE " + fsDic['genFields'] + ";\n"
            postString += "Result = FOREACH Result GENERATE * AS (" + fsDic['genSchema'] + ");\n"

            # postString += "Result = FOREACH Result GENERATE " + currentAction+ "Result::UserId AS UserId, *;\n"
            # postString += "DESCRIBE Result;\n"

    # A1BResult = JOIN BResult BY UserId, CResult By UserId;
    pigString += postString
    pigString += """
    DUMP Result;
    DESCRIBE Result;
    """

    print(pigString)

    # with open('cyygeneratedPig.pig','w') as outFile:
    #    outFile.write(pigString)
    if USE_PIG:
        P = Pig.compile(pigString)
        # P = Pig.compileFromFile('pig_bcd_bc.pig')

    # run the pig script

        if True:
            result = P.bind().runSingle()

            if result.isSuccessful():
                print 'run success'
            else:
                raise 'run failed'
    def run_pagerank(edges_input,
                     output_path,
                     tmp_output_dir,
                     damping_factor=0.85,
                     convergence_threshold=0.0001,
                     max_num_iterations=10,
                     id_name_map=None,
                     preprocessing_script="../pigscripts/pagerank_preprocess.pig",
                     iteration_script="../pigscripts/pagerank_iterate.pig"
                    ):

        """
        Calculates pageranks for directed graph of nodes and edges.

        Three main steps:
            1. Preprocessing: Process input data to:
                 a) Count the total number of nodes.
                 b) Prepare initial pagerank values for all nodes.
            2. Iteration: Calculate new pageranks for each node based on the previous pageranks of the
                          nodes with edges going into the given node.
            3. Postprocessing: Order nodes by pagerank
                               Optionally join (id, pagerank) pairs to a dataset of (id, name) pairs
                               to get human-readable names
        """

        preprocess_dir = "%s/preprocess" % tmp_output_dir
        iteration_dir  = "%s/iteration"  % tmp_output_dir

        # Preprocessing step:
        print "Starting preprocessing step."
        preprocess = Pig.compileFromFile("../pigscripts/pagerank_preprocess.pig").bind({
            "INPUT_PATH"            : edges_input,
            "PAGERANKS_OUTPUT_PATH" : "%s/pageranks" % preprocess_dir,
            "NUM_NODES_OUTPUT_PATH" : "%s/num_nodes" % preprocess_dir
        }).runSingle()

        # Update convergence threshold based on the size of the graph (number of nodes)
        num_nodes             = long(str(preprocess.result("num_nodes").iterator().next().get(0)))
        convergence_threshold = long(convergence_threshold * num_nodes * num_nodes)
        print "Calculated convergence threshold for %d nodes: %d" % (num_nodes, convergence_threshold) 

        # Iteration step:
        def iteration_param_func(it_num, it_dir):
            if it_num == 1:
                iteration_input = "%s/pageranks" % preprocess_dir
            else:
                iteration_input = "%s/%d/pageranks" % (it_dir, it_num - 1)

            return {
                "INPUT_PATH"                  : iteration_input,
                "DAMPING_FACTOR"              : damping_factor,
                "NUM_NODES"                   : num_nodes,
                "PAGERANKS_OUTPUT_PATH"       : "%s/%d/pageranks"    % (it_dir, it_num),
                "AGG_RANK_CHANGE_OUTPUT_PATH" : "%s/%d/rank_changes" % (it_dir, it_num)
            }

        iteration_result = IterationUtils.iterate_until_convergence(
            "../pigscripts/pagerank_iterate.pig", # the pigscript to iterate
            iteration_dir,                        # temporary iteration outputs will be stored here
            iteration_param_func,                 # takes iteration #, returns Pig parameter dictionary
            "Sum of ordering-rank changes",       # name of the convergence metric
            int,                                  # Python type of the convergence metric
            "aggregate_rank_change",              # alias in the pigscript where the metric is stored to
            convergence_threshold,                # stop when metric less than this
            max_num_iterations                    # or if this many iterations have been performed
        )

        # Postprocesing step:
        print "Starting postprocessing step."

        postprocess_script = """
            pageranks   =   LOAD '$PAGERANKS_INPUT_PATH'   USING PigStorage() AS (id: int, pagerank: double);
            pageranks   =   FILTER pageranks BY pagerank IS NOT NULL;
        """

        if id_name_map:
            postprocess_script += """
                id_name_map =   LOAD '$ID_NAME_MAP_INPUT_PATH' USING PigStorage() AS (id: int, name: chararray);
                with_names  =   FOREACH (JOIN id_name_map BY id, pageranks BY id) GENERATE name, pagerank;
                ordered     =   ORDER with_names BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "ID_NAME_MAP_INPUT_PATH" : id_name_map,
                "OUTPUT_PATH"            : output_path
            }).runSingle()
        else:
            postprocess_script += """
                ordered     =   ORDER pageranks BY pagerank DESC;
                rmf $OUTPUT_PATH;
                STORE ordered INTO '$OUTPUT_PATH' USING PigStorage();
            """

            postprocess = Pig.compile(postprocess_script).bind({
                "PAGERANKS_INPUT_PATH"   : "%s/%d/pageranks" % (iteration_dir, iteration_result["num_iterations"]),
                "OUTPUT_PATH"            : output_path
            }).runSingle()

        Pig.fs("rmr %s" % preprocess_dir)
        Pig.fs("rmr %s" % iteration_dir)
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register 'centroid.py' using jython as centroid; 
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, centroid.get_closest_centroid(gpa, '$centroids') as centroid;
                   grouped = group centroided by centroid parallel 2;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'kmoutput';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
    if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
Exemple #11
0
#     initial_centroids = initial_centroids + str(last_centroids[i])
#     if i!=k-1:
#         initial_centroids = initial_centroids + ":"

initial_centroids = "-120.0,-120.0:-60.0,-60.0:0.0, 0.0:60.0,60.0:120.0,120.0"
last_centroids = [(-120.0,-120.0),(-60.0, -60.0),(0.0, 0.0),(60.0, 60.0),(120.0,120.0)]

print initial_centroids



P = Pig.compile("""register /Users/yun_shen/Desktop/spams/pigudf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw_data = load '1.log' as (spam_id:chararray, longitude:double, latitude:double);
                   raw = filter raw_data by longitude is not null and latitude is not null;
                   centroided = foreach raw generate spam_id, longitude, latitude, find_centroid(longitude, latitude) as centroid;
                   grouped = group centroided by centroid parallel 4;
                   store grouped into 'grouped';
                   result = foreach grouped generate group, AVG(centroided.longitude), AVG(centroided.latitude);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num < MAX_ITERATION:
	Q = P.bind({'centroids':initial_centroids})
	results = Q.runSingle()
	if results.isSuccessful() == "FAILED":
		raise "Pig job failed"
	iter = results.result("result").iterator()
	centroids = []
	x = 0.0
from org.apache.pig.scripting import Pig 

# if output path does not exist, create it
if Pig.fs('-test -d ' + _out):
	Pig.fs('mkdir ' + _out)

##
# CountJob
#
# if output path of countjob already exists, skip it, run job
##
if not Pig.fs('-test -d ' + _out_nc):
	print '\nPath ("%s") already exists, skipping job.\n' % _out_nc
else:
	result = Pig.compile(_header + """
	count_ngrams( '${in}', '${out}', '${min_count}' );
	""").bind({'in':_in, 'out':_out_nc, 'min_count': _min_count, 'n':'count-ngrams'}).runSingle()
	# check the result
	if not result.isSuccessful():
	    raise "Pig job failed"


##
# ExtractVocabularyJob
#
# if output path of countjob already exists, skip it, run job
##
if not Pig.fs('-test -d ' + _out_v):
	print '\nPath ("%s") already exists, skipping job.\n' % _out_v
else:
	result = Pig.compile(_header + """
Exemple #13
0
    #     ('date.range','start.date=$DATE;end.date=$DATE;error.on.missing=false');

    script = """
%declare OUTPUT '/user/haliu'
member = load '$OUTPUT/JYMBII-batch/MemberList' USING BinaryJSON();
events = LOAD '/data/tracking/PageViewEvent' USING LiAvroStorage('date.range','start.date=$DATA_DATE;end.date=$DATA_DATE;error.on.missing=false');

job_view_events = FILTER events BY requestHeader.pageKey == 'jobs_seeking_view_job' AND header.memberId > 0;

job_views = FOREACH job_view_events GENERATE 
  (int)header.memberId   AS memberId,
  (long)header.time      AS time,
  trackingCode,
  (int)trackingInfo#'0'  AS jobId;

job_views = join job_views by memberId, member by memberId; 
job_views = foreach job_views generate job_views::memberId as memberId, job_views::time as time, job_views::jobId as jobId;
job_views = filter job_views by memberId > 0; 
 
job_views = distinct job_views parallel 1;
STORE job_views INTO '$OUTPUT/JYMBII-batch/history/view/$REPORT_DATE' USING BinaryJSON('memberId');

    """

    prog = Pig.compile(script)
    for para in params:
        bound = prog.bind(para)
        stats = bound.runSingle()
        print "********************************Finish Current Data " + para[
            'DATE'] + " *************************************************"
P = Pig.compile("""
previous_pagerank = 
    LOAD '$docs_in'
    AS ( url: $inputType, pagerank: float, links:{ link: ( url: $inputType ) } );
/**
Creates 
 <http://rdf.chemspider.com/3442>, 1.0,  {(<http://www.w3.org/2004/02/skos/core#exactMatch>), (<http://bla>)}
*/

outbound_pagerank = 
    FOREACH previous_pagerank 
    GENERATE 
        pagerank / COUNT ( links ) AS pagerank, 
        FLATTEN ( links ) AS to_url; 
/**
Creates:
1.0, <http://bla>
1.0, <http://www.w3.org/2004/02/skos/core#exactMatch>
*/

cogrpd = cogroup outbound_pagerank by to_url, previous_pagerank by url;
/**
creates:
<http://rdf.chemspider.com/3442>, {}, {(<http://rdf.chemspider.com/3442>, 1.0, {(<http://www.w3.org/2004/02/skos/core#exactMatch>), (<http://bla>)})}
*/                   
                      
new_pagerank = 
    FOREACH 
       cogrpd
    GENERATE 
        group AS url, 
        ( 1 - $d ) + $d * SUM (outbound_pagerank.pagerank) AS pagerank, 
        FLATTEN ( previous_pagerank.links ) AS links,
        FLATTEN ( previous_pagerank.pagerank ) AS previous_pagerank;
STORE new_pagerank 
    INTO '$docs_out';
nonulls = filter new_pagerank by previous_pagerank is not null and pagerank is not null;
pagerank_diff = FOREACH nonulls GENERATE ABS ( previous_pagerank - pagerank );

grpall = group pagerank_diff all;
max_diff = foreach grpall generate MAX (pagerank_diff);

STORE max_diff INTO '$max_diff';

""")