def main(input_dir): output_dir = '/user/hadoop-trainer/output-ex1-0/%f' % (time.time()) hadoopy.run_hadoop(input_dir, output_dir, 'mean_image.py') print('Output Dir:[%s]' % (output_dir)) for key, val in hadoopy.hdfs_cat_tb(output_dir): fn = 'output-ex1-0-mean-%s.png' % ('-'.join(map(str, key))) print('Saving %s' % (fn)) Image.fromstring('L', key, val).save(fn)
import hadoopy import time import sys import re # Setup our input output dirs try: run_count = int(sys.argv[1]) except IndexError: run_count = 0 input_dir = '/user/hadoop-trainer/input-ex0-%d' % (run_count) output_dir = '/user/hadoop-trainer/output-ex0-%d/%f' % (run_count, time.time()) # Run job print('Running input_dir[%s]' % (input_dir)) hadoopy.run_hadoop(input_dir, output_dir, 'wc.py') print('Job Done. See your output (encoded in typedbytes, so it will have binary stuff around it) [hadoop fs -cat %s/part-00000]' % (output_dir)) # Dump from TypedBytes format count_thresh = [1, 5000, 5, 5][run_count] should_sort = True print("I'll read the data and dump it in a nicer form, look inside this file to see how to do this. Only counts >= %d that match this regex '^[a-zA-Z\-,\.]+$' with length > 2 are output" % (count_thresh)) pairs = hadoopy.hdfs_cat_tb('%s/part-00000' % (output_dir)) if should_sort: print("I'll now sort the output ascending by wordcount") pairs = sorted(pairs, lambda x,y: cmp(x[1], y[1])) for key, val in pairs: if val >= count_thresh and re.search('^[a-zA-Z\-,\.]+$', key) and len(key) > 2: print('%s\t%s' % (key, val))
#!/usr/bin/env python """Runs the image_convert job Usage: python run_image_convert.py <hdfs_input> <hdfs_output> """ import hadoopy import time import sys import re # Run job try: hadoopy.run_hadoop(sys.argv[1], sys.argv[2], "image_convert.py", reducer=None) except IndexError: print(__doc__)