Esempio n. 1
0
def main(input_dir):
    output_dir = '/user/hadoop-trainer/output-ex1-0/%f' % (time.time())
    hadoopy.run_hadoop(input_dir, output_dir, 'mean_image.py')
    print('Output Dir:[%s]' % (output_dir))
    for key, val in hadoopy.hdfs_cat_tb(output_dir):
        fn = 'output-ex1-0-mean-%s.png' % ('-'.join(map(str, key)))
        print('Saving %s' % (fn))
        Image.fromstring('L', key, val).save(fn)
Esempio n. 2
0
import hadoopy
import time
import sys
import re

# Setup our input output dirs
try:
    run_count = int(sys.argv[1])
except IndexError:
    run_count = 0
input_dir = '/user/hadoop-trainer/input-ex0-%d' % (run_count)
output_dir = '/user/hadoop-trainer/output-ex0-%d/%f' % (run_count, time.time())

# Run job
print('Running input_dir[%s]' % (input_dir))
hadoopy.run_hadoop(input_dir, output_dir, 'wc.py')
print('Job Done. See your output (encoded in typedbytes, so it will have binary stuff around it) [hadoop fs -cat %s/part-00000]' % (output_dir))

# Dump from TypedBytes format
count_thresh = [1, 5000, 5, 5][run_count]
should_sort = True
print("I'll read the data and dump it in a nicer form, look inside this file to see how to do this.  Only counts >= %d that match this regex '^[a-zA-Z\-,\.]+$' with length > 2 are output" % (count_thresh))
pairs = hadoopy.hdfs_cat_tb('%s/part-00000' % (output_dir))
if should_sort:
    print("I'll now sort the output ascending by wordcount")
    pairs = sorted(pairs, lambda x,y: cmp(x[1], y[1]))
for key, val in pairs:
    if val >= count_thresh and re.search('^[a-zA-Z\-,\.]+$', key) and len(key) > 2:
        print('%s\t%s' % (key, val))

#!/usr/bin/env python
"""Runs the image_convert job
Usage:
python run_image_convert.py <hdfs_input> <hdfs_output>
"""
import hadoopy
import time
import sys
import re


# Run job
try:
    hadoopy.run_hadoop(sys.argv[1], sys.argv[2], "image_convert.py", reducer=None)
except IndexError:
    print(__doc__)