def consolidate_canopy_clusters(output, cluster_pkl): clusters = list(hadoopy.cat(output + '/p*')) clusters.sort(lambda x, y: cmp(x[0][0], y[0][0])) cluster_canopies = [(x, y[0][1]) for x, y in enumerate(clusters)] canopy_clusters = _map_cluster_canopies(cluster_canopies) clusters = np.array([np.fromstring(x[1], dtype=np.float32) for x in clusters]) with open(cluster_pkl, 'w') as fp: pickle.dump((clusters, canopy_clusters), fp, 2)
def _run(self, fn): in_path = self.data_path + fn out_path = self.data_path + 'out-' + fn cmd = 'hadoop fs -put %s %s' % (fn, in_path) subprocess.check_call(cmd.split()) # Don't let the file split, CDH3 has a bug and will try to split gz's hadoopy.launch_frozen(in_path, out_path, 'wc.py', jobconfs='mapred.min.split.size=100000000') wc = dict(hadoopy.cat(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
cv.CvtColor(cv_im, cv_im_cvt, cv.CV_RGB2BGR) cv_im = cv_im_cvt return cv_im def str2pil(image_data): return Image.open(StringIO.StringIO(image_data)) def cvcrop(cv_image, x, y, w, h): x, y, w, h = int(x), int(y), int(w), int(h) cropped = cv.CreateImage((w, h), 8, cv_image.channels) src_region = cv.GetSubRect(image, (x, y, w, h)) cv.Copy(src_region, cropped) return cropped #key: Image name #value: (image, faces) where image is the input value and faces is # a list of ((x, y, w, h), n) run_time = '1306607174.041919' out_path = '/mnt/nfsdrives/shared/facefinder/run-%s/' % run_time chip_out_path = '/mnt/nfsdrives/shared/facefinder/run-%s/chips' % run_time os.makedirs(chip_out_path) for image_name, (image, faces) in hadoopy.cat('/user/brandyn/tp/facefinder/run-%s' % run_time): image = pil2cv(str2pil(image)) for num, ((x, y, w, h), n) in enumerate(faces): cv.SaveImage('%s/%s-%d.jpg' % (chip_out_path, image_name, num), cvcrop(image, x, y, w, h)) for (x, y, w, h), n in faces: pt1 = (int(x), int(y)) pt2 = (int((x + w)), int((y + h))) cv.Rectangle(image, pt1, pt2, cv.RGB(255, 0, 0), 3, 8, 0) cv.SaveImage('%s/%s.jpg' % (out_path, image_name), image)
def fetch_output(): global faces_output faces_output = [x for x in hadoopy.cat('faces/face_output')]
def test_err(self): self.assertRaises(IOError, hadoopy.ls, self.nonsense_path) self.assertRaises(IOError, hadoopy.cat(self.nonsense_path).next)
def test_cat(self): cat_output = [_ for _ in hadoopy.cat(self.file_path)] line = (331, 'Title: Alice\'s Adventures in Wonderland') self.assertTrue(line in cat_output)
import hadoopy import os import time out_path = '/mnt/nfsdrives/shared/tp/cluster/%f/' % time.time() for group, (image_name, image_data) in hadoopy.cat('/user/brandyn/tp/image_cluster/run-15//samples'): group_path = '%s/%d/' % (out_path, int(group)) try: os.makedirs(group_path) except OSError: pass print(group_path + '%s.jpg' % image_name) with open(group_path + '%s.jpg' % image_name, 'w') as fp: fp.write(image_data)
import glob import base64 import os import Image import hadoopy FILE = '/tmp/bwhite/output/pets2006.video_frame_data.b/0.903472866947' OUTPUT = 'out' try: os.mkdir(OUTPUT) except OSError: pass for name, data in hadoopy.cat(FILE): if name == '1-1-2241': print(name) Image.fromstring('L', (720, 576), data).save(OUTPUT + '/' + name + '.jpg')
def consolidate_clusters(output, cluster_pkl): clusters = list(hadoopy.cat(output + '/p*')) clusters.sort(lambda x, y: cmp(x[0], y[0])) clusters = np.array([np.fromstring(x[1], dtype=np.float32) for x in clusters]) with open(cluster_pkl, 'w') as fp: pickle.dump(clusters, fp, 2)