def runner(): """ The function that calls haoodpy.run """ iter = gopts.getintkey('iter') blocksize = gopts.getintkey('blocksize') reduce_schedule = gopts.getstrkey('reduce_schedule') mapper = NormalEquations(blocksize=blocksize, isreducer=False) reducer = NormalEquations(blocksize=blocksize, isreducer=True) hadoopy.run(mapper, reducer)
def runner(): """ The function that calls haoodpy.run """ iter = gopts.getintkey('iter') blocksize = gopts.getintkey('blocksize') reduce_schedule = gopts.getstrkey('reduce_schedule') mapper = NormalEquations(blocksize=blocksize,isreducer=False) reducer = NormalEquations(blocksize=blocksize,isreducer=True) hadoopy.run(mapper, reducer)
#!/usr/bin/env python import hadoopy import picarus_takeout import os import picarus import zlib import sys class Mapper(picarus.HBaseMapper): def __init__(self): super(Mapper, self).__init__() self._model = zlib.decompress(open(os.environ['MODEL_FN']).read()) self.job = picarus_takeout.ModelChain(self._model) def _map(self, row, input_binary): try: yield row, self.job.process_binary(input_binary) except: sys.stdout.flush() hadoopy.counter('STATUS', 'badRows') else: sys.stdout.flush() hadoopy.counter('STATUS', 'goodRows') if __name__ == '__main__': hadoopy.run( Mapper, required_cmdenvs=['HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'MODEL_FN'])
try: histogram[clusterid] += 1 except KeyError: histogram[clusterid] = 1 def configure(self): self.clusters = self._load_clusters() def map(self, imageid, features): """ Args: imageid: An ID that is directly passed to the output features: As a list of numpy arrays Yields: A tuple in the form of (key, value) key: imageid value: histogram as a dict of (dim, val) (int, int) """ histogram = {} for feature in features: clusterid = self._nearest_cluster_id(self.clusters, feature) self._update_histogram(clusterid, histogram) yield imageid, histogram if __name__ == "__main__": if hadoopy.run(Mapper): hadoopy.print_doc_quit(__doc__)
inds = (confs >= 0).nonzero()[0] hadoopy.counter('STATS', 'num_pos', inds.size) hadoopy.counter('STATS', 'num_neg', confs.size - inds.size) hadoopy.counter('STATS', 'total', confs.size) if inds.size: self.pyramid[inds, cy, cx] += 1 def close(self): yield 0, (self.pyramid, float(self.num_boxes)) class Reducer(object): def __init__(self): pass def reduce(self, key, pyramid_num_boxes): pyramid_out = 0 num_boxes_out = 0 for pyramid, num_boxes in pyramid_num_boxes: pyramid_out += pyramid num_boxes_out += num_boxes yield key, (pyramid_out, num_boxes_out) if __name__ == '__main__': hadoopy.run(Mapper, Reducer, jobconfs=['mapred.task.timeout=6000000', 'mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec', 'mapred.compress.map.output=true', 'mapred.output.compress=true', 'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'])
def __init__(self): pass def reduce(self, key, values): # Setup data # TODO(brandyn): Use multi-file join pattern data = [None, None, None] for input_type, value in values: data[input_type] = value if len([x for x in data if x is None]) != 0: raise ValueError('Reducer did not get all necessary parts!') exemplar, pos, neg = data # Compute threshold and output new exemplar try: thresh, score = fpr_threshold(pos, neg) except BadExemplar: print('Bad exemplar[%s]' % (key, )) return print('Good exemplar[%s][%f]' % (key, thresh)) key[2] = score yield key, (exemplar[0], exemplar[1] - thresh) if __name__ == '__main__': hadoopy.run(Mapper, Reducer, jobconfs=[ 'mapred.task.timeout=6000000', 'mapred.child.java.opts=-Xmx512M' ])
def reduce(self, image_hash, values): """ Args: image_hash: (see mapper) values: Iterator of values (see mapper) Yields: A tuple in the form of (image_hash, value) image_hash: Image hash value: The provided value (not the prediction) """ predictions = None out_val = None for value in values: if isinstance(value, dict): predictions = value else: out_val = value if predictions is None or out_val is None: hadoopy.counter('DATA_ERR', 'MISSING_PREDICTIONS_OR_DATA') return label, conf = predictions[self._class_name][0] if (self._class_thresh <= label * conf) == ( self._output_class == 1): # Both true or both false yield image_hash, out_val if __name__ == '__main__': hadoopy.run(Mapper, Reducer)
frame = data['frame'] data['event'] = event_video[0] data['video'] = event_video[1] pyramid = np.zeros((self.num_bins, self.num_bins), dtype=np.int32) num_boxes = 0 coord = lambda x: int(np.round(x * self.num_bins)) for (_, box), confs in super(Mapper, self).map(None, frame): num_boxes += 1 cy0, cx0, cy1, cx1 = map(coord, box) cy1 += 1 cx1 += 1 cell_value = 1. / ((cy1 - cy0) * (cx1 - cx0)) inds = (confs >= 0).nonzero()[0] hadoopy.counter('STATS', 'num_pos', inds.size) hadoopy.counter('STATS', 'num_neg', confs.size - inds.size) hadoopy.counter('STATS', 'total', confs.size) if inds.size: pyramid[cy0:cy1, cx0:cx1] += cell_value * inds.size yield data, (pyramid, num_boxes * len(self.ids)) if __name__ == '__main__': hadoopy.run( Mapper, jobconfs=[ 'mapred.task.timeout=6000000', 'mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec', 'mapred.compress.map.output=true', 'mapred.output.compress=true', 'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec' ])
class Reducer(object): def __init__(self): self.labels = file_parse.load(os.environ['LOCAL_LABELS_FN']) def reduce(self, class_name, label_values): """ Args: class_name: (see mapper) label_values: Iterator of label_values (see mapper) Yields: A tuple in the form of (key, value) classifier_name: (see mapper) classifier: Serialized classifier """ label_values = list(label_values) for classifier_name in self.labels['classes'][class_name]['classifiers']: print('Starting [%s,%s]' % (class_name, classifier_name)) classifier_extra = self.labels['classifiers'][classifier_name].get('extra', '') classifier = classifiers.train(self.labels['classifiers'][classifier_name]['name'], classifier_extra, label_values) classifier_ser = classifiers.dumps(classifier_name, classifier_extra, classifier) yield ' '.join([class_name, classifier_name]), classifier_ser print('Ending [%s,%s,%d]' % (class_name, classifier_name, len(classifier_ser))) if __name__ == '__main__': hadoopy.run(Mapper, Reducer)
key: Image name value: Image as jpeg byte data Yields: A tuple in the form of (key, value) key: (Image name, (x, y, w, h)) value: face image (.png) """ try: image = imfeat.image_fromstring(value, { 'type': 'numpy', 'dtype': 'uint8', 'mode': 'gray' }) image_color = imfeat.image_fromstring(value, { 'type': 'numpy', 'dtype': 'uint8', 'mode': 'bgr' }) except: hadoopy.counter('DATA_ERRORS', 'ImageLoadError') return faces = _detect_faces(image, self._cascade) for x, y, w, h in faces: yield (key, (x, y, w, h)), imfeat.image_tostring( image_color[y:y + h, x:x + w, :], '.png') if __name__ == "__main__": hadoopy.run(Mapper, required_files=['haarcascade_frontalface_default.xml'])
except TypeError: cur_cluster_sum = vec yield key, cur_cluster_sum.tostring() def close(self): super(Combiner, self).close() class Reducer(profile.ProfileJob): def __init__(self): super(Reducer, self).__init__() def reduce(self, key, values): cur_cluster_sum = None for vec in values: vec = np.fromstring(vec, dtype=np.float32) try: cur_cluster_sum += vec except TypeError: cur_cluster_sum = vec center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1] yield key, center.tostring() def close(self): super(Reducer, self).close() if __name__ == "__main__": if hadoopy.run(Mapper, Reducer, Combiner): hadoopy.print_doc_quit(__doc__)
import video_raw_features import video_block_features import sys import hadoopy class Mapper(object): def __init__(self): self.b = video_block_features.Mapper() self.r = video_raw_features.Mapper() def map(self, event_filename, video_data): hadoopy.counter('CombinedFeatures', 'DontHave') sys.stderr.write('%s\n' % str(event_filename)) for event_filename, features in self.r.map(event_filename, video_data): sys.stderr.write('%s\n' % str(event_filename)) for x in self.b.map(event_filename, features): yield x if __name__ == '__main__': hadoopy.run(Mapper, video_block_features.Reducer)
#!/usr/bin/env python import hadoopy import vision_data import os import sys class Mapper(object): def __init__(self): self.flickr = vision_data.Flickr() self.max_iters = int(os.environ.get('MAX_ITERS', 1)) self.max_pages = int(os.environ.get('MAX_PAGES', 1)) def map(self, num_kvs, query): sys.stderr.write('Flickr Query[%s]\n') for num, kv in enumerate(self.flickr.image_class_meta_url(query)): yield kv if num >= num_kvs: break def reducer(key, values): yield key, values.next() if __name__ == "__main__": hadoopy.run(Mapper, reducer, jobconfs=['mapred.task.timeout=6000000'])
self.map(key, feat) def _random_canopy(self, canopies): return np.array(random.sample(canopies, 1)) def close(self): hadoopy.status('%f-%f' % (self.ftime, self.gtime)) final_canopies = self._random_canopy(self.canopies) uncovered_points = True while uncovered_points: uncovered_points = False valid_canopies = [] for x in self.canopies: nearest_dist = self.nn(x, final_canopies)[1] if nearest_dist > self.soft_dist: uncovered_points = True if nearest_dist > self.hard_dist: valid_canopies.append(x) if uncovered_points: canopy = self._random_canopy(valid_canopies) final_canopies = np.concatenate((final_canopies, canopy)) self.canopies = valid_canopies for canopy in final_canopies: yield random.random(), canopy.tostring() hadoopy.counter('canopy_cluster','run_time', int(time.time() - self.start_time)) if __name__ == "__main__": if hadoopy.run(MapReduce, MapReduce): hadoopy.print_doc_quit(__doc__)
import hadoopy import picarus def mapper(key, value): """ Args: key: image_hash value: record (see IO docs) Yields: A tuple in the form of (key, value) key: image_hash value: binary file data """ try: fp = picarus.io._record_to_fp(value) except IOError: hadoopy.counter('INPUT_ERROR', 'REMOTE_READ_FAILED') return yield key, fp.read() if __name__ == '__main__': hadoopy.run(mapper)
import cPickle as pickle import numpy as np import hadoopy from hadoopy.pickle import b64dec, b64enc import simplejson as json class Mapper(object): def __init__(self, io_method): self.in_func = {'b64': self.b64, 'json': self.json}[io_method] def b64(self, value): return np.fromstring(b64dec(value), dtype=np.float32) def json(self, value): return np.array(json.loads(value), dtype=np.float32) def map(self, key, value): yield json.dumps(self.in_func(value).tolist()) if __name__ == "__main__": try: io_method = os.environ["IO_METHOD"] except KeyError: hadoopy.print_doc_quit(__doc__) if hadoopy.run(Mapper(io_method)): hadoopy.print_doc_quit(__doc__)
#!/usr/bin/env python import hadoopy import imfeat import os import picarus.api class Mapper(picarus.api.HBaseMapper): def __init__(self): super(Mapper, self).__init__() self._feat = picarus.api.model_fromfile(os.environ['FEATURE_FN']) def _map(self, row, image_binary): try: image = imfeat.image_fromstring(image_binary) except: hadoopy.counter('DATA_ERRORS', 'ImageLoadError') yield row, picarus.api.np_tostring(self._feat(image)) if __name__ == '__main__': hadoopy.run(Mapper, required_cmdenvs=[ 'HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'FEATURE_FN' ])
def map(self, key, value): """ Args: key: Image name value: Image as jpeg byte data Yields: A tuple in the form of (key, value) key: Image name value: (image, faces) where image is the input value and faces is a list of ((x, y, w, h), n) """ try: image = self._load_cv_image(value) except: hadoopy.counter('DATA_ERRORS', 'ImageLoadError') return dist = self._compute_face_distance(image) yield dist, (key, value) def reducer(key, values): """Identity reducer""" for value in values: yield key, value if __name__ == "__main__": hadoopy.run(Mapper, reducer, doc=__doc__)
def close(self): self._compact_heap() return self.heap class Reducer(object): def __init__(self, out_count=True): self.count = 0 try: self.num_clusters = int(os.environ["NUM_CLUSTERS"]) except KeyError: self.num_clusters = DEFAULT_NUM_CLUSTERS self.output = self.yield_count if out_count else self.yield_key def yield_count(self, key, value): return self.count, value def yield_key(self, key, value): return key, value def reduce(self, key, values): for value in values: if self.count < self.num_clusters: yield self.output(key, value) self.count += 1 if __name__ == "__main__": hadoopy.run(Mapper, Reducer, Reducer(False), doc=__doc__)
import hadoopy import random import os try: import numpy as np except ImportError: pass class Mapper(object): def __init__(self): self.alpha = float(os.environ['ALPHA']) def map(self, k, v): out = random.random() if out < self.alpha: yield out, (k, v) def reducer(out, kvs): # NOTE(brandyn): The reducer is so that readtb only has to read 1 file # and so that they are uniformly distributed for kv in kvs: yield kv if __name__ == '__main__': hadoopy.run(Mapper, reducer, required_cmdenvs=['ALPHA'])
clusters = self.clusters[cluster_ids] # Find NN using slow metric # Extends the array by 1 dim that has a 1. in it feat = np.fromstring(feat + '\x00\x00\x80?', dtype=np.float32) nearest_ind = self.nn(feat[0:-1], self.clusters)[0] try: self.out_sums[nearest_ind] += feat except KeyError: self.out_sums[nearest_ind] = feat def close(self): for nearest_ind, feat in self.out_sums.iteritems(): yield nearest_ind, feat.tostring() def reducer(key, values): cur_cluster_sum = None for vec in values: vec = np.fromstring(vec, dtype=np.float32) try: cur_cluster_sum += vec except TypeError: cur_cluster_sum = vec center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1] yield key, center.tostring() if __name__ == "__main__": if hadoopy.run(Mapper, reducer): hadoopy.print_doc_quit(__doc__)
#!/usr/bin/env python import hadoopy import os import numpy as np import json import picarus.api import picarus_takeout class Mapper(picarus.api.HBaseMapper): def __init__(self): super(Mapper, self).__init__() classifier = picarus.api.model_fromfile(os.environ['CLASSIFIER_FN']) if os.environ['CLASSIFIER_TYPE'] == 'sklearn_decision_func': self._classifier = lambda x: repr(float(classifier.decision_function(x).flat[0])) elif os.environ['CLASSIFIER_TYPE'] == 'class_distance_list': self._classifier = lambda x: json.dumps(classifier(x)) else: raise ValueError('Unknown CLASSIFIER_TYPE=%s' % os.environ['CLASSIFIER_TYPE']) def _map(self, row, feature_binary): feature = picarus.api.np_fromstring(feature_binary) yield row, self._classifier(feature) if __name__ == '__main__': hadoopy.run(Mapper, required_cmdenvs=['HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'CLASSIFIER_FN'])
path = ('fixtures/haarcascade_frontalface_default.xml') if os.path.exists(path): self._cascade = cv2.CascadeClassifier(path) else: raise ValueError("Can't find .xml file!") def map(self, key, value): """ Args: key: Image name value: Image as jpeg byte data Yields: A tuple in the form of (key, value) key: (Image name, (x, y, w, h)) value: face image (.png) """ try: image = imfeat.image_fromstring(value, {'type': 'numpy', 'dtype': 'uint8', 'mode': 'gray'}) image_color = imfeat.image_fromstring(value, {'type': 'numpy', 'dtype': 'uint8', 'mode': 'bgr'}) except: hadoopy.counter('DATA_ERRORS', 'ImageLoadError') return faces = _detect_faces(image, self._cascade) for x, y, w, h in faces: yield (key, (x, y, w, h)), imfeat.image_tostring(image_color[y:y + h, x:x + w, :], '.png') if __name__ == "__main__": hadoopy.run(Mapper, required_files=['haarcascade_frontalface_default.xml'])
#!/usr/bin/env python import hadoopy import imfeat import os import picarus.api class Mapper(picarus.api.HBaseMapper): def __init__(self): super(Mapper, self).__init__() self.max_side = int(os.environ.get['MAX_SIDE']) def _map(self, row, image_binary): try: image = imfeat.image_fromstring(image_binary) yield row, imfeat.image_tostring(imfeat.resize_image_max_side(image, self.max_side), 'jpg') except: hadoopy.counter('DATA_ERRORS', 'ImageLoadError') if __name__ == '__main__': hadoopy.run(Mapper, required_cmdenvs=['HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'MAX_SIDE'])
def map(self, key, value): data = value.split('\t') if len(data) < 3: return ngram = data[0].split() year = data[1] count = int(data[2]) if len(ngram) != self.expected_tokens: return pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) k = pair + [year] yield (k, count) def combiner(key, values): yield (key, sum(values)) def reducer(key, values): yield "%s\t%s\t%s" % tuple(key), str(sum(values)) if __name__ == '__main__': hadoopy.run(Mapper, reducer, combiner)
h * image_scale), n) for (x, y, w, h), n in faces] def _load_cv_image(self, value): return imfeat.convert_image(Image.open(StringIO.StringIO(value)), [('opencv', 'rgb', 8)]) def map(self, key, value): """ Args: key: Image name value: Image as jpeg byte data Yields: A tuple in the form of (key, value) key: Image name value: (image, faces) where image is the input value and faces is a list of ((x, y, w, h), n) """ try: image = self._load_cv_image(value) except: hadoopy.counter('DATA_ERRORS', 'ImageLoadError') return faces = self._detect_faces(image) if faces: yield key, (value, faces) if __name__ == "__main__": hadoopy.run(Mapper, doc=__doc__)
def map(self, node_num, data): sys.stderr.write('HadoopyRT: NodeNum[%d]\n' % (node_num,)) flow_controller = hadoopy_rt.FlowControllerNode(self.job_id, self.redis_server, node_num) if 'files' in data: for f, d in data['files'].items(): open(f, 'w').write(d) data['files'] = list(data['files']) # Convert to list, removes memory burden launch_kw_args = dict((x, data[x]) for x in ['files', 'cmdenvs'] if x in data) try: launch_kw_args['cmdenvs'] = hadoopy._runner._listeq_to_dict(launch_kw_args['cmdenvs']) except KeyError: launch_kw_args['cmdenvs'] = {} launch_kw_args['cmdenvs']['hadoopy_rt_stream'] = str(node_num) launch_kw_args['cmdenvs']['hadoopy_rt_redis'] = self.redis_server open(data['script_name'], 'w').write(data['script_data']) while True: try: hadoopy_rt.launch_zmq(flow_controller, data['script_name'], outputs=data.get('outputs'), **launch_kw_args) except Exception, e: sys.stderr.write('%s\n' % str(e)) ps = redis.StrictRedis().pubsub() ps.subscribe(data['script_name']) for x in ps.listen(): if x['type'] == 'message': open(data['script_name'], 'w').write(x['data']) break if __name__ == '__main__': hadoopy.run(Mapper, required_cmdenvs=['hadoopy_rt_redis', 'job_id'])
self._assignments = self._load_assignments() def _load_assignments(self): out = {} # [image_id] = list of clust_ids with open(os.environ['ASSIGNMENTS_FN']) as fp: for clust_ind, image_id in pickle.load(fp): out.setdefault(image_id, []).append(clust_ind) return out def map(self, image_id, image_data): """Take in an image, if it is one we want then output it Args: name: unique image id image_data: Binary image data Yields: A tuple in the form of (key, value) key: cluster ind value: (image_id, image_data) """ try: for cluster_ind in self._assignments[image_id]: yield cluster_ind, (image_id, image_data) except KeyError: pass if __name__ == "__main__": if hadoopy.run(Mapper): hadoopy.print_doc_quit(__doc__)
# determine value of n in the current block of ngrams input_file = os.environ['map_input_file'] self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0]) def map(self, key, value): data = value.split('\t') if len(data) < 3: return ngram = data[0].split() year = data[1] count = int(data[2]) if len(ngram) != self.expected_tokens: return pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) k = pair + [year] yield (k, count) def combiner(key, values): yield (key, sum(values)) def reducer(key, values): yield "%s\t%s\t%s" % tuple(key), str(sum(values)) if __name__ == '__main__': hadoopy.run(Mapper, reducer, combiner)
#!/usr/bin/env python import hadoopy import numpy as np def mapper(key, image_data): (tag, hash) = key print key if tag == 'frame': yield hash, image_data if __name__ == '__main__': hadoopy.run(mapper)
return thresh, score class Reducer(object): def __init__(self): pass def reduce(self, key, values): # Setup data # TODO(brandyn): Use multi-file join pattern data = [None, None, None] for input_type, value in values: data[input_type] = value if len([x for x in data if x is None]) != 0: raise ValueError('Reducer did not get all necessary parts!') exemplar, pos, neg = data # Compute threshold and output new exemplar try: thresh, score = fpr_threshold(pos, neg) except BadExemplar: print('Bad exemplar[%s]' % (key,)) return print('Good exemplar[%s][%f]' % (key, thresh)) key[2] = score yield key, (exemplar[0], exemplar[1] - thresh) if __name__ == '__main__': hadoopy.run(Mapper, Reducer, jobconfs=['mapred.task.timeout=6000000', 'mapred.child.java.opts=-Xmx512M'])
#!/usr/bin/python import hadoopy def mapper(row, column_families): yield 'num_rows', 1 def reducer(key, values): yield key, sum(values) if __name__ == '__main__': hadoopy.run(mapper, reducer)
def _map(self, row, image_binary): try: image = Image.open(StringIO.StringIO(image_binary)) if not hasattr(image, "_getexif"): yield row, json.dumps({}) else: image_tags = image._getexif() if image_tags is None: yield row, json.dumps({}) else: yield row, json.dumps( dict( (name, base64.b64encode(image_tags[id])) if isinstance(image_tags[id], str) else image_tags[id] for id, name in TAGS.items() if id in image_tags ) ) except: sys.stdout.flush() hadoopy.counter("STATUS", "badRows") else: sys.stdout.flush() hadoopy.counter("STATUS", "goodRows") if __name__ == "__main__": hadoopy.run(Mapper, required_cmdenvs=["HBASE_TABLE", "HBASE_OUTPUT_COLUMN"])
#!/usr/bin/env python import hadoopy import hadoopy_rt class Mapper(object): def __init__(self): super(Mapper, self).__init__() def map(self, key, value): for v in value.split(): yield 1, (v, 1) # Send all words to 1 #if v[0] == '#': # yield 2, (v, 1) # Send all hashtags to 2 if __name__ == '__main__': hadoopy.run(Mapper)
#!/usr/bin/env python import hadoopy import hadoopy_rt class Updater(hadoopy_rt.Updater): def __init__(self): super(Updater, self).__init__() def update(self, key, value, slate): slate.set(value) if __name__ == '__main__': hadoopy.run(Updater)
import hadoopy import hadoopy_hbase import os import image_search import numpy as np import cPickle as pickle import picarus.api class Mapper(object): def __init__(self): self._hbase_input_column = os.environ['HBASE_INPUT_COLUMN'].split(':') self._hbase_output_row = os.environ['HBASE_OUTPUT_ROW'] def map(self, row, columns): yield self._hbase_output_row, columns[self._hbase_input_column[0]][self._hbase_input_column[1]] class Reducer(object): def __init__(self): self.hash_bits = int(os.environ['HASH_BITS']) self._hbase = hadoopy_hbase.HBaseRowDict(os.environ['HBASE_OUTPUT_TABLE'], os.environ['HBASE_OUTPUT_COLUMN']) def reduce(self, row, features): self._hbase[row] = pickle.dumps(image_search.RRMedianHasher(self.hash_bits, normalize_features=False).train([picarus.api.np_fromstring(x) for x in features]), -1) if __name__ == '__main__': hadoopy.run(Mapper, Reducer, required_cmdenvs=['HASH_BITS', 'HBASE_INPUT_COLUMN', 'HBASE_OUTPUT_ROW', 'HBASE_OUTPUT_TABLE', 'HBASE_OUTPUT_COLUMN'])
import hadoopy import os import random import image_search # TODO: Put this mapper in Hadoopy helper class Mapper(object): def __init__(self): self.kv_prob = float(os.environ['KV_PROB']) def map(self, k, v): if random.random() < self.kv_prob: yield 0, (k, v) class Reducer(object): def __init__(self): self.hash_bits = int(os.environ['HASH_BITS']) def reduce(self, key, id_feats): yield key, image_search.RRMedianHasher(self.hash_bits, normalize_features=False).train([x for _, x in id_feats]) if __name__ == '__main__': hadoopy.run(Mapper, Reducer, required_cmdenvs=['KV_PROB', 'HASH_BITS'])
#!/usr/bin/env python import hadoopy import os import numpy as np import picarus.api class Mapper(picarus.api.HBaseMapper): def __init__(self): super(Mapper, self).__init__() self._hasher = picarus.api.model_fromfile(os.environ['HASHER_FN']) def _map(self, row, feature_binary): feature = picarus.api.np_fromstring(feature_binary) yield row, self._hasher(feature).tostring() if __name__ == '__main__': hadoopy.run(Mapper, required_cmdenvs=['HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'HASHER_FN'])
data["video"] = event_video[1] pyramid = np.zeros((self.num_bins, self.num_bins), dtype=np.int32) num_boxes = 0 coord = lambda x: int(np.round(x * self.num_bins)) for (_, box), confs in super(Mapper, self).map(None, frame): num_boxes += 1 cy0, cx0, cy1, cx1 = map(coord, box) cy1 += 1 cx1 += 1 cell_value = 1.0 / ((cy1 - cy0) * (cx1 - cx0)) inds = (confs >= 0).nonzero()[0] hadoopy.counter("STATS", "num_pos", inds.size) hadoopy.counter("STATS", "num_neg", confs.size - inds.size) hadoopy.counter("STATS", "total", confs.size) if inds.size: pyramid[cy0:cy1, cx0:cx1] += cell_value * inds.size yield data, (pyramid, num_boxes * len(self.ids)) if __name__ == "__main__": hadoopy.run( Mapper, jobconfs=[ "mapred.task.timeout=6000000", "mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec", "mapred.compress.map.output=true", "mapred.output.compress=true", "mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec", ], )
try: boxes = scale_boxes[scale] except KeyError: continue size_array = np.array([ image.shape[0], image.shape[1], image.shape[0], image.shape[1] ]) for box, fn in boxes: box = np.round(size_array * box).astype(np.int) print(box) if self.type == 'image': image_box = np.ascontiguousarray( image[box[0]:box[2], box[1]:box[3], :]) yield fn, imfeat.image_tostring(image_box, 'png') elif self.type == 'feature': image_box = np.ascontiguousarray( image[box[0]:box[2], box[1]:box[3], :]) yield fn, feature.compute_patch(image_box) elif self.type == 'box': image2 = image.copy() cv2.rectangle(image2, (box[1], box[0]), (box[3], box[2]), (0, 255, 0), 4) yield fn, imfeat.image_tostring(image2, 'jpg') else: raise ValueError(self.type) if __name__ == '__main__': hadoopy.run(Mapper)
import hadoopy import numpy as np import cPickle as pickle import image_search class Mapper(object): def __init__(self): self.hasher = pickle.load(open('hasher.pkl')) def map(self, name, feature): yield 0, (name, self.hasher(feature)[0]) class Reducer(object): def __init__(self): pass def reduce(self, key, name_hashes): names, hashes = zip(*name_hashes) yield np.array(names), np.ascontiguousarray(hashes) if __name__ == '__main__': hadoopy.run(Mapper, Reducer, required_files=['hasher.pkl'])
key: (x, y) bin coordinates between 0 and _bin_count-1 values: an array of face patches (face_size, face_size), dtype=np.uint8, greyscale Yields: 'mean_x_y.jpg', mean_face: the mean face, as jpeg data (x,y), count: the number of faces for bin x,y """ def _image_to_str(img): out = StringIO.StringIO() img.save(out, 'JPEG') out.seek(0) return out.read() x, y = key sum_face = np.zeros((_face_size, _face_size)) face_counter = 0 for face in values: sum_face += face face_counter += 1 mean_face = (sum_face/face_counter).astype('u1') yield ('mean_%d_%d.jpg' % (x, y), _image_to_str(Image.fromarray(mean_face))) yield (x, y), face_counter if __name__ == '__main__': hadoopy.run(mapper, reducer)
in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import hadoopy if __name__ == "__main__": """ SELECT (k, v) where k = target_column_1+target_column_2+...,+target_column_N, where v = target_column_1, ..., target_column_N FROM (input dataset) WHERE filter_column_1 (not) in [filter_vals_1] and filter_column_2 (not) in [filter_vals_2] and ... """ from python_hiveish.mapreduce.mappers import select_where as mapper from python_hiveish.mapreduce.reducers import identity_reducer as reducer hadoopy.run(mapper, reducer, doc=__doc__)