Python run Examples, hadoopy.run Python Examples

Example #1

0

Show file

def runner():
    """ The function that calls haoodpy.run """
    iter = gopts.getintkey('iter')
    blocksize = gopts.getintkey('blocksize')
    reduce_schedule = gopts.getstrkey('reduce_schedule')

    mapper = NormalEquations(blocksize=blocksize, isreducer=False)
    reducer = NormalEquations(blocksize=blocksize, isreducer=True)

    hadoopy.run(mapper, reducer)

Example #2

0

Show file

File: normal.py Project: dgleich/mrtsqr

def runner():
    """ The function that calls haoodpy.run """
    iter = gopts.getintkey('iter')
    blocksize = gopts.getintkey('blocksize')
    reduce_schedule = gopts.getstrkey('reduce_schedule')
    
    mapper = NormalEquations(blocksize=blocksize,isreducer=False)
    reducer =  NormalEquations(blocksize=blocksize,isreducer=True)
    
    
    hadoopy.run(mapper, reducer)

Example #3

0

Show file

#!/usr/bin/env python
import hadoopy
import picarus_takeout
import os
import picarus
import zlib
import sys


class Mapper(picarus.HBaseMapper):
    def __init__(self):
        super(Mapper, self).__init__()
        self._model = zlib.decompress(open(os.environ['MODEL_FN']).read())
        self.job = picarus_takeout.ModelChain(self._model)

    def _map(self, row, input_binary):
        try:
            yield row, self.job.process_binary(input_binary)
        except:
            sys.stdout.flush()
            hadoopy.counter('STATUS', 'badRows')
        else:
            sys.stdout.flush()
            hadoopy.counter('STATUS', 'goodRows')


if __name__ == '__main__':
    hadoopy.run(
        Mapper,
        required_cmdenvs=['HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'MODEL_FN'])

Example #4

0

Show file

        try:
            histogram[clusterid] += 1
        except KeyError:
            histogram[clusterid] = 1

    def configure(self):
        self.clusters = self._load_clusters()

    def map(self, imageid, features):
        """

        Args:
            imageid: An ID that is directly passed to the output
            features: As a list of numpy arrays

        Yields:
            A tuple in the form of (key, value)
            key: imageid
            value: histogram as a dict of (dim, val) (int, int)
        """
        histogram = {}
        for feature in features:
            clusterid = self._nearest_cluster_id(self.clusters, feature)
            self._update_histogram(clusterid, histogram)
        yield imageid, histogram


if __name__ == "__main__":
    if hadoopy.run(Mapper):
        hadoopy.print_doc_quit(__doc__)

Example #5

0

Show file

File: predict_spatial_pyramid_fine.py Project: bwhite/patch_classifier

            inds = (confs >= 0).nonzero()[0]
            hadoopy.counter('STATS', 'num_pos', inds.size)
            hadoopy.counter('STATS', 'num_neg', confs.size - inds.size)
            hadoopy.counter('STATS', 'total', confs.size)
            if inds.size:
                self.pyramid[inds, cy, cx] += 1

    def close(self):
        yield 0, (self.pyramid, float(self.num_boxes))


class Reducer(object):

    def __init__(self):
        pass

    def reduce(self, key, pyramid_num_boxes):
        pyramid_out = 0
        num_boxes_out = 0
        for pyramid, num_boxes in pyramid_num_boxes:
            pyramid_out += pyramid
            num_boxes_out += num_boxes
        yield key, (pyramid_out, num_boxes_out)

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, jobconfs=['mapred.task.timeout=6000000',
                                           'mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec',
                                           'mapred.compress.map.output=true',
                                           'mapred.output.compress=true',
                                           'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'])

Example #6

0

Show file

    def __init__(self):
        pass

    def reduce(self, key, values):
        # Setup data
        # TODO(brandyn): Use multi-file join pattern
        data = [None, None, None]
        for input_type, value in values:
            data[input_type] = value
        if len([x for x in data if x is None]) != 0:
            raise ValueError('Reducer did not get all necessary parts!')
        exemplar, pos, neg = data
        # Compute threshold and output new exemplar
        try:
            thresh, score = fpr_threshold(pos, neg)
        except BadExemplar:
            print('Bad exemplar[%s]' % (key, ))
            return
        print('Good exemplar[%s][%f]' % (key, thresh))
        key[2] = score
        yield key, (exemplar[0], exemplar[1] - thresh)


if __name__ == '__main__':
    hadoopy.run(Mapper,
                Reducer,
                jobconfs=[
                    'mapred.task.timeout=6000000',
                    'mapred.child.java.opts=-Xmx512M'
                ])

Example #7

0

Show file

File: thresh_predictions.py Project: objects-in-space-and-time/picarus

    def reduce(self, image_hash, values):
        """

        Args:
            image_hash: (see mapper)
            values: Iterator of values (see mapper)

        Yields:
            A tuple in the form of (image_hash, value)
            image_hash: Image hash
            value: The provided value (not the prediction)
        """
        predictions = None
        out_val = None
        for value in values:
            if isinstance(value, dict):
                predictions = value
            else:
                out_val = value
        if predictions is None or out_val is None:
            hadoopy.counter('DATA_ERR', 'MISSING_PREDICTIONS_OR_DATA')
            return
        label, conf = predictions[self._class_name][0]
        if (self._class_thresh <= label * conf) == (
                self._output_class == 1):  # Both true or both false
            yield image_hash, out_val


if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer)

Example #8

0

Show file

File: predict_video_frame.py Project: bwhite/patch_classifier

        frame = data['frame']
        data['event'] = event_video[0]
        data['video'] = event_video[1]
        pyramid = np.zeros((self.num_bins, self.num_bins), dtype=np.int32)
        num_boxes = 0
        coord = lambda x: int(np.round(x * self.num_bins))
        for (_, box), confs in super(Mapper, self).map(None, frame):
            num_boxes += 1
            cy0, cx0, cy1, cx1 = map(coord, box)
            cy1 += 1
            cx1 += 1
            cell_value = 1. / ((cy1 - cy0) * (cx1 - cx0))
            inds = (confs >= 0).nonzero()[0]
            hadoopy.counter('STATS', 'num_pos', inds.size)
            hadoopy.counter('STATS', 'num_neg', confs.size - inds.size)
            hadoopy.counter('STATS', 'total', confs.size)
            if inds.size:
                pyramid[cy0:cy1, cx0:cx1] += cell_value * inds.size
        yield data, (pyramid, num_boxes * len(self.ids))


if __name__ == '__main__':
    hadoopy.run(
        Mapper,
        jobconfs=[
            'mapred.task.timeout=6000000',
            'mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec',
            'mapred.compress.map.output=true', 'mapred.output.compress=true',
            'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'
        ])

Example #9

0

Show file

File: train_classifier.py Project: objects-in-space-and-time/picarus

class Reducer(object):

    def __init__(self):
        self.labels = file_parse.load(os.environ['LOCAL_LABELS_FN'])

    def reduce(self, class_name, label_values):
        """

        Args:
            class_name: (see mapper)
            label_values: Iterator of label_values (see mapper)

        Yields:
            A tuple in the form of (key, value)
            classifier_name: (see mapper)
            classifier: Serialized classifier
        """
        label_values = list(label_values)
        for classifier_name in self.labels['classes'][class_name]['classifiers']:
            print('Starting [%s,%s]' % (class_name, classifier_name))
            classifier_extra = self.labels['classifiers'][classifier_name].get('extra', '')
            classifier = classifiers.train(self.labels['classifiers'][classifier_name]['name'], classifier_extra, label_values)
            classifier_ser = classifiers.dumps(classifier_name, classifier_extra, classifier)
            yield ' '.join([class_name, classifier_name]), classifier_ser
            print('Ending [%s,%s,%d]' % (class_name, classifier_name, len(classifier_ser)))


if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer)

Example #10

0

Show file

File: face_finder.py Project: wsxiaoys/hadoopy

            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: (Image name, (x, y, w, h))
            value: face image (.png)
        """
        try:
            image = imfeat.image_fromstring(value, {
                'type': 'numpy',
                'dtype': 'uint8',
                'mode': 'gray'
            })
            image_color = imfeat.image_fromstring(value, {
                'type': 'numpy',
                'dtype': 'uint8',
                'mode': 'bgr'
            })
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        faces = _detect_faces(image, self._cascade)
        for x, y, w, h in faces:
            yield (key, (x, y, w, h)), imfeat.image_tostring(
                image_color[y:y + h, x:x + w, :], '.png')


if __name__ == "__main__":
    hadoopy.run(Mapper, required_files=['haarcascade_frontalface_default.xml'])

Example #11

0

Show file

File: kmeans_cluster_imc.py Project: bwhite/hadoop_clustering

            except TypeError:
                cur_cluster_sum = vec
        yield key, cur_cluster_sum.tostring()
    
    def close(self):
        super(Combiner, self).close()


class Reducer(profile.ProfileJob):
    def __init__(self):
        super(Reducer, self).__init__()

    def reduce(self, key, values):
        cur_cluster_sum = None
        for vec in values:
            vec = np.fromstring(vec, dtype=np.float32)
            try:
                cur_cluster_sum += vec
            except TypeError:
                cur_cluster_sum = vec
        center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1]
        yield key, center.tostring()
    
    def close(self):
        super(Reducer, self).close()


if __name__ == "__main__":
    if hadoopy.run(Mapper, Reducer, Combiner):
        hadoopy.print_doc_quit(__doc__)

Example #12

0

Show file

File: video_combined_features.py Project: objects-in-space-and-time/picarus

import video_raw_features
import video_block_features
import sys
import hadoopy


class Mapper(object):

    def __init__(self):
        self.b = video_block_features.Mapper()
        self.r = video_raw_features.Mapper()

    def map(self, event_filename, video_data):
        hadoopy.counter('CombinedFeatures', 'DontHave')
        sys.stderr.write('%s\n' % str(event_filename))
        for event_filename, features in self.r.map(event_filename, video_data):
            sys.stderr.write('%s\n' % str(event_filename))
            for x in self.b.map(event_filename, features):
                yield x


if __name__ == '__main__':
    hadoopy.run(Mapper, video_block_features.Reducer)

Example #13

0

Show file

#!/usr/bin/env python
import hadoopy
import vision_data
import os
import sys


class Mapper(object):
    def __init__(self):
        self.flickr = vision_data.Flickr()
        self.max_iters = int(os.environ.get('MAX_ITERS', 1))
        self.max_pages = int(os.environ.get('MAX_PAGES', 1))

    def map(self, num_kvs, query):
        sys.stderr.write('Flickr Query[%s]\n')
        for num, kv in enumerate(self.flickr.image_class_meta_url(query)):
            yield kv
            if num >= num_kvs:
                break


def reducer(key, values):
    yield key, values.next()


if __name__ == "__main__":
    hadoopy.run(Mapper, reducer, jobconfs=['mapred.task.timeout=6000000'])

Example #14

0

Show file

File: canopy_cluster.py Project: bwhite/hadoop_clustering

            self.map(key, feat)

    def _random_canopy(self, canopies):
        return np.array(random.sample(canopies, 1))

    def close(self):
        hadoopy.status('%f-%f' % (self.ftime, self.gtime))
        final_canopies = self._random_canopy(self.canopies)
        uncovered_points = True
        while uncovered_points:
            uncovered_points = False
            valid_canopies = []
            for x in self.canopies:
                nearest_dist = self.nn(x, final_canopies)[1]
                if nearest_dist > self.soft_dist:
                    uncovered_points = True
                if nearest_dist > self.hard_dist:
                    valid_canopies.append(x)
            if uncovered_points:
                canopy = self._random_canopy(valid_canopies)
                final_canopies = np.concatenate((final_canopies, canopy))
                self.canopies = valid_canopies
        for canopy in final_canopies:
            yield random.random(), canopy.tostring()
        hadoopy.counter('canopy_cluster','run_time', int(time.time() - self.start_time))


if __name__ == "__main__":
    if hadoopy.run(MapReduce, MapReduce):
        hadoopy.print_doc_quit(__doc__)

Example #15

0

Show file

import hadoopy
import picarus


def mapper(key, value):
    """

    Args:
        key: image_hash
        value: record (see IO docs)

    Yields:
        A tuple in the form of (key, value)
        key: image_hash
        value: binary file data
    """
    try:
        fp = picarus.io._record_to_fp(value)
    except IOError:
        hadoopy.counter('INPUT_ERROR', 'REMOTE_READ_FAILED')
        return
    yield key, fp.read()


if __name__ == '__main__':
    hadoopy.run(mapper)

Example #16

0

Show file

File: dump_data.py Project: bwhite/hadoop_clustering

import cPickle as pickle

import numpy as np

import hadoopy
from hadoopy.pickle import b64dec, b64enc
import simplejson as json


class Mapper(object):
    def __init__(self, io_method):
        self.in_func = {'b64': self.b64, 'json': self.json}[io_method]

    def b64(self, value):
        return np.fromstring(b64dec(value), dtype=np.float32)

    def json(self, value):
        return np.array(json.loads(value), dtype=np.float32)
    
    def map(self, key, value):
        yield json.dumps(self.in_func(value).tolist())


if __name__ == "__main__":
    try:
        io_method = os.environ["IO_METHOD"]
    except KeyError:
        hadoopy.print_doc_quit(__doc__)
    if hadoopy.run(Mapper(io_method)):
        hadoopy.print_doc_quit(__doc__)

Example #17

0

Show file

#!/usr/bin/env python
import hadoopy
import imfeat
import os
import picarus.api


class Mapper(picarus.api.HBaseMapper):
    def __init__(self):
        super(Mapper, self).__init__()
        self._feat = picarus.api.model_fromfile(os.environ['FEATURE_FN'])

    def _map(self, row, image_binary):
        try:
            image = imfeat.image_fromstring(image_binary)
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
        yield row, picarus.api.np_tostring(self._feat(image))


if __name__ == '__main__':
    hadoopy.run(Mapper,
                required_cmdenvs=[
                    'HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN',
                    'FEATURE_FN'
                ])

Example #18

0

Show file

File: face_ranker.py Project: jonstewart/picarus

    def map(self, key, value):
        """
        Args:
            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: Image name
            value: (image, faces) where image is the input value and faces is
                a list of ((x, y, w, h), n)
        """
        try:
            image = self._load_cv_image(value)
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        dist = self._compute_face_distance(image)
        yield dist, (key, value)


def reducer(key, values):
    """Identity reducer"""
    for value in values:
        yield key, value


if __name__ == "__main__":
    hadoopy.run(Mapper, reducer, doc=__doc__)

Example #19

0

Show file

File: random_cluster.py Project: bwhite/hadoop_clustering

    def close(self):
        self._compact_heap()
        return self.heap


class Reducer(object):
    def __init__(self, out_count=True):
        self.count = 0
        try:
            self.num_clusters = int(os.environ["NUM_CLUSTERS"])
        except KeyError:
            self.num_clusters = DEFAULT_NUM_CLUSTERS
        self.output = self.yield_count if out_count else self.yield_key

    def yield_count(self, key, value):
        return self.count, value

    def yield_key(self, key, value):
        return key, value

    def reduce(self, key, values):
        for value in values:
            if self.count < self.num_clusters:
                yield self.output(key, value)
                self.count += 1


if __name__ == "__main__":
    hadoopy.run(Mapper, Reducer, Reducer(False), doc=__doc__)

Example #20

0

Show file

File: random_sample.py Project: bwhite/hadoopy_helper

import hadoopy
import random
import os
try:
    import numpy as np
except ImportError:
    pass


class Mapper(object):

    def __init__(self):
        self.alpha = float(os.environ['ALPHA'])

    def map(self, k, v):
        out = random.random()
        if out < self.alpha:
            yield out, (k, v)


def reducer(out, kvs):
    # NOTE(brandyn): The reducer is so that readtb only has to read 1 file
    # and so that they are uniformly distributed
    for kv in kvs:
        yield kv


if __name__ == '__main__':
    hadoopy.run(Mapper, reducer, required_cmdenvs=['ALPHA'])

Example #21

0

Show file

File: kmeans_canopy_cluster.py Project: bwhite/hadoop_clustering

        clusters = self.clusters[cluster_ids]

        # Find NN using slow metric
        # Extends the array by 1 dim that has a 1. in it
        feat = np.fromstring(feat + '\x00\x00\x80?', dtype=np.float32)
        nearest_ind = self.nn(feat[0:-1], self.clusters)[0]
        try:
            self.out_sums[nearest_ind] += feat
        except KeyError:
            self.out_sums[nearest_ind] = feat

    def close(self):
        for nearest_ind, feat in self.out_sums.iteritems():
            yield nearest_ind, feat.tostring()

def reducer(key, values):
    cur_cluster_sum = None
    for vec in values:
        vec = np.fromstring(vec, dtype=np.float32)
        try:
            cur_cluster_sum += vec
        except TypeError:
            cur_cluster_sum = vec
    center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1]
    yield key, center.tostring()


if __name__ == "__main__":
    if hadoopy.run(Mapper, reducer):
        hadoopy.print_doc_quit(__doc__)

Example #22

0

Show file

File: feature_to_prediction.py Project: strategist922/picarus

#!/usr/bin/env python
import hadoopy
import os
import numpy as np
import json
import picarus.api
import picarus_takeout


class Mapper(picarus.api.HBaseMapper):

    def __init__(self):
        super(Mapper, self).__init__()
        classifier = picarus.api.model_fromfile(os.environ['CLASSIFIER_FN'])
        if os.environ['CLASSIFIER_TYPE'] == 'sklearn_decision_func':
            self._classifier = lambda x: repr(float(classifier.decision_function(x).flat[0]))
        elif os.environ['CLASSIFIER_TYPE'] == 'class_distance_list':
            self._classifier = lambda x: json.dumps(classifier(x))
        else:
            raise ValueError('Unknown CLASSIFIER_TYPE=%s' % os.environ['CLASSIFIER_TYPE'])

    def _map(self, row, feature_binary):
        feature = picarus.api.np_fromstring(feature_binary)
        yield row, self._classifier(feature)


if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'CLASSIFIER_FN'])

Example #23

0

Show file

File: face_finder.py Project: Jeffliu/hadoopy

            path = ('fixtures/haarcascade_frontalface_default.xml')
            if os.path.exists(path):
                self._cascade = cv2.CascadeClassifier(path)
            else:
                raise ValueError("Can't find .xml file!")

    def map(self, key, value):
        """
        Args:
            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: (Image name, (x, y, w, h))
            value: face image (.png)
        """
        try:
            image = imfeat.image_fromstring(value, {'type': 'numpy', 'dtype': 'uint8', 'mode': 'gray'})
            image_color = imfeat.image_fromstring(value, {'type': 'numpy', 'dtype': 'uint8', 'mode': 'bgr'})
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        faces = _detect_faces(image, self._cascade)
        for x, y, w, h in faces:
            yield (key, (x, y, w, h)), imfeat.image_tostring(image_color[y:y + h, x:x + w, :], '.png')


if __name__ == "__main__":
    hadoopy.run(Mapper, required_files=['haarcascade_frontalface_default.xml'])

Example #24

0

Show file

File: image_resize.py Project: objects-in-space-and-time/picarus

#!/usr/bin/env python
import hadoopy
import imfeat
import os
import picarus.api


class Mapper(picarus.api.HBaseMapper):

    def __init__(self):
        super(Mapper, self).__init__()
        self.max_side = int(os.environ.get['MAX_SIDE'])

    def _map(self, row, image_binary):
        try:
            image = imfeat.image_fromstring(image_binary)
            yield row, imfeat.image_tostring(imfeat.resize_image_max_side(image, self.max_side), 'jpg')
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')


if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'MAX_SIDE'])

Example #25

0

Show file

    def map(self, key, value):
        data = value.split('\t')

        if len(data) < 3:
            return

        ngram = data[0].split()
        year = data[1]
        count = int(data[2])

        if len(ngram) != self.expected_tokens:
            return

        pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
        k = pair + [year]

        yield (k, count)


def combiner(key, values):
    yield (key, sum(values))


def reducer(key, values):
    yield "%s\t%s\t%s" % tuple(key), str(sum(values))


if __name__ == '__main__':
    hadoopy.run(Mapper, reducer, combiner)

Example #26

0

Show file

File: face_finder.py Project: wsxiaoys/hadoopy

                  h * image_scale), n) for (x, y, w, h), n in faces]

    def _load_cv_image(self, value):
        return imfeat.convert_image(Image.open(StringIO.StringIO(value)),
                                    [('opencv', 'rgb', 8)])

    def map(self, key, value):
        """
        Args:
            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: Image name
            value: (image, faces) where image is the input value and faces is
                a list of ((x, y, w, h), n)
        """
        try:
            image = self._load_cv_image(value)
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        faces = self._detect_faces(image)
        if faces:
            yield key, (value, faces)


if __name__ == "__main__":
    hadoopy.run(Mapper, doc=__doc__)

Example #27

0

Show file

File: hadoopy_rt_job.py Project: bwhite/hadoopy_rt

    def map(self, node_num, data):
        sys.stderr.write('HadoopyRT: NodeNum[%d]\n' % (node_num,))
        flow_controller = hadoopy_rt.FlowControllerNode(self.job_id, self.redis_server, node_num)
        if 'files' in data:
            for f, d in data['files'].items():
                open(f, 'w').write(d)
            data['files'] = list(data['files'])  # Convert to list, removes memory burden
        launch_kw_args = dict((x, data[x]) for x in ['files', 'cmdenvs'] if x in data)
        try:
            launch_kw_args['cmdenvs'] = hadoopy._runner._listeq_to_dict(launch_kw_args['cmdenvs'])
        except KeyError:
            launch_kw_args['cmdenvs'] = {}
        launch_kw_args['cmdenvs']['hadoopy_rt_stream'] = str(node_num)
        launch_kw_args['cmdenvs']['hadoopy_rt_redis'] = self.redis_server
        open(data['script_name'], 'w').write(data['script_data'])
        while True:
            try:
                hadoopy_rt.launch_zmq(flow_controller, data['script_name'], outputs=data.get('outputs'), **launch_kw_args)
            except Exception, e:
                sys.stderr.write('%s\n' % str(e))
            ps = redis.StrictRedis().pubsub()
            ps.subscribe(data['script_name'])
            for x in ps.listen():
                if x['type'] == 'message':
                    open(data['script_name'], 'w').write(x['data'])
                    break

if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['hadoopy_rt_redis', 'job_id'])

Example #28

0

Show file

File: filter_samples.py Project: jonstewart/picarus

        self._assignments = self._load_assignments()

    def _load_assignments(self):
        out = {}  # [image_id] = list of clust_ids
        with open(os.environ['ASSIGNMENTS_FN']) as fp:
            for clust_ind, image_id in pickle.load(fp):
                out.setdefault(image_id, []).append(clust_ind)
        return out

    def map(self, image_id, image_data):
        """Take in an image, if it is one we want then output it

        Args:
            name: unique image id
            image_data: Binary image data

        Yields:
            A tuple in the form of (key, value)
            key: cluster ind
            value: (image_id, image_data)
        """
        try:
            for cluster_ind in self._assignments[image_id]:
                yield cluster_ind, (image_id, image_data)
        except KeyError:
            pass

if __name__ == "__main__":
    if hadoopy.run(Mapper):
        hadoopy.print_doc_quit(__doc__)

Example #29

0

Show file

File: ngrams.py Project: abeusher/python-ngrams

        # determine value of n in the current block of ngrams
        input_file = os.environ['map_input_file']
        self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
    
    def map(self, key, value):
        data = value.split('\t')
        
        if len(data) < 3:
            return
        
        ngram = data[0].split()
        year = data[1]
        count = int(data[2])
        
        if len(ngram) != self.expected_tokens:
            return
        
        pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
        k = pair + [year]
        
        yield (k, count)        

def combiner(key, values):
    yield (key, sum(values))

def reducer(key, values):
    yield "%s\t%s\t%s" % tuple(key), str(sum(values))

if __name__ == '__main__':
    hadoopy.run(Mapper, reducer, combiner)

Example #30

0

Show file

File: video_keyframe_collect.py Project: jonstewart/picarus

#!/usr/bin/env python
import hadoopy
import numpy as np


def mapper(key, image_data):
    (tag, hash) = key
    print key
    if tag == 'frame':
        yield hash, image_data


if __name__ == '__main__':
    hadoopy.run(mapper)

Example #31

0

Show file

File: calibrate_thresholds.py Project: bwhite/patch_classifier

    return thresh, score


class Reducer(object):

    def __init__(self):
        pass

    def reduce(self, key, values):
        # Setup data
        # TODO(brandyn): Use multi-file join pattern
        data = [None, None, None]
        for input_type, value in values:
            data[input_type] = value
        if len([x for x in data if x is None]) != 0:
            raise ValueError('Reducer did not get all necessary parts!')
        exemplar, pos, neg = data
        # Compute threshold and output new exemplar
        try:
            thresh, score = fpr_threshold(pos, neg)
        except BadExemplar:
            print('Bad exemplar[%s]' % (key,))
            return
        print('Good exemplar[%s][%f]' % (key, thresh))
        key[2] = score
        yield key, (exemplar[0], exemplar[1] - thresh)
        

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, jobconfs=['mapred.task.timeout=6000000', 'mapred.child.java.opts=-Xmx512M'])

Example #32

0

Show file

File: flickr_count_job.py Project: strategist922/hadoopy_hbase

#!/usr/bin/python
import hadoopy


def mapper(row, column_families):
    yield 'num_rows', 1

def reducer(key, values):
    yield key, sum(values)

if __name__ == '__main__':
    hadoopy.run(mapper, reducer)

Example #33

0

Show file

File: image_exif.py Project: William-Wai/picarus

    def _map(self, row, image_binary):
        try:
            image = Image.open(StringIO.StringIO(image_binary))
            if not hasattr(image, "_getexif"):
                yield row, json.dumps({})
            else:
                image_tags = image._getexif()
                if image_tags is None:
                    yield row, json.dumps({})
                else:
                    yield row, json.dumps(
                        dict(
                            (name, base64.b64encode(image_tags[id]))
                            if isinstance(image_tags[id], str)
                            else image_tags[id]
                            for id, name in TAGS.items()
                            if id in image_tags
                        )
                    )
        except:
            sys.stdout.flush()
            hadoopy.counter("STATUS", "badRows")
        else:
            sys.stdout.flush()
            hadoopy.counter("STATUS", "goodRows")


if __name__ == "__main__":
    hadoopy.run(Mapper, required_cmdenvs=["HBASE_TABLE", "HBASE_OUTPUT_COLUMN"])

Example #34

0

Show file

File: twitter_tokenize_job.py Project: bwhite/hadoopy_rt

#!/usr/bin/env python
import hadoopy
import hadoopy_rt


class Mapper(object):

    def __init__(self):
        super(Mapper, self).__init__()

    def map(self, key, value):
        for v in value.split():
            yield 1, (v, 1)  # Send all words to 1
            #if v[0] == '#':
            #    yield 2, (v, 1)  # Send all hashtags to 2

if __name__ == '__main__':
    hadoopy.run(Mapper)

Example #35

0

Show file

File: store_results_job.py Project: bwhite/hadoopy_rt

#!/usr/bin/env python
import hadoopy
import hadoopy_rt


class Updater(hadoopy_rt.Updater):

    def __init__(self):
        super(Updater, self).__init__()

    def update(self, key, value, slate):
        slate.set(value)


if __name__ == '__main__':
    hadoopy.run(Updater)

Example #36

0

Show file

import hadoopy
import hadoopy_hbase
import os
import image_search
import numpy as np
import cPickle as pickle
import picarus.api


class Mapper(object):

    def __init__(self):
        self._hbase_input_column = os.environ['HBASE_INPUT_COLUMN'].split(':')
        self._hbase_output_row = os.environ['HBASE_OUTPUT_ROW']

    def map(self, row, columns):
        yield self._hbase_output_row, columns[self._hbase_input_column[0]][self._hbase_input_column[1]]


class Reducer(object):

    def __init__(self):
        self.hash_bits = int(os.environ['HASH_BITS'])
        self._hbase = hadoopy_hbase.HBaseRowDict(os.environ['HBASE_OUTPUT_TABLE'], os.environ['HBASE_OUTPUT_COLUMN'])

    def reduce(self, row, features):
        self._hbase[row] = pickle.dumps(image_search.RRMedianHasher(self.hash_bits, normalize_features=False).train([picarus.api.np_fromstring(x) for x in features]), -1)

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, required_cmdenvs=['HASH_BITS', 'HBASE_INPUT_COLUMN', 'HBASE_OUTPUT_ROW', 'HBASE_OUTPUT_TABLE', 'HBASE_OUTPUT_COLUMN'])

Example #37

0

Show file

File: train_hasher.py Project: objects-in-space-and-time/picarus

import hadoopy
import os
import random
import image_search


# TODO: Put this mapper in Hadoopy helper
class Mapper(object):

    def __init__(self):
        self.kv_prob = float(os.environ['KV_PROB'])

    def map(self, k, v):
        if random.random() < self.kv_prob:
            yield 0, (k, v)


class Reducer(object):

    def __init__(self):
        self.hash_bits = int(os.environ['HASH_BITS'])

    def reduce(self, key, id_feats):
        yield key, image_search.RRMedianHasher(self.hash_bits, normalize_features=False).train([x for _, x in id_feats])

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, required_cmdenvs=['KV_PROB', 'HASH_BITS'])

Example #38

0

Show file

File: feature_to_hash.py Project: objects-in-space-and-time/picarus

#!/usr/bin/env python
import hadoopy
import os
import numpy as np
import picarus.api


class Mapper(picarus.api.HBaseMapper):

    def __init__(self):
        super(Mapper, self).__init__()
        self._hasher = picarus.api.model_fromfile(os.environ['HASHER_FN'])

    def _map(self, row, feature_binary):
        feature = picarus.api.np_fromstring(feature_binary)
        yield row, self._hasher(feature).tostring()


if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'HASHER_FN'])

Example #39

0

Show file

File: predict_video_frame.py Project: bwhite/patch_classifier

        data["video"] = event_video[1]
        pyramid = np.zeros((self.num_bins, self.num_bins), dtype=np.int32)
        num_boxes = 0
        coord = lambda x: int(np.round(x * self.num_bins))
        for (_, box), confs in super(Mapper, self).map(None, frame):
            num_boxes += 1
            cy0, cx0, cy1, cx1 = map(coord, box)
            cy1 += 1
            cx1 += 1
            cell_value = 1.0 / ((cy1 - cy0) * (cx1 - cx0))
            inds = (confs >= 0).nonzero()[0]
            hadoopy.counter("STATS", "num_pos", inds.size)
            hadoopy.counter("STATS", "num_neg", confs.size - inds.size)
            hadoopy.counter("STATS", "total", confs.size)
            if inds.size:
                pyramid[cy0:cy1, cx0:cx1] += cell_value * inds.size
        yield data, (pyramid, num_boxes * len(self.ids))


if __name__ == "__main__":
    hadoopy.run(
        Mapper,
        jobconfs=[
            "mapred.task.timeout=6000000",
            "mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec",
            "mapred.compress.map.output=true",
            "mapred.output.compress=true",
            "mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec",
        ],
    )

Example #40

0

Show file

                try:
                    boxes = scale_boxes[scale]
                except KeyError:
                    continue
                size_array = np.array([
                    image.shape[0], image.shape[1], image.shape[0],
                    image.shape[1]
                ])
                for box, fn in boxes:
                    box = np.round(size_array * box).astype(np.int)
                    print(box)
                    if self.type == 'image':
                        image_box = np.ascontiguousarray(
                            image[box[0]:box[2], box[1]:box[3], :])
                        yield fn, imfeat.image_tostring(image_box, 'png')
                    elif self.type == 'feature':
                        image_box = np.ascontiguousarray(
                            image[box[0]:box[2], box[1]:box[3], :])
                        yield fn, feature.compute_patch(image_box)
                    elif self.type == 'box':
                        image2 = image.copy()
                        cv2.rectangle(image2, (box[1], box[0]),
                                      (box[3], box[2]), (0, 255, 0), 4)
                        yield fn, imfeat.image_tostring(image2, 'jpg')
                    else:
                        raise ValueError(self.type)


if __name__ == '__main__':
    hadoopy.run(Mapper)

Example #41

0

Show file

import hadoopy
import numpy as np
import cPickle as pickle
import image_search


class Mapper(object):

    def __init__(self):
        self.hasher = pickle.load(open('hasher.pkl'))

    def map(self, name, feature):
        yield 0, (name, self.hasher(feature)[0])


class Reducer(object):

    def __init__(self):
        pass

    def reduce(self, key, name_hashes):
        names, hashes = zip(*name_hashes)
        yield np.array(names), np.ascontiguousarray(hashes)

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, required_files=['hasher.pkl'])

Example #42

0

Show file

File: face_job.py Project: amiller/hadoopy-picnic

        key: (x, y) bin coordinates between 0 and _bin_count-1
        values: an array of face patches (face_size, face_size),
                dtype=np.uint8, greyscale
    Yields:
        'mean_x_y.jpg', mean_face: the mean face, as jpeg data
        (x,y), count: the number of faces for bin x,y
    """
    def _image_to_str(img):
        out = StringIO.StringIO()
        img.save(out, 'JPEG')
        out.seek(0)
        return out.read()

    x, y = key

    sum_face = np.zeros((_face_size, _face_size))
    face_counter = 0

    for face in values:
        sum_face += face
        face_counter += 1

    mean_face = (sum_face/face_counter).astype('u1')
    yield ('mean_%d_%d.jpg' % (x, y),
           _image_to_str(Image.fromarray(mean_face)))
    yield (x, y), face_counter


if __name__ == '__main__':
    hadoopy.run(mapper, reducer)

Example #43

0

Show file

File: video_combined_features.py Project: objects-in-space-and-time/picarus

import video_raw_features
import video_block_features
import sys
import hadoopy


class Mapper(object):
    def __init__(self):
        self.b = video_block_features.Mapper()
        self.r = video_raw_features.Mapper()

    def map(self, event_filename, video_data):
        hadoopy.counter('CombinedFeatures', 'DontHave')
        sys.stderr.write('%s\n' % str(event_filename))
        for event_filename, features in self.r.map(event_filename, video_data):
            sys.stderr.write('%s\n' % str(event_filename))
            for x in self.b.map(event_filename, features):
                yield x


if __name__ == '__main__':
    hadoopy.run(Mapper, video_block_features.Reducer)

Example #44

0

Show file

in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""

import hadoopy

if __name__ == "__main__":
    """
            SELECT (k, v)
                where k = target_column_1+target_column_2+...,+target_column_N,
                where v = target_column_1, ..., target_column_N
            FROM (input dataset)
            WHERE filter_column_1 (not) in [filter_vals_1] and filter_column_2 (not) in [filter_vals_2] and ...
    """
    from python_hiveish.mapreduce.mappers import select_where as mapper
    from python_hiveish.mapreduce.reducers import identity_reducer as reducer
    hadoopy.run(mapper, reducer, doc=__doc__)

Example #45

0

Show file

File: kmeans_cluster.py Project: shubhampachori12110095/hadoop_vision

            except TypeError:
                cur_cluster_sum = vec
        yield key, cur_cluster_sum.tostring()
    
    def close(self):
        super(Combiner, self).close()


class Reducer(profile.ProfileJob):
    def __init__(self):
        super(Reducer, self).__init__()

    def reduce(self, key, values):
        cur_cluster_sum = None
        for vec in values:
            vec = np.fromstring(vec, dtype=np.float32)
            try:
                cur_cluster_sum += vec
            except TypeError:
                cur_cluster_sum = vec
        center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1]
        yield key, center.tostring()
    
    def close(self):
        super(Reducer, self).close()


if __name__ == "__main__":
    if hadoopy.run(Mapper, Reducer, Combiner):
        hadoopy.print_doc_quit(__doc__)