コード例 #1
0
def runner():
    """ The function that calls haoodpy.run """
    iter = gopts.getintkey('iter')
    blocksize = gopts.getintkey('blocksize')
    reduce_schedule = gopts.getstrkey('reduce_schedule')

    mapper = NormalEquations(blocksize=blocksize, isreducer=False)
    reducer = NormalEquations(blocksize=blocksize, isreducer=True)

    hadoopy.run(mapper, reducer)
コード例 #2
0
ファイル: normal.py プロジェクト: dgleich/mrtsqr
def runner():
    """ The function that calls haoodpy.run """
    iter = gopts.getintkey('iter')
    blocksize = gopts.getintkey('blocksize')
    reduce_schedule = gopts.getstrkey('reduce_schedule')
    
    mapper = NormalEquations(blocksize=blocksize,isreducer=False)
    reducer =  NormalEquations(blocksize=blocksize,isreducer=True)
    
    
    hadoopy.run(mapper, reducer)
コード例 #3
0
#!/usr/bin/env python
import hadoopy
import picarus_takeout
import os
import picarus
import zlib
import sys


class Mapper(picarus.HBaseMapper):
    def __init__(self):
        super(Mapper, self).__init__()
        self._model = zlib.decompress(open(os.environ['MODEL_FN']).read())
        self.job = picarus_takeout.ModelChain(self._model)

    def _map(self, row, input_binary):
        try:
            yield row, self.job.process_binary(input_binary)
        except:
            sys.stdout.flush()
            hadoopy.counter('STATUS', 'badRows')
        else:
            sys.stdout.flush()
            hadoopy.counter('STATUS', 'goodRows')


if __name__ == '__main__':
    hadoopy.run(
        Mapper,
        required_cmdenvs=['HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'MODEL_FN'])
コード例 #4
0
        try:
            histogram[clusterid] += 1
        except KeyError:
            histogram[clusterid] = 1

    def configure(self):
        self.clusters = self._load_clusters()

    def map(self, imageid, features):
        """

        Args:
            imageid: An ID that is directly passed to the output
            features: As a list of numpy arrays

        Yields:
            A tuple in the form of (key, value)
            key: imageid
            value: histogram as a dict of (dim, val) (int, int)
        """
        histogram = {}
        for feature in features:
            clusterid = self._nearest_cluster_id(self.clusters, feature)
            self._update_histogram(clusterid, histogram)
        yield imageid, histogram


if __name__ == "__main__":
    if hadoopy.run(Mapper):
        hadoopy.print_doc_quit(__doc__)
コード例 #5
0
            inds = (confs >= 0).nonzero()[0]
            hadoopy.counter('STATS', 'num_pos', inds.size)
            hadoopy.counter('STATS', 'num_neg', confs.size - inds.size)
            hadoopy.counter('STATS', 'total', confs.size)
            if inds.size:
                self.pyramid[inds, cy, cx] += 1

    def close(self):
        yield 0, (self.pyramid, float(self.num_boxes))


class Reducer(object):

    def __init__(self):
        pass

    def reduce(self, key, pyramid_num_boxes):
        pyramid_out = 0
        num_boxes_out = 0
        for pyramid, num_boxes in pyramid_num_boxes:
            pyramid_out += pyramid
            num_boxes_out += num_boxes
        yield key, (pyramid_out, num_boxes_out)

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, jobconfs=['mapred.task.timeout=6000000',
                                           'mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec',
                                           'mapred.compress.map.output=true',
                                           'mapred.output.compress=true',
                                           'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'])
コード例 #6
0
    def __init__(self):
        pass

    def reduce(self, key, values):
        # Setup data
        # TODO(brandyn): Use multi-file join pattern
        data = [None, None, None]
        for input_type, value in values:
            data[input_type] = value
        if len([x for x in data if x is None]) != 0:
            raise ValueError('Reducer did not get all necessary parts!')
        exemplar, pos, neg = data
        # Compute threshold and output new exemplar
        try:
            thresh, score = fpr_threshold(pos, neg)
        except BadExemplar:
            print('Bad exemplar[%s]' % (key, ))
            return
        print('Good exemplar[%s][%f]' % (key, thresh))
        key[2] = score
        yield key, (exemplar[0], exemplar[1] - thresh)


if __name__ == '__main__':
    hadoopy.run(Mapper,
                Reducer,
                jobconfs=[
                    'mapred.task.timeout=6000000',
                    'mapred.child.java.opts=-Xmx512M'
                ])
コード例 #7
0
    def reduce(self, image_hash, values):
        """

        Args:
            image_hash: (see mapper)
            values: Iterator of values (see mapper)

        Yields:
            A tuple in the form of (image_hash, value)
            image_hash: Image hash
            value: The provided value (not the prediction)
        """
        predictions = None
        out_val = None
        for value in values:
            if isinstance(value, dict):
                predictions = value
            else:
                out_val = value
        if predictions is None or out_val is None:
            hadoopy.counter('DATA_ERR', 'MISSING_PREDICTIONS_OR_DATA')
            return
        label, conf = predictions[self._class_name][0]
        if (self._class_thresh <= label * conf) == (
                self._output_class == 1):  # Both true or both false
            yield image_hash, out_val


if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer)
コード例 #8
0
        frame = data['frame']
        data['event'] = event_video[0]
        data['video'] = event_video[1]
        pyramid = np.zeros((self.num_bins, self.num_bins), dtype=np.int32)
        num_boxes = 0
        coord = lambda x: int(np.round(x * self.num_bins))
        for (_, box), confs in super(Mapper, self).map(None, frame):
            num_boxes += 1
            cy0, cx0, cy1, cx1 = map(coord, box)
            cy1 += 1
            cx1 += 1
            cell_value = 1. / ((cy1 - cy0) * (cx1 - cx0))
            inds = (confs >= 0).nonzero()[0]
            hadoopy.counter('STATS', 'num_pos', inds.size)
            hadoopy.counter('STATS', 'num_neg', confs.size - inds.size)
            hadoopy.counter('STATS', 'total', confs.size)
            if inds.size:
                pyramid[cy0:cy1, cx0:cx1] += cell_value * inds.size
        yield data, (pyramid, num_boxes * len(self.ids))


if __name__ == '__main__':
    hadoopy.run(
        Mapper,
        jobconfs=[
            'mapred.task.timeout=6000000',
            'mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec',
            'mapred.compress.map.output=true', 'mapred.output.compress=true',
            'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'
        ])
コード例 #9
0
class Reducer(object):

    def __init__(self):
        self.labels = file_parse.load(os.environ['LOCAL_LABELS_FN'])

    def reduce(self, class_name, label_values):
        """

        Args:
            class_name: (see mapper)
            label_values: Iterator of label_values (see mapper)

        Yields:
            A tuple in the form of (key, value)
            classifier_name: (see mapper)
            classifier: Serialized classifier
        """
        label_values = list(label_values)
        for classifier_name in self.labels['classes'][class_name]['classifiers']:
            print('Starting [%s,%s]' % (class_name, classifier_name))
            classifier_extra = self.labels['classifiers'][classifier_name].get('extra', '')
            classifier = classifiers.train(self.labels['classifiers'][classifier_name]['name'], classifier_extra, label_values)
            classifier_ser = classifiers.dumps(classifier_name, classifier_extra, classifier)
            yield ' '.join([class_name, classifier_name]), classifier_ser
            print('Ending [%s,%s,%d]' % (class_name, classifier_name, len(classifier_ser)))


if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer)
コード例 #10
0
ファイル: face_finder.py プロジェクト: wsxiaoys/hadoopy
            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: (Image name, (x, y, w, h))
            value: face image (.png)
        """
        try:
            image = imfeat.image_fromstring(value, {
                'type': 'numpy',
                'dtype': 'uint8',
                'mode': 'gray'
            })
            image_color = imfeat.image_fromstring(value, {
                'type': 'numpy',
                'dtype': 'uint8',
                'mode': 'bgr'
            })
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        faces = _detect_faces(image, self._cascade)
        for x, y, w, h in faces:
            yield (key, (x, y, w, h)), imfeat.image_tostring(
                image_color[y:y + h, x:x + w, :], '.png')


if __name__ == "__main__":
    hadoopy.run(Mapper, required_files=['haarcascade_frontalface_default.xml'])
コード例 #11
0
            except TypeError:
                cur_cluster_sum = vec
        yield key, cur_cluster_sum.tostring()
    
    def close(self):
        super(Combiner, self).close()


class Reducer(profile.ProfileJob):
    def __init__(self):
        super(Reducer, self).__init__()

    def reduce(self, key, values):
        cur_cluster_sum = None
        for vec in values:
            vec = np.fromstring(vec, dtype=np.float32)
            try:
                cur_cluster_sum += vec
            except TypeError:
                cur_cluster_sum = vec
        center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1]
        yield key, center.tostring()
    
    def close(self):
        super(Reducer, self).close()


if __name__ == "__main__":
    if hadoopy.run(Mapper, Reducer, Combiner):
        hadoopy.print_doc_quit(__doc__)
コード例 #12
0
import video_raw_features
import video_block_features
import sys
import hadoopy


class Mapper(object):

    def __init__(self):
        self.b = video_block_features.Mapper()
        self.r = video_raw_features.Mapper()

    def map(self, event_filename, video_data):
        hadoopy.counter('CombinedFeatures', 'DontHave')
        sys.stderr.write('%s\n' % str(event_filename))
        for event_filename, features in self.r.map(event_filename, video_data):
            sys.stderr.write('%s\n' % str(event_filename))
            for x in self.b.map(event_filename, features):
                yield x


if __name__ == '__main__':
    hadoopy.run(Mapper, video_block_features.Reducer)
コード例 #13
0
#!/usr/bin/env python
import hadoopy
import vision_data
import os
import sys


class Mapper(object):
    def __init__(self):
        self.flickr = vision_data.Flickr()
        self.max_iters = int(os.environ.get('MAX_ITERS', 1))
        self.max_pages = int(os.environ.get('MAX_PAGES', 1))

    def map(self, num_kvs, query):
        sys.stderr.write('Flickr Query[%s]\n')
        for num, kv in enumerate(self.flickr.image_class_meta_url(query)):
            yield kv
            if num >= num_kvs:
                break


def reducer(key, values):
    yield key, values.next()


if __name__ == "__main__":
    hadoopy.run(Mapper, reducer, jobconfs=['mapred.task.timeout=6000000'])
コード例 #14
0
            self.map(key, feat)

    def _random_canopy(self, canopies):
        return np.array(random.sample(canopies, 1))

    def close(self):
        hadoopy.status('%f-%f' % (self.ftime, self.gtime))
        final_canopies = self._random_canopy(self.canopies)
        uncovered_points = True
        while uncovered_points:
            uncovered_points = False
            valid_canopies = []
            for x in self.canopies:
                nearest_dist = self.nn(x, final_canopies)[1]
                if nearest_dist > self.soft_dist:
                    uncovered_points = True
                if nearest_dist > self.hard_dist:
                    valid_canopies.append(x)
            if uncovered_points:
                canopy = self._random_canopy(valid_canopies)
                final_canopies = np.concatenate((final_canopies, canopy))
                self.canopies = valid_canopies
        for canopy in final_canopies:
            yield random.random(), canopy.tostring()
        hadoopy.counter('canopy_cluster','run_time', int(time.time() - self.start_time))


if __name__ == "__main__":
    if hadoopy.run(MapReduce, MapReduce):
        hadoopy.print_doc_quit(__doc__)
コード例 #15
0
import hadoopy
import picarus


def mapper(key, value):
    """

    Args:
        key: image_hash
        value: record (see IO docs)

    Yields:
        A tuple in the form of (key, value)
        key: image_hash
        value: binary file data
    """
    try:
        fp = picarus.io._record_to_fp(value)
    except IOError:
        hadoopy.counter('INPUT_ERROR', 'REMOTE_READ_FAILED')
        return
    yield key, fp.read()


if __name__ == '__main__':
    hadoopy.run(mapper)
コード例 #16
0
ファイル: dump_data.py プロジェクト: bwhite/hadoop_clustering
import cPickle as pickle

import numpy as np

import hadoopy
from hadoopy.pickle import b64dec, b64enc
import simplejson as json


class Mapper(object):
    def __init__(self, io_method):
        self.in_func = {'b64': self.b64, 'json': self.json}[io_method]

    def b64(self, value):
        return np.fromstring(b64dec(value), dtype=np.float32)

    def json(self, value):
        return np.array(json.loads(value), dtype=np.float32)
    
    def map(self, key, value):
        yield json.dumps(self.in_func(value).tolist())


if __name__ == "__main__":
    try:
        io_method = os.environ["IO_METHOD"]
    except KeyError:
        hadoopy.print_doc_quit(__doc__)
    if hadoopy.run(Mapper(io_method)):
        hadoopy.print_doc_quit(__doc__)
コード例 #17
0
#!/usr/bin/env python
import hadoopy
import imfeat
import os
import picarus.api


class Mapper(picarus.api.HBaseMapper):
    def __init__(self):
        super(Mapper, self).__init__()
        self._feat = picarus.api.model_fromfile(os.environ['FEATURE_FN'])

    def _map(self, row, image_binary):
        try:
            image = imfeat.image_fromstring(image_binary)
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
        yield row, picarus.api.np_tostring(self._feat(image))


if __name__ == '__main__':
    hadoopy.run(Mapper,
                required_cmdenvs=[
                    'HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN',
                    'FEATURE_FN'
                ])
コード例 #18
0
ファイル: face_ranker.py プロジェクト: jonstewart/picarus
    def map(self, key, value):
        """
        Args:
            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: Image name
            value: (image, faces) where image is the input value and faces is
                a list of ((x, y, w, h), n)
        """
        try:
            image = self._load_cv_image(value)
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        dist = self._compute_face_distance(image)
        yield dist, (key, value)


def reducer(key, values):
    """Identity reducer"""
    for value in values:
        yield key, value


if __name__ == "__main__":
    hadoopy.run(Mapper, reducer, doc=__doc__)
コード例 #19
0
    def close(self):
        self._compact_heap()
        return self.heap


class Reducer(object):
    def __init__(self, out_count=True):
        self.count = 0
        try:
            self.num_clusters = int(os.environ["NUM_CLUSTERS"])
        except KeyError:
            self.num_clusters = DEFAULT_NUM_CLUSTERS
        self.output = self.yield_count if out_count else self.yield_key

    def yield_count(self, key, value):
        return self.count, value

    def yield_key(self, key, value):
        return key, value

    def reduce(self, key, values):
        for value in values:
            if self.count < self.num_clusters:
                yield self.output(key, value)
                self.count += 1


if __name__ == "__main__":
    hadoopy.run(Mapper, Reducer, Reducer(False), doc=__doc__)
コード例 #20
0
import hadoopy
import random
import os
try:
    import numpy as np
except ImportError:
    pass


class Mapper(object):

    def __init__(self):
        self.alpha = float(os.environ['ALPHA'])

    def map(self, k, v):
        out = random.random()
        if out < self.alpha:
            yield out, (k, v)


def reducer(out, kvs):
    # NOTE(brandyn): The reducer is so that readtb only has to read 1 file
    # and so that they are uniformly distributed
    for kv in kvs:
        yield kv


if __name__ == '__main__':
    hadoopy.run(Mapper, reducer, required_cmdenvs=['ALPHA'])
コード例 #21
0
        clusters = self.clusters[cluster_ids]

        # Find NN using slow metric
        # Extends the array by 1 dim that has a 1. in it
        feat = np.fromstring(feat + '\x00\x00\x80?', dtype=np.float32)
        nearest_ind = self.nn(feat[0:-1], self.clusters)[0]
        try:
            self.out_sums[nearest_ind] += feat
        except KeyError:
            self.out_sums[nearest_ind] = feat

    def close(self):
        for nearest_ind, feat in self.out_sums.iteritems():
            yield nearest_ind, feat.tostring()

def reducer(key, values):
    cur_cluster_sum = None
    for vec in values:
        vec = np.fromstring(vec, dtype=np.float32)
        try:
            cur_cluster_sum += vec
        except TypeError:
            cur_cluster_sum = vec
    center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1]
    yield key, center.tostring()


if __name__ == "__main__":
    if hadoopy.run(Mapper, reducer):
        hadoopy.print_doc_quit(__doc__)
コード例 #22
0
#!/usr/bin/env python
import hadoopy
import os
import numpy as np
import json
import picarus.api
import picarus_takeout


class Mapper(picarus.api.HBaseMapper):

    def __init__(self):
        super(Mapper, self).__init__()
        classifier = picarus.api.model_fromfile(os.environ['CLASSIFIER_FN'])
        if os.environ['CLASSIFIER_TYPE'] == 'sklearn_decision_func':
            self._classifier = lambda x: repr(float(classifier.decision_function(x).flat[0]))
        elif os.environ['CLASSIFIER_TYPE'] == 'class_distance_list':
            self._classifier = lambda x: json.dumps(classifier(x))
        else:
            raise ValueError('Unknown CLASSIFIER_TYPE=%s' % os.environ['CLASSIFIER_TYPE'])

    def _map(self, row, feature_binary):
        feature = picarus.api.np_fromstring(feature_binary)
        yield row, self._classifier(feature)


if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'CLASSIFIER_FN'])
コード例 #23
0
ファイル: face_finder.py プロジェクト: Jeffliu/hadoopy
            path = ('fixtures/haarcascade_frontalface_default.xml')
            if os.path.exists(path):
                self._cascade = cv2.CascadeClassifier(path)
            else:
                raise ValueError("Can't find .xml file!")

    def map(self, key, value):
        """
        Args:
            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: (Image name, (x, y, w, h))
            value: face image (.png)
        """
        try:
            image = imfeat.image_fromstring(value, {'type': 'numpy', 'dtype': 'uint8', 'mode': 'gray'})
            image_color = imfeat.image_fromstring(value, {'type': 'numpy', 'dtype': 'uint8', 'mode': 'bgr'})
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        faces = _detect_faces(image, self._cascade)
        for x, y, w, h in faces:
            yield (key, (x, y, w, h)), imfeat.image_tostring(image_color[y:y + h, x:x + w, :], '.png')


if __name__ == "__main__":
    hadoopy.run(Mapper, required_files=['haarcascade_frontalface_default.xml'])
コード例 #24
0
#!/usr/bin/env python
import hadoopy
import imfeat
import os
import picarus.api


class Mapper(picarus.api.HBaseMapper):

    def __init__(self):
        super(Mapper, self).__init__()
        self.max_side = int(os.environ.get['MAX_SIDE'])

    def _map(self, row, image_binary):
        try:
            image = imfeat.image_fromstring(image_binary)
            yield row, imfeat.image_tostring(imfeat.resize_image_max_side(image, self.max_side), 'jpg')
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')


if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'MAX_SIDE'])
コード例 #25
0
    def map(self, key, value):
        data = value.split('\t')

        if len(data) < 3:
            return

        ngram = data[0].split()
        year = data[1]
        count = int(data[2])

        if len(ngram) != self.expected_tokens:
            return

        pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
        k = pair + [year]

        yield (k, count)


def combiner(key, values):
    yield (key, sum(values))


def reducer(key, values):
    yield "%s\t%s\t%s" % tuple(key), str(sum(values))


if __name__ == '__main__':
    hadoopy.run(Mapper, reducer, combiner)
コード例 #26
0
ファイル: face_finder.py プロジェクト: wsxiaoys/hadoopy
                  h * image_scale), n) for (x, y, w, h), n in faces]

    def _load_cv_image(self, value):
        return imfeat.convert_image(Image.open(StringIO.StringIO(value)),
                                    [('opencv', 'rgb', 8)])

    def map(self, key, value):
        """
        Args:
            key: Image name
            value: Image as jpeg byte data

        Yields:
            A tuple in the form of (key, value)
            key: Image name
            value: (image, faces) where image is the input value and faces is
                a list of ((x, y, w, h), n)
        """
        try:
            image = self._load_cv_image(value)
        except:
            hadoopy.counter('DATA_ERRORS', 'ImageLoadError')
            return
        faces = self._detect_faces(image)
        if faces:
            yield key, (value, faces)


if __name__ == "__main__":
    hadoopy.run(Mapper, doc=__doc__)
コード例 #27
0
ファイル: hadoopy_rt_job.py プロジェクト: bwhite/hadoopy_rt
    def map(self, node_num, data):
        sys.stderr.write('HadoopyRT: NodeNum[%d]\n' % (node_num,))
        flow_controller = hadoopy_rt.FlowControllerNode(self.job_id, self.redis_server, node_num)
        if 'files' in data:
            for f, d in data['files'].items():
                open(f, 'w').write(d)
            data['files'] = list(data['files'])  # Convert to list, removes memory burden
        launch_kw_args = dict((x, data[x]) for x in ['files', 'cmdenvs'] if x in data)
        try:
            launch_kw_args['cmdenvs'] = hadoopy._runner._listeq_to_dict(launch_kw_args['cmdenvs'])
        except KeyError:
            launch_kw_args['cmdenvs'] = {}
        launch_kw_args['cmdenvs']['hadoopy_rt_stream'] = str(node_num)
        launch_kw_args['cmdenvs']['hadoopy_rt_redis'] = self.redis_server
        open(data['script_name'], 'w').write(data['script_data'])
        while True:
            try:
                hadoopy_rt.launch_zmq(flow_controller, data['script_name'], outputs=data.get('outputs'), **launch_kw_args)
            except Exception, e:
                sys.stderr.write('%s\n' % str(e))
            ps = redis.StrictRedis().pubsub()
            ps.subscribe(data['script_name'])
            for x in ps.listen():
                if x['type'] == 'message':
                    open(data['script_name'], 'w').write(x['data'])
                    break

if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['hadoopy_rt_redis', 'job_id'])
コード例 #28
0
ファイル: filter_samples.py プロジェクト: jonstewart/picarus
        self._assignments = self._load_assignments()

    def _load_assignments(self):
        out = {}  # [image_id] = list of clust_ids
        with open(os.environ['ASSIGNMENTS_FN']) as fp:
            for clust_ind, image_id in pickle.load(fp):
                out.setdefault(image_id, []).append(clust_ind)
        return out

    def map(self, image_id, image_data):
        """Take in an image, if it is one we want then output it

        Args:
            name: unique image id
            image_data: Binary image data

        Yields:
            A tuple in the form of (key, value)
            key: cluster ind
            value: (image_id, image_data)
        """
        try:
            for cluster_ind in self._assignments[image_id]:
                yield cluster_ind, (image_id, image_data)
        except KeyError:
            pass

if __name__ == "__main__":
    if hadoopy.run(Mapper):
        hadoopy.print_doc_quit(__doc__)
コード例 #29
0
ファイル: ngrams.py プロジェクト: abeusher/python-ngrams
        # determine value of n in the current block of ngrams
        input_file = os.environ['map_input_file']
        self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
    
    def map(self, key, value):
        data = value.split('\t')
        
        if len(data) < 3:
            return
        
        ngram = data[0].split()
        year = data[1]
        count = int(data[2])
        
        if len(ngram) != self.expected_tokens:
            return
        
        pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
        k = pair + [year]
        
        yield (k, count)        

def combiner(key, values):
    yield (key, sum(values))

def reducer(key, values):
    yield "%s\t%s\t%s" % tuple(key), str(sum(values))

if __name__ == '__main__':
    hadoopy.run(Mapper, reducer, combiner)
コード例 #30
0
#!/usr/bin/env python
import hadoopy
import numpy as np


def mapper(key, image_data):
    (tag, hash) = key
    print key
    if tag == 'frame':
        yield hash, image_data


if __name__ == '__main__':
    hadoopy.run(mapper)
コード例 #31
0
    return thresh, score


class Reducer(object):

    def __init__(self):
        pass

    def reduce(self, key, values):
        # Setup data
        # TODO(brandyn): Use multi-file join pattern
        data = [None, None, None]
        for input_type, value in values:
            data[input_type] = value
        if len([x for x in data if x is None]) != 0:
            raise ValueError('Reducer did not get all necessary parts!')
        exemplar, pos, neg = data
        # Compute threshold and output new exemplar
        try:
            thresh, score = fpr_threshold(pos, neg)
        except BadExemplar:
            print('Bad exemplar[%s]' % (key,))
            return
        print('Good exemplar[%s][%f]' % (key, thresh))
        key[2] = score
        yield key, (exemplar[0], exemplar[1] - thresh)
        

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, jobconfs=['mapred.task.timeout=6000000', 'mapred.child.java.opts=-Xmx512M'])
コード例 #32
0
#!/usr/bin/python
import hadoopy


def mapper(row, column_families):
    yield 'num_rows', 1

def reducer(key, values):
    yield key, sum(values)

if __name__ == '__main__':
    hadoopy.run(mapper, reducer)
コード例 #33
0
ファイル: image_exif.py プロジェクト: William-Wai/picarus
    def _map(self, row, image_binary):
        try:
            image = Image.open(StringIO.StringIO(image_binary))
            if not hasattr(image, "_getexif"):
                yield row, json.dumps({})
            else:
                image_tags = image._getexif()
                if image_tags is None:
                    yield row, json.dumps({})
                else:
                    yield row, json.dumps(
                        dict(
                            (name, base64.b64encode(image_tags[id]))
                            if isinstance(image_tags[id], str)
                            else image_tags[id]
                            for id, name in TAGS.items()
                            if id in image_tags
                        )
                    )
        except:
            sys.stdout.flush()
            hadoopy.counter("STATUS", "badRows")
        else:
            sys.stdout.flush()
            hadoopy.counter("STATUS", "goodRows")


if __name__ == "__main__":
    hadoopy.run(Mapper, required_cmdenvs=["HBASE_TABLE", "HBASE_OUTPUT_COLUMN"])
コード例 #34
0
#!/usr/bin/env python
import hadoopy
import hadoopy_rt


class Mapper(object):

    def __init__(self):
        super(Mapper, self).__init__()

    def map(self, key, value):
        for v in value.split():
            yield 1, (v, 1)  # Send all words to 1
            #if v[0] == '#':
            #    yield 2, (v, 1)  # Send all hashtags to 2

if __name__ == '__main__':
    hadoopy.run(Mapper)
コード例 #35
0
#!/usr/bin/env python
import hadoopy
import hadoopy_rt


class Updater(hadoopy_rt.Updater):

    def __init__(self):
        super(Updater, self).__init__()

    def update(self, key, value, slate):
        slate.set(value)


if __name__ == '__main__':
    hadoopy.run(Updater)
コード例 #36
0
import hadoopy
import hadoopy_hbase
import os
import image_search
import numpy as np
import cPickle as pickle
import picarus.api


class Mapper(object):

    def __init__(self):
        self._hbase_input_column = os.environ['HBASE_INPUT_COLUMN'].split(':')
        self._hbase_output_row = os.environ['HBASE_OUTPUT_ROW']

    def map(self, row, columns):
        yield self._hbase_output_row, columns[self._hbase_input_column[0]][self._hbase_input_column[1]]


class Reducer(object):

    def __init__(self):
        self.hash_bits = int(os.environ['HASH_BITS'])
        self._hbase = hadoopy_hbase.HBaseRowDict(os.environ['HBASE_OUTPUT_TABLE'], os.environ['HBASE_OUTPUT_COLUMN'])

    def reduce(self, row, features):
        self._hbase[row] = pickle.dumps(image_search.RRMedianHasher(self.hash_bits, normalize_features=False).train([picarus.api.np_fromstring(x) for x in features]), -1)

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, required_cmdenvs=['HASH_BITS', 'HBASE_INPUT_COLUMN', 'HBASE_OUTPUT_ROW', 'HBASE_OUTPUT_TABLE', 'HBASE_OUTPUT_COLUMN'])
コード例 #37
0
import hadoopy
import os
import random
import image_search


# TODO: Put this mapper in Hadoopy helper
class Mapper(object):

    def __init__(self):
        self.kv_prob = float(os.environ['KV_PROB'])

    def map(self, k, v):
        if random.random() < self.kv_prob:
            yield 0, (k, v)


class Reducer(object):

    def __init__(self):
        self.hash_bits = int(os.environ['HASH_BITS'])

    def reduce(self, key, id_feats):
        yield key, image_search.RRMedianHasher(self.hash_bits, normalize_features=False).train([x for _, x in id_feats])

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, required_cmdenvs=['KV_PROB', 'HASH_BITS'])
コード例 #38
0
#!/usr/bin/env python
import hadoopy
import os
import numpy as np
import picarus.api


class Mapper(picarus.api.HBaseMapper):

    def __init__(self):
        super(Mapper, self).__init__()
        self._hasher = picarus.api.model_fromfile(os.environ['HASHER_FN'])

    def _map(self, row, feature_binary):
        feature = picarus.api.np_fromstring(feature_binary)
        yield row, self._hasher(feature).tostring()


if __name__ == '__main__':
    hadoopy.run(Mapper, required_cmdenvs=['HBASE_INPUT_COLUMN', 'HBASE_TABLE', 'HBASE_OUTPUT_COLUMN', 'HASHER_FN'])
コード例 #39
0
        data["video"] = event_video[1]
        pyramid = np.zeros((self.num_bins, self.num_bins), dtype=np.int32)
        num_boxes = 0
        coord = lambda x: int(np.round(x * self.num_bins))
        for (_, box), confs in super(Mapper, self).map(None, frame):
            num_boxes += 1
            cy0, cx0, cy1, cx1 = map(coord, box)
            cy1 += 1
            cx1 += 1
            cell_value = 1.0 / ((cy1 - cy0) * (cx1 - cx0))
            inds = (confs >= 0).nonzero()[0]
            hadoopy.counter("STATS", "num_pos", inds.size)
            hadoopy.counter("STATS", "num_neg", confs.size - inds.size)
            hadoopy.counter("STATS", "total", confs.size)
            if inds.size:
                pyramid[cy0:cy1, cx0:cx1] += cell_value * inds.size
        yield data, (pyramid, num_boxes * len(self.ids))


if __name__ == "__main__":
    hadoopy.run(
        Mapper,
        jobconfs=[
            "mapred.task.timeout=6000000",
            "mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec",
            "mapred.compress.map.output=true",
            "mapred.output.compress=true",
            "mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec",
        ],
    )
コード例 #40
0
                try:
                    boxes = scale_boxes[scale]
                except KeyError:
                    continue
                size_array = np.array([
                    image.shape[0], image.shape[1], image.shape[0],
                    image.shape[1]
                ])
                for box, fn in boxes:
                    box = np.round(size_array * box).astype(np.int)
                    print(box)
                    if self.type == 'image':
                        image_box = np.ascontiguousarray(
                            image[box[0]:box[2], box[1]:box[3], :])
                        yield fn, imfeat.image_tostring(image_box, 'png')
                    elif self.type == 'feature':
                        image_box = np.ascontiguousarray(
                            image[box[0]:box[2], box[1]:box[3], :])
                        yield fn, feature.compute_patch(image_box)
                    elif self.type == 'box':
                        image2 = image.copy()
                        cv2.rectangle(image2, (box[1], box[0]),
                                      (box[3], box[2]), (0, 255, 0), 4)
                        yield fn, imfeat.image_tostring(image2, 'jpg')
                    else:
                        raise ValueError(self.type)


if __name__ == '__main__':
    hadoopy.run(Mapper)
コード例 #41
0
import hadoopy
import numpy as np
import cPickle as pickle
import image_search


class Mapper(object):

    def __init__(self):
        self.hasher = pickle.load(open('hasher.pkl'))

    def map(self, name, feature):
        yield 0, (name, self.hasher(feature)[0])


class Reducer(object):

    def __init__(self):
        pass

    def reduce(self, key, name_hashes):
        names, hashes = zip(*name_hashes)
        yield np.array(names), np.ascontiguousarray(hashes)

if __name__ == '__main__':
    hadoopy.run(Mapper, Reducer, required_files=['hasher.pkl'])
コード例 #42
0
ファイル: face_job.py プロジェクト: amiller/hadoopy-picnic
        key: (x, y) bin coordinates between 0 and _bin_count-1
        values: an array of face patches (face_size, face_size),
                dtype=np.uint8, greyscale
    Yields:
        'mean_x_y.jpg', mean_face: the mean face, as jpeg data
        (x,y), count: the number of faces for bin x,y
    """
    def _image_to_str(img):
        out = StringIO.StringIO()
        img.save(out, 'JPEG')
        out.seek(0)
        return out.read()

    x, y = key

    sum_face = np.zeros((_face_size, _face_size))
    face_counter = 0

    for face in values:
        sum_face += face
        face_counter += 1

    mean_face = (sum_face/face_counter).astype('u1')
    yield ('mean_%d_%d.jpg' % (x, y),
           _image_to_str(Image.fromarray(mean_face)))
    yield (x, y), face_counter


if __name__ == '__main__':
    hadoopy.run(mapper, reducer)
コード例 #43
0
import video_raw_features
import video_block_features
import sys
import hadoopy


class Mapper(object):
    def __init__(self):
        self.b = video_block_features.Mapper()
        self.r = video_raw_features.Mapper()

    def map(self, event_filename, video_data):
        hadoopy.counter('CombinedFeatures', 'DontHave')
        sys.stderr.write('%s\n' % str(event_filename))
        for event_filename, features in self.r.map(event_filename, video_data):
            sys.stderr.write('%s\n' % str(event_filename))
            for x in self.b.map(event_filename, features):
                yield x


if __name__ == '__main__':
    hadoopy.run(Mapper, video_block_features.Reducer)
コード例 #44
0
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""

import hadoopy

if __name__ == "__main__":
    """
            SELECT (k, v)
                where k = target_column_1+target_column_2+...,+target_column_N,
                where v = target_column_1, ..., target_column_N
            FROM (input dataset)
            WHERE filter_column_1 (not) in [filter_vals_1] and filter_column_2 (not) in [filter_vals_2] and ...
    """
    from python_hiveish.mapreduce.mappers import select_where as mapper
    from python_hiveish.mapreduce.reducers import identity_reducer as reducer
    hadoopy.run(mapper, reducer, doc=__doc__)
コード例 #45
0
            except TypeError:
                cur_cluster_sum = vec
        yield key, cur_cluster_sum.tostring()
    
    def close(self):
        super(Combiner, self).close()


class Reducer(profile.ProfileJob):
    def __init__(self):
        super(Reducer, self).__init__()

    def reduce(self, key, values):
        cur_cluster_sum = None
        for vec in values:
            vec = np.fromstring(vec, dtype=np.float32)
            try:
                cur_cluster_sum += vec
            except TypeError:
                cur_cluster_sum = vec
        center = cur_cluster_sum[0:-1] / cur_cluster_sum[-1]
        yield key, center.tostring()
    
    def close(self):
        super(Reducer, self).close()


if __name__ == "__main__":
    if hadoopy.run(Mapper, Reducer, Combiner):
        hadoopy.print_doc_quit(__doc__)