コード例 #1
0
ファイル: compute_marginals.py プロジェクト: srush/tf-fork
def runner(job):  
  import dumbo
  dumbo.run(MarginalMap, sum_reduce)
コード例 #2
0
#!/usr/bin/python
"""
select name,occupation from incomeTable;
"""


def mapper(key, value):
    name, age, sex, occupation, incomelevel = value.split(",")
    key = "-".join([name, occupation])
    yield key, 1


if __name__ == '__main__':
    import dumbo
    dumbo.run(mapper)
コード例 #3
0
ファイル: feature_parses.py プロジェクト: srush/tf-fork
def runner(job):  
  import dumbo
  dumbo.run(Mapper, reduce)
コード例 #4
0
import json

EXCLUDE_COMMITS = True
EXCLUDE_TESTS = True


def mapper(key, value):
    msg = json.loads(value)
    user_id = msg['sender_id']
    content = msg['content']
    stream = msg['display_recipient']
    to_yield = content.count('!')
    if EXCLUDE_COMMITS:
        if stream == 'commits':
            to_yield = 0
    if EXCLUDE_TESTS:
        if stream == 'test-stream':
            to_yield = 0
    yield user_id, to_yield


def reducer(key, values):
    yield key, sum(values)


if __name__ == '__main__':
    import dumbo
    dumbo.run(mapper, reducer)
コード例 #5
0
ファイル: mrblur.py プロジェクト: cpatrick/kwmapreduce
def blur_image(im, n, ny=None) :
    """ blurs the image by convolving with a gaussian kernel of typical
        size n. The optional keyword argument ny allows for a different
        size in the y direction.
    """
    g = gauss_kern(n, sizey=ny)
    improc = signal.convolve(im,g, mode='valid')
    return(improc)

def mapper(key, value):
    I = asarray(Image.open(value))
    r = I[:,:,0]
    g = I[:,:,1]
    b = I[:,:,2]
    gray = r*.222 + g*.7067 + b*.0713
    out = blur_image(gray,20)
    im = Image.fromarray(uint8(out))
    outBuff = StringIO.StringIO()
    im.save(outBuff,format="JPEG")
    yield value, outBuff.getvalue()

@opt("getpath","yes")
def reducer(key, values):
    for i in values:
        yield (key,key), i

if __name__ == "__main__":
    import dumbo
    dumbo.run(mapper,reducer,combiner=reducer)
コード例 #6
0
ファイル: testsvm.py プロジェクト: guokr321/CookBook
        yield TRUE_T_STR, true_cnt_t
        yield TRUE_F_STR, true_cnt_f
        yield FALSE_T_STR, false_cnt_t
        yield FALSE_F_STR, false_cnt_f


class Reducer():
    def __call__(self, key, values):
        """
        Reducer Program: statistical for elsvm

        Inputs:
            key: true_label or false_label
            values: cnt for label

        Outputs:
            the statistical result
        """

        if str(key) == TRUE_T_STR:
            yield TRUE_T_STR, sum(values)
        elif str(key) == TRUE_F_STR:
            yield TRUE_F_STR, sum(values)
        elif str(key) == FALSE_T_STR:
            yield FALSE_T_STR, sum(values)
        elif str(key) == FALSE_F_STR:
            yield FALSE_F_STR, sum(values)

if __name__ == "__main__":
    dumbo.run(Mapper, Reducer)
コード例 #7
0
ファイル: top10.py プロジェクト: guokr321/CookBook
def main():
    dumbo.run(mapper,reducer,combiner=reducer)
class Solve_Reducer:
    """
    Solve the subproblem
    """
    def __init__(self):
        self.tau_vec = [0.5, 0.75, 0.95]
        self.ntau = len(self.tau_vec)

    def __call__(self, key, values):
        #SAb = np.array([v for v in values])

        data = []
        for v in values:
            data += v

        SAb = np.array(data)
        m, n = SAb.shape

        x = np.zeros((n - 1, self.ntau))
        for i in range(self.ntau):
            x[:, i] = quantreg_ipm(SAb[:, :n - 1], SAb[:, n - 1],
                                   self.tau_vec[i])

        key = [key, m]
        yield key, x.T.tolist()


if __name__ == '__main__':
    import dumbo
    dumbo.run(Unif_Samp_Mapper, Solve_Reducer)
コード例 #9
0
ファイル: row_sum.py プロジェクト: mesanders/mrmatrix
#!/usr/bin/env dumbo

def mapper(key,value): 
    """ Each record is a line of text. 
    key=<byte that the line starts in the file>
    value=<line of text>
    """
    valarray = [float(v) for v in value.split()]
    yield key, sum(valarray)

if __name__=='__main__':
    import dumbo
    import dumbo.lib
    dumbo.run(mapper,dumbo.lib.identityreducer)
コード例 #10
0
ファイル: matrix2seqfile.py プロジェクト: dgleich/mrtsqr
import sys

"""
Map lines of a matrix to a sequence file:
  Key=<lineno>, Value=[row_i]
"""
def mapper(key,value):
    valarray = [float(v) for v in value.split()]
    if len(valarray) == 0:
        return
    yield key, valarray
    
class Converter:
    def __init__(self,opts):
        pass
    def __call__(self,data):
        item = 0
        for key,value in data:
            for entry in value:
                print "%18.16e"%(entry), 
            print
            item += 1
    
if __name__ == '__main__':
    import dumbo
    import dumbo.lib
    dumbo.run(mapper,dumbo.lib.identityreducer)

    
コード例 #11
0
ファイル: flow.py プロジェクト: r0wb0t/dumbo
    def run_task(self):
        self.kwargs['iter'] = self.index

        dumbo.run(*self.args, **self.kwargs)
コード例 #12
0
ファイル: mr.py プロジェクト: erxiong/strategy_data
 def run(self):
   import dumbo
   dumbo.run(self.mapper, self.reducer)
コード例 #13
0
ファイル: extop10.py プロジェクト: LiaoPengyu/CookBook
def main():
    dumbo.run(Mapper,reducer,combiner=reducer)
コード例 #14
0
ファイル: q2.py プロジェクト: sagar27/hadoop-mr
#!/usr/bin/python
"""
select name,occupation from incomeTable;
"""

def mapper(key,value):
    name,age,sex,occupation,incomelevel = value.split(",")
    key = "-".join([name,occupation])
    yield key,1

if __name__ == '__main__':
    import dumbo
    dumbo.run(mapper)
コード例 #15
0
ファイル: mr.py プロジェクト: Henry2012/recipes
Creation: 2014-4-17
Revision: 2014-4-17
"""

import json
import sys
from dumbo import run

reload(sys)
sys.setdefaultencoding('utf8')

class Mapper(object):
    def __init__(self):
        pass

    def __call__(self, key, value):
        record = json.loads(value.strip())
        if "city" in record:
            yield record['city'], 1

class Reducer(object):
    def __init__(self):
        pass

    def __call__(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    run(Mapper, Reducer)
コード例 #16
0
ファイル: findCommonFnds.py プロジェクト: zopemanish/hadoop
import os
import math

class Mapper:
    def __init__(self):
        pass
    def __call__(self, key, value):
        tempArr = value.replace(' ','').split('->')
        tempArr1 = tempArr[1].replace(' ','').split(',')
	for x in tempArr1:
		combinationsTempArr = [tempArr[0],x]
		combinationsTempArr.sort()
		yield (','.join(combinationsTempArr)), tempArr[1].replace((x+','),'').replace((','+x),'').replace(' ','')


def reducer(key, values):
    tempValArr = []
    commonFnds = []
    for temp in values:
        tempArr = temp.split(",")
        for temp1 in tempArr:
            if(temp1 in tempValArr):
                commonFnds.append(temp1)
            else:    
                tempValArr.append(temp1)
    yield key, commonFnds

if __name__ == "__main__":
    import dumbo
    dumbo.run(Mapper, reducer, combiner=None)
コード例 #17
0
            
            #print numpy.sum(var_beta[:, edges_indices_list].T)
            #print numpy.dot((self._edge_prior[:, edges_indices_list] - 1), var_beta[:, edges_indices_list].T)
            
            # term 6
            #corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1)))
            #corpus_level_log_likelihood += numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list]));
            corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1)) + numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list]), axis=1));
            corpus_level_log_likelihood += numpy.sum( - (var_beta[:, edges_indices_list]-1) * self.compute_dirichlet_expectation(var_beta[:, edges_indices_list]));
                    
        assert numpy.min(var_beta)>=0;

        # TODO: add in alpha updating
        # compute the sufficient statistics for alpha and update
        #alpha_sufficient_statistics = scipy.special.psi(self._gamma) - scipy.special.psi(numpy.sum(self._gamma, axis=1)[:, numpy.newaxis]);
        #alpha_sufficient_statistics = numpy.sum(alpha_sufficient_statistics, axis=0)[numpy.newaxis, :];
        #self.update_alpha(alpha_sufficient_statistics)
        
        #print numpy.sum(numpy.exp(self.E_log_beta), axis=1);
        
        return corpus_level_log_likelihood
        '''

        yield current_topic_index, " ".join(
            ["%f" % (item) for item in E_log_beta[0, :]])


if __name__ == '__main__':
    import dumbo
    dumbo.run(Mapper, Reducer, combiner=Combiner)
コード例 #18
0
ファイル: testelm.py プロジェクト: guokr321/CookBook
        yield TRUE_F_STR, true_cnt_f
        yield FALSE_T_STR, false_cnt_t
        yield FALSE_F_STR, false_cnt_f


class Reducer():
    def __call__(self, key, values):
        """
        Reducer Program: statistical for elsvm

        Inputs:
            key: true_label or false_label
            values: cnt for label

        Outputs:
            the statistical result
        """

        if str(key) == TRUE_T_STR:
            yield TRUE_T_STR, sum(values)
        elif str(key) == TRUE_F_STR:
            yield TRUE_F_STR, sum(values)
        elif str(key) == FALSE_T_STR:
            yield FALSE_T_STR, sum(values)
        elif str(key) == FALSE_F_STR:
            yield FALSE_F_STR, sum(values)


if __name__ == "__main__":
    dumbo.run(Mapper, Reducer)
コード例 #19
0
ファイル: ngrams.py プロジェクト: zjfplayer2/python-ngrams
        if len(data) < 3:
            return

        ngram = data[0].split()
        year = data[1]
        count = int(data[2])

        if len(ngram) != self.expected_tokens:
            return

        pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
        k = pair + [year]

        yield (k, count)


def combiner(key, values):
    yield (key, sum(values))


def reducer(key, values):
    yield "%s\t%s\t%s" % tuple(key), str(sum(values))


if __name__ == '__main__':
    import dumbo
    # import pdb
    # pdb.set_trace()
    # dumbo.run(NgramMapper, reducer, combiner=combiner)
    dumbo.run(NgramMapper, reducer)
コード例 #20
0
ファイル: ngrams.py プロジェクト: abeusher/python-ngrams
        data = value.split('\t')
        
        if len(data) < 3:
            return
        
        ngram = data[0].split()
        year = data[1]
        count = int(data[2])
        
        if len(ngram) != self.expected_tokens:
            return
        
        pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
        k = pair + [year]
        
        yield (k, count)        

def combiner(key, values):
    yield (key, sum(values))

def reducer(key, values):
    yield "%s\t%s\t%s" % tuple(key), str(sum(values))


if __name__ == '__main__':
    import dumbo
    # import pdb
    # pdb.set_trace()
    # dumbo.run(NgramMapper, reducer, combiner=combiner)
    dumbo.run(NgramMapper, reducer)
コード例 #21
0
ファイル: hitsbymonth.py プロジェクト: bbengfort/mapreduce
#!/usr/bin/env python

import re

logline = re.compile(r'^(local|remote) - - \[(.*)\] "(.*)" (\d+) (\d+)$', re.I)

def mapper(key, value):
    line = logline.match(value)
    if line:
        dt = line.groups()[1]
        yield dt.split('/')[1], 1

if __name__ == '__main__':
    import dumbo
    dumbo.run(mapper, dumbo.sumreducer, dumbo.sumreducer)
コード例 #22
0
ファイル: testsvm_step2.py プロジェクト: guokr321/CookBook
    def extend_point(self, point):
        """
        Extent a new value into point array
        """
        point = np.resize(point, len(point) + 1)
        point[-1] = 1
        return point

    def __call__(self, data):
        """
        Mapper Program

            It will output the modified single line
        """

        for docID, doc in data:
            for term in doc.split("\n"):
                self.SEP = self.SEP if self.SEP is not None else get_sep(term)
                point = np.fromstring(term, dtype=np.float64, sep=self.SEP)
                label = int(point[-1])
                last_value = self.getDValue(point)
                point = self.extend_point(point)
                point[-1] = last_value
                point[-2] = float(label)
                output = ",".join([str(i) for i in point])
                yield output, "\t"

if __name__ == "__main__":
    dumbo.run(Mapper)
コード例 #23
0
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#!/usr/bin/env python
# the script we tell dumbo to run.

from zohmg.mapper import Mapper
from zohmg.reducer import Reducer
from zohmg.combiner import Combiner

from usermapper import map

# !@#
import dumbo
dumbo.run(Mapper(map), Reducer(), Combiner())

コード例 #24
0
ファイル: wordcount.py プロジェクト: CyaLiven/dumbo
"""
Counts how many times each word occurs.
"""

def mapper(key,value):
    for word in value.split(): yield word,1

def reducer(key,values):
    yield key,sum(values)

if __name__ == "__main__":
    import dumbo
    dumbo.run(mapper,reducer,reducer)
コード例 #25
0
ファイル: inferencer.py プロジェクト: Jessilee/PyLDA
            corpus_level_log_likelihood += (scipy.special.gammaln(numpy.sum(self._edge_prior[:, edges_indices_list])) - numpy.sum(scipy.special.gammaln(self._edge_prior[:, edges_indices_list]))) * self._number_of_topics;
            corpus_level_log_likelihood += numpy.sum(numpy.dot((self._edge_prior[:, edges_indices_list] - 1), var_beta[:, edges_indices_list].T));
            
            #print numpy.sum(var_beta[:, edges_indices_list].T)
            #print numpy.dot((self._edge_prior[:, edges_indices_list] - 1), var_beta[:, edges_indices_list].T)
            
            # term 6
            #corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1)))
            #corpus_level_log_likelihood += numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list]));
            corpus_level_log_likelihood += numpy.sum( - scipy.special.gammaln(numpy.sum(var_beta[:, edges_indices_list], axis=1)) + numpy.sum(scipy.special.gammaln(var_beta[:, edges_indices_list]), axis=1));
            corpus_level_log_likelihood += numpy.sum( - (var_beta[:, edges_indices_list]-1) * self.compute_dirichlet_expectation(var_beta[:, edges_indices_list]));
                    
        assert numpy.min(var_beta)>=0;

        # TODO: add in alpha updating
        # compute the sufficient statistics for alpha and update
        #alpha_sufficient_statistics = scipy.special.psi(self._gamma) - scipy.special.psi(numpy.sum(self._gamma, axis=1)[:, numpy.newaxis]);
        #alpha_sufficient_statistics = numpy.sum(alpha_sufficient_statistics, axis=0)[numpy.newaxis, :];
        #self.update_alpha(alpha_sufficient_statistics)
        
        #print numpy.sum(numpy.exp(self.E_log_beta), axis=1);
        
        return corpus_level_log_likelihood
        '''

        yield current_topic_index, " ".join(["%f" % (item) for item in E_log_beta[0, :]]);

if __name__ == '__main__':
    import dumbo;
    dumbo.run(Mapper, Reducer, combiner=Combiner);
コード例 #26
0
import json


def mapper(key, value):
    msg = json.loads(value)
    user_id = msg['sender_id']
    name = msg['sender_full_name']
    wordcount = len(msg['content'].split())
    yield (user_id, name), wordcount


def reducer(key, values):
    yield key, sum(values)


if __name__ == '__main__':
    import dumbo
    dumbo.run(mapper, reducer, combiner=reducer)
コード例 #27
0
ファイル: columns2mseq.py プロジェクト: paulcon/mrtsqr
import sys

class Mapper:
    opts = [('addpath','yes')]

    def __call__(self,key,value):
        # This assumes the column index is a one-digit integer 
        # that occurs immediately before the . in the file name.
        # It also assumes that the first element in the column
        # file is a row index.
        path,_key = key
        ind = path.rfind('.')
        col = path[ind-1]

        # value[0] is the row index, value[1] is the element.
        yield value[0], (col,float(value[1]))

def reducer(key,values):
    values = sorted(values, key=lambda value: value[0])
    row = [float(v[1]) for v in values]
    yield key,row
    
if __name__ == '__main__':
    import dumbo
    import dumbo.lib
    
    dumbo.run(Mapper,reducer)