def estimate(master, input, center, k, iterations, map_reader=chain_reader): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ job = master.new_job(name='k-clustering_init', input=input, map_reader=map_reader, map_init=map_init, map=random_init_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(k=k, seed=None, **center), nr_reduces=k) centers = [(i, c) for i, c in result_iterator(job.wait())] job.purge() for j in range(iterations): job = master.new_job(name='k-clustering_iteration_%s' % (j, ), input=input, map_reader=map_reader, map=estimate_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(centers=centers, **center), nr_reduces=k) centers = [(i, c) for i, c in result_iterator(job.wait())] job.purge() return centers
def _assert_csv_reader(self, fields, values, expected): stream = StringIO.StringIO(values) params = Params() params.csv_fields = fields params.csv_dialect = csv.excel_tab actual = csv_reader(stream, None, None, params) ok_(isinstance(actual, types.GeneratorType)) eq_(list(actual), expected)
def load_one_dim(master, input, config_path, nr_maps=1, nr_reduces=1,\ load_method=offdimetlmr, dimnames= repr([]), \ go_live=1, profile=False): dim_job = master.new_job( name = 'dim', input = input, map_init = load_method.dim_map_init, map_reader = load_method.map_reader, map = load_method.dim_map_func, partition = load_method.dim_partition_func, combiner = load_method.dim_combiner_func, reduce = load_method.dim_reduce_func, scheduler = {'max_cores': nr_maps}, nr_reduces = nr_reduces, required_modules=[('config', config_path)], profile = profile, status_interval = 1000000, params = Params(count=0, dimnames=dimnames, \ nr_maps=nr_maps, nr_reduces=nr_reduces) ) results = dim_job.wait() shelvedb_paths = [] if results!=None: for key,value in result_iterator(results): shelvedb_paths.append(key) if go_live==1: load_method.golive(config, shelvedb_paths)
class InitTestCase(DiscoJobTestFixture, DiscoTestCase): inputs = range(10) params = Params(x=10) sort = False def getdata(self, path): return 'skipthis\n' + ('%s\n' % path) * 10 @staticmethod def map_init(input_iter, params): input_iter.next() params.x += 100 @staticmethod def map(e, params): return [(e, int(e) + params.x)] @staticmethod def reduce_init(input_iter, params): params.y = 1000 @staticmethod def reduce(iter, out, params): for k, v in iter: out.add(k, int(v) + params.y) def runTest(self): results = list(self.results) for k, v in results: self.assertEquals(int(k) + 1110, int(v)) self.assertEquals(len(results), 100)
def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle( rule_params.get('server', settings.get('server'))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get( 'worker').rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn( "Error instantiating worker: %s %s - loading default worker" % (settings.get('worker'), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START)
def run(program, jobclass, *inputs): """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...] Create an instance of jobclass and run it. Input urls are specified as arguments or read from stdin. """ from disco.core import Params from disco.util import reify def maybe_list(seq): return seq[0] if len(seq) == 1 else seq name = program.options.name or jobclass.split('.')[-1] input = inputs or [ maybe_list(line.split()) for line in fileinput.input(inputs) ] job = reify(jobclass)(program.disco, name) try: params = job.params except AttributeError: params = Params() params.__dict__.update(**dict(program.options.params)) job.run(input=input, **program.option_parser.jobdict) print job.name
def _assert_reduce(self, data, expected, **kwargs): # turn disco_debug on for more code coverage if kwargs is None: kwargs = dict() kwargs['disco_debug'] = True params = Params(**kwargs) actual = keyset_reduce(data, params) ok_(isinstance(actual, types.GeneratorType)) eq_(list(actual), expected)
class Grep(Job): map = nop_map params = Params(pattern=None) def map_reader(fd, size, url, params): import re if params.pattern: pattern = re.compile(params.pattern) for line in fd: if pattern.match(line): yield url, line
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader): ys = dict([(id, 1) for id in ys]) job = Job(name='naive_bayes_predict') job.run(input=input, map_reader=map_reader, map=predict_map, params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter), clean=False) return job.wait()
def test_keyset_multiplier(self): params = Params() params.keysets = { 'last_name_keyset': dict( key_parts=['_keyset', 'last_name'], value_parts=['count'], ), 'first_name_keyset': dict( key_parts=['_keyset', 'first_name'], value_parts=['count'], ) } data = [{ 'first_name': 'Willow', 'last_name': 'Harvey' }, { 'first_name': 'Noam', 'last_name': 'Clarke' }] expected = [{ 'first_name': 'Willow', 'last_name': 'Harvey', '_keyset': 'first_name_keyset' }, { 'first_name': 'Willow', 'last_name': 'Harvey', '_keyset': 'last_name_keyset' }, { 'first_name': 'Noam', 'last_name': 'Clarke', '_keyset': 'first_name_keyset' }, { 'first_name': 'Noam', 'last_name': 'Clarke', '_keyset': 'last_name_keyset' }] actual = keyset_multiplier(data, None, None, params) ok_(isinstance(actual, types.GeneratorType)) eq_(list(actual), expected)
class LineChunker(Job): params = Params(ddfs_master=None, tag=None) def _map_input_stream(fd, size, url, params): from disco.ddfs import DDFS tag = params.tag or 'disco:chunks:%s' % Task.jobname master = params.ddfs_master or Task.master yield url, DDFS(master).chunk(tag, [url]) map_input_stream = [_map_input_stream] def map(entry, params): yield entry
def __init__(self, master, name, index, method, arg, streams, reduce, **kwargs): super(DiscoDBIterator, self).__init__(name=name, master=master) self.input = [[ '%s!%s/%s' % (url, method, arg) if method else url for url in urls ] for urls in index.ichunks] self.map_input_stream = [scheme_discodb.input_stream] + streams self.params = Params(**kwargs) if reduce: self.partitions = len(self.master.nodeinfo()) self.reduce = reduce
def predict(master, input, center, centers, map_reader=chain_reader): """ Predict the closest clusters for the datapoints in input. """ job = master.new_job(name='kcluster_predict', input=input, map_reader=map_reader, map=predict_map, params=Params(centers=centers, **center), nr_reduces=0) return job.wait()
def test_keyset_multiplier(self): params = Params() params.keysets = { 'last_name_keyset': dict( key_parts=['_keyset', 'last_name'], value_parts=['count'], ), 'first_name_keyset': dict( key_parts=['_keyset', 'first_name'], value_parts=['count'], )} data = [ {'first_name': 'Willow', 'last_name': 'Harvey'}, {'first_name': 'Noam', 'last_name': 'Clarke'}] expected = [ { 'first_name': 'Willow', 'last_name': 'Harvey', '_keyset': 'first_name_keyset' }, { 'first_name': 'Willow', 'last_name': 'Harvey', '_keyset': 'last_name_keyset' }, { 'first_name': 'Noam', 'last_name': 'Clarke', '_keyset': 'first_name_keyset' }, { 'first_name': 'Noam', 'last_name': 'Clarke', '_keyset': 'last_name_keyset' }] actual = keyset_multiplier(data, None, None, params) ok_(isinstance(actual, types.GeneratorType)) eq_(list(actual), expected)
def setUp(self): sys.stdout = self.capture_stdout = cStringIO.StringIO() self.params = Params() self.params.keysets = { 'last_name_keyset': dict( key_parts=['_keyset', 'last_name'], value_parts=['count'], ), 'first_name_keyset': dict( key_parts=['_keyset', 'first_name'], value_parts=['count'], ) }
def __init__(self, master, name, dataset): super(Indexer, self).__init__(name=name, master=master) self.input = dataset.input self.map_input_stream = dataset.stream self.map_reader = dataset.parser self.map = dataset.demuxer self.partition = dataset.balancer self.profile = dataset.profile self.partitions = dataset.nr_ichunks self.required_files = dataset.required_files self.params = Params(n=0, unique_items=dataset.unique_items) if self.partitions: self.reduce = nop_reduce self.reduce_output_stream = [reduce_output_stream, discodb_output] else: self.map_output_stream = [map_output_stream, discodb_output]
class ParamsTestCase(DiscoJobTestFixture, DiscoTestCase): inputs = range(10) params = Params(x=5, f1=fun1, f2=fun2, now=datetime.now()) sort = False def getdata(self, path): return '\n'.join([path] * 10) @staticmethod def map(e, params): return [(e, params.f1(int(e), params.x))] @staticmethod def reduce(iter, out, params): for k, v in iter: out.add(k, params.f2(int(v))) def runTest(self): for k, v in self.results: self.assertEquals(fun2(int(k) + 5), int(v))
class PartialTestCase(DiscoJobTestFixture, DiscoTestCase): @property def inputs(self): return [str(x) for x in range(self.num_workers)] def getdata(self, path): return '1 _ 0 \n' map = partial(map, extra='a') combiner = partial(combiner, extra='b') reduce = partial(reduce, extra='c') map_init = partial(init, extra='d') reduce_init = partial(init, extra='e') map_reader = partial(reader, extra='f') map_writer = partial(writer, extra='g') reduce_reader = partial(reader, extra='h') reduce_writer = partial(writer, extra='i') params = Params(foo=partial(foo, extra='z')) def runTest(self): for k, v in self.results: self.assertEquals(k, '_fazbghczi')
def load_fact(master, input, config_path, nr_maps=1, nr_reduces=1, \ load_method=offdimetlmr, profile=False): #disco = Disco("disco://"+host) fact_starttime = time.time() fact_job = master.new_job( name = 'fact', input = input, map_init = load_method.fact_map_init, map_reader = load_method.map_reader, map = load_method.fact_map_func, combiner = load_method.fact_combiner_func, scheduler = {'max_cores': nr_maps}, nr_reduces = nr_reduces, required_modules=[('config', config_path),], status_interval = 1000000, profile = profile, params = Params(totalcopytime=0, nr_maps=nr_maps, \ nr_reduces=nr_reduces) ) results = fact_job.wait() #results = fact_job.wait(show=True, poll_interval = 100, timeout = 10*3600) fact_endtime = time.time() print "Time of loading facts: %f seconds" % (fact_endtime-fact_starttime)
class ADMM(Job): def map_reader(fd, url, size, params): i = Task.id z = params.z yi = params.y[i] + params.rho * (params.x[i] - z) for A, b in iter: xi = argmin(fi(x) + dot(yi, x - z) + (params.rho / 2.) * dot(x - z, x - z)) yield str(i), (xi, yi) def reduce(iter, params): for n, (i, (xi, yi)) in enumerate(iter): zhat += xi + yi / float(params.rho) yield zhat / n # first run a job to put records into A, b format # and also calculate a first z if __name__ == '__main__': params = Params(rho=1., z=0., objective=) while True: job = ADMM() results = job.wait() z = old_z params.z = list(RecordIter(job.results()))[0] if params.rho * sqrt(n) * pnorm(z - params.z, p=2) <= eta_conv: if sum(dot(xi - params.z, xi - params.z) for xi, yi in RecordIter(results)) <= (eta_feas ** 2): break
def __init__( self, # name, on/off name='_unnamed_', run=True, # throttle min_blobs=1, max_blobs=sys.maxint, partitions=200, partition_function=crc_partition, scheduler=None, worker=None, time_delta=None, newest_first=True, # archive archive=False, archive_tag_prefix='processed', # nuke nuke=False, # map map_init_function=lambda x, y: x, map_function=keyset_map, map_input_stream=chunk_csv_stream, map_output_stream=(map_output_stream, disco_output_stream), #combine combiner_function=None, # reduce reduce_function=keyset_reduce, reduce_output_stream=(reduce_output_stream, disco_output_stream), # result # result_iterator_override --> # see inferno.lib.disco_ext.sorted_iterator for signature result_iterator_override=None, result_processor=keyset_result, result_tag=None, result_tag_suffix=True, save=False, sort=True, sort_buffer_size='10%', sorted_results=True, # keysets keysets=None, key_parts=None, value_parts=None, column_mappings=None, table=None, keyset_parts_preprocess=None, parts_postprocess=None, # input day_range=0, day_offset=0, day_start=None, source_tags=None, source_urls=None, # other rule_init_function=None, rule_cleanup=None, parts_preprocess=None, field_transforms=None, required_files=None, required_modules=None, # notifications --> notify_addresses must be list of addresses notify_on_fail=False, notify_on_success=False, notify_addresses=None, **kwargs): self.qualified_name = name if kwargs: self.params = Params(**kwargs) else: self.params = Params() if not scheduler: scheduler = {'force_local': False, 'max_cores': 200} # name, on/off self.run = run self.name = name # throttle self.min_blobs = min_blobs self.max_blobs = max_blobs self.partitions = partitions self.partition_function = partition_function self.scheduler = scheduler self.time_delta = time_delta if self.time_delta is None: self.time_delta = {'minutes': 5} self.newest_first = newest_first self.worker = worker # archive self.archive = archive self.archive_tag_prefix = archive_tag_prefix # nuke self.nuke = nuke # map self.map_init_function = map_init_function self.map_function = map_function self.map_input_stream = map_input_stream self.map_output_stream = map_output_stream self.combiner_function = combiner_function # reduce self.reduce_function = reduce_function self.reduce_output_stream = reduce_output_stream # result self.result_processor = result_processor self.result_tag = result_tag self.result_tag_suffix = result_tag_suffix self.save = save self.sort = sort self.sort_buffer_size = sort_buffer_size if result_iterator_override: self.result_iterator = result_iterator_override elif self.sort and sorted_results: self.result_iterator = sorted_iterator else: self.result_iterator = result_iterator # input if isinstance(source_tags, basestring): source_tags = [source_tags] self.day_range = day_range self.day_offset = day_offset self.day_start = day_start self.source_tags = source_tags or [] # keysets keyset_dict = {} if keysets: for keyset_name, keyset_obj in keysets.items(): keyset_dict[keyset_name] = keyset_obj.as_dict() else: keyset_dict['_default'] = Keyset(key_parts, value_parts, column_mappings, table, keyset_parts_preprocess, parts_postprocess).as_dict() self.params.keysets = keyset_dict self.params.parts_preprocess = parts_preprocess or [] self.params.field_transforms = field_transforms or dict() # other self.rule_init_function = rule_init_function self.rule_cleanup = rule_cleanup self.required_modules = required_modules or [] self.required_files = required_files or [] self.notify_on_fail = notify_on_fail self.notify_on_success = notify_on_success self.notify_addresses = notify_addresses or [] self.source_urls = source_urls
def params_2(self): return Params(job=self.job_1.name)
def estimate(input, ys, splitter=' ', map_reader=chain_reader): ys = dict([(id, 1) for id in ys]) job = Job(name='naive_bayes_estimate') job.run(input=input, map_reader=map_reader, map=estimate_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(ys=ys, splitter=splitter), clean=False) results = job.wait() total = 0 # will include the items for which we'll be classifying, # for example if the dataset includes males and females, # this dict will include the keys male and female and the # number of times these have been observed in the train set items = {} # the number of times the classes have been observed. For # example, if the feature is something like tall or short, then the dict # will contain the total number of times we have seen tall and short. classes = {} # the number of times we have seen a class with a feature. pairs = {} for key, value in result_iterator(results): l = key.split(splitter) value = int(value) if len(l) == 1: if l[0] == '': total = value elif ys.has_key(l[0]): classes[l[0]] = value else: items[l[0]] = value else: pairs[key] = value #counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]] counts = {} for i in items: for y in ys: key = y + splitter + i counts[key] = [0, 0, 0, 0] if pairs.has_key(key): counts[key][0] = pairs[key] counts[key][1] = items[i] - counts[key][0] if not classes.has_key(y): counts[key][2] = 0 else: counts[key][2] = classes[y] - counts[key][0] counts[key][3] = total - sum(counts[key][:3]) # add pseudocounts counts[key] = map(lambda x: x + 1, counts[key]) total += 4 import math loglikelihoods = {} for key, value in counts.iteritems(): l = key.split(splitter) if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]] = 0.0 loglikelihoods[l[0]] += math.log(value[0] + value[2]) - math.log(value[1] + value[3]) loglikelihoods[key] = math.log(value[0]) - math.log(value[1]) return loglikelihoods
def dgemm(disco, transA, transB, m, n, k, alpha, A, B, beta, C, maxTotalBlocks=128): """ Compute general matrix multiplication alpha*op(A)*op(B) + beta*C in double precision where op(X) = X or transpose(X). @param transA A boolean value for transposing matrix A or not. @param transB A boolean value for transposing matrix B or not. @param m Number of rows of matrix op(A) and C. @param n Number of columns of matrix op(B) and C. @param k Number of columns of matrix op(A) and rows of matrix op(B). @param alpha Scalar multiplier for the matrix product A*B. @param beta Scalar multiplier for matrix C. @param A MatrixWrapper object encapsulating matrix A. @param B MatrixWrapper object encapsulating matrix B. @param C MatrixWrapper object encapsulating matrix C. If there is no C term, then pass in an empty wrapper, MatrixWrapper(), as placeholder. @param disco A Disco instance. @param maxTotalBlocks Suggested number of matrix blocks to use for carrying out the multiplication. Ideally, this should equal to the number of cores available in the cluster. The actual number of blocks is selected based on the size of the matrix. @return MatrixWrapper object encapsulating the resulting matrix. """ def _mapRowBlocks(e, params): from math import ceil from numpy import float64 if type(e) == tuple: e = e[0] output = [] elems = e.split(";") for elem in elems: i, j, val = map(float64, elem.split(",")) if params.transA: i, j = j, i assert i < params.m, "row index %d exceeds matrix dimensions" % int(i) assert j < params.k, "col index %d exceeds matrix dimensions" % int(j) blockX = int(j / params.blockWidth) blockY = int(i / params.blockHeight) offsetY = ceil(params.blockHeight * blockY) val = params.alpha * val if val != 0.0: output += [(blockY*params.blocksPerRow+x, "%s,%d,%d,%.14f" % (params.matrixId, int(i-offsetY), int(j), val)) for x in range(0, params.blocksPerRow)] return output def _mapColBlocks(e, params): from math import ceil from numpy import float64 if type(e) == tuple: e = e[0] output = [] elems = e.split(";") for elem in elems: i, j, val = map(float64, elem.split(",")) if params.transB: i, j = j, i assert i < params.k, "row index %d exceeds matrix dimensions" % int(i) assert j < params.n, "col index %d exceeds matrix dimensions" % int(j) blockX = int(j / params.blockWidth) blockX = int(j / params.blockWidth) offsetX = ceil(params.blockWidth * blockX) if val != 0.0: output += [(y*params.blocksPerRow+blockX, "%s,%d,%d,%.14f" % (params.matrixId, int(i), int(j-offsetX), val)) for y in range(0, params.blocksPerCol)] return output def _mapBlocks(e, params): from math import ceil from numpy import float64 if type(e) == tuple: e = e[0] output = [] elems = e.split(";") for elem in elems: i, j, val = map(float64, elem.split(",")) assert i < params.m, "row index %d exceeds matrix dimensions" % int(i) assert j < params.n, "col index %d exceeds matrix dimensions" % int(j) blockX = int(j / params.blockWidth) blockX = int(j / params.blockWidth) blockY = int(i / params.blockHeight) offsetX = ceil(params.blockWidth * blockX) offsetY = ceil(params.blockHeight * blockY) val = params.beta*val if val != 0.0: output += [(blockY*params.blocksPerRow+blockX, "%s,%d,%d,%.14f" % (params.matrixId, int(i-offsetY), int(j-offsetX), val))] return output def nop_map(e, params): return [e] def _reduceMultiplyAndAdd(iter, out, params): from numpy import float64 rows = {} cols = {} vals = {} maxColIdx = {} maxRowIdx = {} for blockId, s in iter: blockId = int(blockId) matrixId, rowIdx, colIdx, val = s.split(",") rowIdx = int(rowIdx) colIdx = int(colIdx) val = float64(val) if not rows.has_key(blockId): rows[blockId] = {} cols[blockId] = {} vals[blockId] = {} maxColIdx[blockId] = {} maxRowIdx[blockId] = {} if not rows[blockId].has_key(matrixId): rows[blockId][matrixId] = [] cols[blockId][matrixId] = [] vals[blockId][matrixId] = [] maxColIdx[blockId][matrixId] = 0 maxRowIdx[blockId][matrixId] = 0 rows[blockId][matrixId].append(rowIdx) cols[blockId][matrixId].append(colIdx) vals[blockId][matrixId].append(val) maxColIdx[blockId][matrixId] = max(maxColIdx[blockId][matrixId], cols[blockId][matrixId][-1]) maxRowIdx[blockId][matrixId] = max(maxRowIdx[blockId][matrixId], rows[blockId][matrixId][-1]) # initialize sparse matrices from math import ceil from scipy.sparse import coo_matrix for blockId in rows.keys(): # compute the index offset in the original matrix blockY = blockId / params.blocksPerRow blockX = blockId % params.blocksPerRow offsetY = ceil(params.blockHeight * blockY) offsetX = ceil(params.blockWidth * blockX) # compute matrix product if not vals[blockId].has_key('A') or not vals[blockId].has_key('B'): # skip multiplication since either block A or B is empty if vals[blockId].has_key('C'): # return beta*C P = coo_matrix((vals[blockId]['C'],(rows[blockId]['C'],cols[blockId]['C'])), dtype=float64, dims=(maxRowIdx[blockId]['C']+1, maxColIdx[blockId]['C']+1)) else: P = None else: if vals[blockId].has_key('C'): m = max(maxRowIdx[blockId]['A'], maxRowIdx[blockId]['C']) + 1 n = max(maxColIdx[blockId]['B'], maxColIdx[blockId]['C']) + 1 C = coo_matrix((vals[blockId]['C'],(rows[blockId]['C'],cols[blockId]['C'])), dtype=float64, dims=(m,n)) else: m = maxRowIdx[blockId]['A'] + 1 n = maxColIdx[blockId]['B'] + 1 C = coo_matrix(([],([],[])), dtype=float64, dims=(m,n)) A = coo_matrix((vals[blockId]['A'],(rows[blockId]['A'],cols[blockId]['A'])), dtype=float64, dims=(m,max(maxColIdx[blockId]['A'], maxRowIdx[blockId]['B'])+1)) B = coo_matrix((vals[blockId]['B'],(rows[blockId]['B'],cols[blockId]['B'])), dtype=float64, dims=(max(maxColIdx[blockId]['A'], maxRowIdx[blockId]['B'])+1, n)) P = (A * B + C).tocoo() # map block indices into original indices if P != None: start = 0 while start < len(P.row): end = min(start+params.elemsPerLine, len(P.row)) out.add(";".join(["%d,%d,%.14f" % (P.row[i]+offsetY, P.col[i]+offsetX, P.data[i]) for i in range(start,end)]), "") start = end # find the best way to partition matrix into blocks blocksPerRow, blocksPerCol = _partition(m, n, maxTotalBlocks) blockHeight = float(m) / blocksPerCol blockWidth = float(n) / blocksPerRow totalBlocks = blocksPerRow * blocksPerCol #print "%dx%d blocks used with block dimension %fx%f" % (blocksPerCol, blocksPerRow, blockHeight, blockWidth) params = Params(blocksPerRow=blocksPerRow, blocksPerCol=blocksPerCol, blockHeight=blockHeight, blockWidth=blockWidth, alpha=alpha, beta=beta, transA=transA, transB=transB, m=m, k=k, n=n) params.elemsPerLine = 1000 # map matrix A into row blocks params.matrixId = 'A' jobMapA = disco.new_job(input=A.urls, name="dgemm_mapA", map_reader=A.mapReader, map=_mapRowBlocks, params=params, nr_reduces=totalBlocks) resA = jobMapA.wait(clean=False, poll_interval=2) # map matrix B into col blocks params.matrixId = 'B' jobMapB = disco.new_job(input=B.urls, name="dgemm_mapB", map_reader=B.mapReader, map=_mapColBlocks, params=params, nr_reduces=totalBlocks) resB = jobMapB.wait(clean=False, poll_interval=2) # map matrix C into blocks if len(C.urls) == 0: # quick fix for disco bug resC = [] else: params.matrixId = 'C' jobMapC = disco.new_job(input=C.urls, name="dgemm_mapC", map_reader=C.mapReader, map=_mapBlocks, params=params, nr_reduces=totalBlocks) resC = jobMapC.wait(clean=False, poll_interval=2) # multiply the blocks res = disco.new_job(input=resA+resB+resC, name="dgemm_reduce", map_reader=chain_reader, map=nop_map, nr_reduces=totalBlocks, reduce=_reduceMultiplyAndAdd, params=params).wait(clean=False, poll_interval=2) # clean up jobMapA.purge() jobMapB.purge() if len(C.urls) > 0: # quick fix for disco bug jobMapC.purge() return MatrixWrapper(res, chain_reader)
def dgema(disco, transA, transB, m, n, alpha, A, B, beta, maxTotalBlocks=128): """ Compute general matrix addition alpha*op(A) + beta*op(B) in double precision where op(X) = X or transpose(X). @param transA A boolean value for transposing matrix A or not. @param transB A boolean value for transposing matrix B or not. @param m Number of rows of matrix op(A). @param n Number of columns of matrix op(B). @param alpha Scalar multiplier for matrix A. @param beta Scalar multiplier for matrix B. @param A MatrixWrapper object encapsulating matrix A. @param B MatrixWrapper object encapsulating matrix B. @param disco A Disco instance. @param maxTotalBlocks Suggested number of matrix blocks to use for carrying out the addition. Ideally, this should equal to the number of cores available in the cluster. The actual number of blocks is selected based on the size of the matrix. @return MatrixWrapper object encapsulating the resulting matrix. """ def _mapBlocks(e, params): from math import ceil from numpy import float64 if type(e) == tuple: e = e[0] output = [] elems = e.split(";") for elem in elems: i, j, val = map(float64, elem.split(",")) if params.transpose: i, j = j, i assert i < params.m, "row index %d exceeds matrix dimensions" % int(i) assert j < params.n, "col index %d exceeds matrix dimensions" % int(j) blockX = int(j / params.blockWidth) blockY = int(i / params.blockHeight) offsetX = ceil(params.blockWidth * blockX) offsetY = ceil(params.blockHeight * blockY) val = params.scaling * val if val != 0.0: output += [(blockY*params.blocksPerRow+blockX, "%d,%d,%.14f" % (int(i-offsetY), int(j-offsetX), val))] return output def nop_map(e, params): return [e] def _reduceAddBlocks(iter, out, params): from numpy import float64 s = {} # add matrices for blockId, t in iter: blockId = int(blockId) rowIdx, colIdx, val = t.split(",") rowIdx = int(rowIdx) colIdx = int(colIdx) if not s.has_key(blockId): s[blockId] = {} if not s[blockId].has_key(rowIdx): s[blockId][rowIdx] = {} s[blockId][rowIdx][colIdx] = s[blockId][rowIdx].get(colIdx, 0) + float64(val) # output results from math import ceil from scipy.sparse import coo_matrix for blockId in s.keys(): # compute the index offset in the original matrix offsetY = ceil(params.blockHeight * (blockId / params.blocksPerRow)) offsetX = ceil(params.blockWidth * (blockId % params.blocksPerRow)) # map block indices into original indices for rowIdx in s[blockId].keys(): for colIdx in s[blockId][rowIdx].keys(): out.add("%d,%d,%.14f" % (rowIdx+offsetY, colIdx+offsetX, s[blockId][rowIdx][colIdx]), "") # find the best way to partition matrix to blocks blocksPerRow, blocksPerCol = _partition(m, n, maxTotalBlocks) blockHeight = float(m) / blocksPerCol blockWidth = float(n) / blocksPerRow totalBlocks = blocksPerRow * blocksPerCol # map and scale matrices params = Params(blocksPerRow=blocksPerRow, blocksPerCol=blocksPerCol, blockHeight=blockHeight, blockWidth=blockWidth) params.transpose = transA params.scaling = alpha params.m = m params.n = n jobMapA = disco.new_job(input=A.urls, name="dgema_mapA", map_reader=A.mapReader, map=_mapBlocks, params=params, nr_reduces=totalBlocks) resA = jobMapA.wait(clean=False, poll_interval=2) params.transpose = transB params.scaling = beta jobMapB = disco.new_job(input=B.urls, name="dgema_mapB", map_reader=B.mapReader, map=_mapBlocks, params=params, nr_reduces=totalBlocks) resB = jobMapB.wait(clean=False, poll_interval=2) # add matrices res = disco.new_job(input=resA+resB, name="dgema_reduce", map_reader=chain_reader, map=nop_map, params=params, reduce=_reduceAddBlocks, nr_reduces=totalBlocks).wait(clean=False, poll_interval=2) # clean up jobMapA.purge() jobMapB.purge() return MatrixWrapper(res, chain_reader)