def estimate(master, input, center, k, iterations, map_reader=chain_reader): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ job = master.new_job(name='k-clustering_init', input=input, map_reader=map_reader, map_init=map_init, map=random_init_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(k=k, seed=None, **center), nr_reduces=k) centers = [(i, c) for i, c in result_iterator(job.wait())] job.purge() for j in range(iterations): job = master.new_job(name='k-clustering_iteration_%s' % (j, ), input=input, map_reader=map_reader, map=estimate_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(centers=centers, **center), nr_reduces=k) centers = [(i, c) for i, c in result_iterator(job.wait())] job.purge() return centers
class InitTestCase(DiscoJobTestFixture, DiscoTestCase): inputs = range(10) params = Params(x=10) sort = False def getdata(self, path): return 'skipthis\n' + ('%s\n' % path) * 10 @staticmethod def map_init(input_iter, params): input_iter.next() params.x += 100 @staticmethod def map(e, params): return [(e, int(e) + params.x)] @staticmethod def reduce_init(input_iter, params): params.y = 1000 @staticmethod def reduce(iter, out, params): for k, v in iter: out.add(k, int(v) + params.y) def runTest(self): results = list(self.results) for k, v in results: self.assertEquals(int(k) + 1110, int(v)) self.assertEquals(len(results), 100)
def load_one_dim(master, input, config_path, nr_maps=1, nr_reduces=1,\ load_method=offdimetlmr, dimnames= repr([]), \ go_live=1, profile=False): dim_job = master.new_job( name = 'dim', input = input, map_init = load_method.dim_map_init, map_reader = load_method.map_reader, map = load_method.dim_map_func, partition = load_method.dim_partition_func, combiner = load_method.dim_combiner_func, reduce = load_method.dim_reduce_func, scheduler = {'max_cores': nr_maps}, nr_reduces = nr_reduces, required_modules=[('config', config_path)], profile = profile, status_interval = 1000000, params = Params(count=0, dimnames=dimnames, \ nr_maps=nr_maps, nr_reduces=nr_reduces) ) results = dim_job.wait() shelvedb_paths = [] if results!=None: for key,value in result_iterator(results): shelvedb_paths.append(key) if go_live==1: load_method.golive(config, shelvedb_paths)
def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle( rule_params.get('server', settings.get('server'))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get( 'worker').rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn( "Error instantiating worker: %s %s - loading default worker" % (settings.get('worker'), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START)
def run(program, jobclass, *inputs): """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...] Create an instance of jobclass and run it. Input urls are specified as arguments or read from stdin. """ from disco.core import Params from disco.util import reify def maybe_list(seq): return seq[0] if len(seq) == 1 else seq name = program.options.name or jobclass.split('.')[-1] input = inputs or [ maybe_list(line.split()) for line in fileinput.input(inputs) ] job = reify(jobclass)(program.disco, name) try: params = job.params except AttributeError: params = Params() params.__dict__.update(**dict(program.options.params)) job.run(input=input, **program.option_parser.jobdict) print job.name
def _assert_csv_reader(self, fields, values, expected): stream = StringIO.StringIO(values) params = Params() params.csv_fields = fields params.csv_dialect = csv.excel_tab actual = csv_reader(stream, None, None, params) ok_(isinstance(actual, types.GeneratorType)) eq_(list(actual), expected)
def _assert_reduce(self, data, expected, **kwargs): # turn disco_debug on for more code coverage if kwargs is None: kwargs = dict() kwargs['disco_debug'] = True params = Params(**kwargs) actual = keyset_reduce(data, params) ok_(isinstance(actual, types.GeneratorType)) eq_(list(actual), expected)
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader): ys = dict([(id, 1) for id in ys]) job = Job(name='naive_bayes_predict') job.run(input=input, map_reader=map_reader, map=predict_map, params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter), clean=False) return job.wait()
class Grep(Job): map = nop_map params = Params(pattern=None) def map_reader(fd, size, url, params): import re if params.pattern: pattern = re.compile(params.pattern) for line in fd: if pattern.match(line): yield url, line
class LineChunker(Job): params = Params(ddfs_master=None, tag=None) def _map_input_stream(fd, size, url, params): from disco.ddfs import DDFS tag = params.tag or 'disco:chunks:%s' % Task.jobname master = params.ddfs_master or Task.master yield url, DDFS(master).chunk(tag, [url]) map_input_stream = [_map_input_stream] def map(entry, params): yield entry
def __init__(self, master, name, index, method, arg, streams, reduce, **kwargs): super(DiscoDBIterator, self).__init__(name=name, master=master) self.input = [[ '%s!%s/%s' % (url, method, arg) if method else url for url in urls ] for urls in index.ichunks] self.map_input_stream = [scheme_discodb.input_stream] + streams self.params = Params(**kwargs) if reduce: self.partitions = len(self.master.nodeinfo()) self.reduce = reduce
def predict(master, input, center, centers, map_reader=chain_reader): """ Predict the closest clusters for the datapoints in input. """ job = master.new_job(name='kcluster_predict', input=input, map_reader=map_reader, map=predict_map, params=Params(centers=centers, **center), nr_reduces=0) return job.wait()
def setUp(self): sys.stdout = self.capture_stdout = cStringIO.StringIO() self.params = Params() self.params.keysets = { 'last_name_keyset': dict( key_parts=['_keyset', 'last_name'], value_parts=['count'], ), 'first_name_keyset': dict( key_parts=['_keyset', 'first_name'], value_parts=['count'], ) }
def __init__(self, master, name, dataset): super(Indexer, self).__init__(name=name, master=master) self.input = dataset.input self.map_input_stream = dataset.stream self.map_reader = dataset.parser self.map = dataset.demuxer self.partition = dataset.balancer self.profile = dataset.profile self.partitions = dataset.nr_ichunks self.required_files = dataset.required_files self.params = Params(n=0, unique_items=dataset.unique_items) if self.partitions: self.reduce = nop_reduce self.reduce_output_stream = [reduce_output_stream, discodb_output] else: self.map_output_stream = [map_output_stream, discodb_output]
class ParamsTestCase(DiscoJobTestFixture, DiscoTestCase): inputs = range(10) params = Params(x=5, f1=fun1, f2=fun2, now=datetime.now()) sort = False def getdata(self, path): return '\n'.join([path] * 10) @staticmethod def map(e, params): return [(e, params.f1(int(e), params.x))] @staticmethod def reduce(iter, out, params): for k, v in iter: out.add(k, params.f2(int(v))) def runTest(self): for k, v in self.results: self.assertEquals(fun2(int(k) + 5), int(v))
def test_keyset_multiplier(self): params = Params() params.keysets = { 'last_name_keyset': dict( key_parts=['_keyset', 'last_name'], value_parts=['count'], ), 'first_name_keyset': dict( key_parts=['_keyset', 'first_name'], value_parts=['count'], ) } data = [{ 'first_name': 'Willow', 'last_name': 'Harvey' }, { 'first_name': 'Noam', 'last_name': 'Clarke' }] expected = [{ 'first_name': 'Willow', 'last_name': 'Harvey', '_keyset': 'first_name_keyset' }, { 'first_name': 'Willow', 'last_name': 'Harvey', '_keyset': 'last_name_keyset' }, { 'first_name': 'Noam', 'last_name': 'Clarke', '_keyset': 'first_name_keyset' }, { 'first_name': 'Noam', 'last_name': 'Clarke', '_keyset': 'last_name_keyset' }] actual = keyset_multiplier(data, None, None, params) ok_(isinstance(actual, types.GeneratorType)) eq_(list(actual), expected)
class PartialTestCase(DiscoJobTestFixture, DiscoTestCase): @property def inputs(self): return [str(x) for x in range(self.num_workers)] def getdata(self, path): return '1 _ 0 \n' map = partial(map, extra='a') combiner = partial(combiner, extra='b') reduce = partial(reduce, extra='c') map_init = partial(init, extra='d') reduce_init = partial(init, extra='e') map_reader = partial(reader, extra='f') map_writer = partial(writer, extra='g') reduce_reader = partial(reader, extra='h') reduce_writer = partial(writer, extra='i') params = Params(foo=partial(foo, extra='z')) def runTest(self): for k, v in self.results: self.assertEquals(k, '_fazbghczi')
def load_fact(master, input, config_path, nr_maps=1, nr_reduces=1, \ load_method=offdimetlmr, profile=False): #disco = Disco("disco://"+host) fact_starttime = time.time() fact_job = master.new_job( name = 'fact', input = input, map_init = load_method.fact_map_init, map_reader = load_method.map_reader, map = load_method.fact_map_func, combiner = load_method.fact_combiner_func, scheduler = {'max_cores': nr_maps}, nr_reduces = nr_reduces, required_modules=[('config', config_path),], status_interval = 1000000, profile = profile, params = Params(totalcopytime=0, nr_maps=nr_maps, \ nr_reduces=nr_reduces) ) results = fact_job.wait() #results = fact_job.wait(show=True, poll_interval = 100, timeout = 10*3600) fact_endtime = time.time() print "Time of loading facts: %f seconds" % (fact_endtime-fact_starttime)
def __init__( self, # name, on/off name='_unnamed_', run=True, # throttle min_blobs=1, max_blobs=sys.maxint, partitions=200, partition_function=crc_partition, scheduler=None, worker=None, time_delta=None, newest_first=True, # archive archive=False, archive_tag_prefix='processed', # nuke nuke=False, # map map_init_function=lambda x, y: x, map_function=keyset_map, map_input_stream=chunk_csv_stream, map_output_stream=(map_output_stream, disco_output_stream), #combine combiner_function=None, # reduce reduce_function=keyset_reduce, reduce_output_stream=(reduce_output_stream, disco_output_stream), # result # result_iterator_override --> # see inferno.lib.disco_ext.sorted_iterator for signature result_iterator_override=None, result_processor=keyset_result, result_tag=None, result_tag_suffix=True, save=False, sort=True, sort_buffer_size='10%', sorted_results=True, # keysets keysets=None, key_parts=None, value_parts=None, column_mappings=None, table=None, keyset_parts_preprocess=None, parts_postprocess=None, # input day_range=0, day_offset=0, day_start=None, source_tags=None, source_urls=None, # other rule_init_function=None, rule_cleanup=None, parts_preprocess=None, field_transforms=None, required_files=None, required_modules=None, # notifications --> notify_addresses must be list of addresses notify_on_fail=False, notify_on_success=False, notify_addresses=None, **kwargs): self.qualified_name = name if kwargs: self.params = Params(**kwargs) else: self.params = Params() if not scheduler: scheduler = {'force_local': False, 'max_cores': 200} # name, on/off self.run = run self.name = name # throttle self.min_blobs = min_blobs self.max_blobs = max_blobs self.partitions = partitions self.partition_function = partition_function self.scheduler = scheduler self.time_delta = time_delta if self.time_delta is None: self.time_delta = {'minutes': 5} self.newest_first = newest_first self.worker = worker # archive self.archive = archive self.archive_tag_prefix = archive_tag_prefix # nuke self.nuke = nuke # map self.map_init_function = map_init_function self.map_function = map_function self.map_input_stream = map_input_stream self.map_output_stream = map_output_stream self.combiner_function = combiner_function # reduce self.reduce_function = reduce_function self.reduce_output_stream = reduce_output_stream # result self.result_processor = result_processor self.result_tag = result_tag self.result_tag_suffix = result_tag_suffix self.save = save self.sort = sort self.sort_buffer_size = sort_buffer_size if result_iterator_override: self.result_iterator = result_iterator_override elif self.sort and sorted_results: self.result_iterator = sorted_iterator else: self.result_iterator = result_iterator # input if isinstance(source_tags, basestring): source_tags = [source_tags] self.day_range = day_range self.day_offset = day_offset self.day_start = day_start self.source_tags = source_tags or [] # keysets keyset_dict = {} if keysets: for keyset_name, keyset_obj in keysets.items(): keyset_dict[keyset_name] = keyset_obj.as_dict() else: keyset_dict['_default'] = Keyset(key_parts, value_parts, column_mappings, table, keyset_parts_preprocess, parts_postprocess).as_dict() self.params.keysets = keyset_dict self.params.parts_preprocess = parts_preprocess or [] self.params.field_transforms = field_transforms or dict() # other self.rule_init_function = rule_init_function self.rule_cleanup = rule_cleanup self.required_modules = required_modules or [] self.required_files = required_files or [] self.notify_on_fail = notify_on_fail self.notify_on_success = notify_on_success self.notify_addresses = notify_addresses or [] self.source_urls = source_urls
def params_2(self): return Params(job=self.job_1.name)
def estimate(input, ys, splitter=' ', map_reader=chain_reader): ys = dict([(id, 1) for id in ys]) job = Job(name='naive_bayes_estimate') job.run(input=input, map_reader=map_reader, map=estimate_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(ys=ys, splitter=splitter), clean=False) results = job.wait() total = 0 # will include the items for which we'll be classifying, # for example if the dataset includes males and females, # this dict will include the keys male and female and the # number of times these have been observed in the train set items = {} # the number of times the classes have been observed. For # example, if the feature is something like tall or short, then the dict # will contain the total number of times we have seen tall and short. classes = {} # the number of times we have seen a class with a feature. pairs = {} for key, value in result_iterator(results): l = key.split(splitter) value = int(value) if len(l) == 1: if l[0] == '': total = value elif ys.has_key(l[0]): classes[l[0]] = value else: items[l[0]] = value else: pairs[key] = value #counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]] counts = {} for i in items: for y in ys: key = y + splitter + i counts[key] = [0, 0, 0, 0] if pairs.has_key(key): counts[key][0] = pairs[key] counts[key][1] = items[i] - counts[key][0] if not classes.has_key(y): counts[key][2] = 0 else: counts[key][2] = classes[y] - counts[key][0] counts[key][3] = total - sum(counts[key][:3]) # add pseudocounts counts[key] = map(lambda x: x + 1, counts[key]) total += 4 import math loglikelihoods = {} for key, value in counts.iteritems(): l = key.split(splitter) if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]] = 0.0 loglikelihoods[l[0]] += math.log(value[0] + value[2]) - math.log(value[1] + value[3]) loglikelihoods[key] = math.log(value[0]) - math.log(value[1]) return loglikelihoods