def estimate(master, input, center, k, iterations, map_reader=chain_reader):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    job = master.new_job(name='k-clustering_init',
                         input=input,
                         map_reader=map_reader,
                         map_init=map_init,
                         map=random_init_map,
                         combiner=estimate_combiner,
                         reduce=estimate_reduce,
                         params=Params(k=k, seed=None, **center),
                         nr_reduces=k)

    centers = [(i, c) for i, c in result_iterator(job.wait())]
    job.purge()

    for j in range(iterations):
        job = master.new_job(name='k-clustering_iteration_%s' % (j, ),
                             input=input,
                             map_reader=map_reader,
                             map=estimate_map,
                             combiner=estimate_combiner,
                             reduce=estimate_reduce,
                             params=Params(centers=centers, **center),
                             nr_reduces=k)

        centers = [(i, c) for i, c in result_iterator(job.wait())]
        job.purge()

    return centers
Example #2
0
class InitTestCase(DiscoJobTestFixture, DiscoTestCase):
    inputs = range(10)
    params = Params(x=10)
    sort = False

    def getdata(self, path):
        return 'skipthis\n' + ('%s\n' % path) * 10

    @staticmethod
    def map_init(input_iter, params):
        input_iter.next()
        params.x += 100

    @staticmethod
    def map(e, params):
        return [(e, int(e) + params.x)]

    @staticmethod
    def reduce_init(input_iter, params):
        params.y = 1000

    @staticmethod
    def reduce(iter, out, params):
        for k, v in iter:
            out.add(k, int(v) + params.y)

    def runTest(self):
        results = list(self.results)
        for k, v in results:
            self.assertEquals(int(k) + 1110, int(v))
        self.assertEquals(len(results), 100)
Example #3
0
def load_one_dim(master, input, config_path, nr_maps=1, nr_reduces=1,\
                 load_method=offdimetlmr, dimnames= repr([]), \
                 go_live=1, profile=False):
	dim_job = master.new_job(
		name = 'dim',
		input = input,
		map_init = load_method.dim_map_init,
		map_reader = load_method.map_reader,
		map = load_method.dim_map_func,
	        partition = load_method.dim_partition_func,
		combiner = load_method.dim_combiner_func,
		reduce = load_method.dim_reduce_func,
		scheduler = {'max_cores': nr_maps},
		nr_reduces = nr_reduces,
		required_modules=[('config', config_path)],
		profile = profile,
		status_interval = 1000000,
		params = Params(count=0, dimnames=dimnames, \
	                        nr_maps=nr_maps, nr_reduces=nr_reduces)
	)
	results = dim_job.wait()
	shelvedb_paths = []
	if results!=None:
		for key,value in result_iterator(results):
			shelvedb_paths.append(key)
		if go_live==1:
			load_method.golive(config, shelvedb_paths)
Example #4
0
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(
            rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get(
                    'worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn(
                "Error instantiating worker: %s %s - loading default worker" %
                (settings.get('worker'), e))
            self.job = Job(name=rule.name, master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
Example #5
0
def run(program, jobclass, *inputs):
    """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...]

    Create an instance of jobclass and run it.
    Input urls are specified as arguments or read from stdin.
    """
    from disco.core import Params
    from disco.util import reify

    def maybe_list(seq):
        return seq[0] if len(seq) == 1 else seq

    name = program.options.name or jobclass.split('.')[-1]
    input = inputs or [
        maybe_list(line.split()) for line in fileinput.input(inputs)
    ]
    job = reify(jobclass)(program.disco, name)

    try:
        params = job.params
    except AttributeError:
        params = Params()
    params.__dict__.update(**dict(program.options.params))

    job.run(input=input, **program.option_parser.jobdict)
    print job.name
Example #6
0
 def _assert_csv_reader(self, fields, values, expected):
     stream = StringIO.StringIO(values)
     params = Params()
     params.csv_fields = fields
     params.csv_dialect = csv.excel_tab
     actual = csv_reader(stream, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
Example #7
0
 def _assert_reduce(self, data, expected, **kwargs):
     # turn disco_debug on for more code coverage
     if kwargs is None:
         kwargs = dict()
     kwargs['disco_debug'] = True
     params = Params(**kwargs)
     actual = keyset_reduce(data, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
Example #8
0
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name='naive_bayes_predict')
    job.run(input=input,
            map_reader=map_reader,
            map=predict_map,
            params=Params(loglikelihoods=loglikelihoods,
                          ys=ys,
                          splitter=splitter),
            clean=False)
    return job.wait()
Example #9
0
class Grep(Job):
    map = nop_map
    params = Params(pattern=None)

    def map_reader(fd, size, url, params):
        import re
        if params.pattern:
            pattern = re.compile(params.pattern)
            for line in fd:
                if pattern.match(line):
                    yield url, line
Example #10
0
class LineChunker(Job):
    params = Params(ddfs_master=None, tag=None)

    def _map_input_stream(fd, size, url, params):
        from disco.ddfs import DDFS
        tag = params.tag or 'disco:chunks:%s' % Task.jobname
        master = params.ddfs_master or Task.master
        yield url, DDFS(master).chunk(tag, [url])
    map_input_stream = [_map_input_stream]

    def map(entry, params):
        yield entry
Example #11
0
    def __init__(self, master, name, index, method, arg, streams, reduce,
                 **kwargs):
        super(DiscoDBIterator, self).__init__(name=name, master=master)
        self.input = [[
            '%s!%s/%s' % (url, method, arg) if method else url for url in urls
        ] for urls in index.ichunks]
        self.map_input_stream = [scheme_discodb.input_stream] + streams
        self.params = Params(**kwargs)

        if reduce:
            self.partitions = len(self.master.nodeinfo())
            self.reduce = reduce
Example #12
0
def predict(master, input, center, centers, map_reader=chain_reader):
    """
    Predict the closest clusters for the datapoints in input.
    """
    job = master.new_job(name='kcluster_predict',
                         input=input,
                         map_reader=map_reader,
                         map=predict_map,
                         params=Params(centers=centers, **center),
                         nr_reduces=0)

    return job.wait()
Example #13
0
 def setUp(self):
     sys.stdout = self.capture_stdout = cStringIO.StringIO()
     self.params = Params()
     self.params.keysets = {
         'last_name_keyset':
         dict(
             key_parts=['_keyset', 'last_name'],
             value_parts=['count'],
         ),
         'first_name_keyset':
         dict(
             key_parts=['_keyset', 'first_name'],
             value_parts=['count'],
         )
     }
Example #14
0
    def __init__(self, master, name, dataset):
        super(Indexer, self).__init__(name=name, master=master)
        self.input = dataset.input
        self.map_input_stream = dataset.stream
        self.map_reader = dataset.parser
        self.map = dataset.demuxer
        self.partition = dataset.balancer
        self.profile = dataset.profile
        self.partitions = dataset.nr_ichunks
        self.required_files = dataset.required_files
        self.params = Params(n=0, unique_items=dataset.unique_items)

        if self.partitions:
            self.reduce = nop_reduce
            self.reduce_output_stream = [reduce_output_stream, discodb_output]
        else:
            self.map_output_stream = [map_output_stream, discodb_output]
Example #15
0
class ParamsTestCase(DiscoJobTestFixture, DiscoTestCase):
    inputs = range(10)
    params = Params(x=5, f1=fun1, f2=fun2, now=datetime.now())
    sort = False

    def getdata(self, path):
        return '\n'.join([path] * 10)

    @staticmethod
    def map(e, params):
        return [(e, params.f1(int(e), params.x))]

    @staticmethod
    def reduce(iter, out, params):
        for k, v in iter:
            out.add(k, params.f2(int(v)))

    def runTest(self):
        for k, v in self.results:
            self.assertEquals(fun2(int(k) + 5), int(v))
Example #16
0
 def test_keyset_multiplier(self):
     params = Params()
     params.keysets = {
         'last_name_keyset':
         dict(
             key_parts=['_keyset', 'last_name'],
             value_parts=['count'],
         ),
         'first_name_keyset':
         dict(
             key_parts=['_keyset', 'first_name'],
             value_parts=['count'],
         )
     }
     data = [{
         'first_name': 'Willow',
         'last_name': 'Harvey'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke'
     }]
     expected = [{
         'first_name': 'Willow',
         'last_name': 'Harvey',
         '_keyset': 'first_name_keyset'
     }, {
         'first_name': 'Willow',
         'last_name': 'Harvey',
         '_keyset': 'last_name_keyset'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke',
         '_keyset': 'first_name_keyset'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke',
         '_keyset': 'last_name_keyset'
     }]
     actual = keyset_multiplier(data, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
Example #17
0
class PartialTestCase(DiscoJobTestFixture, DiscoTestCase):
    @property
    def inputs(self):
        return [str(x) for x in range(self.num_workers)]

    def getdata(self, path):
        return '1 _ 0 \n'

    map = partial(map, extra='a')
    combiner = partial(combiner, extra='b')
    reduce = partial(reduce, extra='c')
    map_init = partial(init, extra='d')
    reduce_init = partial(init, extra='e')
    map_reader = partial(reader, extra='f')
    map_writer = partial(writer, extra='g')
    reduce_reader = partial(reader, extra='h')
    reduce_writer = partial(writer, extra='i')
    params = Params(foo=partial(foo, extra='z'))

    def runTest(self):
        for k, v in self.results:
            self.assertEquals(k, '_fazbghczi')
Example #18
0
def load_fact(master, input, config_path, nr_maps=1, nr_reduces=1, \
              load_method=offdimetlmr, profile=False):
	#disco = Disco("disco://"+host)
	fact_starttime = time.time()
	fact_job = master.new_job(
		name = 'fact',
		input = input,
		map_init = load_method.fact_map_init,
		map_reader = load_method.map_reader,
		map = load_method.fact_map_func,
		combiner = load_method.fact_combiner_func,
		scheduler = {'max_cores': nr_maps},
		nr_reduces = nr_reduces,
		required_modules=[('config', config_path),],
		status_interval = 1000000,
		profile = profile,
		params = Params(totalcopytime=0, nr_maps=nr_maps, \
	                        nr_reduces=nr_reduces)
	)
	results = fact_job.wait()
	#results = fact_job.wait(show=True, poll_interval = 100, timeout = 10*3600)
	fact_endtime = time.time()
	print "Time of loading facts: %f seconds" % (fact_endtime-fact_starttime)
Example #19
0
    def __init__(
            self,
            # name, on/off
            name='_unnamed_',
            run=True,

            # throttle
            min_blobs=1,
            max_blobs=sys.maxint,
            partitions=200,
            partition_function=crc_partition,
            scheduler=None,
            worker=None,
            time_delta=None,
            newest_first=True,

            # archive
            archive=False,
            archive_tag_prefix='processed',

            # nuke
            nuke=False,

            # map
            map_init_function=lambda x, y: x,
            map_function=keyset_map,
            map_input_stream=chunk_csv_stream,
            map_output_stream=(map_output_stream, disco_output_stream),

            #combine
            combiner_function=None,

            # reduce
            reduce_function=keyset_reduce,
            reduce_output_stream=(reduce_output_stream, disco_output_stream),

            # result
            # result_iterator_override -->
            #   see inferno.lib.disco_ext.sorted_iterator for signature
            result_iterator_override=None,
            result_processor=keyset_result,
            result_tag=None,
            result_tag_suffix=True,
            save=False,
            sort=True,
            sort_buffer_size='10%',
            sorted_results=True,

            # keysets
            keysets=None,
            key_parts=None,
            value_parts=None,
            column_mappings=None,
            table=None,
            keyset_parts_preprocess=None,
            parts_postprocess=None,

            # input
            day_range=0,
            day_offset=0,
            day_start=None,
            source_tags=None,
            source_urls=None,

            # other
            rule_init_function=None,
            rule_cleanup=None,
            parts_preprocess=None,
            field_transforms=None,
            required_files=None,
            required_modules=None,

            # notifications --> notify_addresses must be list of addresses
            notify_on_fail=False,
            notify_on_success=False,
            notify_addresses=None,
            **kwargs):

        self.qualified_name = name
        if kwargs:
            self.params = Params(**kwargs)
        else:
            self.params = Params()

        if not scheduler:
            scheduler = {'force_local': False, 'max_cores': 200}

        # name, on/off
        self.run = run
        self.name = name

        # throttle
        self.min_blobs = min_blobs
        self.max_blobs = max_blobs
        self.partitions = partitions
        self.partition_function = partition_function
        self.scheduler = scheduler
        self.time_delta = time_delta
        if self.time_delta is None:
            self.time_delta = {'minutes': 5}
        self.newest_first = newest_first
        self.worker = worker

        # archive
        self.archive = archive
        self.archive_tag_prefix = archive_tag_prefix

        # nuke
        self.nuke = nuke

        # map
        self.map_init_function = map_init_function
        self.map_function = map_function
        self.map_input_stream = map_input_stream
        self.map_output_stream = map_output_stream
        self.combiner_function = combiner_function

        # reduce
        self.reduce_function = reduce_function
        self.reduce_output_stream = reduce_output_stream

        # result
        self.result_processor = result_processor
        self.result_tag = result_tag
        self.result_tag_suffix = result_tag_suffix
        self.save = save
        self.sort = sort
        self.sort_buffer_size = sort_buffer_size
        if result_iterator_override:
            self.result_iterator = result_iterator_override
        elif self.sort and sorted_results:
            self.result_iterator = sorted_iterator
        else:
            self.result_iterator = result_iterator

        # input
        if isinstance(source_tags, basestring):
            source_tags = [source_tags]
        self.day_range = day_range
        self.day_offset = day_offset
        self.day_start = day_start
        self.source_tags = source_tags or []

        # keysets
        keyset_dict = {}
        if keysets:
            for keyset_name, keyset_obj in keysets.items():
                keyset_dict[keyset_name] = keyset_obj.as_dict()
        else:
            keyset_dict['_default'] = Keyset(key_parts, value_parts,
                                             column_mappings, table,
                                             keyset_parts_preprocess,
                                             parts_postprocess).as_dict()
        self.params.keysets = keyset_dict

        self.params.parts_preprocess = parts_preprocess or []
        self.params.field_transforms = field_transforms or dict()

        # other
        self.rule_init_function = rule_init_function
        self.rule_cleanup = rule_cleanup
        self.required_modules = required_modules or []
        self.required_files = required_files or []
        self.notify_on_fail = notify_on_fail
        self.notify_on_success = notify_on_success
        self.notify_addresses = notify_addresses or []
        self.source_urls = source_urls
Example #20
0
 def params_2(self):
     return Params(job=self.job_1.name)
Example #21
0
def estimate(input, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name='naive_bayes_estimate')

    job.run(input=input,
            map_reader=map_reader,
            map=estimate_map,
            combiner=estimate_combiner,
            reduce=estimate_reduce,
            params=Params(ys=ys, splitter=splitter),
            clean=False)
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == '':
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value


#counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math
    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] +
                                         value[2]) - math.log(value[1] +
                                                              value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods