def classic_iterator(urls, reader=func.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): """ An iterator over records as seen by the classic map interface. :type reader: :func:`disco.classic.worker.func.input_stream` :param reader: shortcut for the last input stream applied. :type input_stream: sequence of :func:`disco.classic.worker.func.input_stream` :param input_stream: used to read from a custom file format. :type notifier: :func:`disco.classic.worker.func.notifier` :param notifier: called when the task opens a url. """ from disco.worker import Input from disco.worker.classic.worker import Worker worker = Worker(map_reader=reader, map_input_stream=input_stream) settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() for input in util.inputlist(urls, settings=settings): notifier(input) for record in Input(input, open=worker.opener('map', 'in', params)): yield record
def proxy_url(path, node='x'): settings = DiscoSettings() port, proxy = settings['DISCO_PORT'], settings['DISCO_PROXY'] if proxy: scheme, netloc, x = urlsplit(proxy) return '%s://%s/disco/node/%s/%s' % (scheme, netloc, node, path) return 'http://%s:%s/%s' % (node, port, path)
def __init__(self, netlocstr='', id=-1, inputs=None, jobdict=None, jobname='', settings=DiscoSettings()): self.netloc = util.netloc.parse(netlocstr) self.id = int(id) self.inputs = inputs self.jobdict = jobdict self.jobname = jobname self.settings = settings self.blobs = [] self.mode = self.__class__.__name__.lower() self.run_id = "%s:%d-%x-%x" % (self.mode, self.id, int(time.time() * 1000), os.getpid()) set_mem_limit(self.settings['DISCO_WORKER_MAX_MEM']) if not jobdict: self.jobdict = JobDict.unpack(open(self.jobpack), globals=worker.__dict__) self.insert_globals(self.functions)
def _master(host_port): host, port = host_port if not host: return master or DiscoSettings()['DISCO_MASTER'] if not port: return 'disco://{0}'.format(host) return 'http://{0}:{1}'.format(host, port)
def proxy_url(url, proxy=DiscoSettings()['DISCO_PROXY'], meth='GET', to_master=True): scheme, (host, port), path = urlsplit(url) if proxy and scheme != "tag": if to_master: return '{0}/{1}'.format(proxy, path) return '{0}/proxy/{1}/{2}/{3}'.format(proxy, host, meth, path) return url
def proxy_url(url, proxy=DiscoSettings()['DISCO_PROXY'], meth='GET', to_master=True): scheme, (host, port), path = urlsplit(url) # if the url contains a dot, it is an external resource, so do not proxy it if proxy and scheme == "http" and url.find('.') == -1: if to_master: return '{0}/{1}'.format(proxy, path) return '{0}/proxy/{1}/{2}/{3}'.format(proxy, host, meth, path) return url
def inputexpand(input, label=None, settings=DiscoSettings()): from disco.ddfs import DDFS, istag if ispartitioned(input) and label is not False: return zip(*(parse_dir(i, label=label) for i in iterify(input))) if isiterable(input): return [inputlist(input, label=label, settings=settings)] if istag(input): ddfs = DDFS(settings=settings) return chainify(blobs for name, tags, blobs in ddfs.findtags(input)) return [input]
def submit(master, jobpack): from disco.settings import DiscoSettings from disco.core import Disco settings = DiscoSettings() dmaster = Disco(master) print "Submitting job to ", master status, response = json.loads(dmaster.request('/disco/job/new', jobpack)) if status != 'ok': errmsg('Failed to start job. Server replied: %s' % response) print response
def urlresolve(url, settings=DiscoSettings()): scheme, netloc, path = urlsplit(url) if scheme == 'tag': def master((host, port)): if not host: return settings['DISCO_MASTER'] if not port: return 'disco://%s' % host return 'http://%s:%s' % (host, port) return urlresolve('%s/ddfs/tag/%s' % (master(netloc), path)) return '%s://%s/%s' % (scheme, netloc, path)
def assert_url(url, master=None): err = """url netloc/scheme: '{}' \nDiscoSettings()['DISCO_MASTER'] = {} os.environ['DISCO_MASTER'] = '{}'""" try: assert urlparse(url).scheme assert urlparse(url).netloc except Exception as e: raise DiscoError(err.format(url, e.message, DiscoSettings()['DISCO_MASTER'] or 'None', master)) return url
def sorted_iterator(urls, reader=func.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): from disco.worker import Input from disco.worker.classic.worker import Worker worker = Worker(map_reader=reader, map_input_stream=input_stream) settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() inputs = [] for input in util.inputlist(urls, settings=settings): notifier(input) instream = Input(input, open=worker.opener('map', 'in', params)) if instream: inputs.append(instream) return SortedIterator(inputs)
def result_iterator(urls, reader=task_io.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): """ An iterator over records stored in either disco or ddfs. :type reader: :func:`disco.worker.task_io.input_stream` :param reader: shortcut for the last input stream applied. :type input_stream: sequence of :func:`disco.worker.task_io.input_stream` :param input_stream: used to read from a custom file format. :type notifier: :func:`disco.func.notifier` :param notifier: called when the task opens a url. """ from disco.worker import Input from disco.worker.task_io import StreamCombiner settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() for input in util.inputlist(urls, settings=settings): if isinstance(input, basestring): dest = proxy_url(input, to_master=False) elif isinstance(input, tuple): dest = tuple([proxy_url(i, to_master=False) for i in input]) else: dest = [proxy_url(i, to_master=False) for i in input] notifier(dest) def open(url): streams = [s for s in input_stream] if reader: streams += [reader] return StreamCombiner(url, streams, params) for record in Input(dest, open=open): yield record
def wait(self, name, poll_interval=2, timeout=None, clean=False, show=DiscoSettings()['DISCO_EVENTS']): """ Block until the job *name* has finished. Returns a list URLs to the results files which is typically processed with :func:`result_iterator`. :meth:`Disco.wait` polls the server for the job status every *poll_interval* seconds. It raises a :class:`disco.JobError` if the job hasn't finished in *timeout* seconds, if specified. :param clean: if set to `True`, calls :meth:`Disco.clean` when the job has finished. Note that this only removes records from the master, but not the actual result files. Once you are done with the results, call:: disco.purge(disco.util.jobname(results[0])) to delete the actual result files. :param show: enables console output of job events. You can control this parameter also using the environment variable ``DISCO_EVENTS``, which provides the default. See ``DISCO_EVENTS`` in :mod:`disco.settings`. (*Added in version 0.2.3*) """ event_monitor = EventMonitor(Job(self, name=name), format=show, poll_interval=poll_interval) start_time = time.time() while True: event_monitor.refresh() try: return self.check_results(name, start_time, timeout, poll_interval * 1000) except Continue: continue finally: if clean: self.clean(name) event_monitor.refresh()
def urlsplit(url, localhost=None, settings=DiscoSettings()): scheme, rest = schemesplit(url) locstr, path = rest.split('/', 1) if '/' in rest else (rest ,'') if scheme == 'disco': prefix, fname = path.split('/', 1) if locstr == localhost: scheme = 'file' if prefix == 'ddfs': path = os.path.join(settings['DDFS_ROOT'], fname) else: path = os.path.join(settings['DISCO_DATA'], fname) else: scheme = 'http' locstr = '%s:%s' % (locstr, settings['DISCO_PORT']) if scheme == 'tag': if not path: path, locstr = locstr, '' return scheme, netloc.parse(locstr), path
def urlsplit(url, localhost=None, disco_port=None, **kwargs): scheme, rest = schemesplit(url) locstr, path = rest.split('/', 1) if '/' in rest else (rest, '') if scheme == 'tag': if not path: path, locstr = locstr, '' else: disco_port = disco_port or str(DiscoSettings()['DISCO_PORT']) host, port = netloc.parse(locstr) if scheme == 'disco' or port == disco_port: if localhost == True or locstr == localhost: scheme = 'file' locstr = '' path = localize(path, **kwargs) elif scheme == 'disco': scheme = 'http' locstr = '{0}:{1}'.format(host, disco_port) return scheme, netloc.parse(locstr), path
class DiscoTestCase(TestCase): disco_settings = DiscoSettings() @property def disco_master_url(self): return self.disco_settings['DISCO_MASTER'] @property def disco(self): return Disco(self.disco_master_url) def assertCommErrorCode(self, code, callable): from disco.error import CommError try: ret = callable() except CommError, e: return self.assertEquals(code, e.code) except Exception, e: raise AssertionError('CommError not raised, got %s' % e)
def Open(url, task=None): if task: disco_data = task.disco_data ddfs_data = task.ddfs_data else: from disco.settings import DiscoSettings settings = DiscoSettings() disco_data = settings['DISCO_DATA'] ddfs_data = settings['DDFS_DATA'] scheme, netloc, rest = util.urlsplit(url) path, rest = rest.split('!', 1) if '!' in rest else (rest, '') discodb = DiscoDB.load( open(util.localize(path, disco_data=disco_data, ddfs_data=ddfs_data))) if rest: method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None) method = getattr(discodb, method_name) if method_name in ('metaquery', 'query'): return method(Q.urlscan(arg)) return method(*filter(None, arg)) return discodb
class TestCase(unittest.TestCase): settings = DiscoSettings() @property def ddfs(self): return DDFS(settings=self.settings) @property def disco(self): return Disco(settings=self.settings) @property def nodes(self): return dict((host, info['max_workers']) for host, info in self.disco.nodeinfo().items() if not info['blacklisted']) @property def num_workers(self): return sum(x['max_workers'] for x in self.disco.nodeinfo().values()) @property def test_server_address(self): return (str(self.settings['DISCO_TEST_HOST']), int(self.settings['DISCO_TEST_PORT'])) def assertAllEqual(self, results, answers): from disco.future import izip_longest as zip for result, answer in zip(results, answers): self.assertEquals(result, answer) def assertCommErrorCode(self, code, callable): from disco.error import CommError try: ret = callable() except CommError, e: return self.assertEquals(code, e.code) except Exception, e: raise AssertionError('CommError not raised, got %s' % e)
class TestCase(unittest.TestCase): settings = DiscoSettings() @property def ddfs(self): return DDFS(settings=self.settings) @property def disco(self): return Disco(settings=self.settings) @property def nodes(self): return dict((host, info['max_workers']) for host, info in self.disco.nodeinfo().items() if not info['blacklisted']) @property def num_workers(self): return sum(x['max_workers'] for x in self.disco.nodeinfo().values()) @property def test_server_address(self): return (str(self.settings['DISCO_TEST_HOST']), int(self.settings['DISCO_TEST_PORT'])) def assertAllEqual(self, results, answers): from disco.compat import zip_longest as zip for result, answer in zip(results, answers): self.assertEquals(result, answer) def assertCommErrorCode(self, code, callable): from disco.error import CommError try: ret = callable() except CommError as e: return self.assertEquals(code, e.code) except Exception as e: raise AssertionError('CommError not raised, got {0}'.format(e)) raise AssertionError('CommError not raised (expected {0}), ' 'returned {1}'.format(code, ret)) def assertResults(self, job, answers): self.assertAllEqual(self.results(job), answers) def results(self, job, **kwargs): return result_iterator(job.wait(), **kwargs) def run(self, result=None): self.is_running = True signal.signal(signal.SIGINT, InterruptTest(self)) super(TestCase, self).run(result) self.is_running = False def setUp(self): if hasattr(self, 'serve'): self.test_server = TestServer.create(self.test_server_address, self.serve) self.test_server.start() def tearDown(self): if hasattr(self, 'serve'): self.test_server.stop() if hasattr(self, 'job') and self.settings['DISCO_TEST_PURGE']: self.job.purge() def skipTest(self, message): # Workaround for python2.5 which doesn't have skipTest in unittests # make sure calls to skipTest are the last statement in a code branch # (until we drop 2.5 support) try: super(TestCase, self).skipTest(message) except AttributeError as e: pass
class JobDict(util.DefaultDict): """ :meth:`Disco.new_job` and :meth:`Job.run` accept the same set of keyword arguments as specified below. .. note:: All arguments that are required are marked as such. All other arguments are optional. :type input: **required**, list of inputs or list of list of inputs :param input: Each input must be specified in one of the following ways: * ``http://www.example.com/data`` - any HTTP address * ``disco://cnode03/bigtxt/file_name`` - Disco address. Refers to ``cnode03:/var/disco/bigtxt/file_name``. Currently this is an alias for ``http://cnode03:[DISCO_PORT]/bigtxt/file_name``. * ``dir://cnode03/jobname/`` - Result directory. This format is used by Disco internally. * ``/home/bob/bigfile.txt`` - a local file. Note that the file must either exist on all the nodes or you must make sure that the job is run only on the nodes where the file exists. Due to these restrictions, this form has only limited use. * ``raw://some_string`` - pseudo-address; instead of fetching data from a remote source, use ``some_string`` in the address as data. Useful for specifying dummy inputs for generator maps. * ``tag://tagname`` - a tag stored in :ref:`DDFS` (*Added in version 0.3*) (*Added in version 0.3.2*) Tags can be token protected. For the data in such token-protected tags to be used as job inputs, the tags should be resolved into the constituent urls or replica sets (e.g. using util.urllist), and provided as the value of the input parameter. (*Added in version 0.2.2*): An input entry can be a list of inputs: This lets you specify redundant versions of an input file. If a list of redundant inputs is specified, the scheduler chooses the input that is located on the node with the lowest load at the time of scheduling. Redundant inputs are tried one by one until the task succeeds. Redundant inputs require that the *map* function is specified. :type map: :func:`disco.func.map` :param map: a :term:`pure function` that defines the map task. :type map_init: :func:`disco.func.init` :param map_init: initialization function for the map task. This function is called once before the task starts. :type map_input_stream: list of :func:`disco.func.input_stream` :param map_input_stream: The given functions are chained together and the final resulting :class:`disco.func.InputStream` object is used to iterate over input entries. (*Added in version 0.2.4*) :type map_output_stream: list of :func:`disco.func.output_stream` :param map_output_stream: The given functions are chained together and the :meth:`disco.func.OutputStream.add` method of the last returned :class:`disco.func.OutputStream` object is used to serialize key, value pairs output by the map. (*Added in version 0.2.4*) :type map_reader: ``None`` or :func:`disco.func.input_stream` :param map_reader: Convenience function to define the last :func:`disco.func.input_stream` function in the *map_input_stream* chain. Disco worker provides a convenience function :func:`disco.func.re_reader` that can be used to create a reader using regular expressions. If you want to use outputs of an earlier job as inputs, use :func:`disco.func.chain_reader` as the *map_reader*. Default is ``None``. (*Changed after version 0.3.1*) The default map_reader became ``None``. See the note in :func:`disco.func.map_line_reader` for information on how this might affect older jobs. :param map_writer: (*Deprecated in version 0.3*) This function comes in handy e.g. when *reduce* is not specified and you want *map* output in a specific format. Another typical case is to use :func:`disco.func.object_writer` as *map_writer* and :func:`disco.func.object_reader` as *reduce_reader* so you can produce arbitrary Python objects in *map*. Remember to specify a *reduce_reader* that can read the format produced by *map_writer*. (*Added in version 0.2*) :type reduce: :func:`disco.func.reduce` :param reduce: If no reduce function is specified, the job will quit after the map phase has finished. *Added in version 0.3.1*: Reduce supports now an alternative signature, :func:`disco.func.reduce2` which uses an iterator instead of ``out.add()`` to output results. *Changed in version 0.2*: It is possible to define only *reduce* without *map*. For more information, see the FAQ entry :ref:`reduceonly`. :type reduce_init: :func:`disco.func.init` :param reduce_init: initialization function for the reduce task. This function is called once before the task starts. :type reduce_input_stream: list of :func:`disco.func.output_stream` :param reduce_input_stream: The given functions are chained together and the last returned :class:`disco.func.InputStream` object is given to *reduce* as its first argument. (*Added in version 0.2.4*) :type reduce_output_stream: list of :func:`disco.func.output_stream` :param reduce_output_stream: The given functions are chained together and the last returned :class:`disco.func.OutputStream` object is given to *reduce* as its second argument. (*Added in version 0.2.4*) :type reduce_reader: :func:`disco.func.input_stream` :param reduce_reader: This function needs to match with *map_writer*, if *map* is specified. If *map* is not specified, you can read arbitrary inputs with this function, similar to *map_reader*. (*Added in version 0.2*) Default is :func:`disco.func.chain_reader`. :param reduce_writer: (*Deprecated in version 0.3*) You can use this function to output results in an arbitrary format from your map/reduce job. If you use :func:`result_iterator` to read results, set its *reader* parameter to a function that can read the format produced by *reduce_writer*. (*Added in version 0.2*) :type combiner: :func:`disco.func.combiner` :param combiner: called after the partitioning function, for each partition. :type partition: :func:`disco.func.partition` :param partition: decides how the map output is distributed to reduce. Default is :func:`disco.func.default_partition`. :type partitions: int or None :param partitions: number of partitions, if any. Default is ``1``. :type merge_partitions: bool :param merge_partitions: whether or not to merge partitioned inputs during reduce. Default is ``False``. :type nr_reduces: *Deprecated in version 0.3* integer :param nr_reduces: Use *partitions* instead. :type scheduler: dict :param scheduler: options for the job scheduler. The following keys are supported: * *max_cores* - use this many cores at most (applies to both map and reduce). Default is ``2**31``. * *force_local* - always run task on the node where input data is located; never use HTTP to access data remotely. * *force_remote* - never run task on the node where input data is located; always use HTTP to access data remotely. (*Added in version 0.2.4*) :type sort: boolean :param sort: flag specifying whether the intermediate results, that is, input to the reduce function, should be sorted. Sorting is most useful in ensuring that the equal keys are consequent in the input for the reduce function. Other than ensuring that equal keys are grouped together, sorting ensures that keys are returned in the ascending order. No other assumptions should be made on the comparison function. The external program ``sort`` is used to sort the input on disk. In-memory sort can easily be performed by the tasks themselves. Default is ``False``. :type params: :class:`Params` :param params: object that is passed to worker tasks to store state The object is serialized using the *pickle* module, so it should be pickleable. A convience class :class:`Params` is provided that provides an easy way to encapsulate a set of parameters. :class:`Params` allows including :term:`pure functions <pure function>` in the parameters. :param ext_params: if either map or reduce function is an external program, typically specified using :func:`disco.util.external`, this object is used to deliver parameters to the program. The default C interface for external Disco functions uses :mod:`netstring` to encode the parameter dictionary. Hence the *ext_params* value must be a dictionary string ``(key, value)`` pairs. However, if the external program doesn't use the default C interface, it can receive parameters in any format. In this case, the *ext_params* value can be an arbitrary string which can be decoded by the program properly. For more information, see :ref:`discoext`. :type required_files: list of paths or dict :param required_files: additional files that are required by the job. Either a list of paths to files to include, or a dictionary which contains items of the form ``(filename, filecontents)``. You can use this parameter to include custom modules or shared libraries in the job. (*Added in version 0.2.3*) .. note:: All files will be saved in a flat directory on the worker. No subdirectories will be created. .. note:: ``LD_LIBRARY_PATH`` is set so you can include a shared library ``foo.so`` in *required_files* and load it in the job directly as ``ctypes.cdll.LoadLibrary("foo.so")``. For an example, see :ref:`discoext`. :param required_modules: required modules to send to the worker (*Changed in version 0.2.3*): Disco tries to guess which modules are needed by your job functions automatically. It sends any local dependencies (i.e. modules not included in the Python standard library) to nodes by default. If guessing fails, or you have other requirements, see :mod:`disco.modutil` for options. :type status_interval: integer :param status_interval: print "K items mapped / reduced" for every Nth item. Setting the value to 0 disables messages. Increase this value, or set it to zero, if you get "Message rate limit exceeded" error due to system messages. This might happen if your tasks are really fast. Decrease the value if you want more messages or you don't have that many data items. Default is ``100000``. :type profile: boolean :param profile: enable tasks profiling. Retrieve profiling results with :meth:`Disco.profile_stats`. Default is ``False``. """ defaults = { 'input': (), 'map': None, 'map_init': func.noop, 'map_reader': None, 'map_input_stream': (func.map_input_stream, ), 'map_output_stream': (func.map_output_stream, func.disco_output_stream), 'combiner': None, 'partition': func.default_partition, 'reduce': None, 'reduce_init': func.noop, 'reduce_reader': func.chain_reader, 'reduce_input_stream': (func.reduce_input_stream, ), 'reduce_output_stream': (func.reduce_output_stream, func.disco_output_stream), 'ext_map': False, 'ext_reduce': False, 'ext_params': None, 'merge_partitions': False, 'params': Params(), 'partitions': 1, 'prefix': '', 'profile': False, 'required_files': {}, 'required_modules': None, 'scheduler': { 'max_cores': '%d' % 2**31 }, 'save': False, 'sort': False, 'status_interval': 100000, 'username': DiscoSettings()['DISCO_JOB_OWNER'], 'version': '.'.join(str(s) for s in sys.version_info[:2]), # deprecated 'nr_reduces': 0, 'map_writer': None, 'mem_sort_limit': 0, 'reduce_writer': None } default_factory = defaults.__getitem__ functions = set([ 'map', 'map_init', 'map_reader', 'map_writer', 'combiner', 'partition', 'reduce', 'reduce_init', 'reduce_reader', 'reduce_writer' ]) scheduler_keys = set(['force_local', 'force_remote', 'max_cores']) stacks = set([ 'map_input_stream', 'map_output_stream', 'reduce_input_stream', 'reduce_output_stream' ]) def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten( util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules( [f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [ list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs) ] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max( id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key) def __contains__(self, key): return key in self.defaults def pack(self): """Pack up the :class:`JobDict` for sending over the wire.""" jobpack = {} if self['required_files']: if not isinstance(self['required_files'], dict): self['required_files'] = util.pack_files( self['required_files']) else: self['required_files'] = {} self['required_files'].update( util.pack_files(o[1] for o in self['required_modules'] if util.iskv(o))) for key in self.defaults: if key in ('map', 'reduce'): if self[key] is None: continue if key == 'input': jobpack['input'] = ' '.join( '\n'.join(reversed(list(util.iterify(url)))) for url in self['input']) elif key == 'username': jobpack['username'] = str(self['username']) elif key in ('nr_reduces', 'prefix'): jobpack[key] = str(self[key]) elif key == 'scheduler': scheduler = self['scheduler'] for key in scheduler: jobpack['sched_%s' % key] = str(scheduler[key]) elif key in self.stacks: jobpack[key] = util.pack_stack(self[key]) else: jobpack[key] = util.pack(self[key]) return encode_netstring_fd(jobpack) @classmethod def unpack(cls, jobpack, globals={}): """Unpack the previously packed :class:`JobDict`.""" jobdict = cls.defaults.copy() jobdict.update(**decode_netstring_fd(jobpack)) for key in cls.defaults: if key == 'input': jobdict['input'] = [ i.split() for i in jobdict['input'].split(' ') ] elif key == 'username': pass elif key == 'nr_reduces': jobdict[key] = int(jobdict[key]) elif key == 'scheduler': for key in cls.scheduler_keys: if 'sched_%s' % key in jobdict: jobdict['scheduler'][key] = jobdict.pop('sched_%s' % key) elif key == 'prefix': pass elif jobdict[key] is None: pass elif key in cls.stacks: jobdict[key] = util.unpack_stack(jobdict[key], globals=globals) else: jobdict[key] = util.unpack(jobdict[key], globals=globals) return cls(**jobdict) @property def input_is_partitioned(self): if self['input']: return all( url.startswith('dir://') for urls in self['input'] for url in urls)
def __init__(self, master=None, settings=None): self.settings = settings or DiscoSettings() self.master = master or self.settings['DISCO_MASTER']
import sys from disco.core import Disco, result_iterator from disco.settings import DiscoSettings def map(line, params): for word in line.split(): yield word, 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job( name="wordcount", input=["http://discoproject.org/media/text/chekhov.txt"], map=map, reduce=reduce, save=True).wait() print "Job done. Results:" for word, count in result_iterator(results): print word, count
import os, random, struct, time, socket, base64 from disco.compat import BytesIO, file, httplib, basestring, str_to_bytes from disco.error import CommError from disco.settings import DiscoSettings from disco.util import iterify, urlresolve, urlsplit BUFFER_SIZE = int(1024**2) CHUNK_SIZE = int(10 * 1024**2) settings = DiscoSettings() nocurl = 'nocurl' in settings['DISCO_FLAGS'].lower().split() try: import pycurl except ImportError: nocurl = True if nocurl: HTTPConnection = httplib.HTTPConnection else: from disco import comm_pycurl from disco.comm_pycurl import HTTPConnection def isredirection(status): return str(status).startswith('3') def issuccessful(status): return str(status).startswith('2')
def proxy_url(url, proxy=DiscoSettings()['DISCO_PROXY']): if proxy: scheme, (host, port), path = urlsplit(url) return '%s/disco/node/%s/%s' % (proxy, host, path) return url
def _master((host, port)): if not host: return master or DiscoSettings()['DISCO_MASTER'] if not port: return 'disco://%s' % host return 'http://%s:%s' % (host, port)
def __init__(self, master=None, proxy=None, settings=None): self.settings = settings or DiscoSettings() self.proxy = proxy or self.settings['DISCO_PROXY'] self.master = self.proxy or master or self.settings['DISCO_MASTER'] self.settings['DISCO_MASTER'] = self.master
def __init__(self, name=None, master=None, worker=None, settings=None): from disco.core import Disco self.name = name or type(self).__name__ self.disco = master if isinstance(master, Disco) else Disco(master) self.worker = worker or self.Worker() self.settings = settings or DiscoSettings()