Esempio n. 1
0
def classic_iterator(urls,
                     reader=func.chain_reader,
                     input_stream=(func.map_input_stream, ),
                     notifier=func.notifier,
                     params=None,
                     ddfs=None):
    """
    An iterator over records as seen by the classic map interface.

    :type  reader: :func:`disco.classic.worker.func.input_stream`
    :param reader: shortcut for the last input stream applied.

    :type  input_stream: sequence of :func:`disco.classic.worker.func.input_stream`
    :param input_stream: used to read from a custom file format.

    :type  notifier: :func:`disco.classic.worker.func.notifier`
    :param notifier: called when the task opens a url.
    """
    from disco.worker import Input
    from disco.worker.classic.worker import Worker
    worker = Worker(map_reader=reader, map_input_stream=input_stream)
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()
    for input in util.inputlist(urls, settings=settings):
        notifier(input)
        for record in Input(input, open=worker.opener('map', 'in', params)):
            yield record
Esempio n. 2
0
def proxy_url(path, node='x'):
    settings = DiscoSettings()
    port, proxy = settings['DISCO_PORT'], settings['DISCO_PROXY']
    if proxy:
        scheme, netloc, x = urlsplit(proxy)
        return '%s://%s/disco/node/%s/%s' % (scheme, netloc, node, path)
    return 'http://%s:%s/%s' % (node, port, path)
Esempio n. 3
0
    def __init__(self,
                 netlocstr='',
                 id=-1,
                 inputs=None,
                 jobdict=None,
                 jobname='',
                 settings=DiscoSettings()):
        self.netloc   = util.netloc.parse(netlocstr)
        self.id       = int(id)
        self.inputs   = inputs
        self.jobdict  = jobdict
        self.jobname  = jobname
        self.settings = settings
        self.blobs    = []
        self.mode     = self.__class__.__name__.lower()
        self.run_id   = "%s:%d-%x-%x" % (self.mode,
                                         self.id,
                                         int(time.time() * 1000),
                                         os.getpid())

        set_mem_limit(self.settings['DISCO_WORKER_MAX_MEM'])

        if not jobdict:
            self.jobdict = JobDict.unpack(open(self.jobpack),
                                          globals=worker.__dict__)
        self.insert_globals(self.functions)
Esempio n. 4
0
File: util.py Progetto: yuj/disco
 def _master(host_port):
     host, port = host_port
     if not host:
         return master or DiscoSettings()['DISCO_MASTER']
     if not port:
         return 'disco://{0}'.format(host)
     return 'http://{0}:{1}'.format(host, port)
Esempio n. 5
0
def proxy_url(url, proxy=DiscoSettings()['DISCO_PROXY'], meth='GET', to_master=True):
    scheme, (host, port), path = urlsplit(url)
    if proxy and scheme != "tag":
        if to_master:
            return '{0}/{1}'.format(proxy, path)
        return '{0}/proxy/{1}/{2}/{3}'.format(proxy, host, meth, path)
    return url
Esempio n. 6
0
def proxy_url(url, proxy=DiscoSettings()['DISCO_PROXY'], meth='GET', to_master=True):
    scheme, (host, port), path = urlsplit(url)
    # if the url contains a dot, it is an external resource, so do not proxy it
    if proxy and scheme == "http" and url.find('.') == -1:
        if to_master:
            return '{0}/{1}'.format(proxy, path)
        return '{0}/proxy/{1}/{2}/{3}'.format(proxy, host, meth, path)
    return url
Esempio n. 7
0
def inputexpand(input, label=None, settings=DiscoSettings()):
    from disco.ddfs import DDFS, istag
    if ispartitioned(input) and label is not False:
        return zip(*(parse_dir(i, label=label) for i in iterify(input)))
    if isiterable(input):
        return [inputlist(input, label=label, settings=settings)]
    if istag(input):
        ddfs = DDFS(settings=settings)
        return chainify(blobs for name, tags, blobs in ddfs.findtags(input))
    return [input]
Esempio n. 8
0
def submit(master, jobpack):
    from disco.settings import DiscoSettings
    from disco.core import Disco
    settings = DiscoSettings()
    dmaster = Disco(master)
    print "Submitting job to ", master
    status, response = json.loads(dmaster.request('/disco/job/new', jobpack))
    if status != 'ok':
        errmsg('Failed to start job. Server replied: %s' % response)
    print response
Esempio n. 9
0
def urlresolve(url, settings=DiscoSettings()):
    scheme, netloc, path = urlsplit(url)
    if scheme == 'tag':
        def master((host, port)):
            if not host:
                return settings['DISCO_MASTER']
            if not port:
                return 'disco://%s' % host
            return 'http://%s:%s' % (host, port)
        return urlresolve('%s/ddfs/tag/%s' % (master(netloc), path))
    return '%s://%s/%s' % (scheme, netloc, path)
Esempio n. 10
0
def assert_url(url, master=None):
    err = """url netloc/scheme: '{}' \nDiscoSettings()['DISCO_MASTER'] = {}
    os.environ['DISCO_MASTER'] = '{}'"""
    try:
        assert urlparse(url).scheme
        assert urlparse(url).netloc
    except Exception as e:
        raise DiscoError(err.format(url,
            e.message,
            DiscoSettings()['DISCO_MASTER'] or 'None',
            master))
    return url
Esempio n. 11
0
def sorted_iterator(urls,
                    reader=func.chain_reader,
                    input_stream=(func.map_input_stream, ),
                    notifier=func.notifier,
                    params=None,
                    ddfs=None):

    from disco.worker import Input
    from disco.worker.classic.worker import Worker

    worker = Worker(map_reader=reader, map_input_stream=input_stream)
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()

    inputs = []
    for input in util.inputlist(urls, settings=settings):
        notifier(input)
        instream = Input(input, open=worker.opener('map', 'in', params))
        if instream:
            inputs.append(instream)

    return SortedIterator(inputs)
Esempio n. 12
0
def result_iterator(urls,
                    reader=task_io.chain_reader,
                    input_stream=(func.map_input_stream, ),
                    notifier=func.notifier,
                    params=None,
                    ddfs=None):
    """
    An iterator over records stored in either disco or ddfs.

    :type  reader: :func:`disco.worker.task_io.input_stream`
    :param reader: shortcut for the last input stream applied.

    :type  input_stream: sequence of :func:`disco.worker.task_io.input_stream`
    :param input_stream: used to read from a custom file format.

    :type  notifier: :func:`disco.func.notifier`
    :param notifier: called when the task opens a url.
    """
    from disco.worker import Input
    from disco.worker.task_io import StreamCombiner
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()
    for input in util.inputlist(urls, settings=settings):
        if isinstance(input, basestring):
            dest = proxy_url(input, to_master=False)
        elif isinstance(input, tuple):
            dest = tuple([proxy_url(i, to_master=False) for i in input])
        else:
            dest = [proxy_url(i, to_master=False) for i in input]
        notifier(dest)

        def open(url):
            streams = [s for s in input_stream]
            if reader:
                streams += [reader]
            return StreamCombiner(url, streams, params)

        for record in Input(dest, open=open):
            yield record
Esempio n. 13
0
File: core.py Progetto: mshron/disco
    def wait(self,
             name,
             poll_interval=2,
             timeout=None,
             clean=False,
             show=DiscoSettings()['DISCO_EVENTS']):
        """
        Block until the job *name* has finished. Returns a list URLs to the
        results files which is typically processed with :func:`result_iterator`.

        :meth:`Disco.wait` polls the server for the job status every
        *poll_interval* seconds. It raises a :class:`disco.JobError` if the
        job hasn't finished in *timeout* seconds, if specified.

        :param clean: if set to `True`, calls :meth:`Disco.clean`
                      when the job has finished.

                      Note that this only removes records from the master,
                      but not the actual result files.
                      Once you are done with the results, call::

                        disco.purge(disco.util.jobname(results[0]))

                      to delete the actual result files.

        :param show: enables console output of job events.
                     You can control this parameter also using the environment
                     variable ``DISCO_EVENTS``, which provides the default.
                     See ``DISCO_EVENTS`` in :mod:`disco.settings`.
                     (*Added in version 0.2.3*)
        """
        event_monitor = EventMonitor(Job(self, name=name),
                                     format=show,
                                     poll_interval=poll_interval)
        start_time = time.time()
        while True:
            event_monitor.refresh()
            try:
                return self.check_results(name, start_time, timeout,
                                          poll_interval * 1000)
            except Continue:
                continue
            finally:
                if clean:
                    self.clean(name)
                event_monitor.refresh()
Esempio n. 14
0
def urlsplit(url, localhost=None, settings=DiscoSettings()):
    scheme, rest = schemesplit(url)
    locstr, path = rest.split('/', 1)  if '/'   in rest else (rest ,'')
    if scheme == 'disco':
        prefix, fname = path.split('/', 1)
        if locstr == localhost:
            scheme = 'file'
            if prefix == 'ddfs':
                path = os.path.join(settings['DDFS_ROOT'], fname)
            else:
                path = os.path.join(settings['DISCO_DATA'], fname)
        else:
            scheme = 'http'
            locstr = '%s:%s' % (locstr, settings['DISCO_PORT'])
    if scheme == 'tag':
        if not path:
            path, locstr = locstr, ''
    return scheme, netloc.parse(locstr), path
Esempio n. 15
0
File: util.py Progetto: yuj/disco
def urlsplit(url, localhost=None, disco_port=None, **kwargs):
    scheme, rest = schemesplit(url)
    locstr, path = rest.split('/', 1) if '/' in rest else (rest, '')
    if scheme == 'tag':
        if not path:
            path, locstr = locstr, ''
    else:
        disco_port = disco_port or str(DiscoSettings()['DISCO_PORT'])
        host, port = netloc.parse(locstr)
        if scheme == 'disco' or port == disco_port:
            if localhost == True or locstr == localhost:
                scheme = 'file'
                locstr = ''
                path = localize(path, **kwargs)
            elif scheme == 'disco':
                scheme = 'http'
                locstr = '{0}:{1}'.format(host, disco_port)
    return scheme, netloc.parse(locstr), path
Esempio n. 16
0
class DiscoTestCase(TestCase):
    disco_settings = DiscoSettings()

    @property
    def disco_master_url(self):
        return self.disco_settings['DISCO_MASTER']

    @property
    def disco(self):
        return Disco(self.disco_master_url)

    def assertCommErrorCode(self, code, callable):
        from disco.error import CommError
        try:
            ret = callable()
        except CommError, e:
            return self.assertEquals(code, e.code)
        except Exception, e:
            raise AssertionError('CommError not raised, got %s' % e)
Esempio n. 17
0
def Open(url, task=None):
    if task:
        disco_data = task.disco_data
        ddfs_data = task.ddfs_data
    else:
        from disco.settings import DiscoSettings
        settings = DiscoSettings()
        disco_data = settings['DISCO_DATA']
        ddfs_data = settings['DDFS_DATA']
    scheme, netloc, rest = util.urlsplit(url)
    path, rest = rest.split('!', 1) if '!' in rest else (rest, '')
    discodb = DiscoDB.load(
        open(util.localize(path, disco_data=disco_data, ddfs_data=ddfs_data)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg))
        return method(*filter(None, arg))
    return discodb
Esempio n. 18
0
class TestCase(unittest.TestCase):
    settings = DiscoSettings()

    @property
    def ddfs(self):
        return DDFS(settings=self.settings)

    @property
    def disco(self):
        return Disco(settings=self.settings)

    @property
    def nodes(self):
        return dict((host, info['max_workers'])
                    for host, info in self.disco.nodeinfo().items()
                    if not info['blacklisted'])

    @property
    def num_workers(self):
        return sum(x['max_workers'] for x in self.disco.nodeinfo().values())

    @property
    def test_server_address(self):
        return (str(self.settings['DISCO_TEST_HOST']),
                int(self.settings['DISCO_TEST_PORT']))

    def assertAllEqual(self, results, answers):
        from disco.future import izip_longest as zip
        for result, answer in zip(results, answers):
            self.assertEquals(result, answer)

    def assertCommErrorCode(self, code, callable):
        from disco.error import CommError
        try:
            ret = callable()
        except CommError, e:
            return self.assertEquals(code, e.code)
        except Exception, e:
            raise AssertionError('CommError not raised, got %s' % e)
Esempio n. 19
0
File: test.py Progetto: yuj/disco
class TestCase(unittest.TestCase):
    settings = DiscoSettings()

    @property
    def ddfs(self):
        return DDFS(settings=self.settings)

    @property
    def disco(self):
        return Disco(settings=self.settings)

    @property
    def nodes(self):
        return dict((host, info['max_workers'])
                    for host, info in self.disco.nodeinfo().items()
                    if not info['blacklisted'])

    @property
    def num_workers(self):
        return sum(x['max_workers'] for x in self.disco.nodeinfo().values())

    @property
    def test_server_address(self):
        return (str(self.settings['DISCO_TEST_HOST']),
                int(self.settings['DISCO_TEST_PORT']))

    def assertAllEqual(self, results, answers):
        from disco.compat import zip_longest as zip
        for result, answer in zip(results, answers):
            self.assertEquals(result, answer)

    def assertCommErrorCode(self, code, callable):
        from disco.error import CommError
        try:
            ret = callable()
        except CommError as e:
            return self.assertEquals(code, e.code)
        except Exception as e:
            raise AssertionError('CommError not raised, got {0}'.format(e))
        raise AssertionError('CommError not raised (expected {0}), '
                             'returned {1}'.format(code, ret))

    def assertResults(self, job, answers):
        self.assertAllEqual(self.results(job), answers)

    def results(self, job, **kwargs):
        return result_iterator(job.wait(), **kwargs)

    def run(self, result=None):
        self.is_running = True
        signal.signal(signal.SIGINT, InterruptTest(self))
        super(TestCase, self).run(result)
        self.is_running = False

    def setUp(self):
        if hasattr(self, 'serve'):
            self.test_server = TestServer.create(self.test_server_address,
                                                 self.serve)
            self.test_server.start()

    def tearDown(self):
        if hasattr(self, 'serve'):
            self.test_server.stop()
        if hasattr(self, 'job') and self.settings['DISCO_TEST_PURGE']:
            self.job.purge()

    def skipTest(self, message):
        # Workaround for python2.5 which doesn't have skipTest in unittests
        # make sure calls to skipTest are the last statement in a code branch
        # (until we drop 2.5 support)
        try:
            super(TestCase, self).skipTest(message)
        except AttributeError as e:
            pass
Esempio n. 20
0
File: core.py Progetto: mshron/disco
class JobDict(util.DefaultDict):
    """
    :meth:`Disco.new_job` and :meth:`Job.run`
    accept the same set of keyword arguments as specified below.

    .. note:: All arguments that are required are marked as such.
              All other arguments are optional.

    :type  input: **required**, list of inputs or list of list of inputs
    :param input: Each input must be specified in one of the following ways:

                   * ``http://www.example.com/data`` - any HTTP address
                   * ``disco://cnode03/bigtxt/file_name`` - Disco address. Refers to ``cnode03:/var/disco/bigtxt/file_name``. Currently this is an alias for ``http://cnode03:[DISCO_PORT]/bigtxt/file_name``.
                   * ``dir://cnode03/jobname/`` - Result directory. This format is used by Disco internally.
                   * ``/home/bob/bigfile.txt`` - a local file. Note that the file must either exist on all the nodes or you must make sure that the job is run only on the nodes where the file exists. Due to these restrictions, this form has only limited use.
                   * ``raw://some_string`` - pseudo-address; instead of fetching data from a remote source, use ``some_string`` in the address as data. Useful for specifying dummy inputs for generator maps.
                   * ``tag://tagname`` - a tag stored in :ref:`DDFS` (*Added in version 0.3*)

                  (*Added in version 0.3.2*)
                  Tags can be token protected. For the data in such
                  token-protected tags to be used as job inputs, the
                  tags should be resolved into the constituent urls or
                  replica sets (e.g. using util.urllist), and provided
                  as the value of the input parameter.

                  (*Added in version 0.2.2*):
                  An input entry can be a list of inputs:
                  This lets you specify redundant versions of an input file.
                  If a list of redundant inputs is specified,
                  the scheduler chooses the input that is located on the node
                  with the lowest load at the time of scheduling.
                  Redundant inputs are tried one by one until the task succeeds.
                  Redundant inputs require that the *map* function is specified.

    :type  map: :func:`disco.func.map`
    :param map: a :term:`pure function` that defines the map task.

    :type  map_init: :func:`disco.func.init`
    :param map_init: initialization function for the map task.
                     This function is called once before the task starts.

    :type  map_input_stream: list of :func:`disco.func.input_stream`
    :param map_input_stream: The given functions are chained together and the final resulting
                             :class:`disco.func.InputStream` object is used
                             to iterate over input entries.

                             (*Added in version 0.2.4*)

    :type  map_output_stream: list of :func:`disco.func.output_stream`
    :param map_output_stream: The given functions are chained together and the
                              :meth:`disco.func.OutputStream.add` method of the last
                              returned :class:`disco.func.OutputStream` object is used
                              to serialize key, value pairs output by the map.
                              (*Added in version 0.2.4*)

    :type  map_reader: ``None`` or :func:`disco.func.input_stream`
    :param map_reader: Convenience function to define the last :func:`disco.func.input_stream`
                       function in the *map_input_stream* chain.

                       Disco worker provides a convenience function
                       :func:`disco.func.re_reader` that can be used to create
                       a reader using regular expressions.

                       If you want to use outputs of an earlier job as inputs,
                       use :func:`disco.func.chain_reader` as the *map_reader*.

                       Default is ``None``.

                       (*Changed after version 0.3.1*)
                       The default map_reader became ``None``.
                       See the note in :func:`disco.func.map_line_reader`
                       for information on how this might affect older jobs.

    :param map_writer: (*Deprecated in version 0.3*) This function comes in
                       handy e.g. when *reduce* is not
                       specified and you want *map* output in a specific format.
                       Another typical case is to use
                       :func:`disco.func.object_writer` as *map_writer* and
                       :func:`disco.func.object_reader` as *reduce_reader*
                       so you can produce arbitrary Python objects in *map*.

                       Remember to specify a *reduce_reader*
                       that can read the format produced by *map_writer*.
                       (*Added in version 0.2*)

    :type  reduce: :func:`disco.func.reduce`
    :param reduce: If no reduce function is specified, the job will quit after
                   the map phase has finished.

                   *Added in version 0.3.1*:
                   Reduce supports now an alternative signature,
                   :func:`disco.func.reduce2` which uses an iterator instead
                   of ``out.add()`` to output results.

                   *Changed in version 0.2*:
                   It is possible to define only *reduce* without *map*.
                   For more information, see the FAQ entry :ref:`reduceonly`.

    :type  reduce_init: :func:`disco.func.init`
    :param reduce_init: initialization function for the reduce task.
                        This function is called once before the task starts.

    :type  reduce_input_stream: list of :func:`disco.func.output_stream`
    :param reduce_input_stream: The given functions are chained together and the last
                              returned :class:`disco.func.InputStream` object is
                              given to *reduce* as its first argument.
                              (*Added in version 0.2.4*)

    :type  reduce_output_stream: list of :func:`disco.func.output_stream`
    :param reduce_output_stream: The given functions are chained together and the last
                              returned :class:`disco.func.OutputStream` object is
                              given to *reduce* as its second argument.
                              (*Added in version 0.2.4*)

    :type  reduce_reader: :func:`disco.func.input_stream`
    :param reduce_reader: This function needs to match with *map_writer*,
                          if *map* is specified.
                          If *map* is not specified,
                          you can read arbitrary inputs with this function,
                          similar to *map_reader*.
                          (*Added in version 0.2*)

                          Default is :func:`disco.func.chain_reader`.

    :param reduce_writer: (*Deprecated in version 0.3*) You can use this function to output results
                          in an arbitrary format from your map/reduce job.
                          If you use :func:`result_iterator` to read results,
                          set its *reader* parameter to a function
                          that can read the format produced by *reduce_writer*.
                          (*Added in version 0.2*)

    :type  combiner: :func:`disco.func.combiner`
    :param combiner: called after the partitioning function, for each partition.

    :type  partition: :func:`disco.func.partition`
    :param partition: decides how the map output is distributed to reduce.

                      Default is :func:`disco.func.default_partition`.

    :type  partitions: int or None
    :param partitions: number of partitions, if any.

                       Default is ``1``.

    :type  merge_partitions: bool
    :param merge_partitions: whether or not to merge partitioned inputs during reduce.

                             Default is ``False``.

    :type  nr_reduces: *Deprecated in version 0.3* integer
    :param nr_reduces: Use *partitions* instead.

    :type  scheduler: dict
    :param scheduler: options for the job scheduler.
                      The following keys are supported:

                       * *max_cores* - use this many cores at most
                                       (applies to both map and reduce).

                                       Default is ``2**31``.

                       * *force_local* - always run task on the node where
                                         input data is located;
                                         never use HTTP to access data remotely.

                       * *force_remote* - never run task on the node where input
                                          data is located;
                                          always use HTTP to access data remotely.

                      (*Added in version 0.2.4*)

    :type  sort: boolean
    :param sort: flag specifying whether the intermediate results,
                 that is, input to the reduce function, should be sorted.
                 Sorting is most useful in ensuring that the equal keys are
                 consequent in the input for the reduce function.

                 Other than ensuring that equal keys are grouped together,
                 sorting ensures that keys are returned in the ascending order.
                 No other assumptions should be made on the comparison function.

                 The external program ``sort`` is used to sort the input on disk.
                 In-memory sort can easily be performed by the tasks themselves.

                 Default is ``False``.

    :type  params: :class:`Params`
    :param params: object that is passed to worker tasks to store state
                   The object is serialized using the *pickle* module,
                   so it should be pickleable.

                   A convience class :class:`Params` is provided that
                   provides an easy way to encapsulate a set of parameters.
                   :class:`Params` allows including
                   :term:`pure functions <pure function>` in the parameters.

    :param ext_params: if either map or reduce function is an external program,
                       typically specified using :func:`disco.util.external`,
                       this object is used to deliver parameters to the program.

                       The default C interface for external Disco functions uses
                       :mod:`netstring` to encode the parameter dictionary.
                       Hence the *ext_params* value must be a dictionary
                       string ``(key, value)`` pairs.

                       However, if the external program doesn't use the default
                       C interface, it can receive parameters in any format.
                       In this case, the *ext_params* value can be an arbitrary
                       string which can be decoded by the program properly.

                       For more information, see :ref:`discoext`.

    :type  required_files: list of paths or dict
    :param required_files: additional files that are required by the job.
                           Either a list of paths to files to include,
                           or a dictionary which contains items of the form
                           ``(filename, filecontents)``.

                           You can use this parameter to include custom modules
                           or shared libraries in the job.
                           (*Added in version 0.2.3*)

                           .. note::

                                All files will be saved in a flat directory
                                on the worker.
                                No subdirectories will be created.


                            .. note::

                                ``LD_LIBRARY_PATH`` is set so you can include
                                a shared library ``foo.so`` in *required_files*
                                and load it in the job directly as
                                ``ctypes.cdll.LoadLibrary("foo.so")``.
                                For an example, see :ref:`discoext`.

    :param required_modules: required modules to send to the worker
                             (*Changed in version 0.2.3*):
                             Disco tries to guess which modules are needed
                             by your job functions automatically.
                             It sends any local dependencies
                             (i.e. modules not included in the
                             Python standard library) to nodes by default.

                             If guessing fails, or you have other requirements,
                             see :mod:`disco.modutil` for options.


    :type  status_interval: integer
    :param status_interval: print "K items mapped / reduced"
                            for every Nth item.
                            Setting the value to 0 disables messages.

                            Increase this value, or set it to zero,
                            if you get "Message rate limit exceeded"
                            error due to system messages.
                            This might happen if your tasks are really fast.
                            Decrease the value if you want more messages or
                            you don't have that many data items.

                            Default is ``100000``.

    :type  profile: boolean
    :param profile: enable tasks profiling.
                    Retrieve profiling results with :meth:`Disco.profile_stats`.

                    Default is ``False``.
    """
    defaults = {
        'input': (),
        'map':
        None,
        'map_init':
        func.noop,
        'map_reader':
        None,
        'map_input_stream': (func.map_input_stream, ),
        'map_output_stream':
        (func.map_output_stream, func.disco_output_stream),
        'combiner':
        None,
        'partition':
        func.default_partition,
        'reduce':
        None,
        'reduce_init':
        func.noop,
        'reduce_reader':
        func.chain_reader,
        'reduce_input_stream': (func.reduce_input_stream, ),
        'reduce_output_stream':
        (func.reduce_output_stream, func.disco_output_stream),
        'ext_map':
        False,
        'ext_reduce':
        False,
        'ext_params':
        None,
        'merge_partitions':
        False,
        'params':
        Params(),
        'partitions':
        1,
        'prefix':
        '',
        'profile':
        False,
        'required_files': {},
        'required_modules':
        None,
        'scheduler': {
            'max_cores': '%d' % 2**31
        },
        'save':
        False,
        'sort':
        False,
        'status_interval':
        100000,
        'username':
        DiscoSettings()['DISCO_JOB_OWNER'],
        'version':
        '.'.join(str(s) for s in sys.version_info[:2]),
        # deprecated
        'nr_reduces':
        0,
        'map_writer':
        None,
        'mem_sort_limit':
        0,
        'reduce_writer':
        None
    }
    default_factory = defaults.__getitem__

    functions = set([
        'map', 'map_init', 'map_reader', 'map_writer', 'combiner', 'partition',
        'reduce', 'reduce_init', 'reduce_reader', 'reduce_writer'
    ])

    scheduler_keys = set(['force_local', 'force_remote', 'max_cores'])

    stacks = set([
        'map_input_stream', 'map_output_stream', 'reduce_input_stream',
        'reduce_output_stream'
    ])

    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'reduce_writer' in kwargs or 'map_writer' in kwargs:
            warn("Writers are deprecated - use output_stream.add() instead",
                 DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(
                util.iterify(self[f])
                for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules(
                [f for f in functions if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [
            list(util.iterify(url)) for i in self['input']
            for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)
        ]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0
        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = 1 + max(
                id for dir in self['input']
                for id, url in util.read_index(dir[0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)

    def __contains__(self, key):
        return key in self.defaults

    def pack(self):
        """Pack up the :class:`JobDict` for sending over the wire."""
        jobpack = {}

        if self['required_files']:
            if not isinstance(self['required_files'], dict):
                self['required_files'] = util.pack_files(
                    self['required_files'])
        else:
            self['required_files'] = {}

        self['required_files'].update(
            util.pack_files(o[1] for o in self['required_modules']
                            if util.iskv(o)))

        for key in self.defaults:
            if key in ('map', 'reduce'):
                if self[key] is None:
                    continue
            if key == 'input':
                jobpack['input'] = ' '.join(
                    '\n'.join(reversed(list(util.iterify(url))))
                    for url in self['input'])
            elif key == 'username':
                jobpack['username'] = str(self['username'])
            elif key in ('nr_reduces', 'prefix'):
                jobpack[key] = str(self[key])
            elif key == 'scheduler':
                scheduler = self['scheduler']
                for key in scheduler:
                    jobpack['sched_%s' % key] = str(scheduler[key])
            elif key in self.stacks:
                jobpack[key] = util.pack_stack(self[key])
            else:
                jobpack[key] = util.pack(self[key])
        return encode_netstring_fd(jobpack)

    @classmethod
    def unpack(cls, jobpack, globals={}):
        """Unpack the previously packed :class:`JobDict`."""

        jobdict = cls.defaults.copy()
        jobdict.update(**decode_netstring_fd(jobpack))

        for key in cls.defaults:
            if key == 'input':
                jobdict['input'] = [
                    i.split() for i in jobdict['input'].split(' ')
                ]
            elif key == 'username':
                pass
            elif key == 'nr_reduces':
                jobdict[key] = int(jobdict[key])
            elif key == 'scheduler':
                for key in cls.scheduler_keys:
                    if 'sched_%s' % key in jobdict:
                        jobdict['scheduler'][key] = jobdict.pop('sched_%s' %
                                                                key)
            elif key == 'prefix':
                pass
            elif jobdict[key] is None:
                pass
            elif key in cls.stacks:
                jobdict[key] = util.unpack_stack(jobdict[key], globals=globals)
            else:
                jobdict[key] = util.unpack(jobdict[key], globals=globals)
        return cls(**jobdict)

    @property
    def input_is_partitioned(self):
        if self['input']:
            return all(
                url.startswith('dir://') for urls in self['input']
                for url in urls)
Esempio n. 21
0
 def __init__(self, master=None, settings=None):
     self.settings = settings or DiscoSettings()
     self.master = master or self.settings['DISCO_MASTER']
Esempio n. 22
0
import sys
from disco.core import Disco, result_iterator
from disco.settings import DiscoSettings


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(
    name="wordcount",
    input=["http://discoproject.org/media/text/chekhov.txt"],
    map=map,
    reduce=reduce,
    save=True).wait()
print "Job done. Results:"
for word, count in result_iterator(results):
    print word, count
Esempio n. 23
0
import os, random, struct, time, socket, base64
from disco.compat import BytesIO, file, httplib, basestring, str_to_bytes
from disco.error import CommError
from disco.settings import DiscoSettings
from disco.util import iterify, urlresolve, urlsplit

BUFFER_SIZE = int(1024**2)
CHUNK_SIZE = int(10 * 1024**2)

settings = DiscoSettings()
nocurl = 'nocurl' in settings['DISCO_FLAGS'].lower().split()

try:
    import pycurl
except ImportError:
    nocurl = True

if nocurl:
    HTTPConnection = httplib.HTTPConnection
else:
    from disco import comm_pycurl
    from disco.comm_pycurl import HTTPConnection


def isredirection(status):
    return str(status).startswith('3')


def issuccessful(status):
    return str(status).startswith('2')
Esempio n. 24
0
def proxy_url(url, proxy=DiscoSettings()['DISCO_PROXY']):
    if proxy:
        scheme, (host, port), path = urlsplit(url)
        return '%s/disco/node/%s/%s' % (proxy, host, path)
    return url
Esempio n. 25
0
 def _master((host, port)):
     if not host:
         return master or DiscoSettings()['DISCO_MASTER']
     if not port:
         return 'disco://%s' % host
     return 'http://%s:%s' % (host, port)
Esempio n. 26
0
 def __init__(self, master=None, proxy=None, settings=None):
     self.settings = settings or DiscoSettings()
     self.proxy = proxy or self.settings['DISCO_PROXY']
     self.master = self.proxy or master or self.settings['DISCO_MASTER']
     self.settings['DISCO_MASTER'] = self.master
Esempio n. 27
0
 def __init__(self, name=None, master=None, worker=None, settings=None):
     from disco.core import Disco
     self.name = name or type(self).__name__
     self.disco = master if isinstance(master, Disco) else Disco(master)
     self.worker = worker or self.Worker()
     self.settings = settings or DiscoSettings()