Example #1
0
    def read_netstr(idx, data, tot):
        ldata = len(data)
        i = 0
        lenstr = ''
        if ldata - idx < 11:
            data = data[idx:] + bytes_to_str(fd.read(8192))
            ldata = len(data)
            idx = 0

        i = data.find(' ', idx, idx + 11)
        if i == -1:
            raise DataError(
                "Corrupted input: "
                "Could not parse a value length at {0} bytes.".format(tot),
                fname)
        else:
            lenstr = data[idx:i + 1]
            idx = i + 1

        if ldata < i + 1:
            raise DataError(
                "Truncated input: "
                "Expected {0} bytes, got {1}".format(size, tot), fname)

        try:
            llen = int(lenstr)
        except ValueError:
            raise DataError(
                "Corrupted input: "
                "Could not parse a value length at {0} bytes.".format(tot),
                fname)

        tot += len(lenstr)

        if ldata - idx < llen + 1:
            data = data[idx:] + bytes_to_str(fd.read(llen + 8193))
            ldata = len(data)
            idx = 0

        msg = data[idx:idx + llen]

        if idx + llen + 1 > ldata:
            raise DataError(
                "Truncated input: "
                "Expected a value of {0} bytes (offset {1} bytes)".format(
                    llen + 1, tot), fname)

        tot += llen + 1
        idx += llen + 1
        return idx, data, tot, msg
Example #2
0
 def _push(self,
           source_target,
           replicas=None,
           forceon=[],
           exclude=[],
           **kwargs):
     source, target = source_target
     qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)),
                                         ('include', ','.join(forceon)),
                                         ('replicas', replicas)) if v])
     urls = self._download('{0}/ddfs/new_blob/{1}?{2}'.format(
         self.master, target, qs))
     try:
         return [
             json.loads(bytes_to_str(url)) for url in self._upload(
                 urls, source, to_master=False, **kwargs)
         ]
     except CommError as e:
         scheme, (host, port), path = urlsplit(e.url)
         if hasattr(source, "seek"):
             source.seek(
                 0)  # source will be read again; seek to the beginning
         else:
             print("{0} is not seekable, retrying".format(source))
         return self._push((source, target),
                           replicas=replicas,
                           forceon=forceon,
                           exclude=exclude + [host],
                           **kwargs)
Example #3
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    from disco.worker.task_io import re_reader
    if worker:
        worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00')
    out_fd.close()
    if worker:
        worker.send(
            'MSG',
            "Downloaded {0:s} OK".format(format_size(getsize(filename))))
        worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    if worker:
        worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in sort_reader(fd, fd.url):
        yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
 def t_read_until(self, delim, spent=0, bytes=''):
     while not bytes.endswith(delim):
         spent += self.select(spent)
         read_bytes = os.read(self.fd, 1)
         raise_if_empty(read_bytes)
         bytes += bytes_to_str(read_bytes)
     return spent, bytes
Example #5
0
File: util.py Project: yuj/disco
def read_index(dir):
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        yield bytes_to_str(line).split()
Example #6
0
File: util.py Project: dangra/disco
def read_index(dir):
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        yield bytes_to_str(line).split()
Example #7
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    from disco.worker.task_io import re_reader
    if worker:
        worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00')
    out_fd.close()
    if worker:
        worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename))))
        worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    if worker:
        worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in sort_reader(fd, fd.url):
        yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
Example #8
0
def cat(program, *urls):
    """Usage: [url ...]

    Concatenate the contents of all url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from subprocess import call
    from disco.comm import download
    from disco.util import deref, urlresolve, proxy_url
    from disco.compat import bytes_to_str

    ignore_missing = program.options.ignore_missing
    tags, urls     = program.separate_tags(*urls)

    def curl(replicas):
        for replica in replicas:
            try:
                return download(proxy_url(urlresolve(replica, master=program.ddfs.master),
                                          to_master=False))
            except Exception as e:
                sys.stderr.write("{0}\n".format(e))
        if not ignore_missing:
            raise Exception("Failed downloading all replicas: {0}".format(replicas))
        return ''

    for replicas in deref(chain(urls, program.blobs(*tags))):
        sys.stdout.write(bytes_to_str(curl(replicas)))
Example #9
0
def cat(program, *urls):
    """Usage: [url ...]

    Concatenate the contents of all url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from subprocess import call
    from disco.comm import download
    from disco.util import deref, urlresolve, proxy_url
    from disco.compat import bytes_to_str

    ignore_missing = program.options.ignore_missing
    tags, urls = program.separate_tags(*urls)

    def curl(replicas):
        for replica in replicas:
            try:
                return download(
                    proxy_url(urlresolve(replica, master=program.ddfs.master),
                              to_master=False))
            except Exception as e:
                sys.stderr.write("{0}\n".format(e))
        if not ignore_missing:
            raise Exception(
                "Failed downloading all replicas: {0}".format(replicas))
        return ''

    for replicas in deref(chain(urls, program.blobs(*tags))):
        sys.stdout.write(bytes_to_str(curl(replicas)))
Example #10
0
File: func.py Project: dangra/disco
    def read_netstr(idx, data, tot):
        ldata = len(data)
        i = 0
        lenstr = ''
        if ldata - idx < 11:
            data = data[idx:] + bytes_to_str(fd.read(8192))
            ldata = len(data)
            idx = 0

        i = data.find(' ', idx, idx + 11)
        if i == -1:
            raise DataError("Corrupted input: "
                            "Could not parse a value length at {0} bytes."
                            .format(tot), fname)
        else:
            lenstr = data[idx:i + 1]
            idx = i + 1

        if ldata < i + 1:
            raise DataError("Truncated input: "
                            "Expected {0} bytes, got {1}"
                            .format(size, tot), fname)

        try:
            llen = int(lenstr)
        except ValueError:
            raise DataError("Corrupted input: "
                            "Could not parse a value length at {0} bytes."
                            .format(tot), fname)

        tot += len(lenstr)

        if ldata - idx < llen + 1:
            data = data[idx:] + bytes_to_str(fd.read(llen + 8193))
            ldata = len(data)
            idx = 0

        msg = data[idx:idx + llen]

        if idx + llen + 1 > ldata:
            raise DataError("Truncated input: "
                            "Expected a value of {0} bytes (offset {1} bytes)"
                            .format(llen + 1, tot), fname)

        tot += llen + 1
        idx += llen + 1
        return idx, data, tot, msg
Example #11
0
 def _download(self, url, data=None, token=None, method="GET", to_master=True):
     byts = download(
         self._resolve(proxy_url(url, proxy=self.proxy, meth=method, to_master=to_master)),
         data=data,
         method=method,
         token=self._token(url, token, method),
     )
     return json.loads(bytes_to_str(byts))
 def t_read(self, nbytes, spent=0, bytes=''):
     while True:
         spent += self.select(spent)
         read_bytes = os.read(self.fd, nbytes - len(bytes))
         raise_if_empty(read_bytes)
         bytes += bytes_to_str(read_bytes)
         if nbytes <= len(bytes):
             return spent, bytes
 def test_large(self):
     self.job = LargeOOBJob().run(
         input=['raw://{0}'.format(i) for i in range(self.num_workers)])
     self.assertResults(self.job, [])
     self.assertEquals(
         sorted((key, bytes_to_str(self.job.oob_get(key)))
                for key in self.job.oob_list()),
         sorted(('{0}-{1}'.format(i, j), 'val:{0}-{1}'.format(i, j))
                for i in range(self.num_workers) for j in range(10)))
Example #14
0
 def test_large(self):
     self.job = LargeOOBJob().run(input=['raw://{0}'.format(i)
                                         for i in range(self.num_workers)])
     self.assertResults(self.job, [])
     self.assertEquals(sorted((key, bytes_to_str(self.job.oob_get(key)))
                              for key in self.job.oob_list()),
                       sorted(('{0}-{1}'.format(i, j), 'val:{0}-{1}'.format(i, j))
                              for i in range(self.num_workers)
                              for j in range(10)))
Example #15
0
def read_index(dir):
    # We might be given replicas of dirs; choose the first.
    if isiterable(dir): dir = dir[0]
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        label, url, size = bytes_to_str(line).split()
        yield int(label), url, int(size)
Example #16
0
def read_index(dir):
    # We might be given replicas of dirs; choose the first.
    if isiterable(dir): dir = dir[0]
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        label, url, size = bytes_to_str(line).split()
        yield int(label), url, int(size)
Example #17
0
def input_stream(fd, sze, url, params):
    """Opens a StringIO whose data is everything after the url scheme.

    For example, `raw://hello_world` would return `hello_world` when read by the task.
    """
    from disco.compat import StringIO, bytes_to_str
    from disco.util import schemesplit
    scheme, string = schemesplit(url)
    ascii = bytes_to_str(string)
    return (StringIO(ascii), len(ascii), url)
Example #18
0
 def _download(self,
               url,
               data=None,
               token=None,
               method='GET',
               to_master=True):
     byts = download(self._resolve(
         proxy_url(url, proxy=self.proxy, meth=method,
                   to_master=to_master)),
                     data=data,
                     method=method,
                     token=self._token(url, token, method))
     return json.loads(bytes_to_str(byts))
Example #19
0
def parse_message(msg):
    msg = bytes_to_str(msg)
    try:
        type, payload = msg.split('>', 1)
        payload = payload.strip()
        if type == '**<MSG':
            Worker.send('MSG', payload)
        elif type == '**<ERR':
            Worker.send('FATAL', payload)
        else:
            raise Exception
    except:
        # let master handle erroneous output
        sys.stderr.write(msg)
Example #20
0
def parse_message(msg):
    msg = bytes_to_str(msg)
    try:
        type, payload = msg.split('>', 1)
        payload = payload.strip()
        if type == '**<MSG':
            Worker.send('MSG', payload)
        elif type == '**<ERR':
            Worker.send('FATAL', payload)
        else:
            raise Exception
    except:
        # let master handle erroneous output
        sys.stderr.write(msg)
Example #21
0
File: ddfs.py Project: wquan/disco
 def _push(self, source_target, replicas=None, exclude=[], **kwargs):
     source, target = source_target
     qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)),
                                         ('replicas', replicas)) if v])
     urls = self._download('{0}/ddfs/new_blob/{1}?{2}'
                           .format(self.master, target, qs))
     try:
         return [json.loads(bytes_to_str(url))
                 for url in self._upload(urls, source, to_master=False, **kwargs)]
     except CommError as e:
         scheme, (host, port), path = urlsplit(e.url)
         return self._push((source, target),
                           replicas=replicas,
                           exclude=exclude + [host],
                           **kwargs)
Example #22
0
File: ddfs.py Project: yuj/disco
 def _push(self, source_target, replicas=None, exclude=[], **kwargs):
     source, target = source_target
     qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)),
                                         ('replicas', replicas)) if v])
     urls = self._download('{0}/ddfs/new_blob/{1}?{2}'.format(
         self.master, target, qs))
     try:
         return [
             json.loads(bytes_to_str(url)) for url in self._upload(
                 urls, source, to_master=False, **kwargs)
         ]
     except CommError as e:
         scheme, (host, port), path = urlsplit(e.url)
         return self._push((source, target),
                           replicas=replicas,
                           exclude=exclude + [host],
                           **kwargs)
Example #23
0
 def _push(self, source_target, replicas=None, forceon=[], exclude=[], **kwargs):
     source, target = source_target
     qs = urlencode(
         [
             (k, v)
             for k, v in (("exclude", ",".join(exclude)), ("include", ",".join(forceon)), ("replicas", replicas))
             if v
         ]
     )
     urls = self._download("{0}/ddfs/new_blob/{1}?{2}".format(self.master, target, qs))
     try:
         return [json.loads(bytes_to_str(url)) for url in self._upload(urls, source, to_master=False, **kwargs)]
     except CommError as e:
         scheme, (host, port), path = urlsplit(e.url)
         if hasattr(source, "seek"):
             source.seek(0)  # source will be read again; seek to the beginning
         else:
             print("{0} is not seekable, retrying".format(source))
         return self._push((source, target), replicas=replicas, forceon=forceon, exclude=exclude + [host], **kwargs)
Example #24
0
File: func.py Project: dangra/disco
def old_netstr_reader(fd, size, fname, head=b''):
    """
    Reader for Disco's default/internal key-value format.

    Reads output of a map / reduce job as the input for a new job.
    Specify this function as your :func:`map_reader`
    to use the output of a previous job as input to another job.
    """
    if size is None:
        raise ValueError("Content-length must be defined")

    def read_netstr(idx, data, tot):
        ldata = len(data)
        i = 0
        lenstr = ''
        if ldata - idx < 11:
            data = data[idx:] + bytes_to_str(fd.read(8192))
            ldata = len(data)
            idx = 0

        i = data.find(' ', idx, idx + 11)
        if i == -1:
            raise DataError("Corrupted input: "
                            "Could not parse a value length at {0} bytes."
                            .format(tot), fname)
        else:
            lenstr = data[idx:i + 1]
            idx = i + 1

        if ldata < i + 1:
            raise DataError("Truncated input: "
                            "Expected {0} bytes, got {1}"
                            .format(size, tot), fname)

        try:
            llen = int(lenstr)
        except ValueError:
            raise DataError("Corrupted input: "
                            "Could not parse a value length at {0} bytes."
                            .format(tot), fname)

        tot += len(lenstr)

        if ldata - idx < llen + 1:
            data = data[idx:] + bytes_to_str(fd.read(llen + 8193))
            ldata = len(data)
            idx = 0

        msg = data[idx:idx + llen]

        if idx + llen + 1 > ldata:
            raise DataError("Truncated input: "
                            "Expected a value of {0} bytes (offset {1} bytes)"
                            .format(llen + 1, tot), fname)

        tot += llen + 1
        idx += llen + 1
        return idx, data, tot, msg

    data = bytes_to_str(head + fd.read(8192))
    tot = idx = 0
    while tot < size:
        key = val = ''
        idx, data, tot, key = read_netstr(idx, data, tot)
        idx, data, tot, val = read_netstr(idx, data, tot)
        yield key, val
Example #25
0
def schemesplit(url):
    return bytes_to_str(url).split('://', 1) if '://' in bytes_to_str(url) else ('', url)
Example #26
0
 def jobenvs(self):
     dict_offset, envs_offset, home_offset, data_offset = self.offsets(self.jobfile)
     self.jobfile.seek(envs_offset)
     return json.loads(bytes_to_str(self.jobfile.read(home_offset - envs_offset)))
 def map(k_v, params):
     yield bytes_to_str(k_v[0]), k_v[1]
Example #28
0
 def t_read(self, nbytes, spent=0, bytes=''):
     while True:
         spent += self.select(spent)
         bytes += bytes_to_str(os.read(self.fd, nbytes - len(bytes)))
         if nbytes <= len(bytes):
             return spent, bytes
Example #29
0
 def map(interface, state, label, inp):
     out = interface.output(0)
     for e in inp:
         out.add(int(e), (bytes_to_str(e)).strip())
Example #30
0
 def map(e, params):
     yield bytes_to_str(e), ''
Example #31
0
 def jobenvs(self):
     dict_offset, envs_offset, home_offset, data_offset = self.offsets(
         self.jobfile)
     self.jobfile.seek(envs_offset)
     return json.loads(
         bytes_to_str(self.jobfile.read(home_offset - envs_offset)))
Example #32
0
def old_netstr_reader(fd, size, fname, head=b''):
    """
    Reader for Disco's default/internal key-value format.

    Reads output of a map / reduce job as the input for a new job.
    Specify this function as your :func:`map_reader`
    to use the output of a previous job as input to another job.
    """
    if size is None:
        raise ValueError("Content-length must be defined")

    def read_netstr(idx, data, tot):
        ldata = len(data)
        i = 0
        lenstr = ''
        if ldata - idx < 11:
            data = data[idx:] + bytes_to_str(fd.read(8192))
            ldata = len(data)
            idx = 0

        i = data.find(' ', idx, idx + 11)
        if i == -1:
            raise DataError(
                "Corrupted input: "
                "Could not parse a value length at {0} bytes.".format(tot),
                fname)
        else:
            lenstr = data[idx:i + 1]
            idx = i + 1

        if ldata < i + 1:
            raise DataError(
                "Truncated input: "
                "Expected {0} bytes, got {1}".format(size, tot), fname)

        try:
            llen = int(lenstr)
        except ValueError:
            raise DataError(
                "Corrupted input: "
                "Could not parse a value length at {0} bytes.".format(tot),
                fname)

        tot += len(lenstr)

        if ldata - idx < llen + 1:
            data = data[idx:] + bytes_to_str(fd.read(llen + 8193))
            ldata = len(data)
            idx = 0

        msg = data[idx:idx + llen]

        if idx + llen + 1 > ldata:
            raise DataError(
                "Truncated input: "
                "Expected a value of {0} bytes (offset {1} bytes)".format(
                    llen + 1, tot), fname)

        tot += llen + 1
        idx += llen + 1
        return idx, data, tot, msg

    data = bytes_to_str(head + fd.read(8192))
    tot = idx = 0
    while tot < size:
        key = val = ''
        idx, data, tot, key = read_netstr(idx, data, tot)
        idx, data, tot, val = read_netstr(idx, data, tot)
        yield key, val
Example #33
0
 def map(k_v, params):
     yield bytes_to_str(k_v[0]), k_v[1]
Example #34
0
 def map(e, params):
     return [(w, 1) for w in re.sub('\W', ' ', bytes_to_str(e)).lower().split()]
Example #35
0
 def map(string, params):
     return shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(string * 10))
Example #36
0
 def reduce(iter, params):
     for k, vs in kvgroup(iter):
         yield bytes_to_str(base64.decodestring(k)), len(list(vs))
Example #37
0
def map_input_stream2(stream, size, url, params):
    return StringIO('b' + bytes_to_str(stream.read()))
Example #38
0
 def map(e, params):
     return [(w, 1)
             for w in re.sub('\W', ' ', bytes_to_str(e)).lower().split()]
Example #39
0
def schemesplit(url):
    return bytes_to_str(url).split('://', 1) if '://' in bytes_to_str(url) else ('', url)
Example #40
0
 def reduce(interface, state, label, inp):
     for rec in sorted(inp):
         state.append((int(rec), (bytes_to_str(rec).strip())))
Example #41
0
def Map(interface, state, label, inp):
    out = interface.output(0)
    for i in inp:
        for k, v in shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(str_to_bytes(i) * 10)):
            out.add(k, v)
Example #42
0
def Map(interface, state, label, inp):
    out = interface.output(0)
    for i in inp:
        for k, v in shuffled((base64.encodestring(str_to_bytes(c)), b'')
                             for c in bytes_to_str(str_to_bytes(i) * 10)):
            out.add(k, v)
 def reduce(iter, params):
     for k, vs in kvgroup(iter):
         yield bytes_to_str(base64.decodestring(k)), len(list(vs))
Example #44
0
 def map(e, params):
     x = bytes_to_str(load_oob(Task.master, params['job'], e))
     assert x == 'value:{0}'.format(e)
     yield 'good', ''
 def map(e, params):
     k = bytes_to_str(e)
     v = str_to_bytes('value:{0}'.format(k))
     put(k, v)
     yield k, v
Example #46
0
 def map(e, params):
     k = bytes_to_str(e)
     v = str_to_bytes('value:{0}'.format(k))
     put(k, v)
     yield k, v
 def map(e, params):
     x = bytes_to_str(load_oob(Task.master, params['job'], e))
     assert x == 'value:{0}'.format(e)
     yield 'good', ''
Example #48
0
 def t_read_until(self, delim, spent=0, bytes=''):
     while not bytes.endswith(delim):
         spent += self.select(spent)
         bytes += bytes_to_str(os.read(self.fd, 1))
     return spent, bytes
 def map(string, params):
     return shuffled((base64.encodestring(str_to_bytes(c)), b'')
                     for c in bytes_to_str(string * 10))
Example #50
0
 def map(e, params):
     yield bytes_to_str(e), ''
Example #51
0
 def map(e, params):
     x, y = [float(x) for x in bytes_to_str(e).split('|')]
     yield mod1.plusceil(x, y) + math.ceil(1.5), ''
Example #52
0
 def safe_name(cls, name):
     return unsafe_re.sub('_', bytes_to_str(name))
Example #53
0
 def t_read(self, nbytes, spent=0, bytes=''):
     while True:
         spent += self.select(spent)
         bytes += bytes_to_str(os.read(self.fd, nbytes - len(bytes)))
         if nbytes <= len(bytes):
             return spent, bytes
Example #54
0
 def t_read_until(self, delim, spent=0, bytes=''):
     while not bytes.endswith(delim):
         spent += self.select(spent)
         bytes += bytes_to_str(os.read(self.fd, 1))
     return spent, bytes
Example #55
0
def map_input_stream1(stream, size, url, params):
    return StringIO("a" + bytes_to_str(stream.read()))
Example #56
0
 def safe_name(cls, name):
     return unsafe_re.sub("_", bytes_to_str(name))
 def map(e, params):
     yield int(e), (bytes_to_str(e)).strip()