Example #1
0
def unix_sort(filename, sort_buffer_size='10%'):
    import subprocess, os.path
    if not os.path.isfile(filename):
        raise DataError("Invalid sort input file {0}".format(filename), filename)
    try:
        env = os.environ.copy()
        env['LC_ALL'] = 'C'
        cmd, shell = sort_cmd(filename, sort_buffer_size)
        subprocess.check_call(cmd, env=env, shell=shell)
    except subprocess.CalledProcessError as e:
        raise DataError("Sorting {0} failed: {1}".format(filename, e), filename)
 def swap(self, error=None):
     try:
         def skip(iter, N):
             from itertools import dropwhile
             return dropwhile(lambda n_rec: n_rec[0] < N, enumerate(iter))
         self.iter = skip(self.open(next(self.urls)), self.last + 1)
     except DataError:
         self.swap(traceback.format_exc())
     except StopIteration:
         if error:
             raise DataError("Exhausted all available replicas, "
                             "last error was:\n\n{0}".format(error), self.input)
         raise DataError("Exhausted all available replicas", self.input)
Example #3
0
    def read_netstr(idx, data, tot):
        ldata = len(data)
        i = 0
        lenstr = ''
        if ldata - idx < 11:
            data = data[idx:] + bytes_to_str(fd.read(8192))
            ldata = len(data)
            idx = 0

        i = data.find(' ', idx, idx + 11)
        if i == -1:
            raise DataError(
                "Corrupted input: "
                "Could not parse a value length at {0} bytes.".format(tot),
                fname)
        else:
            lenstr = data[idx:i + 1]
            idx = i + 1

        if ldata < i + 1:
            raise DataError(
                "Truncated input: "
                "Expected {0} bytes, got {1}".format(size, tot), fname)

        try:
            llen = int(lenstr)
        except ValueError:
            raise DataError(
                "Corrupted input: "
                "Could not parse a value length at {0} bytes.".format(tot),
                fname)

        tot += len(lenstr)

        if ldata - idx < llen + 1:
            data = data[idx:] + bytes_to_str(fd.read(llen + 8193))
            ldata = len(data)
            idx = 0

        msg = data[idx:idx + llen]

        if idx + llen + 1 > ldata:
            raise DataError(
                "Truncated input: "
                "Expected a value of {0} bytes (offset {1} bytes)".format(
                    llen + 1, tot), fname)

        tot += llen + 1
        idx += llen + 1
        return idx, data, tot, msg
Example #4
0
def delimited_reader(fd,
                     size,
                     fname,
                     delimiter,
                     line_terminator='\n',
                     output_tail=False,
                     read_buffer_size=8192):
    tail = []
    tot = 0
    while True:
        if size:
            r = fd.read(min(read_buffer_size, size - tot))
        else:
            r = fd.read(read_buffer_size)
        tot += len(r)
        split_lines = r.split(line_terminator)
        if len(split_lines) > 1:
            tail.append(split_lines[0])
            split_lines[0] = ''.join(tail)
            tail = []
        if split_lines[-1] != '':
            tail.append(split_lines[-1])
        for line in split_lines[:-1]:
            yield line.split(delimiter)
        if not len(r) or (size != None and tot >= size):
            if size != None and tot < size:
                raise DataError("Truncated input: "\
                "Expected %d bytes, got %d" % (size, tot), fname)
            break
    if len(tail) > 0:
        if output_tail:
            yield tail
        else:
            print "Couldn't match the last %d bytes in %s. "\
            "Some bytes may be missing from input." % (sum((len(chunk) for chunk in tail)), fname)
Example #5
0
def ensure_free_space(fname):
    s = os.statvfs(fname)
    free = s.f_bsize * s.f_bavail
    if free < MIN_DISK_SPACE:
        raise DataError(
            "Only {0} KB disk space available. Task failed.".format(
                (free / 1024), fname))
Example #6
0
def sort_reader(fd, fname, read_buffer_size=8192):
    buf = b""
    while True:
        r = fd.read(read_buffer_size)
        if not len(r):
            break
        if len(buf) > read_buffer_size:
            raise DataError("Could not parse the sorted file.", fname)
        buf += r
        keyValues = buf.split(b"\x00")
        buf = keyValues[-1]
        for keyValue in keyValues[:-1]:
            key, value = keyValue.split(b"\xff")
            yield key, value

    if len(buf):
        raise DataError("Could not parse the tail of the sorted file.", fname)
Example #7
0
    def get_input(cls, id):
        done, inputs = cls.send('INPUT', ['include', [id]])
        _id, status, replicas = inputs[0]

        if status == 'busy':
            raise Wait
        if status == 'failed':
            raise DataError("Can't handle broken input", id)
        return [(id, str(url)) for id, url in replicas]
Example #8
0
File: task_io.py Project: saa/disco
def disco_input_stream(stream, size, url, ignore_corrupt=False):
    """Input stream for Disco's internal compression format."""
    from disco.compat import BytesIO, int_of_byte
    from disco.compat import pickle_load
    import struct, gzip, zlib
    offset = 0
    while True:
        header = stream.read(1)
        if not header:
            return
        if int_of_byte(header[0]) < 128:
            for e in old_netstr_reader(stream, size, url, header):
                yield e
            return
        try:
            is_compressed, checksum, hunk_size =\
                struct.unpack('<BIQ', stream.read(13))
        except:
            raise DataError("Truncated data at {0} bytes".format(offset), url)
        if not hunk_size:
            return
        hunk = stream.read(hunk_size)
        data = b''
        try:
            data = zlib.decompress(hunk) if is_compressed else hunk
            if checksum != (zlib.crc32(data) & 0xFFFFFFFF):
                raise ValueError("Checksum does not match")
        except (ValueError, zlib.error) as e:
            if not ignore_corrupt:
                raise DataError(
                    "Corrupted data between bytes {0}-{1}: {2}".format(
                        offset, offset + hunk_size, e), url)
        offset += hunk_size
        hunk = BytesIO(data)
        while True:
            try:
                yield pickle_load(hunk)
            except EOFError:
                break
            except UnpicklingError as e:
                if not ignore_corrupt:
                    raise DataError(
                        "Corrupted data between bytes {0}-{1}: {2}".format(
                            offset - hunk_size, offset, e), url)
Example #9
0
def data_err(message, url):
    """
    Raises a data error with the reason *message*. This signals the master to re-run
    the task on another node. If the same task raises data error on several
    different nodes, the master terminates the job. Thus data error should only be
    raised if it is likely that the occurred error is temporary.

    Typically this function is used by map readers to signal a temporary failure
    in accessing an input file.
    """
    raise DataError(message, url)
Example #10
0
File: util.py Project: yuj/disco
def data_err(message, url):
    """
    .. deprecated:: 0.4
                    raise :class:`disco.error.DataError` instead.

    Raises a :class:`disco.error.DataError`.
    A data error should only be raised if it is likely that the error is transient.
    Typically this function is used by map readers to signal a temporary failure
    in accessing an input file.
    """
    raise DataError(message, url)
Example #11
0
    def swap(self):
        try:

            def skip(iter, N):
                from itertools import dropwhile
                return dropwhile(lambda (n, rec): n < N, enumerate(iter))

            self.iter = skip(self.open(self.urls.next()), self.last + 1)
        except DataError:
            self.swap()
        except StopIteration:
            raise DataError("Exhausted all available replicas", self.input)
Example #12
0
def unix_sort(filename, sort_buffer_size='10%'):
    import subprocess
    try:
        env = os.environ.copy()
        env['LC_ALL'] = 'C'
        subprocess.check_call([
            'sort', '-z', '-t', '\xff', '-k', '1,1', '-T', '.', '-S',
            sort_buffer_size, '-o', filename, filename
        ],
                              env=env)
    except subprocess.CalledProcessError, e:
        raise DataError("Sorting %s failed: %s" % (filename, e), filename)
Example #13
0
 def disk_sort(self, filename):
     Status("Sorting %s..." % filename)
     try:
         subprocess.check_call(['sort',
                                '-z',
                                '-t', '\xff',
                                '-k', '1,1',
                                '-T', '.',
                                '-S', self.sort_buffer_size,
                                '-o', filename,
                                filename])
     except subprocess.CalledProcessError, e:
         raise DataError("Sorting %s failed: %s" % (filename, e), filename)
Example #14
0
def ensure_file(fname, data = None, timeout = 60, mode = 500):
    while timeout > 0:
        if os.path.exists(fname):
            return False
        try:
            fd = os.open(fname + ".partial",
                os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode)
            if callable(data):
                data = data()
            n = os.write(fd, data)
            if n != len(data):
                raise DataError("Writing file failed (only wrote %d/%d bytes)."
                                " Out of disk space?" % (n, len(data)), fname)
            os.close(fd)
            os.rename(fname + ".partial", fname)
            return True
        except OSError, x:
            if x.errno == errno.EEXIST:
                time.sleep(1)
                timeout -= 1
            else:
                raise DataError("Writing external file failed", fname)
Example #15
0
def disco_input_stream(stream, size, url, ignore_corrupt=False):
    """Input stream for Disco's internal compression format."""
    import struct, cStringIO, gzip, cPickle, zlib
    offset = 0
    while True:
        header = stream.read(1)
        if not header:
            return
        if ord(header[0]) < 128:
            for e in old_netstr_reader(stream, size, url, header):
                yield e
            return
        try:
            is_compressed, checksum, hunk_size =\
                struct.unpack('<BIQ', stream.read(13))
        except:
            raise DataError("Truncated data at %d bytes" % offset, url)
        if not hunk_size:
            return
        hunk = stream.read(hunk_size)
        data = ''
        try:
            data = zlib.decompress(hunk) if is_compressed else hunk
            if checksum != (zlib.crc32(data) & 0xFFFFFFFF):
                raise ValueError("Checksum does not match")
        except (ValueError, zlib.error), e:
            if not ignore_corrupt:
                raise DataError(
                    "Corrupted data between bytes %d-%d: %s" %
                    (offset, offset + hunk_size, e), url)
        offset += hunk_size
        hunk = cStringIO.StringIO(data)
        while True:
            try:
                yield cPickle.load(hunk)
            except EOFError:
                break
Example #16
0
        try:
            fd = os.open(fname + ".partial",
                         os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode)
            if callable(data):
                data = data()
            os.write(fd, data)
            os.close(fd)
            os.rename(fname + ".partial", fname)
            return True
        except OSError, x:
            if x.errno == errno.EEXIST:
                time.sleep(1)
                timeout -= 1
            else:
                raise DataError("Writing external file failed", fname)
    raise DataError("Timeout in writing external file", fname)


def write_files(files, path):
    if files:
        path = os.path.abspath(path)
        ensure_path(path)
    for fname, data in files.iteritems():
        # make sure that no files are written outside the given path
        p = os.path.abspath(os.path.join(path, fname))
        if os.path.dirname(p) == path:
            ensure_file(path + "/" + fname, data=data)
        else:
            raise ValueError("Unsafe filename %s" % fname)

Example #17
0
def re_reader(item_re_str,
              fd,
              size,
              fname,
              output_tail=False,
              read_buffer_size=8192):
    """
    A map reader that uses an arbitrary regular expression to parse the input
    stream.

    :param item_re_str: regular expression for matching input items

    The reader works as follows:

     1. X bytes is read from *fd* and appended to an internal buffer *buf*.
     2. ``m = regexp.match(buf)`` is executed.
     3. If *buf* produces a match, ``m.groups()`` is yielded, which contains an
        input entry for the map function. Step 2. is executed for the remaining
        part of *buf*. If no match is made, go to step 1.
     4. If *fd* is exhausted before *size* bytes have been read,
        and *size* tests ``True``,
        a :class:`disco.error.DataError` is raised.
     5. When *fd* is exhausted but *buf* contains unmatched bytes, two modes are
        available: If ``output_tail=True``, the remaining *buf* is yielded as is.
        Otherwise, a message is sent that warns about trailing bytes.
        The remaining *buf* is discarded.

    Note that :func:`re_reader` fails if the input streams contains unmatched
    bytes between matched entries.
    Make sure that your *item_re_str* is constructed so that it covers all
    bytes in the input stream.

    :func:`re_reader` provides an easy way to construct parsers for textual
    input streams.
    For instance, the following reader produces full HTML
    documents as input entries::

        def html_reader(fd, size, fname):
            for x in re_reader("<HTML>(.*?)</HTML>", fd, size, fname):
                yield x[0]

    """
    item_re = re.compile(item_re_str)
    buf = ""
    tot = 0
    while True:
        if size:
            r = fd.read(min(read_buffer_size, size - tot))
        else:
            r = fd.read(read_buffer_size)
        tot += len(r)
        buf += r

        m = item_re.match(buf)
        while m:
            yield m.groups()
            buf = buf[m.end():]
            m = item_re.match(buf)

        if not len(r) or (size != None and tot >= size):
            if size != None and tot < size:
                raise DataError("Truncated input: "\
                "Expected %d bytes, got %d" % (size, tot), fname)
            if len(buf):
                if output_tail:
                    yield [buf]
                else:
                    print "Couldn't match the last %d bytes in %s. "\
                    "Some bytes may be missing from input." % (len(buf), fname)
            break
 def corrupt_reader(fd, size, url, params):
     yield 'hello'
     if 'corrupt' in url:
         raise DataError("Corrupt!", url)
     yield 'there'