Exemple #1
0
def safe_append(instream, outfile, timeout = 60):
        outstream = file(outfile, "a")
        while timeout > 0:
                try:
                        fcntl.flock(outstream, fcntl.LOCK_EX | fcntl.LOCK_NB)
                        try:
                                while True:
                                        buf = instream.read(8192)
                                        if not buf:
                                                instream.close()
                                                outstream.close()
                                                return
                                        outstream.write(buf)
                        except Exception, x:
                                # output file is inconsistent state
                                # we must crash the job
                                err("Updating file %s failed: %s" %\
                                        (outfile, x))
                except IOError, x:
                        # Python doc guides us to check both the
                        # EWOULDBLOCK (11) and EACCES (13) errors
                        if x.errno == 11 or x.errno == 13:
                                time.sleep(1)
                                timeout -= 1
                        else:
                                raise
Exemple #2
0
def netstr_reader(fd, content_len, fname):

        if content_len == None:
                err("Content-length must be defined for netstr_reader")
        def read_netstr(idx, data, tot):
                ldata = len(data)
                i = 0
                lenstr = ""
                if ldata - idx < 11:
                        data = data[idx:] + fd.read(8192)
                        ldata = len(data)
                        idx = 0

                i = data.find(" ", idx, idx + 11)
                if i == -1:
                        err("Corrupted input (%s). Could not "\
                               "parse a value length at %d bytes."\
                                        % (fname, tot))
                else:
                        lenstr = data[idx:i + 1]
                        idx = i + 1

                if ldata < i + 1:
                        data_err("Truncated input (%s). "\
                                "Expected %d bytes, got %d" %\
                                (fname, content_len, tot), fname)
                
                try:
                        llen = int(lenstr)
                except ValueError:
                        err("Corrupted input (%s). Could not "\
                                "parse a value length at %d bytes."\
                                        % (fname, tot))

                tot += len(lenstr)

                if ldata - idx < llen + 1:
                        data = data[idx:] + fd.read(llen + 8193)
                        ldata = len(data)
                        idx = 0

                msg = data[idx:idx + llen]
                
                if idx + llen + 1 > ldata:
                        data_err("Truncated input (%s). "\
                                "Expected a value of %d bytes "\
                                "(offset %u bytes)" %\
                                (fname, llen + 1, tot), fname)

                tot += llen + 1
                idx += llen + 1
                return idx, data, tot, msg
        
        data = fd.read(8192)
        tot = idx = 0
        while tot < content_len:
                key = val = ""
                idx, data, tot, key = read_netstr(idx, data, tot)
                idx, data, tot, val = read_netstr(idx, data, tot)
                yield key, val
Exemple #3
0
def op_map(job):
        global job_name
        
        job_input = this_inputs()
        msg("Received a new map job!")
        
        if len(job_input) != 1:
                err("Map can only handle one input. Got: %s" % 
                        " ".join(job_input))

        nr_reduces = int(job['nr_reduces'])
        required_modules = job['required_modules'].split()
        fun_map_reader.func_code = marshal.loads(job['map_reader'])
        fun_map_writer.func_code = marshal.loads(job['map_writer'])
        fun_partition.func_code = marshal.loads(job['partition'])
        for m in required_modules:
                fun_map_reader.func_globals.setdefault(m, __import__(m))
                fun_partition.func_globals.setdefault(m, __import__(m))
        
        if 'ext_map' in job:
                if 'ext_params' in job:
                        map_params = job['ext_params']
                else:
                        map_params = "0\n"
                external.prepare(job['ext_map'],
                        map_params, EXT_MAP % job_name)
                fun_map.func_code = external.ext_map.func_code
        else:
                map_params = cPickle.loads(job['params'])        
                fun_map.func_code = marshal.loads(job['map'])
        
        for m in required_modules:
                fun_map.func_globals.setdefault(m, __import__(m))

        if 'map_init' in job:
                fun_init.func_code = marshal.loads(job['map_init'])

        if 'combiner' in job:
                fun_combiner.func_code = marshal.loads(job['combiner'])
                for m in required_modules:
                        fun_combiner.func_globals.setdefault(m, __import__(m))
                partitions = [MapOutput(i, map_params, fun_combiner)\
                        for i in range(nr_reduces)]
        else:
                partitions = [MapOutput(i, map_params) for i in range(nr_reduces)]
        
        run_map(job_input[0], partitions, map_params)
        for p in partitions:
                p.close()
        if 'chunked' in job:
                merge_chunks(partitions)
                out = "chunk://%s/%s/map-chunk-%d" %\
                        (this_host(), job_name, this_partition())
        else:
                out = partitions[0].disco_address()
        
        external.close_ext()
        msg("%d %s" % (this_partition(), out), "OUT")
Exemple #4
0
def write_files(ext_data, path):
        path = os.path.abspath(path)
        ensure_path(path + "/", False)
        for fname, data in ext_data.iteritems():
                # make sure that no files are written outside the given path
                p = os.path.abspath(os.path.join(path, fname))
                if os.path.dirname(p) == path:
                        ensure_file(path + "/" + fname, data = data)
                else:
                        err("Unsafe filename %s" % fname)
Exemple #5
0
def op_map(job):
    msg("Received a new map job!")

    if len(Task.inputs) != 1:
        err("Map can only handle one input. Got: %s" %
            " ".join(Task.inputs))

    global fun_reader, fun_writer, fun_partition
    fun_reader = util.unpack(job['map_reader'], globals=globals())
    fun_writer = util.unpack(job['map_writer'], globals=globals())
    fun_partition = util.unpack(job['partition'], globals=globals())

    global fun_init
    if 'map_init' in job:
        fun_init = util.unpack(job['map_init'], globals=globals())

    global fun_map
    if 'ext_map' in job:
        if 'ext_params' in job:
            map_params = job['ext_params']
        else:
            map_params = "0\n"

        path = Task.path("EXT_MAP")
        external.prepare(job['ext_map'], map_params, path)
        fun_map = external.ext_map
    else:
        map_params = util.unpack(job['params'], globals=globals())
        fun_map = util.unpack(job['map'], globals=globals())

    global fun_combiner
    if 'combiner' in job:
        fun_combiner = util.unpack(job['combiner'], globals=globals())

    init_common(job)

    nr_part = max(1, Task.num_partitions)

    if 'combiner' in job:
        partitions = [MapOutput(i, map_params, fun_combiner)\
            for i in range(nr_part)]
    else:
        partitions = [MapOutput(i, map_params) for i in range(nr_part)]

    run_map(Task.inputs[0], partitions, map_params)
    external.close_ext()

    urls = {}
    for i, p in enumerate(partitions):
        p.close()
        urls["%d %s" % (i, p.url())] = True

    index, index_url = Task.map_index
    safe_update(index, urls)
    OutputURL(index_url)
Exemple #6
0
def map_input_stream(stream, size, url, params):
    m = re.match("(\w+)://", url)
    if m:
        scheme = m.group(1)
        try:
            mod = __import__("disco.schemes.scheme_%s" % scheme,
                    fromlist = ["scheme_%s" % scheme])
        except Exception:
            err("Unknown scheme %s in %s" % (scheme, url))
    else:
        from disco.schemes import scheme_file as mod
        url = "file://" + url
    mod.input_stream.func_globals.setdefault("Task", Task)
    return mod.input_stream(stream, size, url, params)
Exemple #7
0
        def read_netstr(idx, data, tot):
                ldata = len(data)
                i = 0
                lenstr = ""
                if ldata - idx < 11:
                        data = data[idx:] + fd.read(8192)
                        ldata = len(data)
                        idx = 0

                i = data.find(" ", idx, idx + 11)
                if i == -1:
                        err("Corrupted input (%s). Could not "\
                               "parse a value length at %d bytes."\
                                        % (fname, tot))
                else:
                        lenstr = data[idx:i + 1]
                        idx = i + 1

                if ldata < i + 1:
                        data_err("Truncated input (%s). "\
                                "Expected %d bytes, got %d" %\
                                (fname, content_len, tot), fname)
                
                try:
                        llen = int(lenstr)
                except ValueError:
                        err("Corrupted input (%s). Could not "\
                                "parse a value length at %d bytes."\
                                        % (fname, tot))

                tot += len(lenstr)

                if ldata - idx < llen + 1:
                        data = data[idx:] + fd.read(llen + 8193)
                        ldata = len(data)
                        idx = 0

                msg = data[idx:idx + llen]
                
                if idx + llen + 1 > ldata:
                        data_err("Truncated input (%s). "\
                                "Expected a value of %d bytes "\
                                "(offset %u bytes)" %\
                                (fname, llen + 1, tot), fname)

                tot += llen + 1
                idx += llen + 1
                return idx, data, tot, msg
Exemple #8
0
def ensure_path(path, check_exists = True):
        if check_exists and os.path.exists(path):
                err("File exists: %s" % path)
        if os.path.isfile(path):
                os.remove(path)
        dirpath, fname = os.path.split(path)
        try:
                os.makedirs(dirpath)
        except OSError, x:
                if x.errno == 17:
                        # File exists is ok, it may happen
                        # if two tasks are racing to create
                        # the directory
                        pass
                else:
                        raise x
Exemple #9
0
def _safe_fileop(op, mode, outfile, timeout):
    outstream = file(outfile, mode)
    while timeout > 0:
        try:
            fcntl.flock(outstream, fcntl.LOCK_EX | fcntl.LOCK_NB)
            try:
                r = op(outstream)
                outstream.close()
                return r
            except Exception, x:
                # output file is inconsistent state
                # we must crash the job
                err("Updating file %s failed: %s" %\
                    (outfile, x))
        except IOError, x:
            # Python / BSD doc guides us to check for these errors
            if x.errno in (errno.EACCES, errno.EAGAIN, errno.EWOULDBLOCK):
                time.sleep(0.1)
                timeout -= 0.1
            else:
                raise
Exemple #10
0
    def download_and_sort(self, params):
        dlname = Task.path("REDUCE_DL", Task.id)
        msg("Reduce will be downloaded to %s" % dlname)
        out_fd = AtomicFile(dlname, "w")
        for url in self.inputs:
            fd, sze, url = connect_input(url, params)
            for k, v in fun_reader(fd, sze, url):
                if " " in k:
                    err("Spaces are not allowed in keys "\
                        "with external sort.")
                if "\0" in v:
                    err("Zero bytes are not allowed in "\
                        "values with external sort. "\
                        "Consider using base64 encoding.")
                out_fd.write("%s %s\0" % (k, v))
        out_fd.close()
        msg("Reduce input downloaded ok")

        msg("Starting external sort")
        sortname = Task.path("REDUCE_SORTED", Task.id)
        ensure_path(os.path.dirname(sortname))
        cmd = ["sort", "-n", "-k", "1,1", "-z",\
            "-t", " ", "-o", sortname, dlname]

        proc = subprocess.Popen(cmd)
        ret = proc.wait()
        if ret:
            err("Sorting %s to %s failed (%d)" %\
                (dlname, sortname, ret))

        msg("External sort done: %s" % sortname)
        return self.multi_file_iterator([sortname], params, reader =\
            lambda fd, sze, url:\
                re_reader("(?s)(.*?) (.*?)\000", fd, sze, url))
Exemple #11
0
        def download_and_sort(self):
                dlname = REDUCE_DL % (job_name, this_partition())
                ensure_path(dlname, False)
                msg("Reduce will be downloaded to %s" % dlname)
                out_fd = file(dlname + ".partial", "w")
                for fname in self.inputs:
                        sze, fd = connect_input(fname)
                        for k, v in fun_reduce_reader(fd, sze, fname):
                                if " " in k:
                                        err("Spaces are not allowed in keys "\
                                            "with external sort.")
                                if "\0" in v:
                                        err("Zero bytes are not allowed in "\
                                            "values with external sort. "\
                                            "Consider using base64 encoding.")
                                out_fd.write("%s %s\0" % (k, v))
                out_fd.close()
                os.rename(dlname + ".partial", dlname)
                msg("Reduce input downloaded ok")

                msg("Starting external sort")
                sortname = REDUCE_SORTED % (job_name, this_partition())
                ensure_path(sortname, False)
                cmd = ["sort", "-n", "-s", "-k", "1,1", "-z",\
                        "-t", " ", "-o", sortname, dlname]

                proc = subprocess.Popen(cmd)
                ret = proc.wait()
                if ret:
                        err("Sorting %s to %s failed (%d)" %\
                                (dlname, sortname, ret))
                
                msg("External sort done: %s" % sortname)
                return self.multi_file_iterator([sortname], reader =\
                        lambda fd, sze, fname:\
                                re_reader("(.*?) (.*?)\000", fd, sze, fname))
Exemple #12
0
def op_map(job):
        job_input = this_inputs()
        msg("Received a new map job!")
        
        if len(job_input) != 1:
                err("Map can only handle one input. Got: %s" % 
                        " ".join(job_input))

        nr_reduces = int(job['nr_reduces'])
        nr_part = max(1, nr_reduces)
        fun_map_reader.func_code = marshal.loads(job['map_reader'])
        fun_map_writer.func_code = marshal.loads(job['map_writer'])
        fun_partition.func_code = marshal.loads(job['partition'])

        if 'map_init' in job:
                fun_init.func_code = marshal.loads(job['map_init'])
        
        if 'required_files' in job:
                write_files(marshal.loads(job['required_files']), REQ_FILES)
                sys.path.insert(0, REQ_FILES)

        req_mod = job['required_modules'].split()
        import_modules(req_mod, [fun_map_reader, fun_map_writer,
            fun_partition, fun_map, fun_combiner, fun_init])

        if 'ext_map' in job:
                if 'ext_params' in job:
                        map_params = job['ext_params']
                else:
                        map_params = "0\n"
                external.prepare(job['ext_map'], map_params, EXT_MAP)
                fun_map.func_code = external.ext_map.func_code
        else:
                map_params = cPickle.loads(job['params'])        
                fun_map.func_code = marshal.loads(job['map'])
        

        if 'combiner' in job:
                fun_combiner.func_code = marshal.loads(job['combiner'])
                partitions = [MapOutput(i, map_params, fun_combiner)\
                        for i in range(nr_part)]
        else:
                partitions = [MapOutput(i, map_params) for i in range(nr_part)]
        
        run_map(job_input[0], partitions, map_params)
        external.close_ext()
        
        for p in partitions:
                p.close()

        if nr_reduces:
                merge_partitions(partitions)
                n = os.path.basename(PART_OUTPUT % 0)
                msg("dir://%s/%s%s:%d" % (this_host(), JOB_HOME, n,
                        len(partitions) - 1), "OUT")
        else:
                res = [os.path.basename(p.fname) for p in partitions]
                index = cStringIO.StringIO("\n".join(res) + "\n")
                safe_append(index, MAP_INDEX)
                msg("dir://%s/%smap-index.txt" %\
                        (this_host(), JOB_HOME), "OUT")