Esempio n. 1
0
        def __init__(self, input_files, do_sort, mem_sort_limit):
                self.inputs = []
                part = PART_SUFFIX % this_partition()
                for input in input_files:
                        if input.startswith("dir://"):
                                try:
                                        self.inputs += parse_dir(input,
                                                part_id = this_partition())
                                except:
                                        data_err("Couldn't resolve address %s"\
                                                % input, input)
                        else:
                                self.inputs.append(input)

                self.line_count = 0
                if do_sort:
                        total_size = 0
                        for input in self.inputs:
                                sze, fd = connect_input(input)
                                total_size += sze

                        msg("Reduce[%d] input is %.2fMB" %\
                                (this_partition(), total_size / 1024.0**2))

                        if total_size > mem_sort_limit:
                                self.iterator = self.download_and_sort()
                        else: 
                                msg("Sorting in memory")
                                m = list(self.multi_file_iterator(self.inputs, False))
                                m.sort(num_cmp)
                                self.iterator = self.list_iterator(m)
                else:
                        self.iterator = self.multi_file_iterator(self.inputs)
Esempio n. 2
0
File: func.py Progetto: davin/disco
def re_reader(item_re_str, fd, content_len, fname, output_tail = False, read_buffer_size=8192):
    item_re = re.compile(item_re_str)
    buf = ""
    tot = 0
    while True:
        if content_len:
            r = fd.read(min(read_buffer_size, content_len - tot))
        else:
            r = fd.read(read_buffer_size)
        tot += len(r)
        buf += r

        m = item_re.match(buf)
        while m:
            yield m.groups()
            buf = buf[m.end():]
            m = item_re.match(buf)

        if not len(r) or tot >= content_len:
            if content_len != None and tot < content_len:
                data_err("Truncated input (%s). "\
                     "Expected %d bytes, got %d" %\
                     (fname, content_len, tot), fname)
            if len(buf):
                if output_tail:
                    yield [buf]
                else:
                    msg("Couldn't match the last %d "\
                        "bytes in %s. Some bytes may be "\
                        "missing from input." %\
                        (len(buf), fname))
            break
Esempio n. 3
0
def get(key, job = None):
        try:
                job = job or this_name()
                return load_oob("http://" + this_master(), job, key)
        except comm.CommException, x:
                data_err("OOB key (%s) not found at %s: HTTP status '%s'" %\
                        (key, url, x.http_code), key)
Esempio n. 4
0
def open_local(input, fname):
        try:
                f = file(fname)
                sze = os.stat(fname).st_size
                return sze, f
        except:
                data_err("Can't access a local input file (%s): %s"\
                                % (input, fname), input)
Esempio n. 5
0
def open_remote(input, ext_host, ext_file, is_chunk):
        try:
                # We can't open a new HTTP connection for each intermediate
                # result -- this would result to M * R TCP connections where
                # M is the number of maps and R the number of reduces. Instead,
                # we pool connections and reuse them whenever possible. HTTP 
                # 1.1 defaults to keep-alive anyway.
                if ext_host in http_pool:
                        http = http_pool[ext_host]
                        if http._HTTPConnection__response:
                                http._HTTPConnection__response.read()
                else:
                        http = httplib.HTTPConnection(ext_host)
                        http_pool[ext_host] = http

                if is_chunk:
                        pos = this_partition() * 8
                        rge = "bytes=%d-%d" % (pos, pos + 15)
                        #msg("Reading offsets at %s" % rge)
                        http.request("GET", ext_file, None, {"Range": rge})
                        fd = http.getresponse()

                        if fd.status != 206:
                                raise "HTTP error %d" % fd.status
                        start, end = struct.unpack("QQ", fd.read())
                        if start == end:
                                return 0, cStringIO.StringIO()
                        else:
                                rge = "bytes=%d-%d" % (start, end - 1)
                        #msg("Reading data at %s" % rge)
                        http.request("GET", ext_file, None, {"Range": rge})
                        fd = http.getresponse()
                        if fd.status != 206:
                                raise "HTTP error %d" % fd.status
                else:
                        http.request("GET", ext_file, "")
                        fd = http.getresponse()
                        if fd.status != 200:
                                raise "HTTP error %d" % fd.status
                sze = fd.getheader("content-length")
                if sze:
                        sze = int(sze)
                return sze, fd

        except httplib.BadStatusLine:
                # BadStatusLine is caused by a closed connection. Re-open a new
                # connection by deleting this connection from the pool and
                # calling this function again. Note that this might result in
                # endless recursion if something went seriously wrong.
                http.close()
                del http_pool[ext_host]
                return open_remote(input, ext_host, ext_file, is_chunk)
        except:
                data_err("Can't access an external input file (%s/%s): %s"\
                                % (ext_host, ext_file, input), input)
Esempio n. 6
0
def get(key, job = None):
        if job:
                c = urllib.urlopen(OOB_URL % (job, key))
        else:
                c = urllib.urlopen(OOB_URL % (job_name, key))

        if "status" in c.headers and not c.headers["status"].startswith("200"):
                data_err("OOB <%s> key (%s) not found" % (c.headers["status"], key), key)
        else:
                r = c.read()
                c.close()
                return r
Esempio n. 7
0
def open_local(input, fname, is_chunk):
        try:
                f = file(fname)
                if is_chunk:
                        f.seek(this_partition() * 8)
                        start, end = struct.unpack("QQ", f.read(16))
                        sze = end - start
                        f.seek(start)
                else:
                        sze = os.stat(fname).st_size
                return sze, f
        except:
                data_err("Can't access a local input file: %s"\
                                % input, input)
Esempio n. 8
0
        def read_netstr(idx, data, tot):
                ldata = len(data)
                i = 0
                lenstr = ""
                if ldata - idx < 11:
                        data = data[idx:] + fd.read(8192)
                        ldata = len(data)
                        idx = 0

                i = data.find(" ", idx, idx + 11)
                if i == -1:
                        err("Corrupted input (%s). Could not "\
                               "parse a value length at %d bytes."\
                                        % (fname, tot))
                else:
                        lenstr = data[idx:i + 1]
                        idx = i + 1

                if ldata < i + 1:
                        data_err("Truncated input (%s). "\
                                "Expected %d bytes, got %d" %\
                                (fname, content_len, tot), fname)
                
                try:
                        llen = int(lenstr)
                except ValueError:
                        err("Corrupted input (%s). Could not "\
                                "parse a value length at %d bytes."\
                                        % (fname, tot))

                tot += len(lenstr)

                if ldata - idx < llen + 1:
                        data = data[idx:] + fd.read(llen + 8193)
                        ldata = len(data)
                        idx = 0

                msg = data[idx:idx + llen]
                
                if idx + llen + 1 > ldata:
                        data_err("Truncated input (%s). "\
                                "Expected a value of %d bytes "\
                                "(offset %u bytes)" %\
                                (fname, llen + 1, tot), fname)

                tot += llen + 1
                idx += llen + 1
                return idx, data, tot, msg
Esempio n. 9
0
def merge_chunks(partitions):
        mapout = CHUNK_OUTPUT % (job_name, this_partition())
     
        f = file(mapout + ".partial", "w")
        offset = (len(partitions) + 1) * 8
        for p in partitions:
                f.write(struct.pack("Q", offset))
                offset += os.stat(p.fname).st_size
        f.write(struct.pack("Q", offset))
        f.close()

        if subprocess.call("cat %s >> %s.partial" % 
                        (" ".join([p.fname for p in partitions]),
                                mapout), shell = True):
                data_err("Couldn't create a chunk", mapout)
        os.rename(mapout + ".partial", mapout)
        for p in partitions:
                os.remove(p.fname)
Esempio n. 10
0
def ensure_file(fname, data = None, timeout = 60, mode = 500):
    while timeout > 0:
        if os.path.exists(fname):
            return False
        try:
            fd = os.open(fname + ".partial",
                os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode)
            if callable(data):
                data = data()
            os.write(fd, data)
            os.close(fd)
            os.rename(fname + ".partial", fname)
            return True
        except OSError, x:
            if x.errno == errno.EEXIST:
                time.sleep(1)
                timeout -= 1
            else:
                data_err("Writing external file %s failed"\
                     % fname, fname)
Esempio n. 11
0
                                                return
                                        outstream.write(buf)
                        except Exception, x:
                                # output file is inconsistent state
                                # we must crash the job
                                err("Updating file %s failed: %s" %\
                                        (outfile, x))
                except IOError, x:
                        # Python doc guides us to check both the
                        # EWOULDBLOCK (11) and EACCES (13) errors
                        if x.errno == 11 or x.errno == 13:
                                time.sleep(1)
                                timeout -= 1
                        else:
                                raise
        data_err("Timeout when updating file %s" % outfile, outfile)


def ensure_file(fname, data = None, timeout = 60, mode = 500):
        while timeout > 0:
                if os.path.exists(fname):
                        return False
                try:
                        fd = os.open(fname + ".partial",
                                os.O_CREAT | os.O_EXCL | os.O_WRONLY, mode)
                        if type(data) == str:
                               os.write(fd, data)
                        else:
                               os.write(fd, data())
                        os.close(fd)
                        os.rename(fname + ".partial", fname)
Esempio n. 12
0
def open_remote(input, ext_host, ext_file):
        try:
                return comm.open_remote("http://%s%s" % (ext_host, ext_file))
        except Exception, x:
                data_err("Can't access an external input file (%s%s): %s"\
                         % (ext_host, ext_file, x), x)