Beispiel #1
0
 def test_urlsplit(self):
     port = self.disco_settings['DISCO_PORT']
     self.assertEquals(urlsplit('http://host/path'),
                       ('http', ('host', ''), 'path'))
     self.assertEquals(urlsplit('http://host:port/path'),
                       ('http', ('host', 'port'), 'path'))
     self.assertEquals(urlsplit('disco://master/long/path'),
                       ('http', ('master', '%s' % port), 'long/path'))
Beispiel #2
0
def open(url, task=None):
    if task:
        scheme, netloc, path = util.urlsplit(url,
                                             localhost=task.host,
                                             disco_port=task.disco_port,
                                             disco_data=task.disco_data,
                                             ddfs_data=task.ddfs_data)
    else:
        scheme, netloc, path = util.urlsplit(url, localhost=None)
    return comm.open_url(util.urljoin((scheme, netloc, path)))
Beispiel #3
0
def process_restrict(interface, state, label, inp, task, label_fn, ffuncs,
                     ghfuncs, deffuncs, agg_fn, wide=False, need_agg=False):
    from disco import util
    empty = ()

    # inp contains a set of replicas, let's force local #HACK
    input_processed = False
    for i, inp_url in inp.input.replicas:
        scheme, (netloc, port), rest = util.urlsplit(inp_url)
        if netloc == task.host:
            input_processed = True
            inp.input = inp_url
            break

    if not input_processed:
        raise Exception("Input %s not processed, no LOCAL resource found."
                        % str(inp.input))

    # opportunistically aggregate in this stage
    if need_agg and not wide:
        for out_label, key in agg_fn(inp, label_fn, ffuncs, ghfuncs, deffuncs):
            interface.output(out_label).add(key, empty)
    else:
        for key, value in inp:
            out_label = label_fn(key)
            # print "RESTRICT: %s %s" % (key, value)
            interface.output(out_label).add(key, value)
Beispiel #4
0
    def map_input_stream(stream, size, url, params):
        from disco.func import string_input_stream
        from disco.util import urlsplit

        scheme, netloc, path = urlsplit(url)
        assert netloc.host != Task.host
        return string_input_stream(str(netloc), size, url, params)
Beispiel #5
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/{0}'.format(path), body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException as e:
        status = None
        errmsg = str(e) or repr(e)
    except (httplib.socket.error, socket.error) as e:
        status = None
        errmsg = e if isinstance(e, basestring) else str(e) or repr(e)

    if not status or isunavailable(status):
        if sleep == 9:
            raise CommError(errmsg, url, status)
        time.sleep(random.randint(1, 2**sleep))
        return request(method, url, data=data, headers=headers, sleep=sleep + 1)
    elif isredirection(status):
        loc = response.getheader('location')
        return request(method,
                       loc if loc.startswith('http:') else resolveuri(url, loc),
                       data=data,
                       headers=headers,
                       sleep=sleep)
    elif not issuccessful(status):
        raise CommError(response.read(), url, status)
    return response
Beispiel #6
0
def process_restrict(interface, state, label, inp, task, label_fn, ffuncs,
                     ghfuncs, deffuncs, agg_fn, wide=False, need_agg=False,
                     distinct=False, limit=sys.maxint):
    from disco import util
    from itertools import groupby, islice
    empty = ()

    # inp contains a set of replicas, let's force local #HACK
    input_processed = False
    for i, inp_url in inp.input.replicas:
        scheme, (netloc, port), rest = util.urlsplit(inp_url)
        if netloc == task.host:
            input_processed = True
            inp.input = inp_url
            break

    if not input_processed:
        raise util.DataError("Input %s not processed, no LOCAL resource found."
                             % str(inp.input), '')

    # opportunistically aggregate, distinct and limit in this stage
    if need_agg and not wide:
        for out_label, key in agg_fn(inp, label_fn, ffuncs, ghfuncs, deffuncs):
            interface.output(out_label).add(key, empty)
    else:
        if distinct:
            for uniqkey, _ in islice(groupby(inp, lambda (k, v): tuple(k)), 0, limit):
                label = label_fn(uniqkey)
                interface.output(label).add(uniqkey, empty)
        else:
Beispiel #7
0
def imp_process(data):
    from disco.util import urlsplit

    _, (host, _), _ = urlsplit(data["url"])
    if host.startswith("www."):
        host = host[4:]
    data["site_id"] = host
Beispiel #8
0
def imp_process(data):
    from disco.util import urlsplit

    _, (host, _), _ = urlsplit(data['url'])
    if host.startswith('www.'):
        host = host[4:]
    data['site_id'] = host
Beispiel #9
0
def stat_input_stream(fd, size, url, params):
    from disco import util
    from hustle.core.marble import MarbleStream

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    otab = None
    try:
        # print "FLurlG: %s" % url
        fle = util.localize(rest, disco_data=params._task.disco_data,
                            ddfs_data=params._task.ddfs_data)
        # print "FLOGLE: %s" % fle
        otab = MarbleStream(fle)
        rows = otab.number_rows
        frows = float(rows)
        rval = {'_': rows, }
        for field, (subdb, subindexdb, _, column, _) in otab.dbs.iteritems():
            if subindexdb:
                rval[field] = subindexdb.stat(otab.txn)['ms_entries'] / frows
        yield '', rval
    except Exception as e:
        print "Gibbers: %s" % e
        raise e
    finally:
        if otab:
            otab.close()
Beispiel #10
0
 def map_input_stream(stream, size, url, params):
     from disco.util import urlsplit
     from disco import comm
     scheme, netloc, path = urlsplit(url)
     # test that scheduler preserved data locality
     msg("NODE %s GOT URL %s" % (Task.netloc, url))
     assert netloc == Task.netloc
     return comm.open_remote("http://%s/%s" % (path, netloc))
Beispiel #11
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/%s' % path, body=data, headers=headers)
        response = conn.getresponse()
    except (httplib.HTTPException, httplib.socket.error), e:
        raise CommError("Request failed: %s" % e, url)
Beispiel #12
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/%s' % path, body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException, e:
        status = None
        errmsg = str(e) or repr(e)
Beispiel #13
0
def input_stream(fd, size, url, params):
    """
    Opens the path on host using an http client and the setting `DISCO_PORT`.
    """
    scheme, netloc, rest = urlsplit(url)
    prefix, fname = rest.split('/', 1)
    if netloc[0] == Task.netloc[0]:
        if prefix == 'ddfs':
            root = Task.settings['DDFS_ROOT']
        else:
            root = Task.settings['DISCO_DATA']
        path = os.path.join(root, fname)
        return comm.open_local(path)
    return comm.open_remote('http://%s/%s/%s' % (netloc, prefix, fname))
Beispiel #14
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    from itertools import izip, repeat
    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    fle = util.localize(rest, disco_data=params._task.disco_data,
                        ddfs_data=params._task.ddfs_data)
    # print "FLOGLE: %s %s" % (url, fle)

    otab = None
    try:
        # import sys
        # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
        # import pydevd
        # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
        otab = MarbleStream(fle)
        bitmaps = {}

        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bm = where(otab)
                    bitmaps[index] = (bm, len(bm))
                else:
                    # it is either the table itself, or a partition expression.
                    # Either way, returns the entire table
                    bitmaps[index] = (otab.iter_all(), otab.number_rows)

        for index, (bitmap, blen) in bitmaps.iteritems():
            prefix_gen = [repeat(index, blen)] if gen_where_index else []

            row_iter = prefix_gen + \
                [otab.mget(col, bitmap) if col is not None else repeat(None, blen)
                 for col in key_names[index]]

            for row in izip(*row_iter):
                yield row, empty

    finally:
        if otab:
            otab.close()
Beispiel #15
0
 def _push(self, source_target, replicas=None, exclude=[], **kwargs):
     source, target = source_target
     qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)),
                                         ('replicas', replicas)) if v])
     urls = self._download('{0}/ddfs/new_blob/{1}?{2}'
                           .format(self.master, target, qs))
     try:
         return [json.loads(bytes_to_str(url))
                 for url in self._upload(urls, source, to_master=False, **kwargs)]
     except CommError as e:
         scheme, (host, port), path = urlsplit(e.url)
         return self._push((source, target),
                           replicas=replicas,
                           exclude=exclude + [host],
                           **kwargs)
Beispiel #16
0
def input_stream(fd, size, url, params):
    scheme, netloc, rest = util.urlsplit(url)

    if netloc[0] == Task.netloc[0]:
        path, rest   = rest.split('!', 1) if '!' in rest else (rest, '')
        Task.discodb = DiscoDB.load(open(os.path.join(Task.root, path)))

        if rest:
            method, arg = rest.split('/', 1)
            if method == 'query':
                if hasattr(params, 'discodb_query'):
                    return Task.discodb.query(params.discodb_query), size, url
                return Task.discodb.query(Q.urlscan(arg)), size, url
            return getattr(Task.discodb, method)(), size, url
        return Task.discodb, size, url
    raise core.DiscoError("Scheme 'discodb' can only be used with force_local=True")
Beispiel #17
0
def process_stat(interface, state, label, inp, task):
    from disco import util

    # inp contains a set of replicas, let's force local #HACK
    input_processed = False
    for i, inp_url in inp.input.replicas:
        scheme, (netloc, port), rest = util.urlsplit(inp_url)
        if netloc == task.host:
            input_processed = True
            inp.input = inp_url
            break

    if not input_processed:
        raise Exception("Input %s not processed, no LOCAL resource found."
                        % str(inp.input))

    for key, value in inp:
        interface.output(0).add(key, value)
Beispiel #18
0
 def _push(self, source_target, replicas=None, forceon=[], exclude=[], **kwargs):
     source, target = source_target
     qs = urlencode(
         [
             (k, v)
             for k, v in (("exclude", ",".join(exclude)), ("include", ",".join(forceon)), ("replicas", replicas))
             if v
         ]
     )
     urls = self._download("{0}/ddfs/new_blob/{1}?{2}".format(self.master, target, qs))
     try:
         return [json.loads(bytes_to_str(url)) for url in self._upload(urls, source, to_master=False, **kwargs)]
     except CommError as e:
         scheme, (host, port), path = urlsplit(e.url)
         if hasattr(source, "seek"):
             source.seek(0)  # source will be read again; seek to the beginning
         else:
             print("{0} is not seekable, retrying".format(source))
         return self._push((source, target), replicas=replicas, forceon=forceon, exclude=exclude + [host], **kwargs)
Beispiel #19
0
def read(interface, state, label, inp):
    from disco import util

    for e in inp:
        scheme, netloc, _ = util.urlsplit(e)
        fileName, joinColumn = str(netloc).split("?")
        File = open(PREFIX + fileName, "r")
        col = int(joinColumn)

        reader = csv.reader(File)
        firstRow = True
        for row in reader:
            if firstRow:
                tableName = row[0]
                firstRow = False
            else:
                fullName = tableName + "?" + str(col)
                Hash = int(hashlib.md5(str_to_bytes(row[col])).hexdigest(), 16) % 160
                interface.output(Hash).add(fullName, row)
Beispiel #20
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index, key_names):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        print "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise e

    fle = util.localize(rest, disco_data=params._task.disco_data, ddfs_data=params._task.ddfs_data)
    # print "FLOGLE: %s %s" % (url, fle)

    otab = None
    try:
        # import sys
        # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
        # import pydevd
        # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
        otab = MarbleStream(fle)
        bitmaps = {}
        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bitmaps[index] = where(otab)
                else:
                    # it is either the table itself, or a partition expression.  either way,
                    # return the entire table
                    bitmaps[index] = otab.iter_all()

        for index, bitmap in bitmaps.iteritems():
            prefix = [index] if gen_where_index else []
            for row_id in bitmap:
                record = [otab.get(col, row_id) if col else None for col in key_names[index]]
                # print "Gibbled: %s" % repr(record)
                record[0:0] = prefix  # this looks odd, but is faster than 'prefix + record'
                yield tuple(record), empty
    finally:
        if otab:
            otab.close()
Beispiel #21
0
def process_restrict(interface, state, label, inp, task, label_fn):
    from disco import util

    # inp contains a set of replicas, let's force local #HACK
    input_processed = False
    for i, inp_url in inp.input.replicas:
        scheme, (netloc, port), rest = util.urlsplit(inp_url)
        if netloc == task.host:
            input_processed = True
            inp.input = inp_url
            break

    if not input_processed:
        raise Exception("Input %s not processed, no LOCAL resource found." %
                        str(inp.input))

    for key, value in inp:
        out_label = label_fn(key)
        # print "RESTRICT: %s %s" % (key, value)
        interface.output(out_label).add(key, value)
Beispiel #22
0
 def test_urlsplit(self):
     port = self.disco_settings['DISCO_PORT']
     ddfs = self.disco_settings['DDFS_ROOT']
     data = self.disco_settings['DISCO_DATA']
     self.assertEquals(urlsplit('http://host/path'),
                       ('http', ('host', ''), 'path'))
     self.assertEquals(urlsplit('http://host:port/path'),
                       ('http', ('host', 'port'), 'path'))
     self.assertEquals(urlsplit('disco://master/long/path'),
                       ('http', ('master', '%s' % port), 'long/path'))
     self.assertEquals(urlsplit('disco://localhost/ddfs/path',
                                localhost='localhost'),
                       ('file', ('localhost', ''), os.path.join(ddfs, 'path')))
     self.assertEquals(urlsplit('disco://localhost/data/path',
                                localhost='localhost'),
                       ('file', ('localhost', ''), os.path.join(data, 'path')))
     self.assertEquals(urlsplit('tag://tag', ''),
                       ('tag', ('', ''), 'tag'))
     self.assertEquals(urlsplit('tag://host/tag', ''),
                       ('tag', ('host', ''), 'tag'))
     self.assertEquals(urlsplit('tag://host:port/tag', ''),
                       ('tag', ('host', 'port'), 'tag'))
Beispiel #23
0
def input_stream(fd, size, url, params):
    import os
    from disco import util
    from disco.comm import download
    from discodb import DiscoDB, Q
    scheme, netloc, rest = util.urlsplit(url)
    path, rest   = rest.split('!', 1) if '!' in rest else (rest, '')

    if netloc[0] == Task.netloc[0]:
        discodb = DiscoDB.load(open(os.path.join(Task.root, path)))
    else:
        discodb = DiscoDB.loads(download('disco://%s/%s' % (netloc, path)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg)), size, url
        return method(*filter(None, arg)), size, url
    return discodb, size, url
Beispiel #24
0
def Open(url, task=None):
    if task:
        disco_data = task.disco_data
        ddfs_data = task.ddfs_data
    else:
        from disco.settings import DiscoSettings
        settings = DiscoSettings()
        disco_data = settings['DISCO_DATA']
        ddfs_data = settings['DDFS_DATA']
    scheme, netloc, rest = util.urlsplit(url)
    path, rest = rest.split('!', 1) if '!' in rest else (rest, '')
    discodb = DiscoDB.load(
        open(util.localize(path, disco_data=disco_data, ddfs_data=ddfs_data)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg))
        return method(*filter(None, arg))
    return discodb
Beispiel #25
0
def filename_input_stream(fd, size, url, params):
    """This input_stream simply returns the path of the local disk file for this map job"""
    from disco import util
    from disco.worker.classic import worker

    try:
        scheme, netloc, rest = util.urlsplit(url)
        netloc = "%s:%s" % netloc if netloc[1] else netloc[0]
    except Exception as e:
        msg = "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise util.DataError(msg, url)

    if scheme == 'file':
        yield url, "/%s" % rest
    else:
        # print url, rest
        fle = util.localize(rest,
                            disco_data=worker.Task.disco_data,
                            ddfs_data=worker.Task.ddfs_data)

        yield url, fle
Beispiel #26
0
def filename_input_stream(fd, size, url, params):
    """This input_stream simply returns the path of the local disk file for this map job"""
    from disco import util
    from disco.worker.classic import worker

    try:
        scheme, netloc, rest = util.urlsplit(url)
        netloc = "%s:%s" % netloc if netloc[1] else netloc[0]
    except Exception as e:
        msg = "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise util.DataError(msg, url)

    if scheme == 'file':
        yield url, "/%s" % rest
    else:
        # print url, rest
        fle = util.localize(rest,
                            disco_data=worker.Task.disco_data,
                            ddfs_data=worker.Task.ddfs_data)

        yield url, fle
Beispiel #27
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    # This fixes a problem with Unicode errors in Python 2.7
    # works in Python 2.6 as well, but not earlier versions
    try:
        if data is not None:
            data = bytearray(data)
    except NameError:
        # In Python < 2.6, bytearray doesn't exist
        pass

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/%s' % path, body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException, e:
        status = None
        errmsg = str(e) or repr(e)
Beispiel #28
0
def Open(url, task=None):
    if task:
        disco_data = task.disco_data
        ddfs_data = task.ddfs_data
    else:
        from disco.settings import DiscoSettings
        settings = DiscoSettings()
        disco_data = settings['DISCO_DATA']
        ddfs_data = settings['DDFS_DATA']
    scheme, netloc, rest = util.urlsplit(url)
    path, rest = rest.split('!', 1) if '!' in rest else (rest, '')
    discodb = DiscoDB.load(open(util.localize(path, disco_data=disco_data,
                                ddfs_data=ddfs_data)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg))
        return method(*filter(None, arg))
    return discodb
Beispiel #29
0
 def test_urlsplit(self):
     port = self.settings['DISCO_PORT']
     ddfs = self.settings['DDFS_DATA']
     data = self.settings['DISCO_DATA']
     self.assertEquals(urlsplit('http://host/path'),
                       ('http', ('host', ''), 'path'))
     self.assertEquals(urlsplit('http://host:port/path'),
                       ('http', ('host', 'port'), 'path'))
     self.assertEquals(urlsplit('disco://master/long/path'),
                       ('http', ('master', '{0}'.format(port)), 'long/path'))
     self.assertEquals(urlsplit('disco://localhost/ddfs/path',
                                localhost='localhost',
                                ddfs_data=ddfs),
                       ('file', ('', ''), os.path.join(ddfs, 'path')))
     self.assertEquals(urlsplit('disco://localhost/data/path',
                                localhost='localhost',
                                disco_data=data),
                       ('file', ('', ''), os.path.join(data, 'path')))
     self.assertEquals(urlsplit('tag://tag', ''),
                       ('tag', ('', ''), 'tag'))
     self.assertEquals(urlsplit('tag://host/tag', ''),
                       ('tag', ('host', ''), 'tag'))
     self.assertEquals(urlsplit('tag://host:port/tag', ''),
                       ('tag', ('host', 'port'), 'tag'))
Beispiel #30
0
def process_restrict(interface,
                     state,
                     label,
                     inp,
                     task,
                     label_fn,
                     ffuncs,
                     ghfuncs,
                     deffuncs,
                     agg_fn,
                     wide=False,
                     need_agg=False):
    from disco import util
    empty = ()

    # inp contains a set of replicas, let's force local #HACK
    input_processed = False
    for i, inp_url in inp.input.replicas:
        scheme, (netloc, port), rest = util.urlsplit(inp_url)
        if netloc == task.host:
            input_processed = True
            inp.input = inp_url
            break

    if not input_processed:
        raise util.DataError(
            "Input %s not processed, no LOCAL resource found." %
            str(inp.input), '')

    # opportunistically aggregate in this stage
    if need_agg and not wide:
        for out_label, key in agg_fn(inp, label_fn, ffuncs, ghfuncs, deffuncs):
            interface.output(out_label).add(key, empty)
    else:
        for key, value in inp:
            out_label = label_fn(key)
            # print "RESTRICT: %s %s" % (key, value)
            interface.output(out_label).add(key, value)
Beispiel #31
0
def download(url,
             data=None,
             redir=False,
             offset=0,
             method=None,
             sleep=0,
             header=None):

    header = header if header != None else {}

    from disco.util import urlsplit
    try:
        scheme, netloc, path = urlsplit(url)
        http = httplib.HTTPConnection(str(netloc))
        h = {}
        if offset:
            if type(offset) == tuple:
                offs = 'bytes=%d-%d' % offset
            else:
                offs = 'bytes=%d-' % offset
            h = {'Range': offs}
        if not method:
            method = 'POST' if data != None else 'GET'
        http.request(method, '/%s' % path, data, headers = h)
        fd = http.getresponse()
        if fd.status == 302:
            loc = fd.getheader('location')
            if loc.startswith('http://'):
                url = loc
            elif loc.startswith('/'):
                url = 'http://%s%s' % (netloc, loc)
            else:
                url = '%s/%s' % (url, loc)
            return download(url, data, redir, offset, method, sleep, header)
        header.update(fd.getheaders())
        return fd.status, fd.read()
    except (httplib.HTTPException, httplib.socket.error), e:
        raise CommError("Transfer %s failed: %s" % (url, e), url)
Beispiel #32
0
 def _push(self,
           source_target,
           replicas=None,
           forceon=[],
           exclude=[],
           **kwargs):
     source, target = source_target
     qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)),
                                         ('include', ','.join(forceon)),
                                         ('replicas', replicas)) if v])
     urls = self._download('{0}/ddfs/new_blob/{1}?{2}'.format(
         self.master, target, qs))
     try:
         return [
             json.loads(bytes_to_str(url)) for url in self._upload(
                 urls, source, to_master=False, **kwargs)
         ]
     except CommError as e:
         scheme, (host, port), path = urlsplit(e.url)
         return self._push((source, target),
                           replicas=replicas,
                           forceon=forceon,
                           exclude=exclude + [host],
                           **kwargs)
Beispiel #33
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/{0}'.format(path), body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException as e:
        status = None
        errmsg = str(e) or repr(e)
    except (httplib.socket.error, socket.error) as e:
        status = None
        errmsg = e if isinstance(e, basestring) else str(e) or repr(e)

    if not status or isunavailable(status):
        if sleep == 9:
            raise CommError(errmsg, url, status)
        time.sleep(random.randint(1, 2**sleep))
        return request(method,
                       url,
                       data=data,
                       headers=headers,
                       sleep=sleep + 1)
    elif isredirection(status):
        loc = response.getheader('location')
        return request(
            method,
            loc if loc.startswith('http:') else resolveuri(url, loc),
            data=data,
            headers=headers,
            sleep=sleep)
    elif not issuccessful(status):
        raise CommError(response.read(), url, status)
    return response
Beispiel #34
0
def relativizetag(tag, parent):
    _scheme, netloc, name = urlsplit(canonizetag(tag))
    _scheme, parentloc, _ = urlsplit(canonizetag(parent))
    return urljoin(('tag', netloc or parentloc, name))
Beispiel #35
0
def tagname(tag):
    scheme, netloc, name = urlsplit(canonizetag(tag))
    return name
Beispiel #36
0
 def answers(self):
     for input in self.input:
         scheme, netloc, path = urlsplit(input)
         yield str(netloc), ''
Beispiel #37
0
def resolveuri(baseuri, uri):
    if uri.startswith('/'):
        scheme, netloc, _path = urlsplit(baseuri)
        return '%s://%s%s' % (scheme, netloc, uri)
    return '%s/%s' % (baseuri, uri)
Beispiel #38
0
 def assertResults(self, job, input):
     self.assertAllEqual(sorted(self.results(job)),
                         sorted((str(urlsplit(i)[1]), '') for i in input))
Beispiel #39
0
def tagname(tag):
    scheme, netloc, name = urlsplit(canonizetag(tag))
    return name
Beispiel #40
0
def hustle_input_stream(fd, size, url, params, wheres, gen_where_index,
                        key_names, limit):
    from disco import util
    from hustle.core.marble import Expr, MarbleStream
    from itertools import izip, repeat, islice, imap
    from sys import maxint
    from pyebset import BitSet

    empty = ()

    try:
        scheme, netloc, rest = util.urlsplit(url)
    except Exception as e:
        msg = "Error handling hustle_input_stream for %s. %s" % (url, e)
        raise util.DataError(msg, url)

    fle = util.localize(rest,
                        disco_data=params._task.disco_data,
                        ddfs_data=params._task.ddfs_data)

    otab = None
    try:
        otab = MarbleStream(fle)
        bitmaps = {}

        for index, where in enumerate(wheres):
            # do not process where clauses that have nothing to do with this marble
            if where._name == otab.marble._name:
                if type(where) is Expr and not where.is_partition:
                    bm = where(otab)
                    if limit != maxint:
                        bs = BitSet()
                        for i in islice(bm, 0, limit):
                            bs.set(i)
                        bitmaps[index] = (bs, len(bs))
                    else:
                        bitmaps[index] = (bm, len(bm))
                else:
                    # it is either the table itself, or a partition expression.
                    # Either way, returns the entire table
                    if limit != maxint:
                        bs = BitSet()
                        for i in islice(otab.iter_all(), 0, limit):
                            bs.set(i)
                        bitmaps[index] = (bs, len(bs))
                    else:
                        bitmaps[index] = (otab.iter_all(), otab.number_rows)

        for index, (bitmap, blen) in bitmaps.iteritems():
            prefix_gen = [repeat(index, blen)] if gen_where_index else []

            # row_iter = prefix_gen + \
            # [otab.mget(col, bitmap) if col is not None else repeat(None, blen)
            # for col in key_names[index]]
            row_creators = []
            for col, column_fn in key_names[index]:
                if col is not None:
                    if column_fn is None:
                        row_creators.append(otab.mget(col, bitmap))
                    else:
                        row_creators.append(
                            imap(column_fn, otab.mget(col, bitmap)))
                else:
                    row_creators.append(repeat(None, blen))
            row_iter = prefix_gen + row_creators

            for row in izip(*row_iter):
                yield row, empty
    finally:
        if otab:
            otab.close()
Beispiel #41
0
            return '%s/proxy/%s/%s/%s' % (self.proxy, host, method, path)
        return url

    def _push(self, (source, target), replicas=None, exclude=[], **kwargs):
        qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)),
                                            ('replicas', replicas)) if v])
        urls = self._download('%s/ddfs/new_blob/%s?%s' %
                              (self.master, target, qs))

        try:
            return [
                json.loads(url)
                for url in self._upload(urls, source, **kwargs)
            ]
        except CommError, e:
            scheme, (host, port), path = urlsplit(e.url)
            return self._push((source, target),
                              replicas=replicas,
                              exclude=exclude + [host],
                              **kwargs)

    def _tagattr(self, tag, attr):
        return '%s/%s' % (self._resolve(canonizetag(tag)), attr)

    def _token(self, token, method):
        if token is None:
            if method == 'GET':
                return self.settings['DDFS_READ_TOKEN']
            return self.settings['DDFS_WRITE_TOKEN']
        return token
Beispiel #42
0
 def map_input_stream(stream, size, url, params):
     scheme, (host, port), test_server = urlsplit(url)
     # test that scheduler observed the blacklist
     print("NODE {0} GOT URL {1}".format(Task.host, url))
     assert Task.host <= host
     return open_remote("http://{0}/{1}".format(test_server, host))
Beispiel #43
0
 def _maybe_proxy(self, url, method='GET'):
     if self.proxy:
         scheme, (host, port), path = urlsplit(url)
         return '%s/proxy/%s/%s/%s' % (self.proxy, host, method, path)
     return url
Beispiel #44
0
 def map_input_stream(stream, size, url, params):
     scheme, netloc, path = urlsplit(url)
     assert netloc.host == Task.host
     return string_input_stream(str(netloc), size, url, params)
Beispiel #45
0
 def map_input_stream(stream, size, url, params):
     scheme, (host, port), test_server = urlsplit(url)
     # test that scheduler preserved data locality
     msg("NODE {0} GOT URL {1}".format(Task.host, url))
     assert Task.host == host
     return open_remote("http://{0}/{1}".format(test_server, host))
Beispiel #46
0
 def answers(self):
     for input in self.input:
         scheme, netloc, path = urlsplit(input)
         yield str(netloc), ''
Beispiel #47
0
 def map_input_stream(stream, size, url, params):
     scheme, (host, port), test_server = urlsplit(url)
     # test that scheduler observed the blacklist
     msg("NODE {0} GOT URL {1}".format(Task.host, url))
     assert Task.host <= host
     return open_remote("http://{0}/{1}".format(test_server, host))
Beispiel #48
0
 def open_url(self, url):
     scheme, netloc, rest = util.urlsplit(url, localhost=self.host)
     if not scheme or scheme == 'file':
         return comm.open_local(rest)
     return comm.open_remote('%s://%s/%s' % (scheme, netloc, rest))
Beispiel #49
0
def resolveuri(baseuri, uri):
    if uri.startswith('/'):
        scheme, netloc, _path = urlsplit(baseuri)
        return '{0}://{1}{2}'.format(scheme, netloc, uri)
    return '{0}/{1}'.format(baseuri, uri)
Beispiel #50
0
def relativizetag(tag, parent):
    _scheme, netloc, name = urlsplit(canonizetag(tag))
    _scheme, parentloc, _ = urlsplit(canonizetag(parent))
    return urljoin(("tag", netloc or parentloc, name))
Beispiel #51
0
 def map_input_stream(stream, size, url, params):
     from disco.func import string_input_stream
     from disco.util import urlsplit
     scheme, netloc, path = urlsplit(url)
     assert netloc.host != Task.host
     return string_input_stream(str(netloc), size, url, params)