Example #1
0
 def test_urlresolve(self):
     url = 'tag://0'
     hostname = gethostname()
     self.assertEquals(urlresolve(url, master=None),
                       'http://{}:8989/ddfs/tag/0'.format(hostname))
     self.assertRaises(DiscoError,
                       lambda: urlresolve(url, master='disco-master'))
Example #2
0
 def curl(replicas):
     for replica in replicas:
         try:
             return download(proxy_url(urlresolve(replica, master=program.ddfs.master),
                                       to_master=False))
         except Exception, e:
             sys.stderr.write("%s\n" % e)
Example #3
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/{0}'.format(path), body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException as e:
        status = None
        errmsg = str(e) or repr(e)
    except (httplib.socket.error, socket.error) as e:
        status = None
        errmsg = e if isinstance(e, basestring) else str(e) or repr(e)

    if not status or isunavailable(status):
        if sleep == 9:
            raise CommError(errmsg, url, status)
        time.sleep(random.randint(1, 2**sleep))
        return request(method, url, data=data, headers=headers, sleep=sleep + 1)
    elif isredirection(status):
        loc = response.getheader('location')
        return request(method,
                       loc if loc.startswith('http:') else resolveuri(url, loc),
                       data=data,
                       headers=headers,
                       sleep=sleep)
    elif not issuccessful(status):
        raise CommError(response.read(), url, status)
    return response
Example #4
0
 def curl(replicas):
     for replica in replicas:
         try:
             return download(proxy_url(urlresolve(replica, master=program.ddfs.master),
                                       to_master=False))
         except Exception, e:
             sys.stderr.write("%s\n" % e)
Example #5
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/%s' % path, body=data, headers=headers)
        response = conn.getresponse()
    except (httplib.HTTPException, httplib.socket.error), e:
        raise CommError("Request failed: %s" % e, url)
Example #6
0
 def __init__(self, urls, source):
     from disco.util import urlresolve
     self.multi = pycurl.CurlMulti()
     self.pending = [(url, HTTPConnection('').prepare('PUT',
                                                      urlresolve(url),
                                                      body=source))
                     for url in urls]
     for url, conn in self.pending:
         self.multi.add_handle(conn.handle)
Example #7
0
File: ddfscli.py Project: yuj/disco
 def curl(replicas):
     for replica in replicas:
         try:
             return download(proxy_url(urlresolve(replica, master=program.ddfs.master),
                                       to_master=False))
         except Exception as e:
             sys.stderr.write("{0}\n".format(e))
     if not ignore_missing:
         raise Exception("Failed downloading all replicas: {0}".format(replicas))
     return ''
Example #8
0
 def curl(replicas):
     for replica in replicas:
         try:
             return download(proxy_url(urlresolve(replica, master=program.ddfs.master),
                                       to_master=False))
         except Exception as e:
             sys.stderr.write("{0}\n".format(e))
     if not ignore_missing:
         raise Exception("Failed downloading all replicas: {0}".format(replicas))
     return ''
Example #9
0
 def __init__(self, urls, source, token=None):
     from disco.util import urlresolve
     self.multi = pycurl.CurlMulti()
     headers = self.auth_header(token)
     self.pending = [(url, HTTPConnection('').prepare('PUT',
                                                      urlresolve(url),
                                                      body=source,
                                                      headers=headers))
                     for url in urls]
     for url, conn in self.pending:
         self.multi.add_handle(conn.handle)
Example #10
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/%s' % path, body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException, e:
        status = None
        errmsg = str(e) or repr(e)
Example #11
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/%s' % path, body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException, e:
        status = None
        errmsg = str(e) or repr(e)
Example #12
0
    def get(self, uri):
        """Returns the `Document` with the specified `uri`."""
        name, startpos, size = self.get_pos(uri)
        try:
            dump_uri = urlresolve(self.__dump_name_to_blob_uri(name))
        except KeyError:
            raise DocumentNotFound("couldn't find doc with dump name '%s'" % name)

        req = urllib2.Request(dump_uri)
        req.add_header("Range", "bytes=%d-%d" % (startpos, startpos + size - 1))
        res = urllib2.urlopen(req)
        return WARCParser(res).next()
Example #13
0
def download(url, **kwargs):
    code, body = real_download(urlresolve(url), **kwargs)
    if code == 503:
        sleep = kwargs.get('sleep', 0)
        if sleep == 9:
            raise CommError("Too many 503 replies", url)
        else:
            time.sleep(2**sleep)
            kwargs['sleep'] = sleep + 1
            return download(url, **kwargs)
    elif not str(code).startswith('2'):
        raise CommError(body, url, code)
    return body
Example #14
0
    def test_add_dump(self):
        self.docset.add_dump("d1", dump1)

        # check that it's in list of dumps
        self.assertTrue("d1" in self.docset.dump_names())

        # check accessible over http
        from disco.util import urlresolve
        import urllib2

        uri = list(self.docset.dump_uris())[0]
        httpuri = urlresolve(uri)
        d = urllib2.urlopen(httpuri).read()
        self.assertEquals(d, fixtures.warc_file1)
Example #15
0
 def index(self):
     # Lazily load index data from DDFS.
     if self.__index is None:
         blobs = [uri for (uri,) in self.ddfs.blobs(self.ddfs_index_tag)]
         if len(blobs) == 0:
             self.__index = {}
             self.__index_version = 0
         else:
             # Find blob with highest version number.
             ver, discouri = sorted([(self.__blob_uri_to_dump_name(uri), uri)
                                     for uri in blobs], reverse=True)[0]
             uri = urlresolve(discouri)
             data = urllib2.urlopen(uri).read()
             try:
                 self.__index = pickle.loads(data)
                 self.__index_version = int(ver)
             except EOFError:
                 raise EOFError("EOF reading docset index at %s in tag %s" % \
                                    (uri, self.ddfs_index_tag))
     return self.__index
Example #16
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify, urlresolve, proxy_url

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset]
                for repset in chain(urls, program.blobs(*tags))]
    for record in classic_iterator(bloburls,
                                   input_stream=stream,
                                   reader=reader):
        print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
Example #17
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify, urlresolve, proxy_url

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset]
                for repset in chain(urls, program.blobs(*tags))]
    for record in classic_iterator(bloburls,
                                   input_stream=stream,
                                   reader=reader):
        print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
Example #18
0
File: comm.py Project: mazdak/disco
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    # This fixes a problem with Unicode errors in Python 2.7
    # works in Python 2.6 as well, but not earlier versions
    try:
        if data is not None:
            data = bytearray(data)
    except NameError:
        # In Python < 2.6, bytearray doesn't exist
        pass

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/%s' % path, body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException, e:
        status = None
        errmsg = str(e) or repr(e)
Example #19
0
def request(method, url, data=None, headers={}, sleep=0):
    scheme, netloc, path = urlsplit(urlresolve(url))

    try:
        conn = HTTPConnection(str(netloc))
        conn.request(method, '/{0}'.format(path), body=data, headers=headers)
        response = conn.getresponse()
        status = response.status
        errmsg = response.reason
    except httplib.HTTPException as e:
        status = None
        errmsg = str(e) or repr(e)
    except (httplib.socket.error, socket.error) as e:
        status = None
        errmsg = e if isinstance(e, basestring) else str(e) or repr(e)

    if not status or isunavailable(status):
        if sleep == 9:
            raise CommError(errmsg, url, status)
        time.sleep(random.randint(1, 2**sleep))
        return request(method,
                       url,
                       data=data,
                       headers=headers,
                       sleep=sleep + 1)
    elif isredirection(status):
        loc = response.getheader('location')
        return request(
            method,
            loc if loc.startswith('http:') else resolveuri(url, loc),
            data=data,
            headers=headers,
            sleep=sleep)
    elif not issuccessful(status):
        raise CommError(response.read(), url, status)
    return response
Example #20
0
def open_remote(url, token=None):
    conn = Connection(urlresolve(url), token)
    return conn, len(conn), conn.url
Example #21
0
def open_remote(url, token=None):
    return Connection(urlresolve(url), token)
Example #22
0
def open_remote(url):
    conn = Conn(urlresolve(url))
    return conn, conn.length(), conn.url
Example #23
0
 def indexurl(self, indexspec):
     resource = urlparse.urlparse(indexspec)
     if resource.netloc:
         return urlresolve(indexspec)
     path = '/indices/%s' % indexspec
     return urlparse.urlunparse(('http', self.netloc, path, '', '', ''))
Example #24
0
 def _resolve(self, url):
     return urlresolve(url, settings=self.settings)
Example #25
0
 def indexurl(self, indexspec):
     resource = urlparse.urlparse(indexspec)
     if resource.netloc:
         return urlresolve(indexspec)
     path = '/indices/%s' % indexspec
     return urlparse.urlunparse(('http', self.netloc, path, '', '', ''))
Example #26
0
 def _resolve(self, url):
     return urlresolve(url, settings=self.settings)
Example #27
0
 def _resolve(self, url):
     return urlresolve(url, master=self.master)
Example #28
0
def open_remote(url, token=None):
    return Connection(urlresolve(url), token)
Example #29
0
 def indexurl(self, indexspec):
     resource = urlparse.urlparse(indexspec)
     if resource.netloc:
         return urlresolve(indexspec)
     path = "/indices/%s" % indexspec
     return urlparse.urlunparse(("http", self.netloc, path, "", "", ""))
Example #30
0
 def _resolve(self, url):
     return urlresolve(url, master=self.master)
Example #31
0
def open_remote(url):
    conn = Connection(urlresolve(url))
    return conn, len(conn), conn.url