Exemple #1
0
def read_index(dir):
    from disco.comm import open_url
    file = open_url(proxy_url(dir))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        yield line.split()
Exemple #2
0
def read_index(dir):
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        yield bytes_to_str(line).split()
Exemple #3
0
def read_index(dir):
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        yield bytes_to_str(line).split()
Exemple #4
0
def read_index(dir):
    from disco.comm import open_url
    file = open_url(proxy_url(dir))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        yield line.split()
Exemple #5
0
def open(url, task=None):
    if task:
        scheme, netloc, path = util.urlsplit(url,
                                             localhost=task.host,
                                             disco_port=task.disco_port,
                                             disco_data=task.disco_data,
                                             ddfs_data=task.ddfs_data)
    else:
        scheme, netloc, path = util.urlsplit(url, localhost=None)
    return comm.open_url(util.urljoin((scheme, netloc, path)))
Exemple #6
0
def read_index(dir):
    # We might be given replicas of dirs; choose the first.
    if isiterable(dir): dir = dir[0]
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        label, url, size = bytes_to_str(line).split()
        yield int(label), url, int(size)
Exemple #7
0
def read_index(dir):
    # We might be given replicas of dirs; choose the first.
    if isiterable(dir): dir = dir[0]
    from disco.comm import open_url
    file = open_url(proxy_url(dir, to_master=False))
    if dir.endswith(".gz"):
        file = gzip.GzipFile(fileobj=file)
    for line in file:
        label, url, size = bytes_to_str(line).split()
        yield int(label), url, int(size)
Exemple #8
0
def open(url, task=None):
    if task:
        scheme, netloc, path = util.urlsplit(url,
                                             localhost=task.host,
                                             disco_port=task.disco_port,
                                             disco_data=task.disco_data,
                                             ddfs_data=task.ddfs_data)
    else:
        scheme, netloc, path = util.urlsplit(url, localhost=None)
    return comm.open_url(util.urljoin((scheme, netloc, path)))
Exemple #9
0
def input_stream(fd, size, url, params):
    """Opens the url locally on the node."""
    assert url.startswith('s3://')
    bucketname, keyname = url[5:].split('/', 1)
    access_key = params.get('aws_access_key_id')
    secret_key = params.get('aws_secret_access_key')
    s3 = connect_s3(access_key, secret_key)
    bucket = s3.get_bucket(bucketname, validate=False)
    key = bucket.get_key(keyname)
    if key.size:
        url = key.generate_url(24*3600)
        return open_url(url), key.size, url
    else:
        return StringIO(), 0, url
Exemple #10
0
 def concat_input(cls, task, output_label, replicas):
     output = AtomicFile(task.output_path(output_label))
     BUFFER_SIZE = 1024*1024
     for reps in replicas:
         # Use only the first replica for now, since a set of one
         # is the most common case.
         # TODO: handle falling back to alternative replicas.
         inp = open_url(reps[0])
         buf = inp.read(BUFFER_SIZE)
         while (len(buf) > 0):
             output.write(buf)
             buf = inp.read(BUFFER_SIZE)
         inp.close()
     output.close()
     return output.path, output.size()
Exemple #11
0
 def concat_input(cls, task, output_label, replicas):
     output = AtomicFile(task.output_path(output_label))
     BUFFER_SIZE = 1024 * 1024
     for reps in replicas:
         # Use only the first replica for now, since a set of one
         # is the most common case.
         # TODO: handle falling back to alternative replicas.
         inp = open_url(reps[0])
         buf = inp.read(BUFFER_SIZE)
         while (len(buf) > 0):
             output.write(buf)
             buf = inp.read(BUFFER_SIZE)
         inp.close()
     output.close()
     return output.path, output.size()
def input_stream(fd, size, url, params):
    """Opens the url locally on the node."""
    from disco.comm import open_url
    return open_url(url)
Exemple #13
0
def open(url, task=None):
    _hdfs_scheme, address = schemesplit(url)
    namenode_port, rest = schemesplit(address)
    http_url = 'http://' + namenode_port + '/webhdfs/v1/' + rest + '?op=OPEN'
    return comm.open_url(http_url)
Exemple #14
0
def open(url, task=None):
    return comm.open_url(url)
def open(url, task=None):
    _hdfs_scheme, address = schemesplit(url)
    namenode_port, rest = schemesplit(address)
    http_url = 'http://' + namenode_port + '/webhdfs/v1/' + rest + '?op=OPEN'
    return comm.open_url(http_url)
Exemple #16
0
def open(url, task=None):
    return comm.open_url(url)