def read_index(dir): from disco.comm import open_url file = open_url(proxy_url(dir)) if dir.endswith(".gz"): file = gzip.GzipFile(fileobj=file) for line in file: yield line.split()
def read_index(dir): from disco.comm import open_url file = open_url(proxy_url(dir, to_master=False)) if dir.endswith(".gz"): file = gzip.GzipFile(fileobj=file) for line in file: yield bytes_to_str(line).split()
def open(url, task=None): if task: scheme, netloc, path = util.urlsplit(url, localhost=task.host, disco_port=task.disco_port, disco_data=task.disco_data, ddfs_data=task.ddfs_data) else: scheme, netloc, path = util.urlsplit(url, localhost=None) return comm.open_url(util.urljoin((scheme, netloc, path)))
def read_index(dir): # We might be given replicas of dirs; choose the first. if isiterable(dir): dir = dir[0] from disco.comm import open_url file = open_url(proxy_url(dir, to_master=False)) if dir.endswith(".gz"): file = gzip.GzipFile(fileobj=file) for line in file: label, url, size = bytes_to_str(line).split() yield int(label), url, int(size)
def input_stream(fd, size, url, params): """Opens the url locally on the node.""" assert url.startswith('s3://') bucketname, keyname = url[5:].split('/', 1) access_key = params.get('aws_access_key_id') secret_key = params.get('aws_secret_access_key') s3 = connect_s3(access_key, secret_key) bucket = s3.get_bucket(bucketname, validate=False) key = bucket.get_key(keyname) if key.size: url = key.generate_url(24*3600) return open_url(url), key.size, url else: return StringIO(), 0, url
def concat_input(cls, task, output_label, replicas): output = AtomicFile(task.output_path(output_label)) BUFFER_SIZE = 1024*1024 for reps in replicas: # Use only the first replica for now, since a set of one # is the most common case. # TODO: handle falling back to alternative replicas. inp = open_url(reps[0]) buf = inp.read(BUFFER_SIZE) while (len(buf) > 0): output.write(buf) buf = inp.read(BUFFER_SIZE) inp.close() output.close() return output.path, output.size()
def concat_input(cls, task, output_label, replicas): output = AtomicFile(task.output_path(output_label)) BUFFER_SIZE = 1024 * 1024 for reps in replicas: # Use only the first replica for now, since a set of one # is the most common case. # TODO: handle falling back to alternative replicas. inp = open_url(reps[0]) buf = inp.read(BUFFER_SIZE) while (len(buf) > 0): output.write(buf) buf = inp.read(BUFFER_SIZE) inp.close() output.close() return output.path, output.size()
def input_stream(fd, size, url, params): """Opens the url locally on the node.""" from disco.comm import open_url return open_url(url)
def open(url, task=None): _hdfs_scheme, address = schemesplit(url) namenode_port, rest = schemesplit(address) http_url = 'http://' + namenode_port + '/webhdfs/v1/' + rest + '?op=OPEN' return comm.open_url(http_url)
def open(url, task=None): return comm.open_url(url)