def __init__(self, **kwargs): kwargs2 = { k: v for k, v in kwargs.items() if k in ['host', 'port', 'user', 'ticket_cache', 'token', 'pars'] } HDFileSystem.__init__(self, connect=True, **kwargs2)
def __init__(self, pipeline_options): """Initializes a connection to HDFS. Connection configuration is done using :doc:`hdfs`. """ super(HadoopFileSystem, self).__init__(pipeline_options) self._hdfs_client = HDFileSystem()
def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() self._use_log() try: self.args_dict = eval(sys.argv[1:]) if not isinstance(self.args_dict, dict): raise ValueError('args must be like key-value ') except Exception as e: self.args_dict = {} logging.warning('get args failed:{}'.format(e)) self.proxies = self.args_dict.get('proxies') # 代理配置 self.hdfs = self.args_dict.get('hdfs', {}) # hdfs配置 # 如果没有这两个参数 直接报异常 不执行 if not self.hdfs or not self.proxies: raise ValueError('args not have hdfs or proxies') self.sleep_time = self.args_dict.get('sleep_time', 0.2) # 休眠时间 self.service_args = self.args_dict.get('service_args', {}) # PhantomJS代理配置 self.aliyun_log = self.args_dict.get('aliyun_log', {}) self.alilog = AliyunLog( '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME), endp=self.aliyun_log.get('endpoint', endpoint), accid=self.aliyun_log.get('accessKeyId', accessKeyId), acckey=self.aliyun_log.get('accessKey', accessKey), proj=self.aliyun_log.get('project', project), logst=self.aliyun_log.get('logstore', logstore)) # 阿里云log配置文件,需要校验如果没有该参数会不会报错 try: self.HDFS = HDFileSystem(host=self.hdfs.get( 'ip', '192.168.100.178'), port=self.hdfs.get('port', 8020)) except: pass
def test_batch_mode(self): self._hbaseTestingUtility.getMiniHBaseCluster( ).waitForActiveAndReadyMaster(60000) dfs = self._hbaseTestingUtility.getDFSCluster().getFileSystem() hdfs = HDFileSystem(host='localhost', port=dfs.getUri().getPort(), pars={'dfs.client.read.shortcircuit': 'false'}) hdfs.put('test_resources/batch_test', '/batch_test') self.engine_config['mode'] = 'batch' engine = DataTransformationEngine(self.sc, self.engine_config) engine.start() # wait up to 10s for the server to start time.sleep(5) expected = b"{'table_name': 'dummy_table', 'column_family': 'dummy', 'row': '1', 'data': '1', 'column': " \ b"'count'}\n{'table_name': 'dummy_table', 'column_family': 'dummy', 'row': '2', 'data': '2', " \ b"'column': 'count'}\n{'table_name': 'dummy_table', 'column_family': 'dummy', 'row': '3', " \ b"'data': '3', 'column': 'count'}\n{'table_name': 'dummy_table', 'column_family': 'dummy', " \ b"'row': '4', 'data': '4', 'column': 'count'}\n{'table_name': 'dummy_table', 'column_family': " \ b"'dummy', 'row': '5', 'data': '5', 'column': 'count'}\n " with hdfs.open('/result/part-00000', replication=1) as f: self.assertTrue(f.read(), expected)
def read_block_from_hdfs(filename, offset, length, host=None, port=None, delimiter=None): from locket import lock_file with lock_file('.lock'): hdfs = HDFileSystem(host=host, port=port) bytes = hdfs.read_block(filename, offset, length, delimiter) return bytes
def write_hdfs_orc(vineyard_socket, stream_id, path, proc_num, proc_index): client = vineyard.connect(vineyard_socket) streams = client.get(stream_id) if len(streams) != proc_num or streams[proc_index] is None: raise ValueError( f'Fetch stream error with proc_num={proc_num},proc_index={proc_index}' ) instream = streams[proc_index] reader = instream.open_reader(client) host, port = urlparse(path).netloc.split(':') hdfs = HDFileSystem(host=host, port=int(port)) path = urlparse(path).path writer = None with hdfs.open(path, 'wb') as f: while True: try: buf = reader.next() except: writer.close() break buf_reader = pa.ipc.open_stream(buf) if writer is None: #get schema schema = {} for field in buf_reader.schema: schema[field.name] = orc_type(field.type) writer = pyorc.Writer(f, pyorc.Struct(**schema)) for batch in buf_reader: df = batch.to_pandas() writer.writerows(df.itertuples(False, None))
def main(): import argparse import os parser = argparse.ArgumentParser(prog='chipper.py', description='Extract trips for a video') parser.add_argument("dataset_path", default="dataset_path", action="store", type=str, help="Path to the dataset in hdfs.") parser.add_argument('string_to_match', type=str, help='string to match <str> in filename') args = parser.parse_args() from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='namenode', port=8020) filenames = hdfs.glob(args.dataset_path) def get_info(filename): bname = os.path.basename(filename) return bname.split('-')[0], int(bname.split('-')[1]) filenames = sorted(filenames, key=get_info) filenames_filtered = [filename for filename in filenames if args.string_to_match in filename] fp = FrameProducer(filenames_filtered, hdfs.open) chipper = Chipper(fp) count = 0 for _ in chipper: count += 1 logger.warn('Total Chips: {}'.format(count))
def __init__(self): self.hdfs = HDFileSystem(host='localhost', port=9000) self.dest_path_tweet = '/user/BigData/tweet_data' self.dest_path_rss = '/user/BigData/rss_data' self.dest_path_corona = '/user/BigData/corona_data' self.destination_path = "" self.import_path = '../import_data' self.import_path_tweet = os.path.join(self.import_path, 'tweets') self.import_path_rss = os.path.join(self.import_path, 'rss') self.import_path_corona = os.path.join(self.import_path, 'corona') self.hdfs_types = { 'tweet': self.dest_path_tweet, 'rss': self.dest_path_rss, 'corona': self.dest_path_corona } self.import_types = { 'tweet': self.import_path_tweet, 'rss': self.import_path_rss, 'corona': self.import_path_corona } self.temp_types = { 'tweet': "TempData/temp_tweet.csv", "rss": "TempData/temp_rss.csv", "corona": "TempData/temp_corona.csv" }
def run2(): # hdfs = HDFileSystem('cdh5namenode', '8020') hdfs = HDFileSystem(host='192.168.100.178', port=8020) hdfs.mkdir('/user/cloudera/tmp/trave_hotel/test/') # client = Client("http://192.168.100.78:50070") # 50070: Hadoop默认namenode with hdfs.open("/user/cloudera/tmp/trave_hotel/elong_comment.txt", 'ab') as f: f.write('hello' + '\n')
def test_connection_error(): with pytest.raises(RuntimeError) as ctx: hdfs = HDFileSystem(host='localhost', port=9999, connect=False) hdfs.connect() # error message is long and with java exceptions, so here we just check # that important part of error is present msg = 'Caused by: HdfsNetworkConnectException: Connect to "localhost:9999"' assert msg in str(ctx.value)
def test_connection_error(): with pytest.raises(ConnectionError) as ctx: hdfs = HDFileSystem(host='localhost', port=9999, connect=False) hdfs.connect() # error message is long and with java exceptions, so here we just check # that important part of error is present msg = 'Caused by: HdfsNetworkConnectException: Connect to "localhost:9999"' assert msg in str(ctx.value)
def read_hdfs_bytes(vineyard_socket, path, proc_num, proc_index): if proc_index: return client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) host, port = urlparse(path).netloc.split(':') hdfs = HDFileSystem(host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"}) header_row = False fragments = urlparse(path).fragment.split('&') path = urlparse(path).path for frag in fragments: try: k, v = frag.split('=') except: pass else: if k == 'header_row': header_row = (v.upper() == 'TRUE') if header_row: builder[k] = '1' else: builder[k] = '0' elif k == 'delimiter': builder[k] = bytes(v, "utf-8").decode("unicode_escape") offset = 0 length = 1024 * 1024 if header_row: header_line = hdfs.read_block(path, 0, 1, b'\n') builder['header_line'] = header_line.decode('unicode_escape') offset = len(header_line)-1 stream = builder.seal(client) ret = {'type': 'return'} ret['content'] = repr(stream.id) print(json.dumps(ret)) writer = stream.open_writer(client) while True: buf = hdfs.read_block(path, offset, length, b'\n') size = len(buf) if not size: break offset += size chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()
def hdfs(): hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') yield hdfs if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test')
def main(): start = int(time.time() * 1000) hdfs = HDFileSystem(host='10.200.10.1', port=10500) '''with hdfs.open('/user/kingofshadows/myfile.txt', 'wb') as f: f.write('Hello, world!') with hdfs.open('/user/kingofshadows/myfile.txt', 'rb') as f: print(f.read()) hdfs.ls('/')''' hdfs.put('/test-files/10653954_330928323779441_1670298960_n.jpg', 'img1.jpg') print(str(int(time.time()*1000) - start) + ' milliseconds.')
def remove_hdfs_blockchain(self): try: from hdfs3 import HDFileSystem hdfs = HDFileSystem(host=self.config['HDFS_HOST'], port=self.config['HDFS_PORT']) for file in hdfs.ls(self.config['HDFS_PATH']): hdfs.rm(file) except ImportError: logging.error("hdfs3 module not found") except: logging.error("Error occured in connecting to hadoop")
def make_hdfs(): hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') try: yield hdfs finally: if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test')
def read_block_from_hdfs(host, port, filename, offset, length, delimiter): from hdfs3 import HDFileSystem if sys.version_info[0] == 2: from locket import lock_file with lock_file('.lock'): hdfs = HDFileSystem(host=host, port=port) bytes = hdfs.read_block(filename, offset, length, delimiter) else: hdfs = HDFileSystem(host=host, port=port) bytes = hdfs.read_block(filename, offset, length, delimiter) return bytes
def load_2_hdfs(self): news_list = self.s['news_list_file'] hdfs_news_list = self.s['hdfs'] % (os.path.split(news_list)[1]) news_info = self.s['news_info_file'] hdfs_news_info = self.s['hdfs'] % (os.path.split(news_info)[1]) try: hdfs = HDFileSystem(host='192.168.100.178', port=8020) hdfs.put(news_list, hdfs_news_list) hdfs.put(news_info, hdfs_news_info) except: print('集群挂了')
def load_in_hdfs(self, filename): """ 集群操作 :param filename: 需要推送的文件 :return: """ hdfs = HDFileSystem(host='192.168.100.178', port=8020) try: file_path = os.path.join(os.path.abspath('DATA'), filename) hdfs_path = os.path.join('/user/spider/TAPD_TASK', filename) hdfs.put(file_path, hdfs_path) except Exception as e: print('集群挂了', e)
def read_hive_orc(vineyard_socket, path, proc_num, proc_index): # This method is to read the data files of a specific hive table # that is stored as orc format in HDFS. # # In general, the data files of a hive table are stored at the hive # space in the HDFS with the table name as the directory, # e.g., # # .. code:: python # # '/user/hive/warehouse/sometable' # # To read the entire table, simply use 'hive://user/hive/warehouse/sometable' # as the path. # # In case the table is partitioned, use the sub-directory of a specific partition # to read only the data from that partition. For example, sometable is partitioned # by column date, we can read the data in a given date by giving path as # # .. code:: python # # 'hive://user/hive/warehouse/sometable/date=20201112' # if proc_index: raise ValueError('Parallel reading ORC hasn\'t been supported yet') client = vineyard.connect(vineyard_socket) builder = DataframeStreamBuilder(client) stream = builder.seal(client) client.persist(stream) ret = {'type': 'return'} ret['content'] = repr(stream.id) print(json.dumps(ret), flush=True) writer = stream.open_writer(client) host, port = urlparse(path).netloc.split(':') hdfs = HDFileSystem(host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"}) paths = hdfs.glob(urlparse(path).path) files = [] for sub in paths: if hdfs.isfile(sub): files.append(sub) else: files += hdfs.glob(sub) for filepath in files: read_hdfs_orc(filepath, hdfs, writer) writer.finish()
def hdfs(): hdfs = HDFileSystem(host="localhost", port=8020) if hdfs.exists("/tmp/test"): hdfs.rm("/tmp/test") hdfs.mkdir("/tmp/test") yield hdfs if hdfs.exists("/tmp/test"): hdfs.rm("/tmp/test")
def read_from_hdfs(self, block_hash): try: from hdfs3 import HDFileSystem hdfs = HDFileSystem(host=self.writer_service.config['HDFS_HOST'], port=self.writer_service.config['HDFS_PORT']) block_file_path = self.writer_service.config[ 'HDFS_PATH'] + block_hash + ".json" with hdfs.open(block_file_path) as read_file: data = json.load(read_file) return data except ImportError: logging.error("hdfs3 module not found") except: logging.error("Error in connecting to hadoop")
def make_hdfs(): from hdfs3 import HDFileSystem # from .hdfs import DaskHDFileSystem basedir = '/tmp/test-distributed' hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists(basedir): hdfs.rm(basedir) hdfs.mkdir(basedir) try: yield hdfs, basedir finally: if hdfs.exists(basedir): hdfs.rm(basedir)
def test_simple_pars(no_conf): hdfs = HDFileSystem('blah', 1, autoconf=False, connect=False) assert hdfs.conf['host'] == 'blah' assert hdfs.conf['port'] == 1 hdfs = HDFileSystem('blah', 1, autoconf=True, connect=False) assert hdfs.conf['host'] == 'blah' assert hdfs.conf['port'] == 1 hdfs = HDFileSystem('blah', 1, autoconf=True, connect=False, pars={'port': 2}) assert hdfs.conf['port'] == 1 hdfs = HDFileSystem(autoconf=True, connect=False) assert hdfs.conf['host'] == conf_defaults['host'] assert hdfs.conf['port'] == conf_defaults['port'] with pytest.raises(Exception): HDFileSystem(autoconf=False, connect=True) hdfs = HDFileSystem(host='blah', autoconf=True, connect=False) assert hdfs.conf['host'] == 'blah' hdfs = HDFileSystem(connect=False, pars={'port': 1}) assert hdfs.conf['port'] == 1 hdfs = HDFileSystem(connect=False, pars={'port': 1}, port=2) assert hdfs.conf['port'] == 2
def __init__(self, topics): # self.consumer = KafkaConsumer( # topics, # bootstrap_servers=['localhost:9092'], # auto_offset_reset='earliest', # enable_auto_commit=True, # group_id='my-group', # value_deserializer=lambda x: loads(x.decode('utf-8'))) # self.HDFS = pa.HDFS.connect(host='localhost', port=9000) self.hdfs = HDFileSystem(host='localhost', port=9000) self.destination_path = "" self.dest_path_tweet = '/user/BigData/tweet_data' self.dest_path_rss = '/user/BigData/rss_data' self.dest_path_corona = '/user/BigData/corona_data' self.temp_path = 'temp_data'
def __init__(self): """Initializes a connection to HDFS. Connection configuration is done using :doc:`hdfs`. """ super(HadoopFileSystem, self).__init__() self._hdfs_client = HDFileSystem()
def _read_avro(path, executor=None, hdfs=None, lazy=True, **kwargs): """ See distributed.hdfs.read_avro for docstring """ from hdfs3 import HDFileSystem from dask import do import fastavro hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = walk_glob(hdfs, path) blockss = [] for fn in filenames: with hdfs.open(fn, 'rb') as f: av = fastavro.reader(f) header = av._header schema = json.loads(header['meta']['avro.schema'].decode()) blockss.extend([ read_bytes(fn, executor, hdfs, lazy=True, delimiter=header['sync'], not_zero=True) for fn in filenames ]) # TODO: why is filenames used twice? lazy_values = [ do(avro_body)(b, header) for blocks in blockss for b in blocks ] if lazy: raise gen.Return(lazy_values) else: futures = executor.compute(lazy_values) raise gen.Return(futures)
def hdfs_connect_kerberos(hdfs_name_services: str, hdfs_replication: str, user: str, hdfs_host_services: str, hdfs_kbr5_user_keytab_path: str, hdfs_krb5_username: str): host = hdfs_name_services print("Usando KerberosClient...") conf = HDFSWrapper.create_hdfs3_conf(True, hdfs_name_services, hdfs_replication, hdfs_host_services) try: ticket_cache = HDFSWrapper.get_ticket_cache() if ticket_cache is not None: hdfs_client = HDFileSystem(host=host, port=None, user=user, pars=conf, ticket_cache=ticket_cache) else: hdfs_client = HDFSWrapper.renew_ticket_cache( conf, hdfs_name_services, user, hdfs_kbr5_user_keytab_path, hdfs_krb5_username, message="ERROR: Problems to generate Ticket Cache!") except: hdfs_client = HDFSWrapper.renew_ticket_cache( conf, hdfs_name_services, user, hdfs_kbr5_user_keytab_path, hdfs_krb5_username, message="ERROR: Problems to renew Ticket Cache!") return HDFSWrapper(hdfs_client)
def _read_csv(fn, executor=None, hdfs=None, lazy=False, lineterminator='\n', header=True, names=None, **kwargs): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) kwargs['lineterminator'] = lineterminator filenames = hdfs.glob(fn) blockss = [read_binary(fn, executor, hdfs, lazy=True, delimiter=lineterminator) for fn in filenames] if names is None and header: with hdfs.open(filenames[0]) as f: head = pd.read_csv(f, nrows=5, **kwargs) names = head.columns dfs1 = [[do(buffer_to_csv)(blocks[0], names=names, skiprows=1, **kwargs)] + [do(buffer_to_csv)(b, names=names, **kwargs) for b in blocks[1:]] for blocks in blockss] dfs2 = sum(dfs1, []) if lazy: from dask.dataframe import from_imperative raise gen.Return(from_imperative(dfs2, columns=names)) else: futures = executor.compute(*dfs2) from distributed.collections import _futures_to_dask_dataframe df = yield _futures_to_dask_dataframe(futures) raise gen.Return(df)
def connect(self, namenode_host=default_namenode_host, namenode_port=default_namenode_port): """ hdfs """ hdfs = HDFileSystem(namenode_host, namenode_port) return hdfs
def open_files(path, hdfs=None, lazy=None, **auth): if lazy is not None: raise DeprecationWarning("Lazy keyword has been deprecated. " "Now always lazy") hdfs = hdfs or HDFileSystem(**auth) filenames = sorted(hdfs.glob(path)) myopen = delayed(hdfs_open_file) return [myopen(fn, auth) for fn in filenames]
def test_hdfs_csv(self): ''' HDFS3 Configuration: https://hdfs3.readthedocs.io/en/latest/hdfs.html ''' dfs = HDFileSystem(host="localhost", port=8020, user="******") dfs.mkdir("/temp") dfs.put("tests/data/data.csv", "/temp/data.csv") with dfs.open("/temp/data.csv") as f: df = pandas.read_csv(f, nrows=10) print("Data shape from HDFS: " + str(df.shape)) dfs.rm("/temp/data.csv")
def test_token_and_ticket_cache_in_same_time(): ticket_cache = "/tmp/krb5cc_0" token = "abc" with pytest.raises(RuntimeError) as ctx: HDFileSystem(connect=False, ticket_cache=ticket_cache, token=token) msg = "It is not possible to use ticket_cache and token at same time" assert msg in str(ctx.value)
def renew_ticket_cache(conf: dict, hdfs_name_services: str, user: str, hdfs_kbr5_user_keytab_path: str, hdfs_krb5_username: str, message: str=""): hdfs_host = hdfs_name_services status = HDFSWrapperNativeClient.generate_ticket_cache(hdfs_kbr5_user_keytab_path, hdfs_krb5_username) if status: ticket_cache = HDFSWrapperNativeClient.get_ticket_cache() return HDFileSystem(host=hdfs_host, port=None, user=user, pars=conf, ticket_cache=ticket_cache) else: RequestResult.ofError(message) return None
def get_block_locations(self, paths): offsets = [] lengths = [] machines = [] for path in paths: if path.startswith('hdfs://'): path = path[len('hdfs://'):] out = HDFileSystem.get_block_locations(self, path) offsets.append([o['offset'] for o in out]) lengths.append([o['length'] for o in out]) machines.append([o['hosts'] for o in out]) return offsets, lengths, machines
test_host ='localhost' test_port = 9000 def hdfs_exists(hdfs_client) path = 'tmp/test' if hdfs_client.exists(path) hdfs_client.rm(path) hdfs_client.makedirs(path) def hdfs_write_read(hdfs_client) data = b"hello"*20 file_a = '/tmp/text/file_a' with hdfs_client.open(file_a,'wb',replication=1) as f: f.write(data) with hdfs_client.open(file_a,'rb') as f: out = f.red(len(data)) def hdfs_readline(hdfs_client) file_b = '/tmp/test/file_b' with hdfs_client.open(file_b,'wb') as f: f.write(b"hello\nhadoop") with hdfs_client.open(file_b,'rb') as f: lines = f.readline() assertlen(lines)==2 if __name__=="__main__": hdfs_client = HDFileSystem(host=test_host,port=test_port) hdfs_exists(hdfs_client) hdfs_write_read(hdfs_client) hdfs_readline(hdfs_clinet) hdfs_client.disconnect() HelloWorld()
def open_file_write_direct(path, hdfs=None, **kwargs): if hdfs is None: hdfs = HDFileSystem(kwargs.get('host'), kwargs.get('port')) return hdfs.open(path, 'wb')
def hdfs_open_file(path, auth): from hdfs3 import HDFileSystem hdfs = HDFileSystem(**auth) return hdfs.open(path, mode='rb')
def hdfs_open_file(path, auth): hdfs = HDFileSystem(**auth) return hdfs.open(path, mode='rb')
class HadoopFileSystem(FileSystem): """``FileSystem`` implementation that supports HDFS. URL arguments to methods expect strings starting with ``hdfs://``. Uses client library :class:`hdfs3.core.HDFileSystem`. """ def __init__(self): """Initializes a connection to HDFS. Connection configuration is done using :doc:`hdfs`. """ super(HadoopFileSystem, self).__init__() self._hdfs_client = HDFileSystem() @classmethod def scheme(cls): return 'hdfs' @staticmethod def _parse_url(url): """Verifies that url begins with hdfs:// prefix, strips it and adds a leading /. Raises: ValueError if url doesn't begin with hdfs://. Args: url: A URL in the form hdfs://path/... Returns: For an input of 'hdfs://path/...', will return '/path/...'. """ m = _URL_RE.match(url) if m is None: raise ValueError('Could not parse url: %s' % url) return m.group(1) def join(self, base_url, *paths): """Join two or more pathname components. Args: base_url: string path of the first component of the path. Must start with hdfs://. paths: path components to be added Returns: Full url after combining all the passed components. """ basepath = self._parse_url(base_url) return _HDFS_PREFIX + self._join(basepath, *paths) def _join(self, basepath, *paths): return posixpath.join(basepath, *paths) def split(self, url): rel_path = self._parse_url(url) head, tail = posixpath.split(rel_path) return _HDFS_PREFIX + head, tail def mkdirs(self, url): path = self._parse_url(url) if self._exists(path): raise IOError('Path already exists: %s' % path) return self._mkdirs(path) def _mkdirs(self, path): self._hdfs_client.makedirs(path) def match(self, url_patterns, limits=None): if limits is None: limits = [None] * len(url_patterns) if len(url_patterns) != len(limits): raise BeamIOError( 'Patterns and limits should be equal in length: %d != %d' % ( len(url_patterns), len(limits))) # TODO(udim): Update client to allow batched results. def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" file_infos = self._hdfs_client.ls(path_pattern, detail=True)[:limit] metadata_list = [FileMetadata(file_info['name'], file_info['size']) for file_info in file_infos] return MatchResult(path_pattern, metadata_list) exceptions = {} result = [] for url_pattern, limit in zip(url_patterns, limits): try: path_pattern = self._parse_url(url_pattern) result.append(_match(path_pattern, limit)) except Exception as e: # pylint: disable=broad-except exceptions[url_pattern] = e if exceptions: raise BeamIOError('Match operation failed', exceptions) return result def _open_hdfs(self, path, mode, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) res = self._hdfs_client.open(path, mode) if compression_type != CompressionTypes.UNCOMPRESSED: res = CompressedFile(res) return res def create(self, url, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Returns: *hdfs3.core.HDFile*: An Python File-like object. """ path = self._parse_url(url) return self._create(path, mime_type, compression_type) def _create(self, path, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): return self._open_hdfs(path, 'wb', mime_type, compression_type) def open(self, url, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Returns: *hdfs3.core.HDFile*: An Python File-like object. """ path = self._parse_url(url) return self._open(path, mime_type, compression_type) def _open(self, path, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): return self._open_hdfs(path, 'rb', mime_type, compression_type) def copy(self, source_file_names, destination_file_names): """ Will overwrite files and directories in destination_file_names. Raises ``BeamIOError`` if any error occurred. Args: source_file_names: iterable of URLs. destination_file_names: iterable of URLs. """ if len(source_file_names) != len(destination_file_names): raise BeamIOError( 'source_file_names and destination_file_names should ' 'be equal in length: %d != %d' % ( len(source_file_names), len(destination_file_names))) def _copy_file(source, destination): with self._open(source) as f1: with self._create(destination) as f2: while True: buf = f1.read(_COPY_BUFFER_SIZE) if not buf: break f2.write(buf) def _copy_path(source, destination): """Recursively copy the file tree from the source to the destination.""" if not self._hdfs_client.isdir(source): _copy_file(source, destination) return for path, dirs, files in self._hdfs_client.walk(source): for dir in dirs: new_dir = self._join(destination, dir) if not self._exists(new_dir): self._mkdirs(new_dir) rel_path = posixpath.relpath(path, source) if rel_path == '.': rel_path = '' for file in files: _copy_file(self._join(path, file), self._join(destination, rel_path, file)) exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) _copy_path(rel_source, rel_destination) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Copy operation failed', exceptions) def rename(self, source_file_names, destination_file_names): exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) if not self._hdfs_client.mv(rel_source, rel_destination): raise BeamIOError( 'libhdfs error in renaming %s to %s' % (source, destination)) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Rename operation failed', exceptions) def exists(self, url): """Checks existence of url in HDFS. Args: url: String in the form hdfs://... Returns: True if url exists as a file or directory in HDFS. """ path = self._parse_url(url) return self._exists(path) def _exists(self, path): """Returns True if path exists as a file or directory in HDFS. Args: path: String in the form /... """ return self._hdfs_client.exists(path) def delete(self, urls): exceptions = {} for url in urls: try: path = self._parse_url(url) self._hdfs_client.rm(path, recursive=True) except Exception as e: # pylint: disable=broad-except exceptions[url] = e if exceptions: raise BeamIOError("Delete operation failed", exceptions)
from hdfs3 import HDFileSystem import os os.environp['HADOOP_USER_NAME']=hadoop hdfs = HDFileSystem(host='trevally.amer.nevint.com', port=9000) print hdfs.ls('/user/hadoop')
def __init__(self, **kwargs): kwargs2 = {k: v for k, v in kwargs.items() if k in ['host', 'port', 'user', 'ticket_cache', 'token', 'pars']} HDFileSystem.__init__(self, connect=True, **kwargs2)
def glob(self, path): if path.startswith('hdfs://'): path = path[len('hdfs://'):] return sorted(HDFileSystem.glob(self, path))
from tornado import gen from dask.imperative import Value from distributed.utils_test import gen_cluster, cluster, loop, make_hdfs from distributed.utils import get_ip from distributed.hdfs import (read_bytes, get_block_locations, write_bytes, _read_csv, read_csv) from distributed import Executor from distributed.executor import _wait, Future pytest.importorskip('hdfs3') from hdfs3 import HDFileSystem try: hdfs = HDFileSystem(host='localhost', port=8020) hdfs.df() del hdfs except: pytestmark = pytest.mark.skipif('True') ip = get_ip() def test_get_block_locations(): with make_hdfs() as hdfs: data = b'a' * int(1e8) # todo: reduce block size to speed up test fn_1 = '/tmp/test/file1' fn_2 = '/tmp/test/file2'
def open(self, path, mode='rb'): mode = mode.rstrip('b') return open(path, mode) def put(self, src, dst): return shutil.copy(src, dst) if __name__ == "__main__": # load the hdfs node info f = open('hdfs.yml', 'r') data = yaml.load(f) f.close() hdfs_nn = data['hdfs_nn'] hdfs = HDFileSystem(host=hdfs_nn, port=data['hdfs_port']) tfs = TransparentFileSystem(hdfs) print hdfs.exists('/tmp') # print hdfs.hoge('/tmp') print tfs.exists('/tmp') # print tfs.hoge('/tmp') # tfs_local = TransparentFileSystem() # print tfs_local.glob('/var/tmp') print 'test' print tfs.glob('/tmp') # tfs.hoge() tfs_local = TransparentFileSystem() # print tfs_local.glob('/home/vagrant/work/data/*')
def open(self, path, mode='rb', **kwargs): if path.startswith('hdfs://'): path = path[len('hdfs://'):] return HDFileSystem.open(self, path, mode, **kwargs)