Example #1
0
 def __init__(self, **kwargs):
     kwargs2 = {
         k: v
         for k, v in kwargs.items()
         if k in ['host', 'port', 'user', 'ticket_cache', 'token', 'pars']
     }
     HDFileSystem.__init__(self, connect=True, **kwargs2)
Example #2
0
    def __init__(self, pipeline_options):
        """Initializes a connection to HDFS.

    Connection configuration is done using :doc:`hdfs`.
    """
        super(HadoopFileSystem, self).__init__(pipeline_options)
        self._hdfs_client = HDFileSystem()
Example #3
0
 def __init__(self):
     self.crawl = Crawl()
     self.analysis = Analysis()
     self.pipe = Pipeline()
     self._use_log()
     try:
         self.args_dict = eval(sys.argv[1:])
         if not isinstance(self.args_dict, dict):
             raise ValueError('args must be like key-value ')
     except Exception as e:
         self.args_dict = {}
         logging.warning('get args failed:{}'.format(e))
     self.proxies = self.args_dict.get('proxies')  # 代理配置
     self.hdfs = self.args_dict.get('hdfs', {})  # hdfs配置
     # 如果没有这两个参数 直接报异常 不执行
     if not self.hdfs or not self.proxies:
         raise ValueError('args not have hdfs or proxies')
     self.sleep_time = self.args_dict.get('sleep_time', 0.2)  # 休眠时间
     self.service_args = self.args_dict.get('service_args',
                                            {})  # PhantomJS代理配置
     self.aliyun_log = self.args_dict.get('aliyun_log', {})
     self.alilog = AliyunLog(
         '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME),
         endp=self.aliyun_log.get('endpoint', endpoint),
         accid=self.aliyun_log.get('accessKeyId', accessKeyId),
         acckey=self.aliyun_log.get('accessKey', accessKey),
         proj=self.aliyun_log.get('project', project),
         logst=self.aliyun_log.get('logstore',
                                   logstore))  # 阿里云log配置文件,需要校验如果没有该参数会不会报错
     try:
         self.HDFS = HDFileSystem(host=self.hdfs.get(
             'ip', '192.168.100.178'),
                                  port=self.hdfs.get('port', 8020))
     except:
         pass
    def test_batch_mode(self):
        self._hbaseTestingUtility.getMiniHBaseCluster(
        ).waitForActiveAndReadyMaster(60000)

        dfs = self._hbaseTestingUtility.getDFSCluster().getFileSystem()
        hdfs = HDFileSystem(host='localhost',
                            port=dfs.getUri().getPort(),
                            pars={'dfs.client.read.shortcircuit': 'false'})

        hdfs.put('test_resources/batch_test', '/batch_test')

        self.engine_config['mode'] = 'batch'
        engine = DataTransformationEngine(self.sc, self.engine_config)
        engine.start()
        # wait up to 10s for the server to start
        time.sleep(5)

        expected = b"{'table_name': 'dummy_table', 'column_family': 'dummy', 'row': '1', 'data': '1', 'column': " \
                   b"'count'}\n{'table_name': 'dummy_table', 'column_family': 'dummy', 'row': '2', 'data': '2', " \
                   b"'column': 'count'}\n{'table_name': 'dummy_table', 'column_family': 'dummy', 'row': '3', " \
                   b"'data': '3', 'column': 'count'}\n{'table_name': 'dummy_table', 'column_family': 'dummy', " \
                   b"'row': '4', 'data': '4', 'column': 'count'}\n{'table_name': 'dummy_table', 'column_family': " \
                   b"'dummy', 'row': '5', 'data': '5', 'column': 'count'}\n "

        with hdfs.open('/result/part-00000', replication=1) as f:
            self.assertTrue(f.read(), expected)
Example #5
0
def read_block_from_hdfs(filename, offset, length, host=None, port=None,
        delimiter=None):
    from locket import lock_file
    with lock_file('.lock'):
        hdfs = HDFileSystem(host=host, port=port)
        bytes = hdfs.read_block(filename, offset, length, delimiter)
    return bytes
Example #6
0
def write_hdfs_orc(vineyard_socket, stream_id, path, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    streams = client.get(stream_id)
    if len(streams) != proc_num or streams[proc_index] is None:
        raise ValueError(
            f'Fetch stream error with proc_num={proc_num},proc_index={proc_index}'
        )
    instream = streams[proc_index]
    reader = instream.open_reader(client)

    host, port = urlparse(path).netloc.split(':')
    hdfs = HDFileSystem(host=host, port=int(port))
    path = urlparse(path).path

    writer = None
    with hdfs.open(path, 'wb') as f:
        while True:
            try:
                buf = reader.next()
            except:
                writer.close()
                break
            buf_reader = pa.ipc.open_stream(buf)
            if writer is None:
                #get schema
                schema = {}
                for field in buf_reader.schema:
                    schema[field.name] = orc_type(field.type)
                writer = pyorc.Writer(f, pyorc.Struct(**schema))
            for batch in buf_reader:
                df = batch.to_pandas()
                writer.writerows(df.itertuples(False, None))
Example #7
0
def main():
    import argparse
    import os
    parser = argparse.ArgumentParser(prog='chipper.py', description='Extract trips for a video')
    parser.add_argument("dataset_path", default="dataset_path", action="store", type=str,
                        help="Path to the dataset in hdfs.")

    parser.add_argument('string_to_match', type=str, help='string to match <str> in filename')
    args = parser.parse_args()

    from hdfs3 import HDFileSystem
    hdfs = HDFileSystem(host='namenode', port=8020)
    filenames = hdfs.glob(args.dataset_path)

    def get_info(filename):
        bname = os.path.basename(filename)
        return bname.split('-')[0], int(bname.split('-')[1])
    filenames = sorted(filenames, key=get_info)
    filenames_filtered = [filename for filename in filenames if args.string_to_match in filename]

    fp = FrameProducer(filenames_filtered, hdfs.open)

    chipper = Chipper(fp)
    count = 0
    for _ in chipper:
        count += 1
    logger.warn('Total Chips: {}'.format(count))
Example #8
0
    def __init__(self):
        self.hdfs = HDFileSystem(host='localhost', port=9000)
        self.dest_path_tweet = '/user/BigData/tweet_data'
        self.dest_path_rss = '/user/BigData/rss_data'
        self.dest_path_corona = '/user/BigData/corona_data'
        self.destination_path = ""
        self.import_path = '../import_data'
        self.import_path_tweet = os.path.join(self.import_path, 'tweets')
        self.import_path_rss = os.path.join(self.import_path, 'rss')
        self.import_path_corona = os.path.join(self.import_path, 'corona')

        self.hdfs_types = {
            'tweet': self.dest_path_tweet,
            'rss': self.dest_path_rss,
            'corona': self.dest_path_corona
        }
        self.import_types = {
            'tweet': self.import_path_tweet,
            'rss': self.import_path_rss,
            'corona': self.import_path_corona
        }
        self.temp_types = {
            'tweet': "TempData/temp_tweet.csv",
            "rss": "TempData/temp_rss.csv",
            "corona": "TempData/temp_corona.csv"
        }
Example #9
0
def run2():
    # hdfs = HDFileSystem('cdh5namenode', '8020')
    hdfs = HDFileSystem(host='192.168.100.178', port=8020)
    hdfs.mkdir('/user/cloudera/tmp/trave_hotel/test/')
    # client = Client("http://192.168.100.78:50070")  # 50070: Hadoop默认namenode
    with hdfs.open("/user/cloudera/tmp/trave_hotel/elong_comment.txt", 'ab') as f:
        f.write('hello' + '\n')
Example #10
0
def test_connection_error():
    with pytest.raises(RuntimeError) as ctx:
        hdfs = HDFileSystem(host='localhost', port=9999, connect=False)
        hdfs.connect()
    # error message is long and with java exceptions, so here we just check
    # that important part of error is present
    msg = 'Caused by: HdfsNetworkConnectException: Connect to "localhost:9999"'
    assert msg in str(ctx.value)
Example #11
0
def test_connection_error():
    with pytest.raises(ConnectionError) as ctx:
        hdfs = HDFileSystem(host='localhost', port=9999, connect=False)
        hdfs.connect()
    # error message is long and with java exceptions, so here we just check
    # that important part of error is present
    msg = 'Caused by: HdfsNetworkConnectException: Connect to "localhost:9999"'
    assert msg in str(ctx.value)
Example #12
0
def read_hdfs_bytes(vineyard_socket, path, proc_num, proc_index):      
    if proc_index:
        return  
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    host, port = urlparse(path).netloc.split(':')
    hdfs = HDFileSystem(host=host, port=int(port), pars={"dfs.client.read.shortcircuit": "false"})

    header_row = False
    fragments = urlparse(path).fragment.split('&')
    
    path = urlparse(path).path

    for frag in fragments:
        try:
            k, v = frag.split('=')
        except:
            pass
        else:
            if k == 'header_row':
                header_row = (v.upper() == 'TRUE')
                if header_row:
                    builder[k] = '1'
                else:
                    builder[k] = '0'
            elif k == 'delimiter':
                builder[k] = bytes(v, "utf-8").decode("unicode_escape")

    offset = 0
    length = 1024 * 1024

    if header_row:
        header_line = hdfs.read_block(path, 0, 1, b'\n')
        builder['header_line'] = header_line.decode('unicode_escape')
        offset = len(header_line)-1

    stream = builder.seal(client)
    
    ret = {'type': 'return'}
    ret['content'] = repr(stream.id)
    print(json.dumps(ret))

    writer = stream.open_writer(client)

    while True:
        buf = hdfs.read_block(path, offset, length, b'\n')
        size = len(buf)
        if not size:
            break
        offset += size
        chunk = writer.next(size)
        buf_writer = pa.FixedSizeBufferWriter(chunk)
        buf_writer.write(buf)
        buf_writer.close()

    writer.finish()
Example #13
0
def hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    yield hdfs

    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
Example #14
0
def main():
	start = int(time.time() * 1000)
	hdfs = HDFileSystem(host='10.200.10.1', port=10500)
	'''with hdfs.open('/user/kingofshadows/myfile.txt', 'wb') as f:
		f.write('Hello, world!')
	with hdfs.open('/user/kingofshadows/myfile.txt', 'rb') as f:
		print(f.read())
	hdfs.ls('/')'''
	hdfs.put('/test-files/10653954_330928323779441_1670298960_n.jpg', 'img1.jpg')
	print(str(int(time.time()*1000) - start) + ' milliseconds.')
Example #15
0
 def remove_hdfs_blockchain(self):
     try:
         from hdfs3 import HDFileSystem
         hdfs = HDFileSystem(host=self.config['HDFS_HOST'],
                             port=self.config['HDFS_PORT'])
         for file in hdfs.ls(self.config['HDFS_PATH']):
             hdfs.rm(file)
     except ImportError:
         logging.error("hdfs3 module not found")
     except:
         logging.error("Error occured in connecting to hadoop")
Example #16
0
def make_hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    try:
        yield hdfs
    finally:
        if hdfs.exists('/tmp/test'):
            hdfs.rm('/tmp/test')
Example #17
0
def read_block_from_hdfs(filename,
                         offset,
                         length,
                         host=None,
                         port=None,
                         delimiter=None):
    from locket import lock_file
    with lock_file('.lock'):
        hdfs = HDFileSystem(host=host, port=port)
        bytes = hdfs.read_block(filename, offset, length, delimiter)
    return bytes
Example #18
0
def read_block_from_hdfs(host, port, filename, offset, length, delimiter):
    from hdfs3 import HDFileSystem
    if sys.version_info[0] == 2:
        from locket import lock_file
        with lock_file('.lock'):
            hdfs = HDFileSystem(host=host, port=port)
            bytes = hdfs.read_block(filename, offset, length, delimiter)
    else:
        hdfs = HDFileSystem(host=host, port=port)
        bytes = hdfs.read_block(filename, offset, length, delimiter)
    return bytes
Example #19
0
 def load_2_hdfs(self):
     news_list = self.s['news_list_file']
     hdfs_news_list = self.s['hdfs'] % (os.path.split(news_list)[1])
     news_info = self.s['news_info_file']
     hdfs_news_info = self.s['hdfs'] % (os.path.split(news_info)[1])
     try:
         hdfs = HDFileSystem(host='192.168.100.178', port=8020)
         hdfs.put(news_list, hdfs_news_list)
         hdfs.put(news_info, hdfs_news_info)
     except:
         print('集群挂了')
Example #20
0
 def load_in_hdfs(self, filename):
     """
     集群操作
     :param filename: 需要推送的文件
     :return:
     """
     hdfs = HDFileSystem(host='192.168.100.178', port=8020)
     try:
         file_path = os.path.join(os.path.abspath('DATA'), filename)
         hdfs_path = os.path.join('/user/spider/TAPD_TASK', filename)
         hdfs.put(file_path, hdfs_path)
     except Exception as e:
         print('集群挂了', e)
Example #21
0
def read_hive_orc(vineyard_socket, path, proc_num, proc_index):
    # This method is to read the data files of a specific hive table
    # that is stored as orc format in HDFS.
    #
    # In general, the data files of a hive table are stored at the hive
    # space in the HDFS with the table name as the directory,
    # e.g.,
    #
    # .. code:: python
    #
    #    '/user/hive/warehouse/sometable'
    #
    # To read the entire table, simply use 'hive://user/hive/warehouse/sometable'
    # as the path.
    #
    # In case the table is partitioned, use the sub-directory of a specific partition
    # to read only the data from that partition. For example, sometable is partitioned
    # by column date, we can read the data in a given date by giving path as
    #
    # .. code:: python
    #
    #    'hive://user/hive/warehouse/sometable/date=20201112'
    #
    if proc_index:
        raise ValueError('Parallel reading ORC hasn\'t been supported yet')
    client = vineyard.connect(vineyard_socket)
    builder = DataframeStreamBuilder(client)
    stream = builder.seal(client)
    client.persist(stream)
    ret = {'type': 'return'}
    ret['content'] = repr(stream.id)
    print(json.dumps(ret), flush=True)

    writer = stream.open_writer(client)
    host, port = urlparse(path).netloc.split(':')
    hdfs = HDFileSystem(host=host,
                        port=int(port),
                        pars={"dfs.client.read.shortcircuit": "false"})

    paths = hdfs.glob(urlparse(path).path)
    files = []
    for sub in paths:
        if hdfs.isfile(sub):
            files.append(sub)
        else:
            files += hdfs.glob(sub)

    for filepath in files:
        read_hdfs_orc(filepath, hdfs, writer)

    writer.finish()
Example #22
0
def hdfs():
    hdfs = HDFileSystem(host="localhost", port=8020)
    if hdfs.exists("/tmp/test"):
        hdfs.rm("/tmp/test")
    hdfs.mkdir("/tmp/test")

    yield hdfs

    if hdfs.exists("/tmp/test"):
        hdfs.rm("/tmp/test")
Example #23
0
def hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    yield hdfs

    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
 def read_from_hdfs(self, block_hash):
     try:
         from hdfs3 import HDFileSystem
         hdfs = HDFileSystem(host=self.writer_service.config['HDFS_HOST'],
                             port=self.writer_service.config['HDFS_PORT'])
         block_file_path = self.writer_service.config[
             'HDFS_PATH'] + block_hash + ".json"
         with hdfs.open(block_file_path) as read_file:
             data = json.load(read_file)
         return data
     except ImportError:
         logging.error("hdfs3 module not found")
     except:
         logging.error("Error in connecting to hadoop")
Example #25
0
def make_hdfs():
    from hdfs3 import HDFileSystem
    # from .hdfs import DaskHDFileSystem
    basedir = '/tmp/test-distributed'
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists(basedir):
        hdfs.rm(basedir)
    hdfs.mkdir(basedir)

    try:
        yield hdfs, basedir
    finally:
        if hdfs.exists(basedir):
            hdfs.rm(basedir)
Example #26
0
def test_simple_pars(no_conf):
    hdfs = HDFileSystem('blah', 1, autoconf=False, connect=False)
    assert hdfs.conf['host'] == 'blah'
    assert hdfs.conf['port'] == 1
    hdfs = HDFileSystem('blah', 1, autoconf=True, connect=False)
    assert hdfs.conf['host'] == 'blah'
    assert hdfs.conf['port'] == 1
    hdfs = HDFileSystem('blah',
                        1,
                        autoconf=True,
                        connect=False,
                        pars={'port': 2})
    assert hdfs.conf['port'] == 1

    hdfs = HDFileSystem(autoconf=True, connect=False)
    assert hdfs.conf['host'] == conf_defaults['host']
    assert hdfs.conf['port'] == conf_defaults['port']
    with pytest.raises(Exception):
        HDFileSystem(autoconf=False, connect=True)
    hdfs = HDFileSystem(host='blah', autoconf=True, connect=False)
    assert hdfs.conf['host'] == 'blah'
    hdfs = HDFileSystem(connect=False, pars={'port': 1})
    assert hdfs.conf['port'] == 1
    hdfs = HDFileSystem(connect=False, pars={'port': 1}, port=2)
    assert hdfs.conf['port'] == 2
Example #27
0
def make_hdfs():
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists('/tmp/test'):
        hdfs.rm('/tmp/test')
    hdfs.mkdir('/tmp/test')

    try:
        yield hdfs
    finally:
        if hdfs.exists('/tmp/test'):
            hdfs.rm('/tmp/test')
Example #28
0
    def __init__(self, topics):
        # self.consumer = KafkaConsumer(
        #     topics,
        #     bootstrap_servers=['localhost:9092'],
        #     auto_offset_reset='earliest',
        #     enable_auto_commit=True,
        #     group_id='my-group',
        #     value_deserializer=lambda x: loads(x.decode('utf-8')))

        # self.HDFS = pa.HDFS.connect(host='localhost', port=9000)
        self.hdfs = HDFileSystem(host='localhost', port=9000)
        self.destination_path = ""
        self.dest_path_tweet = '/user/BigData/tweet_data'
        self.dest_path_rss = '/user/BigData/rss_data'
        self.dest_path_corona = '/user/BigData/corona_data'
        self.temp_path = 'temp_data'
Example #29
0
  def __init__(self):
    """Initializes a connection to HDFS.

    Connection configuration is done using :doc:`hdfs`.
    """
    super(HadoopFileSystem, self).__init__()
    self._hdfs_client = HDFileSystem()
Example #30
0
def _read_avro(path, executor=None, hdfs=None, lazy=True, **kwargs):
    """ See distributed.hdfs.read_avro for docstring """
    from hdfs3 import HDFileSystem
    from dask import do
    import fastavro
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)

    filenames = walk_glob(hdfs, path)

    blockss = []
    for fn in filenames:
        with hdfs.open(fn, 'rb') as f:
            av = fastavro.reader(f)
            header = av._header
        schema = json.loads(header['meta']['avro.schema'].decode())

        blockss.extend([
            read_bytes(fn,
                       executor,
                       hdfs,
                       lazy=True,
                       delimiter=header['sync'],
                       not_zero=True) for fn in filenames
        ])  # TODO: why is filenames used twice?

    lazy_values = [
        do(avro_body)(b, header) for blocks in blockss for b in blocks
    ]

    if lazy:
        raise gen.Return(lazy_values)
    else:
        futures = executor.compute(lazy_values)
        raise gen.Return(futures)
Example #31
0
    def hdfs_connect_kerberos(hdfs_name_services: str, hdfs_replication: str,
                              user: str, hdfs_host_services: str,
                              hdfs_kbr5_user_keytab_path: str,
                              hdfs_krb5_username: str):
        host = hdfs_name_services
        print("Usando KerberosClient...")
        conf = HDFSWrapper.create_hdfs3_conf(True, hdfs_name_services,
                                             hdfs_replication,
                                             hdfs_host_services)
        try:
            ticket_cache = HDFSWrapper.get_ticket_cache()
            if ticket_cache is not None:
                hdfs_client = HDFileSystem(host=host,
                                           port=None,
                                           user=user,
                                           pars=conf,
                                           ticket_cache=ticket_cache)
            else:
                hdfs_client = HDFSWrapper.renew_ticket_cache(
                    conf,
                    hdfs_name_services,
                    user,
                    hdfs_kbr5_user_keytab_path,
                    hdfs_krb5_username,
                    message="ERROR: Problems to generate Ticket Cache!")
        except:
            hdfs_client = HDFSWrapper.renew_ticket_cache(
                conf,
                hdfs_name_services,
                user,
                hdfs_kbr5_user_keytab_path,
                hdfs_krb5_username,
                message="ERROR: Problems to renew Ticket Cache!")

        return HDFSWrapper(hdfs_client)
Example #32
0
def _read_csv(fn, executor=None, hdfs=None, lazy=False, lineterminator='\n',
        header=True, names=None, **kwargs):
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)
    kwargs['lineterminator'] = lineterminator
    filenames = hdfs.glob(fn)
    blockss = [read_binary(fn, executor, hdfs, lazy=True, delimiter=lineterminator)
               for fn in filenames]
    if names is None and header:
        with hdfs.open(filenames[0]) as f:
            head = pd.read_csv(f, nrows=5, **kwargs)
            names = head.columns

    dfs1 = [[do(buffer_to_csv)(blocks[0], names=names, skiprows=1, **kwargs)] +
            [do(buffer_to_csv)(b, names=names, **kwargs) for b in blocks[1:]]
            for blocks in blockss]
    dfs2 = sum(dfs1, [])
    if lazy:
        from dask.dataframe import from_imperative
        raise gen.Return(from_imperative(dfs2, columns=names))
    else:
        futures = executor.compute(*dfs2)
        from distributed.collections import _futures_to_dask_dataframe
        df = yield _futures_to_dask_dataframe(futures)
        raise gen.Return(df)
Example #33
0
 def connect(self,
             namenode_host=default_namenode_host,
             namenode_port=default_namenode_port):
     """
     hdfs
     """
     hdfs = HDFileSystem(namenode_host, namenode_port)
     return hdfs
Example #34
0
def open_files(path, hdfs=None, lazy=None, **auth):
    if lazy is not None:
        raise DeprecationWarning("Lazy keyword has been deprecated. "
                                 "Now always lazy")
    hdfs = hdfs or HDFileSystem(**auth)
    filenames = sorted(hdfs.glob(path))
    myopen = delayed(hdfs_open_file)
    return [myopen(fn, auth) for fn in filenames]
Example #35
0
def make_hdfs():
    from hdfs3 import HDFileSystem
    # from .hdfs import DaskHDFileSystem
    basedir = '/tmp/test-distributed'
    hdfs = HDFileSystem(host='localhost', port=8020)
    if hdfs.exists(basedir):
        hdfs.rm(basedir)
    hdfs.mkdir(basedir)

    try:
        yield hdfs, basedir
    finally:
        if hdfs.exists(basedir):
            hdfs.rm(basedir)
Example #36
0
    def test_hdfs_csv(self):
        '''
        HDFS3 Configuration:
          https://hdfs3.readthedocs.io/en/latest/hdfs.html
        '''
        dfs =  HDFileSystem(host="localhost", port=8020, user="******")
        dfs.mkdir("/temp")
        dfs.put("tests/data/data.csv", "/temp/data.csv")

        with dfs.open("/temp/data.csv") as f:
            df = pandas.read_csv(f, nrows=10)
            print("Data shape from HDFS: " + str(df.shape))
        
        dfs.rm("/temp/data.csv")
Example #37
0
def test_token_and_ticket_cache_in_same_time():
    ticket_cache = "/tmp/krb5cc_0"
    token = "abc"

    with pytest.raises(RuntimeError) as ctx:
        HDFileSystem(connect=False, ticket_cache=ticket_cache, token=token)

    msg = "It is not possible to use ticket_cache and token at same time"
    assert msg in str(ctx.value)
Example #38
0
 def renew_ticket_cache(conf: dict, hdfs_name_services: str, user: str, hdfs_kbr5_user_keytab_path: str, hdfs_krb5_username: str, message: str=""):
     hdfs_host = hdfs_name_services
     status = HDFSWrapperNativeClient.generate_ticket_cache(hdfs_kbr5_user_keytab_path, hdfs_krb5_username)
     if status:
         ticket_cache = HDFSWrapperNativeClient.get_ticket_cache()
         return HDFileSystem(host=hdfs_host, port=None, user=user, pars=conf, ticket_cache=ticket_cache)
     else:
         RequestResult.ofError(message)
         return None
Example #39
0
 def get_block_locations(self, paths):
     offsets = []
     lengths = []
     machines = []
     for path in paths:
         if path.startswith('hdfs://'):
             path = path[len('hdfs://'):]
         out = HDFileSystem.get_block_locations(self, path)
         offsets.append([o['offset'] for o in out])
         lengths.append([o['length'] for o in out])
         machines.append([o['hosts'] for o in out])
     return offsets, lengths, machines
Example #40
0
test_host ='localhost'
test_port = 9000

def hdfs_exists(hdfs_client)
	path = 'tmp/test'
	if hdfs_client.exists(path)
		hdfs_client.rm(path)
	hdfs_client.makedirs(path)
def hdfs_write_read(hdfs_client)
	data = b"hello"*20
	file_a = '/tmp/text/file_a'
	with hdfs_client.open(file_a,'wb',replication=1) as f:
		f.write(data)
	with hdfs_client.open(file_a,'rb') as f:
		out = f.red(len(data))
def hdfs_readline(hdfs_client)
	file_b = '/tmp/test/file_b'
	with hdfs_client.open(file_b,'wb') as f:
		f.write(b"hello\nhadoop")
	with hdfs_client.open(file_b,'rb') as f:
		lines = f.readline()
		assertlen(lines)==2

if __name__=="__main__":
	hdfs_client = HDFileSystem(host=test_host,port=test_port)
	hdfs_exists(hdfs_client)
	hdfs_write_read(hdfs_client)
	hdfs_readline(hdfs_clinet)
	hdfs_client.disconnect()
    HelloWorld()
Example #41
0
def open_file_write_direct(path, hdfs=None, **kwargs):
    if hdfs is None:
        hdfs = HDFileSystem(kwargs.get('host'), kwargs.get('port'))
    return hdfs.open(path, 'wb')
Example #42
0
def hdfs_open_file(path, auth):
    from hdfs3 import HDFileSystem
    hdfs = HDFileSystem(**auth)
    return hdfs.open(path, mode='rb')
Example #43
0
def hdfs_open_file(path, auth):
    hdfs = HDFileSystem(**auth)
    return hdfs.open(path, mode='rb')
Example #44
0
class HadoopFileSystem(FileSystem):
  """``FileSystem`` implementation that supports HDFS.

  URL arguments to methods expect strings starting with ``hdfs://``.

  Uses client library :class:`hdfs3.core.HDFileSystem`.
  """

  def __init__(self):
    """Initializes a connection to HDFS.

    Connection configuration is done using :doc:`hdfs`.
    """
    super(HadoopFileSystem, self).__init__()
    self._hdfs_client = HDFileSystem()

  @classmethod
  def scheme(cls):
    return 'hdfs'

  @staticmethod
  def _parse_url(url):
    """Verifies that url begins with hdfs:// prefix, strips it and adds a
    leading /.

    Raises:
      ValueError if url doesn't begin with hdfs://.

    Args:
      url: A URL in the form hdfs://path/...

    Returns:
      For an input of 'hdfs://path/...', will return '/path/...'.
    """
    m = _URL_RE.match(url)
    if m is None:
      raise ValueError('Could not parse url: %s' % url)
    return m.group(1)

  def join(self, base_url, *paths):
    """Join two or more pathname components.

    Args:
      base_url: string path of the first component of the path.
        Must start with hdfs://.
      paths: path components to be added

    Returns:
      Full url after combining all the passed components.
    """
    basepath = self._parse_url(base_url)
    return _HDFS_PREFIX + self._join(basepath, *paths)

  def _join(self, basepath, *paths):
    return posixpath.join(basepath, *paths)

  def split(self, url):
    rel_path = self._parse_url(url)
    head, tail = posixpath.split(rel_path)
    return _HDFS_PREFIX + head, tail

  def mkdirs(self, url):
    path = self._parse_url(url)
    if self._exists(path):
      raise IOError('Path already exists: %s' % path)
    return self._mkdirs(path)

  def _mkdirs(self, path):
    self._hdfs_client.makedirs(path)

  def match(self, url_patterns, limits=None):
    if limits is None:
      limits = [None] * len(url_patterns)

    if len(url_patterns) != len(limits):
      raise BeamIOError(
          'Patterns and limits should be equal in length: %d != %d' % (
              len(url_patterns), len(limits)))

    # TODO(udim): Update client to allow batched results.
    def _match(path_pattern, limit):
      """Find all matching paths to the pattern provided."""
      file_infos = self._hdfs_client.ls(path_pattern, detail=True)[:limit]
      metadata_list = [FileMetadata(file_info['name'], file_info['size'])
                       for file_info in file_infos]
      return MatchResult(path_pattern, metadata_list)

    exceptions = {}
    result = []
    for url_pattern, limit in zip(url_patterns, limits):
      try:
        path_pattern = self._parse_url(url_pattern)
        result.append(_match(path_pattern, limit))
      except Exception as e:  # pylint: disable=broad-except
        exceptions[url_pattern] = e

    if exceptions:
      raise BeamIOError('Match operation failed', exceptions)
    return result

  def _open_hdfs(self, path, mode, mime_type, compression_type):
    if mime_type != 'application/octet-stream':
      logging.warning('Mime types are not supported. Got non-default mime_type:'
                      ' %s', mime_type)
    if compression_type == CompressionTypes.AUTO:
      compression_type = CompressionTypes.detect_compression_type(path)
    res = self._hdfs_client.open(path, mode)
    if compression_type != CompressionTypes.UNCOMPRESSED:
      res = CompressedFile(res)
    return res

  def create(self, url, mime_type='application/octet-stream',
             compression_type=CompressionTypes.AUTO):
    """
    Returns:
      *hdfs3.core.HDFile*: An Python File-like object.
    """
    path = self._parse_url(url)
    return self._create(path, mime_type, compression_type)

  def _create(self, path, mime_type='application/octet-stream',
              compression_type=CompressionTypes.AUTO):
    return self._open_hdfs(path, 'wb', mime_type, compression_type)

  def open(self, url, mime_type='application/octet-stream',
           compression_type=CompressionTypes.AUTO):
    """
    Returns:
      *hdfs3.core.HDFile*: An Python File-like object.
    """
    path = self._parse_url(url)
    return self._open(path, mime_type, compression_type)

  def _open(self, path, mime_type='application/octet-stream',
            compression_type=CompressionTypes.AUTO):
    return self._open_hdfs(path, 'rb', mime_type, compression_type)

  def copy(self, source_file_names, destination_file_names):
    """
    Will overwrite files and directories in destination_file_names.

    Raises ``BeamIOError`` if any error occurred.

    Args:
      source_file_names: iterable of URLs.
      destination_file_names: iterable of URLs.
    """
    if len(source_file_names) != len(destination_file_names):
      raise BeamIOError(
          'source_file_names and destination_file_names should '
          'be equal in length: %d != %d' % (
              len(source_file_names), len(destination_file_names)))

    def _copy_file(source, destination):
      with self._open(source) as f1:
        with self._create(destination) as f2:
          while True:
            buf = f1.read(_COPY_BUFFER_SIZE)
            if not buf:
              break
            f2.write(buf)

    def _copy_path(source, destination):
      """Recursively copy the file tree from the source to the destination."""
      if not self._hdfs_client.isdir(source):
        _copy_file(source, destination)
        return

      for path, dirs, files in self._hdfs_client.walk(source):
        for dir in dirs:
          new_dir = self._join(destination, dir)
          if not self._exists(new_dir):
            self._mkdirs(new_dir)

        rel_path = posixpath.relpath(path, source)
        if rel_path == '.':
          rel_path = ''
        for file in files:
          _copy_file(self._join(path, file),
                     self._join(destination, rel_path, file))

    exceptions = {}
    for source, destination in zip(source_file_names, destination_file_names):
      try:
        rel_source = self._parse_url(source)
        rel_destination = self._parse_url(destination)
        _copy_path(rel_source, rel_destination)
      except Exception as e:  # pylint: disable=broad-except
        exceptions[(source, destination)] = e

    if exceptions:
      raise BeamIOError('Copy operation failed', exceptions)

  def rename(self, source_file_names, destination_file_names):
    exceptions = {}
    for source, destination in zip(source_file_names, destination_file_names):
      try:
        rel_source = self._parse_url(source)
        rel_destination = self._parse_url(destination)
        if not self._hdfs_client.mv(rel_source, rel_destination):
          raise BeamIOError(
              'libhdfs error in renaming %s to %s' % (source, destination))
      except Exception as e:  # pylint: disable=broad-except
        exceptions[(source, destination)] = e

    if exceptions:
      raise BeamIOError('Rename operation failed', exceptions)

  def exists(self, url):
    """Checks existence of url in HDFS.

    Args:
      url: String in the form hdfs://...

    Returns:
      True if url exists as a file or directory in HDFS.
    """
    path = self._parse_url(url)
    return self._exists(path)

  def _exists(self, path):
    """Returns True if path exists as a file or directory in HDFS.

    Args:
      path: String in the form /...
    """
    return self._hdfs_client.exists(path)

  def delete(self, urls):
    exceptions = {}
    for url in urls:
      try:
        path = self._parse_url(url)
        self._hdfs_client.rm(path, recursive=True)
      except Exception as e:  # pylint: disable=broad-except
        exceptions[url] = e

    if exceptions:
      raise BeamIOError("Delete operation failed", exceptions)
Example #45
0
from hdfs3 import HDFileSystem
import os

os.environp['HADOOP_USER_NAME']=hadoop

hdfs = HDFileSystem(host='trevally.amer.nevint.com', port=9000)

print hdfs.ls('/user/hadoop')
Example #46
0
 def __init__(self, **kwargs):
     kwargs2 = {k: v for k, v in kwargs.items()
                if k in ['host', 'port', 'user', 'ticket_cache',
                         'token', 'pars']}
     HDFileSystem.__init__(self, connect=True, **kwargs2)
Example #47
0
 def glob(self, path):
     if path.startswith('hdfs://'):
         path = path[len('hdfs://'):]
     return sorted(HDFileSystem.glob(self, path))
Example #48
0
from tornado import gen

from dask.imperative import Value

from distributed.utils_test import gen_cluster, cluster, loop, make_hdfs
from distributed.utils import get_ip
from distributed.hdfs import (read_bytes, get_block_locations, write_bytes,
        _read_csv, read_csv)
from distributed import Executor
from distributed.executor import _wait, Future


pytest.importorskip('hdfs3')
from hdfs3 import HDFileSystem
try:
    hdfs = HDFileSystem(host='localhost', port=8020)
    hdfs.df()
    del hdfs
except:
    pytestmark = pytest.mark.skipif('True')


ip = get_ip()


def test_get_block_locations():
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)  # todo: reduce block size to speed up test
        fn_1 = '/tmp/test/file1'
        fn_2 = '/tmp/test/file2'
Example #49
0
    def open(self, path, mode='rb'):
        mode = mode.rstrip('b')
        return open(path, mode)

    def put(self, src, dst):
        return shutil.copy(src, dst)

if __name__ == "__main__":
    # load the hdfs node info
    f = open('hdfs.yml', 'r')
    data = yaml.load(f)
    f.close()

    hdfs_nn = data['hdfs_nn']
    hdfs = HDFileSystem(host=hdfs_nn, port=data['hdfs_port'])

    tfs = TransparentFileSystem(hdfs)
    print hdfs.exists('/tmp')
    # print hdfs.hoge('/tmp')
    print tfs.exists('/tmp')
    # print tfs.hoge('/tmp')

    # tfs_local = TransparentFileSystem()
    # print tfs_local.glob('/var/tmp')

    print 'test'
    print tfs.glob('/tmp')
    # tfs.hoge()
    tfs_local = TransparentFileSystem()
    # print tfs_local.glob('/home/vagrant/work/data/*')
Example #50
0
 def open(self, path, mode='rb', **kwargs):
     if path.startswith('hdfs://'):
         path = path[len('hdfs://'):]
     return HDFileSystem.open(self, path, mode, **kwargs)