Python HdfsClient.listdir Examples, pyhdfs.HdfsClient.listdir Python Examples

Example #1

0

Show file

def test_table_manipulations(cluster):
    create_table(cluster, "hdfs_test")

    node = cluster.instances["node"]
    fs = HdfsClient(hosts=cluster.hdfs_ip)

    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-04', 4096)))

    node.query("RENAME TABLE hdfs_test TO hdfs_renamed")
    assert node.query("SELECT count(*) FROM hdfs_renamed FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query("RENAME TABLE hdfs_renamed TO hdfs_test")
    assert node.query("CHECK TABLE hdfs_test FORMAT Values") == "(1)"

    node.query("DETACH TABLE hdfs_test")
    node.query("ATTACH TABLE hdfs_test")
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query("TRUNCATE TABLE hdfs_test")
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(0)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD

Example #2

0

Show file

File: test.py Project: sonnary/ClickHouse

def test_move_partition_to_another_disk(cluster):
    create_table(cluster, "hdfs_test")

    node = cluster.instances["node"]
    fs = HdfsClient(hosts=cluster.hdfs_ip)

    node.query("INSERT INTO hdfs_test VALUES {}".format(
        generate_values('2020-01-03', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(
        generate_values('2020-01-04', 4096)))
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(
        hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query(
        "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'")
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE

    node.query(
        "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdfs'")
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(
        hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

Example #3

0

Show file

File: test.py Project: sonnary/ClickHouse

def test_simple_insert_select(cluster, min_rows_for_wide_part, files_per_part):
    create_table(cluster,
                 "hdfs_test",
                 additional_settings="min_rows_for_wide_part={}".format(
                     min_rows_for_wide_part))

    node = cluster.instances["node"]

    values1 = generate_values('2020-01-03', 4096)
    node.query("INSERT INTO hdfs_test VALUES {}".format(values1))
    assert node.query(
        "SELECT * FROM hdfs_test order by dt, id FORMAT Values") == values1

    fs = HdfsClient(hosts=cluster.hdfs_ip)

    hdfs_objects = fs.listdir('/clickhouse')
    print(hdfs_objects)
    assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part

    values2 = generate_values('2020-01-04', 4096)
    node.query("INSERT INTO hdfs_test VALUES {}".format(values2))
    assert node.query("SELECT * FROM hdfs_test ORDER BY dt, id FORMAT Values"
                      ) == values1 + "," + values2

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part * 2

    assert node.query(
        "SELECT count(*) FROM hdfs_test where id = 1 FORMAT Values") == "(2)"

Example #4

0

Show file

def wait_for_delete_hdfs_objects(cluster, expected, num_tries=30):
    fs = HdfsClient(hosts=cluster.hdfs_ip)
    while num_tries > 0:
        num_hdfs_objects = len(fs.listdir('/clickhouse'))
        if num_hdfs_objects == expected:
            break
        num_tries -= 1
        time.sleep(1)
    assert(len(fs.listdir('/clickhouse')) == expected)

Example #5

0

Show file

File: test.py Project: wwjiang007/ClickHouse

def wait_for_hdfs_objects(cluster, fp, expected, num_tries=30):
    fs = HdfsClient(hosts=cluster.hdfs_ip)
    while num_tries > 0:
        num_hdfs_objects = len(fs.listdir(fp))
        if num_hdfs_objects == expected:
            break
        num_tries -= 1
        time.sleep(1)
    assert len(fs.listdir(fp)) == expected

Example #6

0

Show file

def test_move_replace_partition_to_another_table(cluster):
    create_table(cluster, "hdfs_test")

    node = cluster.instances["node"]
    fs = HdfsClient(hosts=cluster.hdfs_ip)

    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-04', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-05', 4096, -1)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-06', 4096, -1)))
    assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)"
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4

    create_table(cluster, "hdfs_clone")

    node.query("ALTER TABLE hdfs_test MOVE PARTITION '2020-01-03' TO TABLE hdfs_clone")
    node.query("ALTER TABLE hdfs_test MOVE PARTITION '2020-01-05' TO TABLE hdfs_clone")
    assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)"
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"
    assert node.query("SELECT sum(id) FROM hdfs_clone FORMAT Values") == "(0)"
    assert node.query("SELECT count(*) FROM hdfs_clone FORMAT Values") == "(8192)"

    # Number of objects in HDFS should be unchanged.
    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4

    # Add new partitions to source table, but with different values and replace them from copied table.
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096, -1)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-05', 4096)))
    assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)"
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 6

    node.query("ALTER TABLE hdfs_test REPLACE PARTITION '2020-01-03' FROM hdfs_clone")
    node.query("ALTER TABLE hdfs_test REPLACE PARTITION '2020-01-05' FROM hdfs_clone")
    assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)"
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)"
    assert node.query("SELECT sum(id) FROM hdfs_clone FORMAT Values") == "(0)"
    assert node.query("SELECT count(*) FROM hdfs_clone FORMAT Values") == "(8192)"

    # Wait for outdated partitions deletion.
    print(1)
    wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4)

    node.query("DROP TABLE hdfs_clone NO DELAY")
    assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)"
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)"

    # Data should remain in hdfs
    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4

Example #7

0

Show file

File: test.py Project: sonnary/ClickHouse

def test_attach_detach_partition(cluster):
    create_table(cluster, "hdfs_test")

    node = cluster.instances["node"]
    fs = HdfsClient(hosts=cluster.hdfs_ip)

    node.query("INSERT INTO hdfs_test VALUES {}".format(
        generate_values('2020-01-03', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(
        generate_values('2020-01-04', 4096)))
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(
        hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query("ALTER TABLE hdfs_test DETACH PARTITION '2020-01-03'")
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(4096)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(
        hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query("ALTER TABLE hdfs_test ATTACH PARTITION '2020-01-03'")
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(
        hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query("ALTER TABLE hdfs_test DROP PARTITION '2020-01-03'")
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(4096)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE

    node.query("ALTER TABLE hdfs_test DETACH PARTITION '2020-01-04'")
    node.query("ALTER TABLE hdfs_test DROP DETACHED PARTITION '2020-01-04'",
               settings={"allow_drop_detached": 1})
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(0)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD

Example #8

0

Show file

def drop_table(cluster):
    node = cluster.instances["node"]

    fs = HdfsClient(hosts=cluster.hdfs_ip)
    hdfs_objects = fs.listdir('/clickhouse')
    print('Number of hdfs objects to delete:', len(hdfs_objects), sep=' ')

    node.query("DROP TABLE IF EXISTS hdfs_test SYNC")

    try:
        wait_for_delete_hdfs_objects(cluster, 0)
    finally:
        hdfs_objects = fs.listdir('/clickhouse')
        if len(hdfs_objects) == 0:
            return
        print("Manually removing extra objects to prevent tests cascade failing: ", hdfs_objects)
        for path in hdfs_objects:
            fs.delete(path)

Example #9

0

Show file

File: hdfs2mongo.py Project: jayleon/spiderSth

def start():
    # 连接MongoDB，查询tokens，根据contractAddress到etherscan查询最新数据
    client = MongoCluster().connect()
    db = client.get_database('gse-transaction')
    collection = db.get_collection('mrout_6000001-6001000')
    # collection.insert_one()

    # 连接HDFS读取文件
    from pyhdfs import HdfsClient
    client2 = HdfsClient(hosts='%s,50070' % hdfs_ip, max_tries=10)
    # 返回这个用户的根目录
    print client2.get_home_directory()
    # 返回可用的namenode节点
    print client2.get_active_namenode()
    # 返回指定目录下的所有文件
    print client2.listdir("/user/leon/mrout_3_6000001-6001000/")
    # 读某个文件
    client2.mkdirs("/user/leon")
    inputfile = client2.open('/user/leon/mrout_3_6000001-6001000/part-00000')
    # 查看文件内容
    for r in inputfile:
        line = str(r).encode('utf-8')  # open后是二进制,str()转换为字符串并转码
        print(line)

Example #10

0

Show file

File: dataUtils_HDFS.py Project: jiasy/PY_Service

def showDirs(client_: HdfsClient, path_: str, maxDepth_: int, list_: list, depth_: int = 0):
    if depth_ > maxDepth_:
        return
    for _sub in client_.listdir(path_):
        _filePath = path_ + _sub
        _fileStatus = client_.get_file_status(_filePath)
        # print('_fileStatus = ' + str(_fileStatus))
        if _fileStatus.type == "DIRECTORY":
            if _fileStatus.permission == '711':
                print("|      " * depth_ + "+--" + _sub + "     permission is 711")
            elif _fileStatus.permission == '700':
                print("|      " * depth_ + "+--" + _sub + "     permission is 700")
            elif _fileStatus.permission == '733':
                print("|      " * depth_ + "+--" + _sub + "     permission is 733")
            elif _fileStatus.permission == '770':
                print("|      " * depth_ + "+--" + _sub + "     permission is 770")
            else:
                print("|      " * depth_ + "+--" + _sub)
                list_.append(_filePath)
                showDirs(client_, _filePath + '/', maxDepth_, list_, depth_ + 1)

Example #11

0

Show file

def assert_objects_count(started_cluster, objects_count, path='data/'):
    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
    hdfs_objects = fs.listdir('/clickhouse')
    assert objects_count == len(hdfs_objects)

Example #12

0

Show file

class HDFSClientUtilityTest(unittest.TestCase):
    '''Unit test for hdfsClientUtility.py'''
    def setUp(self):
        self.hdfs_file_path = '../../.vscode/hdfsInfo.json'
        self.hdfs_config = None
        try:
            with open(self.hdfs_file_path, 'r') as file:
                self.hdfs_config = json.load(file)
        except Exception as exception:
            print(exception)

        self.hdfs_client = HdfsClient(hosts='{0}:{1}'.format(
            self.hdfs_config['host'], '50070'),
                                      user_name=self.hdfs_config['userName'])

    def get_random_name(self, length):
        return ''.join(
            random.sample(string.ascii_letters + string.digits, length))

    def test_copy_file_run(self):
        '''test copyFileToHdfs'''
        file_name = self.get_random_name(8)
        file_content = 'hello world!'

        with open('./{}'.format(file_name), 'w') as file:
            file.write(file_content)

        result = copyFileToHdfs(
            './{}'.format(file_name),
            '/{0}/{1}'.format(self.hdfs_config['userName'],
                              file_name), self.hdfs_client)
        self.assertTrue(result)

        file_list = self.hdfs_client.listdir('/{0}'.format(
            self.hdfs_config['userName']))
        self.assertIn(file_name, file_list)

        hdfs_file_name = self.get_random_name(8)
        self.hdfs_client.copy_to_local(
            '/{0}/{1}'.format(self.hdfs_config['userName'], file_name),
            './{}'.format(hdfs_file_name))
        self.assertTrue(os.path.exists('./{}'.format(hdfs_file_name)))

        with open('./{}'.format(hdfs_file_name), 'r') as file:
            content = file.readline()
            self.assertEqual(file_content, content)
        #clean up
        os.remove('./{}'.format(file_name))
        os.remove('./{}'.format(hdfs_file_name))
        self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'],
                                                  file_name))

    def test_copy_directory_run(self):
        '''test copyDirectoryToHdfs'''
        directory_name = self.get_random_name(8)
        file_name_list = [self.get_random_name(8), self.get_random_name(8)]
        file_content = 'hello world!'

        os.makedirs('./{}'.format(directory_name))
        for file_name in file_name_list:
            with open('./{0}/{1}'.format(directory_name, file_name),
                      'w') as file:
                file.write(file_content)

        result = copyDirectoryToHdfs(
            './{}'.format(directory_name),
            '/{0}/{1}'.format(self.hdfs_config['userName'],
                              directory_name), self.hdfs_client)
        self.assertTrue(result)

        directory_list = self.hdfs_client.listdir('/{0}'.format(
            self.hdfs_config['userName']))
        self.assertIn(directory_name, directory_list)

        sub_file_list = self.hdfs_client.listdir('/{0}/{1}'.format(
            self.hdfs_config['userName'], directory_name))
        for file_name in file_name_list:
            self.assertIn(file_name, sub_file_list)
            #clean up
            self.hdfs_client.delete('/{0}/{1}/{2}'.format(
                self.hdfs_config['userName'], directory_name, file_name))
        self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'],
                                                  directory_name))

        shutil.rmtree('./{}'.format(directory_name))

Example #13

0

Show file

File: HdfsUtil.py Project: ldw0810/Crawler

class hdfs(object):
    #默认50070端口
    def __init__(self, cur_database_param):
        # super(HdfsClients, self).__init__()
        # self.quert_db_info = super(HdfsClients, self).getDBConfig()
        # self.hdfsHost=self.quert_db_info["host"]
        hdfsHost = cur_database_param['url']
        path = cur_database_param['dbname']
        self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost))
        self.host = hdfsHost
        self.path = path

    def append(self, path, data):
        self.hdfs.append(path, data)
        pass

    def concat(self, target, sources):
        self.concat(target, sources)

    # self, taskJobId,tableName=None,jobTemplateFieldList=None
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None,
                               data=None):
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)
            tableName = taskJob.tableName
        path = self.path + '/' + tableName
        self.hdfs.create(path, data, replication=2)

    def hmkdirs(self, path):
        self.hdfs.mkdirs(path)

    def open(self, path):
        return self.hdfs.open(path=path)

    def delete(self, path):
        self.hdfs.delete(path=path)

    def listdir(self, rule):
        f = self.hdfs.listdir(rule)
        return f

    def insert(self, jobid, tablename, column_dict, paramMap=None):
        if tablename == None:
            taskJob = TaskJobDao.loadTaskById(jobid)
            tablename = taskJob.tableName
        path = self.path + '/' + tablename
        createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        task_job_id_sequenceValue = paramMap.get(
            "task_job_id_sequence") if paramMap != None else None
        if task_job_id_sequenceValue != None:
            column_dict.update(
                {"task_job_id_sequence": str(task_job_id_sequenceValue)})
        column_dict.update({
            "task_job_del_flag": "False",
            "task_job_create_time": createTime
        })
        # self.append(path, column_dict)
        if self.isTableExist(tablename):
            self.append(path, column_dict)
        else:
            self.createTableByTaskJobId(jobid, tablename, column_dict)
        # return column_dict

    def isTableExist(self, tablename):
        path = self.path + '/' + tablename
        exist = self.hdfs.exists(path)
        return exist

    def save_to_hdfs(self, jobid, path, data):
        if self.isTableExist(path):
            self.append(path, data)
        else:
            self.createTableByTaskJobId(jobid, path, data)

    def save_to_hdfs2(self, path, data):
        if self.hdfs.exists(path):
            self.hdfs.append(path, data)
        else:
            self.hdfs.create(path, data, replication=2)

    def execute(self, sqls="append", path=None, data=None):
        try:
            if isinstance(sqls, list) and len(sqls) > 0:
                for sql in sqls:
                    # method = eval(sql)
                    method = getattr(self, sql)
                    method(path, data)
            else:
                # method = eval(sqls)
                method = getattr(self, sqls)
                method(path, data)
        except Exception, e:
            logging.error("hdfs,execute," + str(e))
            raise Exception()

Example #14

0

Show file

File: test_hdfs.py Project: xiaoyaoguai2/1926class_craw

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from pyhdfs import HdfsClient
'''
python链接hadoop的hdfs文件系统，进行文件的上传和下载
'''

# 从hdfs文件系统读取文件

# hdfs地址
# client = HdfsClient(hosts='192.168.1.163:50070')
client = HdfsClient(hosts='192.168.1.156:50070')

print(client.listdir("/repo/"))

res = client.open('/repo/README.txt')
for r in res:
    line = str(r, encoding='utf-8')  # open后是二进制,str()转换为字符串并转码
    print(line)

client = HdfsClient(hosts='192.168.1.156:50070',
                    user_name='hadoop')  # 只有hadoop用户拥有写权限
str1 = 'hello world'
client.create('/py.txt', str1)  # 创建新文件并写入字符串

# 上传本地文件到HDFS

# client = HdfsClient(hosts='hacker:50070', user_name='root')
# 本地文件绝对路径,HDFS目录必须不存在
# client.copy_from_local('D:/PythonProjects/crawl_work/thread_crawl_work02', '/usr/hadoop/')

Example #15

0

Show file

File: test.py Project: zhouruiapple/ClickHouse

def assert_objects_count(cluster, objects_count, path='data/'):
    fs = HdfsClient(hosts='localhost')
    hdfs_objects = fs.listdir('/clickhouse')
    assert objects_count == len(hdfs_objects)

Example #16

0

Show file

from pyhdfs import HdfsClient

client = HdfsClient(hosts='hadoop1:50070')

print(client.listdir('/user/hive/warehouse/repdata1'))

Example #17

0

Show file

###############################################################################################
#
#   Python analyze HDFS
#
###############################################################################################

import re
from pyhdfs import HdfsClient

client = HdfsClient(hosts='dzaratsian-nifi4.field.hortonworks.com:50070')

root_path = '/topics/minifitest/2017/01/'

days       = client.listdir(root_path)
days_path  = [root_path + str(path) for path in days]

hours_path = []
for day in days_path:
    hours = client.listdir(day)
    for hour in hours:
        path = day + '/' + str(hour)
        #print path
        hours_path.append(path)

minutes_path = []
for path in hours_path:
    minutes = client.listdir(path)
    if len(minutes) != 60:
        print '[ INFO ] Incomplete minutes (less than 60) in path: ' + str(path) + ' (Count = ' + str(len(minutes)) + ')'