def test_table_manipulations(cluster): create_table(cluster, "hdfs_test") node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-04', 4096))) node.query("RENAME TABLE hdfs_test TO hdfs_renamed") assert node.query("SELECT count(*) FROM hdfs_renamed FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("RENAME TABLE hdfs_renamed TO hdfs_test") assert node.query("CHECK TABLE hdfs_test FORMAT Values") == "(1)" node.query("DETACH TABLE hdfs_test") node.query("ATTACH TABLE hdfs_test") assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("TRUNCATE TABLE hdfs_test") assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(0)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD
def test_move_partition_to_another_disk(cluster): create_table(cluster, "hdfs_test") node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) node.query("INSERT INTO hdfs_test VALUES {}".format( generate_values('2020-01-03', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format( generate_values('2020-01-04', 4096))) assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len( hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query( "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'") assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE node.query( "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdfs'") assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len( hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
def test_simple_insert_select(cluster, min_rows_for_wide_part, files_per_part): create_table(cluster, "hdfs_test", additional_settings="min_rows_for_wide_part={}".format( min_rows_for_wide_part)) node = cluster.instances["node"] values1 = generate_values('2020-01-03', 4096) node.query("INSERT INTO hdfs_test VALUES {}".format(values1)) assert node.query( "SELECT * FROM hdfs_test order by dt, id FORMAT Values") == values1 fs = HdfsClient(hosts=cluster.hdfs_ip) hdfs_objects = fs.listdir('/clickhouse') print(hdfs_objects) assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part values2 = generate_values('2020-01-04', 4096) node.query("INSERT INTO hdfs_test VALUES {}".format(values2)) assert node.query("SELECT * FROM hdfs_test ORDER BY dt, id FORMAT Values" ) == values1 + "," + values2 hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part * 2 assert node.query( "SELECT count(*) FROM hdfs_test where id = 1 FORMAT Values") == "(2)"
def wait_for_delete_hdfs_objects(cluster, expected, num_tries=30): fs = HdfsClient(hosts=cluster.hdfs_ip) while num_tries > 0: num_hdfs_objects = len(fs.listdir('/clickhouse')) if num_hdfs_objects == expected: break num_tries -= 1 time.sleep(1) assert(len(fs.listdir('/clickhouse')) == expected)
def wait_for_hdfs_objects(cluster, fp, expected, num_tries=30): fs = HdfsClient(hosts=cluster.hdfs_ip) while num_tries > 0: num_hdfs_objects = len(fs.listdir(fp)) if num_hdfs_objects == expected: break num_tries -= 1 time.sleep(1) assert len(fs.listdir(fp)) == expected
def test_move_replace_partition_to_another_table(cluster): create_table(cluster, "hdfs_test") node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-04', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-05', 4096, -1))) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-06', 4096, -1))) assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4 create_table(cluster, "hdfs_clone") node.query("ALTER TABLE hdfs_test MOVE PARTITION '2020-01-03' TO TABLE hdfs_clone") node.query("ALTER TABLE hdfs_test MOVE PARTITION '2020-01-05' TO TABLE hdfs_clone") assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" assert node.query("SELECT sum(id) FROM hdfs_clone FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM hdfs_clone FORMAT Values") == "(8192)" # Number of objects in HDFS should be unchanged. hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4 # Add new partitions to source table, but with different values and replace them from copied table. node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096, -1))) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-05', 4096))) assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 6 node.query("ALTER TABLE hdfs_test REPLACE PARTITION '2020-01-03' FROM hdfs_clone") node.query("ALTER TABLE hdfs_test REPLACE PARTITION '2020-01-05' FROM hdfs_clone") assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)" assert node.query("SELECT sum(id) FROM hdfs_clone FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM hdfs_clone FORMAT Values") == "(8192)" # Wait for outdated partitions deletion. print(1) wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4) node.query("DROP TABLE hdfs_clone NO DELAY") assert node.query("SELECT sum(id) FROM hdfs_test FORMAT Values") == "(0)" assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(16384)" # Data should remain in hdfs hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
def test_attach_detach_partition(cluster): create_table(cluster, "hdfs_test") node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) node.query("INSERT INTO hdfs_test VALUES {}".format( generate_values('2020-01-03', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format( generate_values('2020-01-04', 4096))) assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len( hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("ALTER TABLE hdfs_test DETACH PARTITION '2020-01-03'") assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(4096)" hdfs_objects = fs.listdir('/clickhouse') assert len( hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("ALTER TABLE hdfs_test ATTACH PARTITION '2020-01-03'") assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len( hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("ALTER TABLE hdfs_test DROP PARTITION '2020-01-03'") assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(4096)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE node.query("ALTER TABLE hdfs_test DETACH PARTITION '2020-01-04'") node.query("ALTER TABLE hdfs_test DROP DETACHED PARTITION '2020-01-04'", settings={"allow_drop_detached": 1}) assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(0)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD
def drop_table(cluster): node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) hdfs_objects = fs.listdir('/clickhouse') print('Number of hdfs objects to delete:', len(hdfs_objects), sep=' ') node.query("DROP TABLE IF EXISTS hdfs_test SYNC") try: wait_for_delete_hdfs_objects(cluster, 0) finally: hdfs_objects = fs.listdir('/clickhouse') if len(hdfs_objects) == 0: return print("Manually removing extra objects to prevent tests cascade failing: ", hdfs_objects) for path in hdfs_objects: fs.delete(path)
def start(): # 连接MongoDB,查询tokens,根据contractAddress到etherscan查询最新数据 client = MongoCluster().connect() db = client.get_database('gse-transaction') collection = db.get_collection('mrout_6000001-6001000') # collection.insert_one() # 连接HDFS读取文件 from pyhdfs import HdfsClient client2 = HdfsClient(hosts='%s,50070' % hdfs_ip, max_tries=10) # 返回这个用户的根目录 print client2.get_home_directory() # 返回可用的namenode节点 print client2.get_active_namenode() # 返回指定目录下的所有文件 print client2.listdir("/user/leon/mrout_3_6000001-6001000/") # 读某个文件 client2.mkdirs("/user/leon") inputfile = client2.open('/user/leon/mrout_3_6000001-6001000/part-00000') # 查看文件内容 for r in inputfile: line = str(r).encode('utf-8') # open后是二进制,str()转换为字符串并转码 print(line)
def showDirs(client_: HdfsClient, path_: str, maxDepth_: int, list_: list, depth_: int = 0): if depth_ > maxDepth_: return for _sub in client_.listdir(path_): _filePath = path_ + _sub _fileStatus = client_.get_file_status(_filePath) # print('_fileStatus = ' + str(_fileStatus)) if _fileStatus.type == "DIRECTORY": if _fileStatus.permission == '711': print("| " * depth_ + "+--" + _sub + " permission is 711") elif _fileStatus.permission == '700': print("| " * depth_ + "+--" + _sub + " permission is 700") elif _fileStatus.permission == '733': print("| " * depth_ + "+--" + _sub + " permission is 733") elif _fileStatus.permission == '770': print("| " * depth_ + "+--" + _sub + " permission is 770") else: print("| " * depth_ + "+--" + _sub) list_.append(_filePath) showDirs(client_, _filePath + '/', maxDepth_, list_, depth_ + 1)
def assert_objects_count(started_cluster, objects_count, path='data/'): fs = HdfsClient(hosts=started_cluster.hdfs_ip) hdfs_objects = fs.listdir('/clickhouse') assert objects_count == len(hdfs_objects)
class HDFSClientUtilityTest(unittest.TestCase): '''Unit test for hdfsClientUtility.py''' def setUp(self): self.hdfs_file_path = '../../.vscode/hdfsInfo.json' self.hdfs_config = None try: with open(self.hdfs_file_path, 'r') as file: self.hdfs_config = json.load(file) except Exception as exception: print(exception) self.hdfs_client = HdfsClient(hosts='{0}:{1}'.format( self.hdfs_config['host'], '50070'), user_name=self.hdfs_config['userName']) def get_random_name(self, length): return ''.join( random.sample(string.ascii_letters + string.digits, length)) def test_copy_file_run(self): '''test copyFileToHdfs''' file_name = self.get_random_name(8) file_content = 'hello world!' with open('./{}'.format(file_name), 'w') as file: file.write(file_content) result = copyFileToHdfs( './{}'.format(file_name), '/{0}/{1}'.format(self.hdfs_config['userName'], file_name), self.hdfs_client) self.assertTrue(result) file_list = self.hdfs_client.listdir('/{0}'.format( self.hdfs_config['userName'])) self.assertIn(file_name, file_list) hdfs_file_name = self.get_random_name(8) self.hdfs_client.copy_to_local( '/{0}/{1}'.format(self.hdfs_config['userName'], file_name), './{}'.format(hdfs_file_name)) self.assertTrue(os.path.exists('./{}'.format(hdfs_file_name))) with open('./{}'.format(hdfs_file_name), 'r') as file: content = file.readline() self.assertEqual(file_content, content) #clean up os.remove('./{}'.format(file_name)) os.remove('./{}'.format(hdfs_file_name)) self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'], file_name)) def test_copy_directory_run(self): '''test copyDirectoryToHdfs''' directory_name = self.get_random_name(8) file_name_list = [self.get_random_name(8), self.get_random_name(8)] file_content = 'hello world!' os.makedirs('./{}'.format(directory_name)) for file_name in file_name_list: with open('./{0}/{1}'.format(directory_name, file_name), 'w') as file: file.write(file_content) result = copyDirectoryToHdfs( './{}'.format(directory_name), '/{0}/{1}'.format(self.hdfs_config['userName'], directory_name), self.hdfs_client) self.assertTrue(result) directory_list = self.hdfs_client.listdir('/{0}'.format( self.hdfs_config['userName'])) self.assertIn(directory_name, directory_list) sub_file_list = self.hdfs_client.listdir('/{0}/{1}'.format( self.hdfs_config['userName'], directory_name)) for file_name in file_name_list: self.assertIn(file_name, sub_file_list) #clean up self.hdfs_client.delete('/{0}/{1}/{2}'.format( self.hdfs_config['userName'], directory_name, file_name)) self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'], directory_name)) shutil.rmtree('./{}'.format(directory_name))
class hdfs(object): #默认50070端口 def __init__(self, cur_database_param): # super(HdfsClients, self).__init__() # self.quert_db_info = super(HdfsClients, self).getDBConfig() # self.hdfsHost=self.quert_db_info["host"] hdfsHost = cur_database_param['url'] path = cur_database_param['dbname'] self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost)) self.host = hdfsHost self.path = path def append(self, path, data): self.hdfs.append(path, data) pass def concat(self, target, sources): self.concat(target, sources) # self, taskJobId,tableName=None,jobTemplateFieldList=None def createTableByTaskJobId(self, taskJobId, tableName=None, jobTemplateFieldList=None, data=None): if tableName == None: taskJob = TaskJobDao.loadTaskById(taskJobId) tableName = taskJob.tableName path = self.path + '/' + tableName self.hdfs.create(path, data, replication=2) def hmkdirs(self, path): self.hdfs.mkdirs(path) def open(self, path): return self.hdfs.open(path=path) def delete(self, path): self.hdfs.delete(path=path) def listdir(self, rule): f = self.hdfs.listdir(rule) return f def insert(self, jobid, tablename, column_dict, paramMap=None): if tablename == None: taskJob = TaskJobDao.loadTaskById(jobid) tablename = taskJob.tableName path = self.path + '/' + tablename createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) task_job_id_sequenceValue = paramMap.get( "task_job_id_sequence") if paramMap != None else None if task_job_id_sequenceValue != None: column_dict.update( {"task_job_id_sequence": str(task_job_id_sequenceValue)}) column_dict.update({ "task_job_del_flag": "False", "task_job_create_time": createTime }) # self.append(path, column_dict) if self.isTableExist(tablename): self.append(path, column_dict) else: self.createTableByTaskJobId(jobid, tablename, column_dict) # return column_dict def isTableExist(self, tablename): path = self.path + '/' + tablename exist = self.hdfs.exists(path) return exist def save_to_hdfs(self, jobid, path, data): if self.isTableExist(path): self.append(path, data) else: self.createTableByTaskJobId(jobid, path, data) def save_to_hdfs2(self, path, data): if self.hdfs.exists(path): self.hdfs.append(path, data) else: self.hdfs.create(path, data, replication=2) def execute(self, sqls="append", path=None, data=None): try: if isinstance(sqls, list) and len(sqls) > 0: for sql in sqls: # method = eval(sql) method = getattr(self, sql) method(path, data) else: # method = eval(sqls) method = getattr(self, sqls) method(path, data) except Exception, e: logging.error("hdfs,execute," + str(e)) raise Exception()
#!/usr/bin/env python # -*- coding:utf-8 -*- from pyhdfs import HdfsClient ''' python链接hadoop的hdfs文件系统,进行文件的上传和下载 ''' # 从hdfs文件系统读取文件 # hdfs地址 # client = HdfsClient(hosts='192.168.1.163:50070') client = HdfsClient(hosts='192.168.1.156:50070') print(client.listdir("/repo/")) res = client.open('/repo/README.txt') for r in res: line = str(r, encoding='utf-8') # open后是二进制,str()转换为字符串并转码 print(line) client = HdfsClient(hosts='192.168.1.156:50070', user_name='hadoop') # 只有hadoop用户拥有写权限 str1 = 'hello world' client.create('/py.txt', str1) # 创建新文件并写入字符串 # 上传本地文件到HDFS # client = HdfsClient(hosts='hacker:50070', user_name='root') # 本地文件绝对路径,HDFS目录必须不存在 # client.copy_from_local('D:/PythonProjects/crawl_work/thread_crawl_work02', '/usr/hadoop/')
def assert_objects_count(cluster, objects_count, path='data/'): fs = HdfsClient(hosts='localhost') hdfs_objects = fs.listdir('/clickhouse') assert objects_count == len(hdfs_objects)
from pyhdfs import HdfsClient client = HdfsClient(hosts='hadoop1:50070') print(client.listdir('/user/hive/warehouse/repdata1'))
############################################################################################### # # Python analyze HDFS # ############################################################################################### import re from pyhdfs import HdfsClient client = HdfsClient(hosts='dzaratsian-nifi4.field.hortonworks.com:50070') root_path = '/topics/minifitest/2017/01/' days = client.listdir(root_path) days_path = [root_path + str(path) for path in days] hours_path = [] for day in days_path: hours = client.listdir(day) for hour in hours: path = day + '/' + str(hour) #print path hours_path.append(path) minutes_path = [] for path in hours_path: minutes = client.listdir(path) if len(minutes) != 60: print '[ INFO ] Incomplete minutes (less than 60) in path: ' + str(path) + ' (Count = ' + str(len(minutes)) + ')'