def get_hdfs_client(env="local"): master, slave = get_env(env) try: client = Client(master) client.list("/") except HdfsError: client = Client(slave) return client
def test_hdfs_files(): project = utils.get_test_project() head_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % head_ip) root_dirs = hdfs.list('/') assert 'spark' in root_dirs spark_dirs = hdfs.list('/spark') assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
def test_hdfs_dirs(): project = utils.get_test_project() head_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % head_ip) users_dirs = hdfs.list('/user') assert 'hive' in users_dirs assert 'impala' in users_dirs users_dirs = hdfs.list('/user/hive') assert 'warehouse' in users_dirs
def test_hdfs_dirs(): project = utils.get_test_project() nn_ip = project.cluster.head.ip hdfs = Client("http://%s:50070" % nn_ip) assert hdfs root_dirs = hdfs.list("/") assert "tmp" in root_dirs assert "user" in root_dirs users_dirs = hdfs.list("/user") assert project.settings["USERNAME"] in users_dirs
def test_hdfs_dirs(): project = utils.get_test_project() nn_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % nn_ip) assert hdfs root_dirs = hdfs.list('/') assert 'tmp' in root_dirs assert 'user' in root_dirs users_dirs = hdfs.list('/user') assert project.settings['USERNAME'] in users_dirs
def read(dir_path, header): client = Client("http://127.0.0.1:50070") log_data = [] for date_dir in client.list(dir_path): for log_file in client.list(dir_path+'/'+date_dir): with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs: for line in fs: row = line.strip().split('&') if row != ['']: tmp = [] for field in row: tmp.append(field.split('=')[1]) log_data.append(tmp) return pd.DataFrame(log_data, columns=header)
def get(self, request): _hdfsName = request.GET.get("hdfsName", "46eccfa2-1c56-11e8-a752-1008b1983d21") _hdfsPath = os.path.join("/datahoop/", _hdfsName) # print(_hdfsPath) try: # 链接HDFS,读取文件 cli = Client(settings.HDFS_HOST) fileName = cli.list(_hdfsPath)[1] # print("filename:", fileName) _hdfsPath = os.path.join(_hdfsPath + "/", fileName) # print(_hdfsPath) try: with cli.read(_hdfsPath, length=2000, encoding="gbk") as f: datas = f.read() except UnicodeDecodeError: with cli.read(_hdfsPath, length=2000, encoding="utf8") as f: datas = f.read() # 字符转list re.sub("\r\n", "\n", datas) logger.debug(datas) datas = datas.strip('"').split('\n') content = [] for i in datas: content.append(i.strip('"').split(",")) except HdfsError: return Response(data={"error": "文件未找到或文件编码格式不符合"}, status=status.HTTP_400_BAD_REQUEST) return Response(data={"data": content}, status=status.HTTP_200_OK)
def download_parquet_from_hdfs_dir(parquet_dir, local_dir, hdfs_ip, hdfs_port=50070): """ 从hdfs批量下载parquet文件到local_path :param parquet_dir: parquet文件所在的文件'/data/a.parquet' :param local_path: '/data_gen/b.parquet' :param hdfs_ip: :param hdfs_port: :return: """ import os from hdfs.client import Client client = Client(f'http://{hdfs_ip}:{hdfs_port}') parquet_list = client.list(parquet_dir) print(parquet_list) for p in parquet_list: if p.endswith('.parquet'): print(f'downloading {os.path.join(parquet_dir, p)}') with client.read(os.path.join(parquet_dir, p)) as reader: data = reader.read() if not os.path.exists(local_dir): os.makedirs(local_dir) with open(os.path.join(local_dir, p), 'wb') as f: f.write(data)
def run_hdfs_test(conf: ConfigData): # the_date = conf.test_date() # "20181101" client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" # root_path = conf.unzip_dir(is_baoli) # 'D:/DATA/UNZIP/' # dest_dir = conf.hdfs_dir_syb(is_baoli) # file_pre = conf.file_pre1() # "t1_trxrecord_" # file_ext = conf.file_ext2() # "_V2.csv" # client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True) dat = client.list('/', status=False) print(dat)
def read_accesslog_from_hdfs(self): # 实时日志流的存储是每5个点击数据存储一次 client = Client("http://localhost:50070") file_names = client.list("/hadoop_file") ss = "" for file_name in file_names: with client.read("/hadoop_file/" + file_name, encoding="utf-8") as reader: for line in reader: # 去除测试数据 if line.startswith("filed1"): continue ss += line
def do(): global csv_path client = Client(hdfshost) file_list = client.list(csv_path) print(file_list) for file in file_list: if file.endswith(".csv"): csv_path = csv_path + file # 读取csv并同名写到本地 with open("./异常临界值local.csv", 'w', encoding='GB2312') as local: with client.read(csv_path, encoding='GB2312') as hdfs: for line in hdfs: local.write(line.strip('\n'))
def get_child(client: Client, path: str, f_type: int=3): # 1 file , 2 dir, 3 any a_list = [] # path = str(pathlib.PosixPath(path).expanduser()) is_dir = MyHdfsFile.isdir(client, path) if is_dir: names = client.list(path) for a_name in names: a_file = str(pathlib.PurePosixPath(path).joinpath(a_name)) if f_type == 3: a_list.append(a_file) elif MyHdfsFile.is_exist(client, a_file, f_type=f_type): a_list.append(a_file) return a_list
def generate_files( date, path="user/hadoop/trajectory/sim_trajectory_per_day/shanghai/%s-%s/%s", ): year, month, day = date.split('-') if date in QuerierParallel.files: return else: client = Client(QuerierParallel.master_hdfs_path, root="/", timeout=100, session=False) QuerierParallel.files.update( {date: client.list(path % (year, month, day))})
def mv_local_to_hdfs(filename): ''' 将写好的文件移动到hdfs ''' now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) file_index=int(now_time[11:13]) if file_index==0: file_path_all=getYesterday() else: file_path_all=now_time[0:10] client=Client("http://master:50070") if file_path_all not in client.list('/traffLog'): os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffLog/'+file_path_all) local_path=get_path_or_buf(filename) os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffLog/'+file_path_all)
def mv_local_to_hdfs(): ''' 将写好的文件移动到hdfs ''' now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) file_index=int(now_time[11:13]) if file_index==0: file_path_all=getYesterday() else: file_path_all=now_time[0:10] client=Client("http://master:50070") if file_path_all not in client.list('/traffFile'): os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffFile/'+file_path_all) local_path='/usr/local/bro/spool/worker-1/extract_files/*' os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffFile/'+file_path_all)
class HdfsClient(object): def __init__(self, url=None): self.url = url self.client = Client(url=url) def ls(self, path): return self.client.list(path) def isFile(self, path): result = self.client.status(path, strict=False) if result: return result[TYPE] == FILE else: return False def mkdir(self, path): self.client.makedirs(path, permission=777) def isDirectory(self, path): result = self.client.status(path, strict=False) if result: return result[TYPE] == DIRECTORY else: return False def upload(self, localSourcePath, remoteDistPath): self.client.upload(remoteDistPath, localSourcePath, overwrite=True) def dowload(self, remoteSourcePath, localDistPath): self.client.download(remoteSourcePath, localDistPath, overwrite=True) def put(self, localSourcePath, remoteDistPath): with open(localSourcePath, "r") as reader, self.client.write(remoteDistPath) as writer: data = reader.read(FILE_SIZE) while data != "": writer.write(data) data = reader.read(FILE_SIZE) def get(self, remoteSourcePath, localDistPath): with self.client.read(remoteSourcePath, chunk_size=FILE_SIZE) as reader, open( localDistPath, "a+") as writer: for chunk in reader: writer.write(chunk)
def list_files_in_flume_directory(hdfs_address): ''' Description: This function helps users to check files in a specific hdfs directory Parameters: -hdfs_address: hadoop master node ip address -hdfs_file_path: the hdfs directory path Returns: files_utf8_encode. A list which contains all files' name in a directory ''' # connect to hdfs client = Client('http://' + hdfs_address) # list all folders files = client.list('/flume') files_utf8_encode = map(lambda x: x.encode('utf-8'), files) if len(files_utf8_encode) > 1: print( 'Please check the data pipeline, the number of files should be 1') else: print(files_utf8_encode[0]) return files_utf8_encode[0]
def get(self, request): """ 计算结果下载hdfs 文件 :param request: :return: """ hdfsPath = request.GET.get("hdfsPath") logger.debug("请求文件:{0}".format(hdfsPath)) localPath = os.path.join(settings.BASE_DIR, 'media', 'hdfsFile') logger.debug("本地存储路径:{0}".format(localPath)) # 链接HDFS下载文件 cli = Client(settings.HDFS_HOST) logger.debug("HDFS连接{0}".format(cli)) try: fileName = cli.list(hdfsPath)[1] # print("filename:", fileName) path = os.path.join(hdfsPath, fileName) logger.debug(path, localPath) cli.download(hdfs_path=path, local_path=localPath, overwrite=True) except HdfsError: return Response(data={"error": "文件未找到"}, status=status.HTTP_404_NOT_FOUND) return Response(data={"fileName": fileName}, status=status.HTTP_200_OK)
def get_hadoop_connection(cls, host): try: client = Client(host, root='/', timeout=10000) client.list('/') except Exception as e: try: log_handler.log.info('get query data error from hadoop 01 -----{}'.format(e)) host = host.replace('01', '02') client = Client(host, root='/', timeout=10000) client.list('/') except Exception as e: try: log_handler.log.info('get query data error from hadoop 02 -----{}'.format(e)) host = host.replace('02', '03') client = Client(host, root='/', timeout=10000) client.list('/') except Exception as e: client = None log_handler.log.info('get query data error from hadoop -----{}'.format(e)) return client
""" ' a study.__init__.py module ' __author__ = 'steven' import os import time from hdfs.client import Client client = Client("http://127.0.0.1:50070", root="/", timeout=100) print(client.makedirs("/test/")) print(client.status("/test/")) print(client.list("/test/")) print(client.delete("/test/", True)) upload_filename = client.upload( "/test/" + str(int(round(time.time() * 1000))) + ".pdf", "test.pdf") print(upload_filename) download_path = os.path.join(os.path.abspath('.'), 'download/hdfs/') if not os.path.exists(download_path): os.makedirs(download_path, True) else: print(download_path, ' is existed.') print( client.download( upload_filename, download_path + str(int(round(time.time() * 1000))) + ".pdf")) print(client.delete(upload_filename)) print(client.delete(upload_filename))
def resutlApp(dict_parameters): # step1 : 获取动态参数 print('hello wolrd') # arguments = parse_arguments(sys.argv[1:]) # width = arguments['width'][0] # height = arguments['height'][0] # model_path = arguments['model_path'][0] # label_path = arguments['label_path'][0] # testDataset = arguments['testDataset'][0] print("接受参数中..................") print(type(dict_parameters), dict_parameters) print("\n" * 3) network = dict_parameters.get('network') hdfs_label_ip = dict_parameters.get('hdfslabel') img_src = dict_parameters.get('img_src') width = dict_parameters.get("width") height = dict_parameters.get("height") testDataset = dict_parameters.get("testDataset") model_path = dict_parameters.get("model_path") label_path = dict_parameters.get("label_path") print(network, img_src, width, height, model_path, label_path, testDataset) print("model path is ", model_path) # # step2测试数据预处理 X, ImgFiles = preprocessImageFolder(testDataset, width, height) # step3 : 获取预测的类别标签名称: print('hdfs web interface address is :', hdfs_label_ip) client = Client(hdfs_label_ip) # client = hdfs.Client("http://172.10.236.21:50070") types = client.list(label_path) # step4 加载模型 tf.logging.info("模型加载中.....") print(" model loading .....", model_path) keras.backend.clear_session() model = load_model(model_path) print(model.summary()) np.set_printoptions(precision=2, suppress=True) # step 5使用模型进行预测 print("model predicting ......") if 'fcn' in network: try: print(" test in FCN network structure") classes = np.argmax(np.squeeze(model.predict(X)), axis=1) except Exception as e: print("un expected error").format(e) else: try: print(" test in Dense network structure") classes = np.argmax(model.predict(X), axis=1) except Exception as e: print("un expected error").format(e) # print('classes: ', classes) # step 6:测试结果返回 back_testResult = {} print('types: ', types) print("predicted result in your kerboard floder is: \n") for i, index in enumerate(classes): key = str(ImgFiles[i]) value = str(types[index]) back_testResult.setdefault(key, value) # plt.imshow(ImgFiles[i]) # plt.title(str(types[index])) # plt.show() print('call bck type ', type(back_testResult)) # return 'abc' return str(back_testResult)
def test_hdfs(self): client = Client('http://172.16.2.41:50070', proxy='hive', root='/') print(client.list('/tmp/jiajie')) with client.read('/tmp/jiajie/birth_names.txt', length=10) as reader: data = reader.read()
#设置连接 client = Client("http://192.168.56.20:50070", root="/", session=False) # list——获取指定路径的子目录信息 # print(client.list("/")) # status——获取路径的具体信息 # print(client.status("/", strict=True)) # makedirs——创建目录 # print(client.makedirs("/hello")) # rename—重命名 # print(client.rename("/hello","/helloWorld")) # delete—删除 # print(client.delete("helloWorld")) # upload——上传数据 # client.upload("/",r"C:\Users\Administrator\Desktop\斗破\斗破苍穹.txt") # download——下载 # client.download("/斗破苍穹.txt",r"C:\Users\Administrator\Desktop") # read——读取文件 # with client.read("/斗破苍穹.txt",encoding='GBK') as f: # print(f.read()) print(client.list("/"))
loss = percent(nnDF) print('loss: ', loss) #coefficient of determination ---- 1.0 is the best score = model.score(testD, testL) print('score: ', score) if __name__ == '__main__': filepath = '/sdbadmin/hadoop/input' try: client = Client('http://192.168.111.130:50070') except Exception as e: print(e) dirs = client.list(filepath) #将hdfs本地化 print('there are %d shares' % (len(dirs))) ''' try: for i in range(len(dirs)): client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i]) except Exception as e: print(e) ''' min_max_scaler = preprocessing.MinMaxScaler() DD = pd.DataFrame([]) for i in range(len(dirs)): df = pd.read_csv('/opt/share_code_data/' + dirs[i], index_col=0) if len(DD) == len(df) or len(DD) == 0 and len(df) != 0: trun = min_max_scaler.fit_transform(
def yesterday(): return today() - datetime.timedelta(days=1) # 执行主方法 if __name__ == '__main__': print "监控HDFS......" yesterday_datetime_format = yesterday() for table in CHECK_TABLE: is_success = False has_data = False content = "" try: path = ROOT_DIR + table + "/" + str(yesterday_datetime_format) client_list = client.list(path, True) for i in range(0, len(client_list)): if (client_list[i][0].startswith('part-')) and (int( client_list[i][1].get("length")) > 0): has_data = True elif client_list[i][0].__eq__("_SUCCESS"): is_success = True except Exception, e: content = "异常信息:" + str(e) + "<br>" + \ str("HDFS路径:") + path if (content == "") and (not is_success): content = "异常信息:" + table + "相关job运行失败" + "<br>" + \ str("HDFS路径:") + path
# command line : pip install hdfs #Permission denied: '/usr/local/anaconda3/lib/python3.7/site-packages/docopt.py' #try another way like below and it works # pip install hdfs --target=/users/home/xzh216/pythonPackage import sys; sys.path.append("/users/home/xzh216/pythonPackage/") import hdfs from hdfs.client import Client client = Client("http://node0:9870/") # echo $HADOOP_HOME # cat /opt/hadoop/hadoop/etc/hadoop/core-site.xml --->node0 #node0 hostname 2.X50070 3.X 9870 #print("hdfs:", client.list(hdfs_path="/",status=True)) fileList=client.list("/data/ghcnd/daily/") #create empty rdd result = spark.createDataFrame(sc.emptyRDD(),schema_Daily) path_pre = "hdfs:///data/ghcnd/daily/" for file in fileList: daily_temp= ( spark.read.format("com.databricks.spark.csv") .option("header", "false") .option("inferSchema", "false") .schema(schema_Daily) .load(path_pre+file) ) result=result.union(daily_temp) result.rdd.getNumPartitions() #258
from hdfs.client import Client client = Client("http://192.168.1.197:50070", root="/", timeout=100, session=False) print client.list("/topics") # with client.read("/topics") as reader: # print reader.read()
#!/usr/bin/python3 # -*- coding: utf-8 -*- # hdfs = HDFileSystem(host='hdfs-bizaistca.corp.microsoft.com', port=8020, user='******') from hdfs.client import Client hdfs_path = '/user/hadoop/fanyuguang/input/' local_path = '.' client = Client("hdfs-bizaistca.corp.microsoft.com:8020/", root="/", timeout=10000, session=False) result = client.list(hdfs_path, status=False) print(result) # client.download(hdfs_path, local_path, overwrite=False)
class HDFSClient: def __init__(self, url, root=None, user=None, proxy=None, timeout=None, session=None): """ 连接hdfs url: HDFS名称节点的主机名或IP地址及端口号 root: 根路径,此路径将作为传递给客户端的所有HDFS路径的前缀 user: 使用InsecureClient(Base Client),指定访问hdfs的用户;Client使用默认用户dr.who proxy: 代理的用户 timeout: 连接超时,转发到请求处理程序 session: request.Session实例,用于发出所有请求 """ if user: self.client = InsecureClient(url, user=user) else: self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=session) def list_hdfs_file(self, hdfs_path, status=False): """ 返回目录下的文件 status: 每个文件或目录的属性信息(FileStatus) return: 列表中包含元组,每个元组是目录名或文件名和属性信息构成 """ return self.client.list(hdfs_path, status=status) def walk_hdfs_file(self, hdfs_path, depth=0, status=False, ignore_missing=False, allow_dir_changes=False): """ 深度遍历远程文件系统 hdfs_path: 起始路径。如果该路径不存在,则会引发HdfsError。如果指向文件,则返回的生成器将为空 depth: 探索的最大深度。0为无限制 status: 同时返回每个文件或文件夹的相应FileStatus ignore_missing: 忽略缺少的嵌套文件夹,而不是引发异常 allow_dir_changes: 允许更改目录列表以影响遍历 return: 生成器,返回值参考python的walk函数 """ return self.client.walk(hdfs_path, depth=depth, status=status, ignore_missing=ignore_missing, allow_dir_changes=allow_dir_changes) def delete_hdfs_file(self, hdfs_path, recursive=False, skip_trash=False): """ 删除文件 recursive: 递归删除文件或目录,默认情况下,如果尝试删除非空目录,此方法将引发HdfsError skip_trash: 设置为false时,已删除的路径将被移动到适当的垃圾回收文件夹,而不是被删除 return: 如果删除成功,则此函数返回True;如果hdfs_path之前不存在文件或目录,则返回False """ return self.client.delete(hdfs_path, recursive=recursive, skip_trash=skip_trash) def download_hdfs_file(self, hdfs_path, local_path, overwrite=True, n_threads=1, temp_dir=None, **kwargs): """ 下载文件 hdfs_file: HDFS上要下载的文件或文件夹的路径。如果是文件夹,则将下载该文件夹下的所有文件 local_file: 本地路径。如果它已经存在并且是目录,则文件将在其中下载 overwrite: 覆盖任何现有文件或目录 n_threads: 用于并行化的线程数。值为0(或负数)将使用与文件一样多的线程 temp_dir: 当overwrite = True并且最终目标路径已经存在时,将首先在其下下载文件的目录。下载成功完成后,它将被交换 **kwargs: 关键字参数转发给read()。如果未传递chunk_size参数,则将使用默认值64 kB return: 方法执行成功,将返回本地下载路径 """ res = self.client.download(hdfs_path, local_path, overwrite=overwrite, n_threads=n_threads, temp_dir=temp_dir, **kwargs) def upload_hdfs_file(self, hdfs_path, local_path, n_threads=1, temp_dir=None, chunk_size=65536, progress=None, cleanup=True, **kwargs): """ 上传文件 hdfs_path: 目标HDFS路径。如果它已经存在并且是目录,则文件将在其中上传 local_path: 文件或文件夹的本地路径。如果是文件夹,则将上载其中的所有 文件(请注意,这意味着没有文件的文件夹将不会远程创建) cleanup: 如果上传过程中发生错误,删除所有上传的文件 return: 方法执行成功,将返回状态码,远程上传目录,错误信息 """ try: res = self.client.upload(hdfs_path, local_path, n_threads=n_threads, temp_dir=temp_dir, chunk_size=chunk_size, progress=progress, cleanup=cleanup, overwrite=True) return 0, res, '' except HdfsError as e: return 1, '', str(e) def makedirs(self, hdfs_path, permission=None): """ 创建目录,可以递归 permission: 在新创建的目录上设置的八进制权限,这些权限将仅在尚不存在的目录上设置 return: None """ self.client.makedirs(hdfs_path, permission=permission) def parts(self, hdfs_path, parts=None, status=False): """ hdfs_path: 远程路径。该目录每个分区最多应包含一个零件文件(否则将任意选择一个文件) parts: 零件文件编号列表或要选择的零件文件总数。如果是数字,那么将随机选择那么多分区。 默认情况下,将返回所有零件文件。如果部件是列表,但未找到部件之一或需要太多样本,则会引发HdfsError status: 返回文件的FileStatus return: 返回对应于路径的零件文件的字典 """ return self.client.parts(hdfs_path, parts=parts, status=status) def read_hdfs_file(self, **kwds): """ 读取文件内容,这个方法必须在一个with块中使用,以便关闭连接 >>> with client.read('foo') as reader: >>> content = reader.read() hdfs_path: HDFS路径 offset: 起始字节位置 length: 要处理的字节数。设置为None时会读取整个文件 buffer_size: 用于传输数据的缓冲区大小(以字节为单位)。默认为在HDFS配置中设置的值 encoding: 用于解码请求的编码。默认情况下,返回原始数据 chunk_size: 如果设置为正数,则上下文管理器将返回一个生成器,该生成器生成每个chunk_size字节, 而不是类似文件的对象(除非还设置了定界符) delimiter: 如果设置,上下文管理器将在每次遇到定界符时返回生成器。此参数要求指定编码 progress: 回调函数,用于跟踪进度,称为每个chunk_size字节(如果未指定块大小,则不可用)。 它将传递两个参数,即要上传的文件的路径和到目前为止已传输的字节数。 完成后,将以-1作为第二个参数调用一次 """ self.client.read(**kwds) def write_hdfs_file(self, hdfs_path, data=None, overwrite=False, permission=None, blocksize=None, replication=None, buffersize=None, append=False, encoding=None): """ 在HDFS上创建文件 data: 要写入的文件内容。 可以是字符串,生成器或文件对象。 最后两个选项将允许流式上传(即无需 将全部内容加载到内存中)。 如果为None,则此方法将返回类似文件的对象,应使用with块调用 它(请参见下面的示例) permission: 在新创建的文件上设置的八进制权限 append: 附加到文件而不是创建新文件 encoding: 用于序列化写入数据的编码 >>> from json import dump, dumps >>> records = [ >>> {'name': 'foo', 'weight': 1}, >>> {'name': 'bar', 'weight': 2}, >>> ] >>> # As a context manager: >>> with client.write('data/records.jsonl', encoding='utf-8') as writer: >>> dump(records, writer) >>> Or, passing in a generator directly: >>> client.write('data/records.jsonl', data=dumps(records), encoding='utf-8') """ self.client.write(hdfs_path, data=data, overwrite=overwrite, permission=permission, blocksize=blocksize, replication=replication, buffersize=buffersize, append=append, encoding=encoding) def rename_or_move(self, hdfs_src_path, hdfs_dst_path): """ 移动文件或目录 hdfs_src_path: 源路径 hdfs_dst_path: 目标路径,如果路径已经存在并且是目录,则源将移入其中。 如果路径存在并且是文件,或者缺少父目标目录,则此方法将引发HdfsError """ self.client.rename(hdfs_src_path, hdfs_dst_path) def set_owner(self, hdfs_path, owner=None, group=None): """ 更改文件的所有者,必须至少指定所有者和组之一 owner: 可选,文件的新所有者 group: 可选,文件的新所有组 """ self.client.set_owner(hdfs_path, owner=owner, group=group) def set_permission(self, hdfs_path, permission): """ 更改文件权限 permission: 文件的新八进制权限字符串 """ self.client.set_permission(hdfs_path, permission) def set_replication(self, hdfs_path, replication): """ 设置文件副本 replication: 副本数 """ self.client.set_replication(hdfs_path, replication) def set_times(self, hdfs_path, access_time=None, modification_time=None): """ 更改文件的时间戳 """ self.client.set_times(hdfs_path, access_time=access_time, modification_time=modification_time) def status_hdfs_file(self, hdfs_path, strict=True): """ 获取文件的FileStatus strict: 如果为False,则返回None,而不是如果路径不存在则引发异常 """ self.client.status(hdfs_path, strict=strict)
from hdfs.client import Client def con(host): conn = Client(host) return conn def ls(host, remote_path): conn = con(host) list_path = conn.list(remote_path) return list_path def down(host, remote_path, local_path): conn = con(host) conn.download(remote_path, local_path) if __name__ == "__main__": hdfs_host = "http://192.168.175.231:9870/" client = Client(hdfs_host) print(client.list("/test/parquet/ns=37082600014020001005/d=20200801")) print(len(client.list("/test/parquet/ns=37082600014020001005/d=20200801"))) # client.download('/test/parquet/ns=37082600014020001005', # 'D:\\PycharmWorkSpace\\demo\\hdfs') # local_file_path = "D:\\PycharmWorkSpace\\demo\\hdfs\\ns=37082600014020001005" # columns = ['id', 't', 'v', 'vt', 'c', 'u', 's', 'dqsj'] # df = pa.read_parquet(local_file_path, engine='auto', columns=columns) # print(df['dqsj'])
from hdfs.client import Client import os, tarfile client = Client("http://10.239.1.57:50070") #list()会列出hdfs指定路径的所有文件信息,接收两个参数 print("hdfs中的目录为:", client.list(hdfs_path="/test1", status=True)) #read() #读取文件信息 类似与 hdfs dfs -cat hfds_path,参数如下: # hdfs_path hdfs路径 # offset 读取位置 # length 读取长度 # buffer_size 设置buffer_size 不设置使用hdfs默认100MB 对于大文件 buffer够大的化 sort与shuffle都更快 # encoding 指定编码 # chunk_size 字节的生成器,必须和encodeing一起使用 满足chunk_size设置即 yield # delimiter 设置分隔符 必须和encodeing一起设置 # progress 读取进度回调函数 读取一个chunk_size回调一次 print( client.read( "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv")) # 下载 print( "下载文件结果part.csv:", client.download( hdfs_path= "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv", local_path="/home/yyj2020/test", overwrite=True))
def test_request(self): client = Client(url="http://10.0.137.24:50070") print client.list("/user/cabbage", status=True) print client.status("/user/cabbage")
"--hdfsManager", help="HDFS manager", required=True) args = parser.parse_args() sc = SparkContext(appName="EFFECT-LOAD-TO-ES") conf = SparkConf() hdfs_client = Client(args.hdfsManager) hdfsRelativeFilname = args.input if hdfsRelativeFilname.startswith("hdfs://"): idx = hdfsRelativeFilname.find("/", 8) if idx != -1: hdfsRelativeFilname = hdfsRelativeFilname[idx:] if args.doctype is None: document_types = hdfs_client.list(args.input, False) else: document_types = args.doctype.split(",") create_index = True for doc_type in document_types: doc_type = doc_type.strip() input_rdd = sc.sequenceFile(args.input + "/" + doc_type) #.partitionBy(args.partitions) if doc_type == 'topic' or doc_type == 'post': es_write_conf = { "es.nodes": args.host, "es.port": args.port, "es.nodes.discover": "false",