def get(self, request): _hdfsName = request.GET.get("hdfsName", "46eccfa2-1c56-11e8-a752-1008b1983d21") _hdfsPath = os.path.join("/datahoop/", _hdfsName) # print(_hdfsPath) try: # 链接HDFS,读取文件 cli = Client(settings.HDFS_HOST) fileName = cli.list(_hdfsPath)[1] # print("filename:", fileName) _hdfsPath = os.path.join(_hdfsPath + "/", fileName) # print(_hdfsPath) try: with cli.read(_hdfsPath, length=2000, encoding="gbk") as f: datas = f.read() except UnicodeDecodeError: with cli.read(_hdfsPath, length=2000, encoding="utf8") as f: datas = f.read() # 字符转list re.sub("\r\n", "\n", datas) logger.debug(datas) datas = datas.strip('"').split('\n') content = [] for i in datas: content.append(i.strip('"').split(",")) except HdfsError: return Response(data={"error": "文件未找到或文件编码格式不符合"}, status=status.HTTP_400_BAD_REQUEST) return Response(data={"data": content}, status=status.HTTP_200_OK)
def get(self, request): _hdfsName = request.GET.get("hdfsName", "46eccfa2-1c56-11e8-a752-1008b1983d21") _hdfsPath = os.path.join("/datahoop/", _hdfsName) obj = DataSource.objects.get(format_filename=_hdfsName) # print(_hdfsPath) try: # 链接HDFS,读取文件 cli = Client(settings.HDFS_HOST) try: with cli.read(_hdfsPath, encoding="gbk") as f: datas = f.read() except UnicodeDecodeError: with cli.read(_hdfsPath, encoding="utf8") as f: datas = f.read() except HdfsError: return Response(data={"error": "文件未找到或文件编码格式不符合"}, status=status.HTTP_400_BAD_REQUEST) response = HttpResponse(content_type='csv/plain') response['Content-Disposition'] = 'attachment; filename={0}'.format( obj.file_name) response.write(datas) return response
def get_data(file_path): HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader: data = [line.strip().split() for line in reader] print("data",data[0:2]) return data
def download_parquet_from_hdfs_dir(parquet_dir, local_dir, hdfs_ip, hdfs_port=50070): """ 从hdfs批量下载parquet文件到local_path :param parquet_dir: parquet文件所在的文件'/data/a.parquet' :param local_path: '/data_gen/b.parquet' :param hdfs_ip: :param hdfs_port: :return: """ import os from hdfs.client import Client client = Client(f'http://{hdfs_ip}:{hdfs_port}') parquet_list = client.list(parquet_dir) print(parquet_list) for p in parquet_list: if p.endswith('.parquet'): print(f'downloading {os.path.join(parquet_dir, p)}') with client.read(os.path.join(parquet_dir, p)) as reader: data = reader.read() if not os.path.exists(local_dir): os.makedirs(local_dir) with open(os.path.join(local_dir, p), 'wb') as f: f.write(data)
def get_data_hdfs(file_path): HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader: data = [line.strip().split(',') for line in reader] print("data", data[0:5]) df = pd.DataFrame(data[1:], columns=data[0]) return df
def hdfs_file2points(path): client = Client(QuerierParallel.master_hdfs_path, root="/", timeout=100, session=False) points = [] with client.read(path) as f: for line in f: info = line.strip('\n').split('\t') points.append([float(info[0]), float(info[1])]) f.close() return points
def do(): global csv_path client = Client(hdfshost) file_list = client.list(csv_path) print(file_list) for file in file_list: if file.endswith(".csv"): csv_path = csv_path + file # 读取csv并同名写到本地 with open("./异常临界值local.csv", 'w', encoding='GB2312') as local: with client.read(csv_path, encoding='GB2312') as hdfs: for line in hdfs: local.write(line.strip('\n'))
def read_accesslog_from_hdfs(self): # 实时日志流的存储是每5个点击数据存储一次 client = Client("http://localhost:50070") file_names = client.list("/hadoop_file") ss = "" for file_name in file_names: with client.read("/hadoop_file/" + file_name, encoding="utf-8") as reader: for line in reader: # 去除测试数据 if line.startswith("filed1"): continue ss += line
def read(dir_path, header): client = Client("http://127.0.0.1:50070") log_data = [] for date_dir in client.list(dir_path): for log_file in client.list(dir_path+'/'+date_dir): with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs: for line in fs: row = line.strip().split('&') if row != ['']: tmp = [] for field in row: tmp.append(field.split('=')[1]) log_data.append(tmp) return pd.DataFrame(log_data, columns=header)
def read_corpus(): qList = [] # 问题的关键词列表 qList_kw = [] aList = [] lines = [] client = Client("http://localhost:50070") with client.read("/corpus/q_a.csv", encoding='utf-8') as reader: for line in reader: lines.append(line.strip()) for t in lines: qList.append(t[0]) qList_kw.append(seg.cut(t[0])) aList.append(t[1]) return qList_kw, qList, aList
class HdfsClient(object): def __init__(self, url=None): self.url = url self.client = Client(url=url) def ls(self, path): return self.client.list(path) def isFile(self, path): result = self.client.status(path, strict=False) if result: return result[TYPE] == FILE else: return False def mkdir(self, path): self.client.makedirs(path, permission=777) def isDirectory(self, path): result = self.client.status(path, strict=False) if result: return result[TYPE] == DIRECTORY else: return False def upload(self, localSourcePath, remoteDistPath): self.client.upload(remoteDistPath, localSourcePath, overwrite=True) def dowload(self, remoteSourcePath, localDistPath): self.client.download(remoteSourcePath, localDistPath, overwrite=True) def put(self, localSourcePath, remoteDistPath): with open(localSourcePath, "r") as reader, self.client.write(remoteDistPath) as writer: data = reader.read(FILE_SIZE) while data != "": writer.write(data) data = reader.read(FILE_SIZE) def get(self, remoteSourcePath, localDistPath): with self.client.read(remoteSourcePath, chunk_size=FILE_SIZE) as reader, open( localDistPath, "a+") as writer: for chunk in reader: writer.write(chunk)
def download_parquet_from_hdfs(parquet_path, local_path, hdfs_ip, hdfs_port=50070): """ 从hdfs下载parquet文件到local_path :param parquet_path: '/data/a.parquet' :param local_path: '/data_gen/b.parquet' :param hdfs_ip: :param hdfs_port: :return: """ from hdfs.client import Client client = Client(f'http://{hdfs_ip}:{hdfs_port}') with client.read(parquet_path) as reader: data = reader.read() with open(local_path, 'wb') as f: f.write(data)
import pandas as pd client = Client("http://10.103.0.11:9870", root='/') files = [] path = '/lr/pos' dirt = [] for a, b, c in client.walk(path): root = a dirt.append(b) files = c col = [ 'id', 'mmsi', 'latitude', 'longitude', 'course', 'speed', 'lasttm', 'day' ] for file in dirt[0]: print(file) res = [] with client.read(path + '/' + file + '/' + 'part-00000') as read: for line in read: data = str(line).split('[')[1].split(']')[0] l1 = data.split(',') l1[2] = float(l1[2]) l1[3] = float(l1[3]) l1[4] = float(l1[4]) l1[5] = float(l1[5]) l1[6] = int(l1[6]) res.append(l1) df = pd.DataFrame(res, columns=col) name = str(file).split('.')[0] df.to_excel('../posData/' + name + '.xlsx')
''' import hdfs import os import json import json, time, re from hdfs.client import Client client = Client("http://IP:50070") data = time.strftime("%Y-%m-%d") filepath = ("/nginx_log/www_test1-%s.log" % data) #print(filepath) dirname = "/nginx_log/" TARGETPATH = r'D:\targeFile' with client.read(filepath, encoding='utf-8') as f: for l in f: d = json.loads(l) # lines.append(l.strip()) # d = json.loads(l) lt = [] s = d.get('status') if s == "200": #print(d.values()) for y in d: if type(d[y]) == dict: for k in d[y]: lt.append(d[y][k]) elif type(d[y]) == list: for i in d[y]: lt.append(i)
def test_hdfs(self): client = Client('http://172.16.2.41:50070', proxy='hive', root='/') print(client.list('/tmp/jiajie')) with client.read('/tmp/jiajie/birth_names.txt', length=10) as reader: data = reader.read()
from hdfs.client import Client import time client = Client("http://10.10.10.103:50070") with client.read('/user/lifeng/test/honeybee/hello') as fs: content = fs.read()
class HDFSClient: def __init__(self, url, root=None, user=None, proxy=None, timeout=None, session=None): """ 连接hdfs url: HDFS名称节点的主机名或IP地址及端口号 root: 根路径,此路径将作为传递给客户端的所有HDFS路径的前缀 user: 使用InsecureClient(Base Client),指定访问hdfs的用户;Client使用默认用户dr.who proxy: 代理的用户 timeout: 连接超时,转发到请求处理程序 session: request.Session实例,用于发出所有请求 """ if user: self.client = InsecureClient(url, user=user) else: self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=session) def list_hdfs_file(self, hdfs_path, status=False): """ 返回目录下的文件 status: 每个文件或目录的属性信息(FileStatus) return: 列表中包含元组,每个元组是目录名或文件名和属性信息构成 """ return self.client.list(hdfs_path, status=status) def walk_hdfs_file(self, hdfs_path, depth=0, status=False, ignore_missing=False, allow_dir_changes=False): """ 深度遍历远程文件系统 hdfs_path: 起始路径。如果该路径不存在,则会引发HdfsError。如果指向文件,则返回的生成器将为空 depth: 探索的最大深度。0为无限制 status: 同时返回每个文件或文件夹的相应FileStatus ignore_missing: 忽略缺少的嵌套文件夹,而不是引发异常 allow_dir_changes: 允许更改目录列表以影响遍历 return: 生成器,返回值参考python的walk函数 """ return self.client.walk(hdfs_path, depth=depth, status=status, ignore_missing=ignore_missing, allow_dir_changes=allow_dir_changes) def delete_hdfs_file(self, hdfs_path, recursive=False, skip_trash=False): """ 删除文件 recursive: 递归删除文件或目录,默认情况下,如果尝试删除非空目录,此方法将引发HdfsError skip_trash: 设置为false时,已删除的路径将被移动到适当的垃圾回收文件夹,而不是被删除 return: 如果删除成功,则此函数返回True;如果hdfs_path之前不存在文件或目录,则返回False """ return self.client.delete(hdfs_path, recursive=recursive, skip_trash=skip_trash) def download_hdfs_file(self, hdfs_path, local_path, overwrite=True, n_threads=1, temp_dir=None, **kwargs): """ 下载文件 hdfs_file: HDFS上要下载的文件或文件夹的路径。如果是文件夹,则将下载该文件夹下的所有文件 local_file: 本地路径。如果它已经存在并且是目录,则文件将在其中下载 overwrite: 覆盖任何现有文件或目录 n_threads: 用于并行化的线程数。值为0(或负数)将使用与文件一样多的线程 temp_dir: 当overwrite = True并且最终目标路径已经存在时,将首先在其下下载文件的目录。下载成功完成后,它将被交换 **kwargs: 关键字参数转发给read()。如果未传递chunk_size参数,则将使用默认值64 kB return: 方法执行成功,将返回本地下载路径 """ res = self.client.download(hdfs_path, local_path, overwrite=overwrite, n_threads=n_threads, temp_dir=temp_dir, **kwargs) def upload_hdfs_file(self, hdfs_path, local_path, n_threads=1, temp_dir=None, chunk_size=65536, progress=None, cleanup=True, **kwargs): """ 上传文件 hdfs_path: 目标HDFS路径。如果它已经存在并且是目录,则文件将在其中上传 local_path: 文件或文件夹的本地路径。如果是文件夹,则将上载其中的所有 文件(请注意,这意味着没有文件的文件夹将不会远程创建) cleanup: 如果上传过程中发生错误,删除所有上传的文件 return: 方法执行成功,将返回状态码,远程上传目录,错误信息 """ try: res = self.client.upload(hdfs_path, local_path, n_threads=n_threads, temp_dir=temp_dir, chunk_size=chunk_size, progress=progress, cleanup=cleanup, overwrite=True) return 0, res, '' except HdfsError as e: return 1, '', str(e) def makedirs(self, hdfs_path, permission=None): """ 创建目录,可以递归 permission: 在新创建的目录上设置的八进制权限,这些权限将仅在尚不存在的目录上设置 return: None """ self.client.makedirs(hdfs_path, permission=permission) def parts(self, hdfs_path, parts=None, status=False): """ hdfs_path: 远程路径。该目录每个分区最多应包含一个零件文件(否则将任意选择一个文件) parts: 零件文件编号列表或要选择的零件文件总数。如果是数字,那么将随机选择那么多分区。 默认情况下,将返回所有零件文件。如果部件是列表,但未找到部件之一或需要太多样本,则会引发HdfsError status: 返回文件的FileStatus return: 返回对应于路径的零件文件的字典 """ return self.client.parts(hdfs_path, parts=parts, status=status) def read_hdfs_file(self, **kwds): """ 读取文件内容,这个方法必须在一个with块中使用,以便关闭连接 >>> with client.read('foo') as reader: >>> content = reader.read() hdfs_path: HDFS路径 offset: 起始字节位置 length: 要处理的字节数。设置为None时会读取整个文件 buffer_size: 用于传输数据的缓冲区大小(以字节为单位)。默认为在HDFS配置中设置的值 encoding: 用于解码请求的编码。默认情况下,返回原始数据 chunk_size: 如果设置为正数,则上下文管理器将返回一个生成器,该生成器生成每个chunk_size字节, 而不是类似文件的对象(除非还设置了定界符) delimiter: 如果设置,上下文管理器将在每次遇到定界符时返回生成器。此参数要求指定编码 progress: 回调函数,用于跟踪进度,称为每个chunk_size字节(如果未指定块大小,则不可用)。 它将传递两个参数,即要上传的文件的路径和到目前为止已传输的字节数。 完成后,将以-1作为第二个参数调用一次 """ self.client.read(**kwds) def write_hdfs_file(self, hdfs_path, data=None, overwrite=False, permission=None, blocksize=None, replication=None, buffersize=None, append=False, encoding=None): """ 在HDFS上创建文件 data: 要写入的文件内容。 可以是字符串,生成器或文件对象。 最后两个选项将允许流式上传(即无需 将全部内容加载到内存中)。 如果为None,则此方法将返回类似文件的对象,应使用with块调用 它(请参见下面的示例) permission: 在新创建的文件上设置的八进制权限 append: 附加到文件而不是创建新文件 encoding: 用于序列化写入数据的编码 >>> from json import dump, dumps >>> records = [ >>> {'name': 'foo', 'weight': 1}, >>> {'name': 'bar', 'weight': 2}, >>> ] >>> # As a context manager: >>> with client.write('data/records.jsonl', encoding='utf-8') as writer: >>> dump(records, writer) >>> Or, passing in a generator directly: >>> client.write('data/records.jsonl', data=dumps(records), encoding='utf-8') """ self.client.write(hdfs_path, data=data, overwrite=overwrite, permission=permission, blocksize=blocksize, replication=replication, buffersize=buffersize, append=append, encoding=encoding) def rename_or_move(self, hdfs_src_path, hdfs_dst_path): """ 移动文件或目录 hdfs_src_path: 源路径 hdfs_dst_path: 目标路径,如果路径已经存在并且是目录,则源将移入其中。 如果路径存在并且是文件,或者缺少父目标目录,则此方法将引发HdfsError """ self.client.rename(hdfs_src_path, hdfs_dst_path) def set_owner(self, hdfs_path, owner=None, group=None): """ 更改文件的所有者,必须至少指定所有者和组之一 owner: 可选,文件的新所有者 group: 可选,文件的新所有组 """ self.client.set_owner(hdfs_path, owner=owner, group=group) def set_permission(self, hdfs_path, permission): """ 更改文件权限 permission: 文件的新八进制权限字符串 """ self.client.set_permission(hdfs_path, permission) def set_replication(self, hdfs_path, replication): """ 设置文件副本 replication: 副本数 """ self.client.set_replication(hdfs_path, replication) def set_times(self, hdfs_path, access_time=None, modification_time=None): """ 更改文件的时间戳 """ self.client.set_times(hdfs_path, access_time=access_time, modification_time=modification_time) def status_hdfs_file(self, hdfs_path, strict=True): """ 获取文件的FileStatus strict: 如果为False,则返回None,而不是如果路径不存在则引发异常 """ self.client.status(hdfs_path, strict=strict)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from hdfs.client import Client # 读取hdfs文件内容,将每行存入数组返回 def read_hdfs_file(client, filename): lines = [] with client.read(filename, encoding='utf-8', delimiter='\n') as reader: for line in reader: lines.append(line.strip()) return lines if __name__ == '__main__': client = Client("http://127.0.0.1:50070/") print("hdfs中的目录为:", client.list(hdfs_path="/user", status=True)) with client.read("/user/hive/warehouse/test.db/t_tmp1/tmp1.txt", length=200, encoding='utf-8') as obj: for i in obj: print(i)
def Convert_file(self, p_srcfileType, p_srcfilename, p_dstfileType, p_dstfilename): try: if p_srcfileType.upper() == "MEM": m_srcFSType = "MEM" if self.g_MemoryFSHandler is None: self.g_MemoryFSHandler = fs.open_fs('mem://') m_srcFS = self.g_MemoryFSHandler else: m_srcFS = self.g_MemoryFSHandler m_srcFileName = p_srcfilename elif p_srcfileType.upper() == "FS": m_srcFSType = "FS" m_srcFS = fs.open_fs('./') m_srcFileName = p_srcfilename elif p_srcfileType.upper() == "HDFS": m_srcFSType = "HDFS" m_srcFullFileName = p_srcfilename m_Protocal = m_srcFullFileName.split("://")[0] m_NodePort = m_srcFullFileName[len(m_Protocal) + 3:].split("/")[0] m_WebFSURL = m_Protocal + "://" + m_NodePort m_WebFSDir, m_srcFileName = os.path.split(m_srcFullFileName[len(m_WebFSURL):]) m_srcFS = Client(m_WebFSURL, m_WebFSDir, proxy=None, session=None) else: m_srcFS = None m_srcFileName = None m_srcFSType = "Not Supported" if p_dstfileType.upper() == "MEM": m_dstFSType = "MEM" if self.g_MemoryFSHandler is None: self.g_MemoryFSHandler = fs.open_fs('mem://') m_dstFS = self.g_MemoryFSHandler else: m_dstFS = self.g_MemoryFSHandler m_dstFileName = p_dstfilename elif p_dstfileType.upper() == "FS": m_dstFSType = "FS" m_dstFS = fs.open_fs('./') m_dstFileName = p_dstfilename elif p_dstfileType.upper() == "HDFS": m_dstFSType = "HDFS" m_dstFullFileName = p_dstfilename m_Protocal = m_dstFullFileName.split("://")[0] m_NodePort = m_dstFullFileName[len(m_Protocal) + 3:].split("/")[0] m_WebFSURL = m_Protocal + "://" + m_NodePort m_WebFSDir, m_dstFileName = os.path.split(m_dstFullFileName[len(m_WebFSURL):]) m_dstFS = Client(m_WebFSURL, m_WebFSDir, proxy=None, session=None) else: m_dstFS = None m_dstFileName = None m_dstFSType = "Not Supported convert." if m_srcFSType == "Not Supported" or m_dstFSType == "Not Supported": raise SQLCliException("Not supported convert. From [" + p_srcfileType + "] to [" + p_dstfileType + "]") if m_srcFSType in ('MEM', 'FS') and m_dstFSType in ('MEM', 'FS'): with m_srcFS.openbin(m_srcFileName, "r") as m_reader, m_dstFS.openbin(m_dstFileName, "w") as m_writer: while True: m_Contents = m_reader.read(8192) if len(m_Contents) == 0: break m_writer.write(m_Contents) if m_srcFSType == "HDFS" and m_dstFSType in ('MEM', 'FS'): with m_srcFS.read(m_srcFileName, "rb") as m_reader, m_dstFS.openbin(m_dstFileName, "w") as m_writer: while True: m_Contents = m_reader.read(8192) if len(m_Contents) == 0: break m_writer.write(m_Contents) # 对于HDFS的写入,每80M提交一次,以避免内存的OOM问题 if m_srcFSType in ('MEM', 'FS') and m_dstFSType == "HDFS": bHeaderWrite = True with m_srcFS.openbin(m_srcFileName, "r") as m_reader: while True: m_Contents = m_reader.read(8192 * 10240) if len(m_Contents) == 0: break if bHeaderWrite: with m_dstFS.write(m_dstFileName, overwrite=True) as m_writer: m_writer.write(m_Contents) bHeaderWrite = False else: with m_dstFS.write(m_dstFileName, append=True) as m_writer: m_writer.write(m_Contents) if m_srcFSType == "HDFS" and m_dstFSType == "HDFS": bHeaderWrite = True with m_srcFS.read(m_srcFileName) as m_reader: while True: m_Contents = m_reader.read(8192 * 10240) if len(m_Contents) == 0: break if bHeaderWrite: with m_dstFS.write(m_dstFileName, overwrite=True) as m_writer: m_writer.write(m_Contents) bHeaderWrite = False else: with m_dstFS.write(m_dstFileName, append=True) as m_writer: m_writer.write(m_Contents) except HdfsError as he: # HDFS 会打印整个堆栈信息,所以这里默认只打印第一行的信息 if "SQLCLI_DEBUG" in os.environ: raise SQLCliException(he.message) else: raise SQLCliException(he.message.split('\n')[0])
lines1 = [] lines2 = [] lines3 = [] lines4 = [] client = Client("http://222.27.166.215:50070") a = 0 b = 0 c = 0 d = 0 e = 0 f = 0 g = 0 h = 0 ############ tiaoxingtu ############### with client.read("/home/spark-test/picture_data/part-00000", encoding='utf-8') as reader: for line in reader: line = line.replace("'", "") line = line.replace("(", "") line = line.replace(")", "") lines.append(line.split(",")) df = pd.DataFrame(data=lines) plt.figure(figsize=(10, 15)) df[1] = df[1].astype(int) for i in range(0, len(df)): if float(df.iloc[i][0]) < 4: g = g + df.iloc[i][1] if float(df.iloc[i][0]) >= 4 and float(df.iloc[i][0]) < 5: a = a + df.iloc[i][1] if float(df.iloc[i][0]) >= 5 and float(df.iloc[i][0]) < 6: b = b + int(df.iloc[i][1])
from component.spark.ProcessDriver import ProcessDriver from hdfs.client import Client import xml.etree.ElementTree as ET client = Client("http://172.18.130.100:50070") with client.read("/liupei/test/template.xml") as fs: list = [] key = "" value = "" #tree=ET.parse("/home/liupei/test/template.xml") tree = ET.parse(fs) root = tree.getroot() appName = root.attrib["appName"] #print(appName) for childs in root: map = {} for child in childs: if child.tag == "key": key = child.text elif child.tag == "value": value = child.text map[key] = value list.append(map) #print(list) pd = ProcessDriver(appName, list) pd.start()
from ProcessDriver import ProcessDriver from hdfs.client import Client import xml.etree.ElementTree as ET import sys client = Client(sys.argv[1]) with client.read(sys.argv[2]) as fs: list = [] key = "" value = "" tree = ET.parse(fs) root = tree.getroot() appName = root.attrib["appName"] for childs in root: map = {} for child in childs: if child.tag == "key": key = child.text elif child.tag == "value": value = child.text map[key] = value list.append(map) type = list[0]["type"] pd = ProcessDriver(appName, list) if (type == "core"): pd.startCore() elif (type == "sql"): pd.startSQL() elif (type == "gpsql"): pd.startgpSQL()
from hdfs.client import Client client = Client( "http://host6.cloud.sinocbd.com:50070/") # 50070: Hadoop默认namenode dir(client) # 其中用到的方法有: # walk() 类似os.walk,返回值也是包含(路径,目录名,文件名)元素的数组,每层迭代。 # read() 类似file.read,官方文档的说法是client.read必须在with块里使用: # path=[] # for i in client.walk('/tempfiles/temp',depth=1): # for item in i: # path.append(item) # print(item) # print(path) with client.read('/tempfiles/1.csv', encoding='gbk') as fs: content = fs.read() print(content)
#list()会列出hdfs指定路径的所有文件信息,接收两个参数 print("hdfs中的目录为:", client.list(hdfs_path="/test1", status=True)) #read() #读取文件信息 类似与 hdfs dfs -cat hfds_path,参数如下: # hdfs_path hdfs路径 # offset 读取位置 # length 读取长度 # buffer_size 设置buffer_size 不设置使用hdfs默认100MB 对于大文件 buffer够大的化 sort与shuffle都更快 # encoding 指定编码 # chunk_size 字节的生成器,必须和encodeing一起使用 满足chunk_size设置即 yield # delimiter 设置分隔符 必须和encodeing一起设置 # progress 读取进度回调函数 读取一个chunk_size回调一次 print( client.read( "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv")) # 下载 print( "下载文件结果part.csv:", client.download( hdfs_path= "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv", local_path="/home/yyj2020/test", overwrite=True)) # 打包 with tarfile.open("/home/yyj2020/test/tartest.tar.gz", "w:gz") as tar: tar.add("/home/yyj2020/test/files", arcname=os.path.basename("/home/yyj2020/test/files")) tar.close()
def test_something(self): client = Client("http://172.18.130.100:50070") with client.read("/liupei/test/template.xml") as fs: content = fs.read() print(content)