def download_parquet_from_hdfs_dir(parquet_dir, local_dir, hdfs_ip, hdfs_port=50070): """ 从hdfs批量下载parquet文件到local_path :param parquet_dir: parquet文件所在的文件'/data/a.parquet' :param local_path: '/data_gen/b.parquet' :param hdfs_ip: :param hdfs_port: :return: """ import os from hdfs.client import Client client = Client(f'http://{hdfs_ip}:{hdfs_port}') parquet_list = client.list(parquet_dir) print(parquet_list) for p in parquet_list: if p.endswith('.parquet'): print(f'downloading {os.path.join(parquet_dir, p)}') with client.read(os.path.join(parquet_dir, p)) as reader: data = reader.read() if not os.path.exists(local_dir): os.makedirs(local_dir) with open(os.path.join(local_dir, p), 'wb') as f: f.write(data)
def get(self, request): _hdfsName = request.GET.get("hdfsName", "46eccfa2-1c56-11e8-a752-1008b1983d21") _hdfsPath = os.path.join("/datahoop/", _hdfsName) # print(_hdfsPath) try: # 链接HDFS,读取文件 cli = Client(settings.HDFS_HOST) fileName = cli.list(_hdfsPath)[1] # print("filename:", fileName) _hdfsPath = os.path.join(_hdfsPath + "/", fileName) # print(_hdfsPath) try: with cli.read(_hdfsPath, length=2000, encoding="gbk") as f: datas = f.read() except UnicodeDecodeError: with cli.read(_hdfsPath, length=2000, encoding="utf8") as f: datas = f.read() # 字符转list re.sub("\r\n", "\n", datas) logger.debug(datas) datas = datas.strip('"').split('\n') content = [] for i in datas: content.append(i.strip('"').split(",")) except HdfsError: return Response(data={"error": "文件未找到或文件编码格式不符合"}, status=status.HTTP_400_BAD_REQUEST) return Response(data={"data": content}, status=status.HTTP_200_OK)
def get_data(file_path): HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader: data = [line.strip().split() for line in reader] print("data",data[0:2]) return data
def get(self, request): _hdfsName = request.GET.get("hdfsName", "46eccfa2-1c56-11e8-a752-1008b1983d21") _hdfsPath = os.path.join("/datahoop/", _hdfsName) obj = DataSource.objects.get(format_filename=_hdfsName) # print(_hdfsPath) try: # 链接HDFS,读取文件 cli = Client(settings.HDFS_HOST) try: with cli.read(_hdfsPath, encoding="gbk") as f: datas = f.read() except UnicodeDecodeError: with cli.read(_hdfsPath, encoding="utf8") as f: datas = f.read() except HdfsError: return Response(data={"error": "文件未找到或文件编码格式不符合"}, status=status.HTTP_400_BAD_REQUEST) response = HttpResponse(content_type='csv/plain') response['Content-Disposition'] = 'attachment; filename={0}'.format( obj.file_name) response.write(datas) return response
def test_hdfs_files(): project = utils.get_test_project() head_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % head_ip) root_dirs = hdfs.list('/') assert 'spark' in root_dirs spark_dirs = hdfs.list('/spark') assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
def make_directory(hdfs_address, directory_path, directory_name): ''' Description: This function helps users to create a directory in hdfs Parameters: -hdfs_address: hadoop master node ip address -directory_path: the path the user want to create a directory -directory_name: the directory name Returns: None ''' client = Client('http://' + hdfs_address) client.makedirs(directory_path + directory_name)
def dataframe_write_to_hdfs(hdfs_path, dataframe): """ :param client: :param hdfs_path: :param dataframe: :return: """ HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') client.write(hdfs_path, dataframe.to_csv(header=False,index=False,sep="\t"), encoding='utf-8',overwrite=True)
def put_to_hdfs(result_file): client = Client("http://192.168.53.30:50070") if client.status('/tmp/result.csv', strict=False): client.delete('/tmp/result.csv') client.upload('/tmp', result_file) else: client.upload('/tmp', result_file)
def test_hdfs_dirs(): project = utils.get_test_project() head_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % head_ip) users_dirs = hdfs.list('/user') assert 'hive' in users_dirs assert 'impala' in users_dirs users_dirs = hdfs.list('/user/hive') assert 'warehouse' in users_dirs
def get_data_hdfs(file_path): HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader: data = [line.strip().split(',') for line in reader] print("data", data[0:5]) df = pd.DataFrame(data[1:], columns=data[0]) return df
def save_page_hdfs(ipPort, file_path, contents): """保存网页源码到hdfs :param ipPort: hdfs连接地址 :param file_path: 文件路径 :param contents: 网页内容 :return: None """ client = Client(ipPort) with client.write(file_path) as writer: writer.write(bytes(contents, encoding='utf8'))
def hdfs_file2points(path): client = Client(QuerierParallel.master_hdfs_path, root="/", timeout=100, session=False) points = [] with client.read(path) as f: for line in f: info = line.strip('\n').split('\t') points.append([float(info[0]), float(info[1])]) f.close() return points
def run_hdfs_test(conf: ConfigData): # the_date = conf.test_date() # "20181101" client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" # root_path = conf.unzip_dir(is_baoli) # 'D:/DATA/UNZIP/' # dest_dir = conf.hdfs_dir_syb(is_baoli) # file_pre = conf.file_pre1() # "t1_trxrecord_" # file_ext = conf.file_ext2() # "_V2.csv" # client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True) dat = client.list('/', status=False) print(dat)
def do(): global csv_path client = Client(hdfshost) file_list = client.list(csv_path) print(file_list) for file in file_list: if file.endswith(".csv"): csv_path = csv_path + file # 读取csv并同名写到本地 with open("./异常临界值local.csv", 'w', encoding='GB2312') as local: with client.read(csv_path, encoding='GB2312') as hdfs: for line in hdfs: local.write(line.strip('\n'))
def test_hdfs_dirs(): project = utils.get_test_project() nn_ip = project.cluster.head.ip hdfs = Client("http://%s:50070" % nn_ip) assert hdfs root_dirs = hdfs.list("/") assert "tmp" in root_dirs assert "user" in root_dirs users_dirs = hdfs.list("/user") assert project.settings["USERNAME"] in users_dirs
def test_hdfs_dirs(): project = utils.get_test_project() nn_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % nn_ip) assert hdfs root_dirs = hdfs.list('/') assert 'tmp' in root_dirs assert 'user' in root_dirs users_dirs = hdfs.list('/user') assert project.settings['USERNAME'] in users_dirs
def read_accesslog_from_hdfs(self): # 实时日志流的存储是每5个点击数据存储一次 client = Client("http://localhost:50070") file_names = client.list("/hadoop_file") ss = "" for file_name in file_names: with client.read("/hadoop_file/" + file_name, encoding="utf-8") as reader: for line in reader: # 去除测试数据 if line.startswith("filed1"): continue ss += line
def generate_files( date, path="user/hadoop/trajectory/sim_trajectory_per_day/shanghai/%s-%s/%s", ): year, month, day = date.split('-') if date in QuerierParallel.files: return else: client = Client(QuerierParallel.master_hdfs_path, root="/", timeout=100, session=False) QuerierParallel.files.update( {date: client.list(path % (year, month, day))})
def read(dir_path, header): client = Client("http://127.0.0.1:50070") log_data = [] for date_dir in client.list(dir_path): for log_file in client.list(dir_path+'/'+date_dir): with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs: for line in fs: row = line.strip().split('&') if row != ['']: tmp = [] for field in row: tmp.append(field.split('=')[1]) log_data.append(tmp) return pd.DataFrame(log_data, columns=header)
def mv_local_to_hdfs(): ''' 将写好的文件移动到hdfs ''' now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) file_index=int(now_time[11:13]) if file_index==0: file_path_all=getYesterday() else: file_path_all=now_time[0:10] client=Client("http://master:50070") if file_path_all not in client.list('/traffFile'): os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffFile/'+file_path_all) local_path='/usr/local/bro/spool/worker-1/extract_files/*' os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffFile/'+file_path_all)
def mv_local_to_hdfs(filename): ''' 将写好的文件移动到hdfs ''' now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) file_index=int(now_time[11:13]) if file_index==0: file_path_all=getYesterday() else: file_path_all=now_time[0:10] client=Client("http://master:50070") if file_path_all not in client.list('/traffLog'): os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffLog/'+file_path_all) local_path=get_path_or_buf(filename) os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffLog/'+file_path_all)
def read_corpus(): qList = [] # 问题的关键词列表 qList_kw = [] aList = [] lines = [] client = Client("http://localhost:50070") with client.read("/corpus/q_a.csv", encoding='utf-8') as reader: for line in reader: lines.append(line.strip()) for t in lines: qList.append(t[0]) qList_kw.append(seg.cut(t[0])) aList.append(t[1]) return qList_kw, qList, aList
def main(): """Entry point.""" # arguments parsing first for quicker feedback on invalid arguments args = docopt(__doc__, version=__version__) # set up logging logger = lg.getLogger('hdfs') logger.setLevel(lg.DEBUG) handler = Config().get_file_handler('hdfs') if handler: logger.addHandler(handler) # set up client and fix arguments client = Client.from_alias(args['--alias']) rpath = args['RPATH'] or '.' for option in ('--depth', '--threads'): try: args[option] = int(args[option]) except ValueError: raise HdfsError('Invalid `%s` option: %r.', option, args[option]) # run command if args['--log']: if handler: sys.stdout.write('%s\n' % (handler.baseFilename, )) else: raise HdfsError('No log file active.') elif args['--write']: reader = (line for line in sys.stdin) # doesn't work with stdin, why? client.write(rpath, reader, overwrite=args['--overwrite']) elif args['--read']: size = client.status(rpath)['length'] read(client.read(rpath), size, '%s\t' % (rpath, )) elif args['--download']: client.download( rpath, args['LPATH'], overwrite=args['--overwrite'], n_threads=args['--threads'] ) elif args['--upload']: client.upload( rpath, args['LPATH'], overwrite=args['--overwrite'], n_threads=args['--threads'] ) elif args['--list']: list_infos(client, rpath, args['--depth'], args['--json'], args['--path']) else: banner = ( 'Interactive HDFS python shell.\n' 'Client for %r is available as `CLIENT`.' % (client.url, ) ) namespace = {'CLIENT': client} try: from IPython import embed except ImportError: from code import interact interact(banner=banner, local=namespace) else: embed(banner1=banner, user_ns=namespace)
def run_hive(conf: ConfigData, the_date: str): a_client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user()) cur = conn.cursor() print("Start\n") the_date = StrTool.get_the_date_str(the_date) # "20181101" # hdfs_dir_bl root_path = str( pathlib.PurePosixPath(conf.get_hdfs_path()).joinpath(the_date)) file_name = str( pathlib.PurePosixPath(root_path).joinpath( conf.get_file_name(the_date))) # "/data/posflow/allinpay_utf8_zc/20181101/" # 20181101_loginfo_rsp_bl_new.csv # 20181101_rsp_agt_bl_new.del # 20181101_rxinfo_rsp_bl.txt table_name = conf.get_table_name() if MyHdfsFile.isfile(a_client, file_name): sql = 'LOAD DATA INPATH \'' + file_name + '\' INTO TABLE ' + table_name # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' print("OK" + " " + sql + "\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def generate_temp_files(need_certificate=NEED_CERTIFICATE): if need_certificate: with krbcontext(using_keytab=True, keytab_file=KEYTAB_PATH, principal=PRINCIPAL): for node in HDFS.NODES: try: hdfs_client = KerberosClient(node) hdfs_client.download(HDFS.REMOTE_PATH, HDFS.LOCAL_PATH, n_threads=HDFS.THREAD_NUM) except Exception as err: logging.info(err) else: return logging.error("Failed to download remote HDFS file.") raise Exception("Failed to download remote HDFS file.") else: for node in HDFS.NODES: try: hdfs_client = Client(node) hdfs_client.download(HDFS.REMOTE_PATH, HDFS.LOCAL_PATH, n_threads=HDFS.THREAD_NUM) except Exception as err: logging.info(err) else: return logging.error("Failed to download remote HDFS file.") raise Exception("Failed to download remote HDFS file.")
def get_client(host, use_kerberos): if use_kerberos: from hdfs.ext.kerberos import KerberosClient return KerberosClient(host) else: from hdfs.client import Client return Client(host)
def run_hive(conf: ConfigData, the_date: str): client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user()) cur = conn.cursor() the_date = StrTool.get_the_date_str(the_date) # "20181101" root_path = conf.get_data("hdfs_dir_zc") # "/data/posflow/allinpay_utf8_zc/" file_ext3 = conf.get_data("file_ext3") # _loginfo_rsp.txt # 20181101_loginfo_rsp.txt file_ext4 = conf.get_data("file_ext4") # _loginfo_rsp_agt.txt # 20181101_loginfo_rsp_agt.txt file_ext5 = conf.get_data("file_ext5") # _rxinfo_rsp.txt # 20181101_rxinfo_rsp.txt file_ext6 = conf.get_data("file_ext6") # _rxinfo_rsp_agt.txt # 20181101_rxinfo_rsp_agt.txt print("Start\n") file3 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext3)) file4 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext4)) file5 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext5)) file6 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext6)) f_list = [file3,file4,file5,file6] t_list = ["hive_table3", "hive_table4", "hive_table5", "hive_table6"] for n in range(0,4): if MyHdfsFile.isfile(client, f_list[n]): sql = 'LOAD DATA INPATH \'' + f_list[n] + '\' INTO TABLE ' + conf.get_data(t_list[n]) # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' print("OK" + " " + sql+"\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def HDFS_cd(self, hdfs_path): """ 切换当前目录, 其实就是重新连接了 """ m_NewDirectory = Path(os.path.join(self.__m_HDFS_WebFSDir__, hdfs_path)).as_posix() self.__m_HDFS_WebFSDir__ = m_NewDirectory self.__m_HDFS_Handler__ = Client(self.__m_HDFS_WebFSURL__, self.__m_HDFS_WebFSDir__, session=None)
def _get_client(self, addr, port): if not CACHE or not self._clients.has_key(addr): cli = Client('http://%s:%s' % (str(addr), str(port))) if CACHE: self._clients.update({addr: cli}) else: cli = self._clients.get(addr) return cli
def connect(self, host, port): conn_url = "http://{}:{}".format(host, port) try: self.client = Client(conn_url) return True except Exception, e: print "Connect Failed:{}-{}".format(Exception, e) return False
def safe_make_dir(client: Client, to_file: str): p = pathlib.PurePosixPath(to_file) # pathlib.Path(to_file).parents if len(p.parts) >= 2: # type(p) == pathlib._PathParents and the_path = p.parts[0] for i in range(1, len(p.parts) - 1): the_path = pathlib.PurePosixPath(the_path).joinpath(pathlib.PurePosixPath(p.parts[i])) the_path_str = str(the_path) # os.path.join(the_path, p.parts[i]) the_dir = client.status(the_path_str, strict=False) if the_dir is None: client.makedirs(the_path_str, permission=777) # client.set_owner(thePath,owner='hdfs',group='supergroup') else: if the_dir['type'].lower() == 'directory': pass else: return return
def download_parquet_from_hdfs(parquet_path, local_path, hdfs_ip, hdfs_port=50070): """ 从hdfs下载parquet文件到local_path :param parquet_path: '/data/a.parquet' :param local_path: '/data_gen/b.parquet' :param hdfs_ip: :param hdfs_port: :return: """ from hdfs.client import Client client = Client(f'http://{hdfs_ip}:{hdfs_port}') with client.read(parquet_path) as reader: data = reader.read() with open(local_path, 'wb') as f: f.write(data)
def test_autoload_client_from_path(self): with temppath() as module_path: self._write_client_module(module_path, 'PathClient') with temppath() as config_path: config = Config(config_path) config.add_section(config.global_section) config.set(config.global_section, 'autoload.paths', module_path) config._autoload() client = Client.from_options({'url': ''}, 'PathClient') eq_(client.one, 1)
def is_exist(client: Client, path: str, f_type: int=3): # 1 file , 2 dir, 3 any the_dir = client.status(path, strict=False) if the_dir is None: return False else: if (f_type & 1) > 0 and the_dir['type'].lower() == 'file': return True if (f_type & 2) > 0 and the_dir['type'].lower() == 'directory': return True return False
def test_autoload_client_from_module(self): with temppath() as module_dpath: os.mkdir(module_dpath) sys.path.append(module_dpath) module_fpath = osp.join(module_dpath, 'mclient.py') self._write_client_module(module_fpath, 'ModuleClient') try: with temppath() as config_path: config = Config(config_path) config.add_section(config.global_section) config.set(config.global_section, 'autoload.modules', 'mclient') config._autoload() client = Client.from_options({'url': ''}, 'ModuleClient') eq_(client.one, 1) finally: sys.path.remove(module_dpath)
def main(): """Entry point.""" # arguments parsing first for quicker feedback on invalid arguments args = docopt(__doc__, version=__version__) # set up logging logger = lg.getLogger('hdfs') logger.setLevel(lg.DEBUG) handler = Config().get_file_handler('hdfs') if handler: logger.addHandler(handler) # set up client and fix arguments client = Client.from_alias(args['--alias']) rpath = args['RPATH'] or '' for option in ('--depth', '--threads'): try: args[option] = int(args[option]) except ValueError: raise HdfsError('Invalid `%s` option: %r.', option, args[option]) # run command if args['--log']: if handler: sys.stdout.write('%s\n' % (handler.baseFilename, )) else: raise HdfsError('No log file active.') elif args['--write']: reader = (line for line in sys.stdin) # doesn't work with stdin, why? client.write(rpath, reader, overwrite=args['--overwrite']) elif args['--read']: size = client.status(rpath)['length'] read(client.read(rpath), size, '%s\t' % (rpath, )) elif args['--download']: client.download( rpath, args['LPATH'], overwrite=args['--overwrite'], n_threads=args['--threads'] ) else: infos(client, rpath, args['--depth'], args['--json'], args['--path'])